diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
| -rw-r--r-- | fs/btrfs/extent-tree.c | 3142 |
1 files changed, 1764 insertions, 1378 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 72ba13b027a9..e4cae34620d1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -16,8 +16,9 @@ #include <linux/percpu_counter.h> #include <linux/lockdep.h> #include <linux/crc32c.h> -#include "misc.h" -#include "tree-log.h" +#include "ctree.h" +#include "extent-tree.h" +#include "transaction.h" #include "disk-io.h" #include "print-tree.h" #include "volumes.h" @@ -25,32 +26,28 @@ #include "locking.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" #include "space-info.h" #include "block-rsv.h" -#include "delalloc-space.h" -#include "block-group.h" #include "discard.h" -#include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" #include "accessors.h" -#include "extent-tree.h" #include "root-tree.h" #include "file-item.h" #include "orphan.h" #include "tree-checker.h" +#include "raid-stripe-tree.h" +#include "delayed-inode.h" #undef SCRAMBLE_DELAYED_REFS static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_ref_head *href, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, @@ -58,57 +55,33 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod); + struct btrfs_key *ins, int ref_mod, u64 oref_root); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); -static int find_next_key(struct btrfs_path *path, int level, +static int find_next_key(const struct btrfs_path *path, int level, struct btrfs_key *key); -static int block_group_bits(struct btrfs_block_group *cache, u64 bits) +static int block_group_bits(const struct btrfs_block_group *cache, u64 bits) { return (cache->flags & bits) == bits; } -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes) -{ - u64 end = start + num_bytes - 1; - set_extent_bits(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE); - return 0; -} - -void btrfs_free_excluded_extents(struct btrfs_block_group *cache) -{ - struct btrfs_fs_info *fs_info = cache->fs_info; - u64 start, end; - - start = cache->start; - end = start + cache->length - 1; - - clear_extent_bits(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE); -} - /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_root *root = btrfs_extent_root(fs_info, start); - int ret; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = start; - key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - btrfs_free_path(path); - return ret; + key.offset = len; + return btrfs_search_slot(NULL, root, &key, path, 0, 0); } /* @@ -122,18 +95,17 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags) + u64 offset, int metadata, u64 *refs, u64 *flags, + u64 *owning_root) { struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct extent_buffer *leaf; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; - u32 item_size; u64 num_refs; u64 extent_flags; + u64 owner = 0; int ret; /* @@ -149,25 +121,20 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - if (!trans) { - path->skip_locking = 1; - path->search_commit_root = 1; - } - search_again: key.objectid = bytenr; - key.offset = offset; if (metadata) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = offset; extent_root = btrfs_extent_root(fs_info, bytenr); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out_free; + return ret; - if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { + if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -180,37 +147,40 @@ search_again: } if (ret == 0) { - leaf = path->nodes[0]; - item_size = btrfs_item_size(leaf, path->slots[0]); - if (item_size >= sizeof(*ei)) { - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); - extent_flags = btrfs_extent_flags(leaf, ei); - } else { - ret = -EINVAL; - btrfs_print_v0_err(fs_info); - if (trans) - btrfs_abort_transaction(trans, ret); - else - btrfs_handle_fs_error(fs_info, ret, NULL); + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_extent_item *ei; + const u32 item_size = btrfs_item_size(leaf, path->slots[0]); - goto out_free; + if (unlikely(item_size < sizeof(*ei))) { + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); + btrfs_abort_transaction(trans, ret); + return ret; } - BUG_ON(num_refs == 0); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + if (unlikely(num_refs == 0)) { + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected zero reference count for extent item " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&key)); + btrfs_abort_transaction(trans, ret); + return ret; + } + extent_flags = btrfs_extent_flags(leaf, ei); + owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); } else { num_refs = 0; extent_flags = 0; ret = 0; } - if (!trans) - goto out; - delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); @@ -230,22 +200,21 @@ search_again: spin_lock(&head->lock); if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; - else - BUG_ON(num_refs == 0); num_refs += head->ref_mod; spin_unlock(&head->lock); mutex_unlock(&head->mutex); } spin_unlock(&delayed_refs->lock); -out: + WARN_ON(num_refs == 0); if (refs) *refs = num_refs; if (flags) *flags = extent_flags; -out_free: - btrfs_free_path(path); + if (owning_root) + *owning_root = owner; + return ret; } @@ -357,16 +326,22 @@ out_free: /* * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, - * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, + * is_data == BTRFS_REF_TYPE_DATA, data type is required, * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, - struct btrfs_extent_inline_ref *iref, + const struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data) { + struct btrfs_fs_info *fs_info = eb->fs_info; int type = btrfs_extent_inline_ref_type(eb, iref); u64 offset = btrfs_extent_inline_ref_offset(eb, iref); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + return type; + } + if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY || type == BTRFS_SHARED_DATA_REF_KEY || @@ -375,26 +350,25 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_TREE_BLOCK_REF_KEY) return type; if (type == BTRFS_SHARED_BLOCK_REF_KEY) { - ASSERT(eb->fs_info); + ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ - if (offset && - IS_ALIGNED(offset, eb->fs_info->sectorsize)) + if (offset && IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else if (is_data == BTRFS_REF_TYPE_DATA) { if (type == BTRFS_EXTENT_DATA_REF_KEY) return type; if (type == BTRFS_SHARED_DATA_REF_KEY) { - ASSERT(eb->fs_info); + ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->sectorsize)) + IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else { @@ -403,11 +377,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, } } - btrfs_print_leaf((struct extent_buffer *)eb); - btrfs_err(eb->fs_info, + WARN_ON(1); + btrfs_print_leaf(eb); + btrfs_err(fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); - WARN_ON(1); return BTRFS_REF_TYPE_INVALID; } @@ -419,32 +393,32 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } -static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref) +static u64 hash_extent_data_ref_item(const struct extent_buffer *leaf, + const struct btrfs_extent_data_ref *ref) { return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), btrfs_extent_data_ref_objectid(leaf, ref), btrfs_extent_data_ref_offset(leaf, ref)); } -static int match_extent_data_ref(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - u64 root_objectid, u64 owner, u64 offset) +static bool match_extent_data_ref(const struct extent_buffer *leaf, + const struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) { if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || btrfs_extent_data_ref_objectid(leaf, ref) != owner || btrfs_extent_data_ref_offset(leaf, ref) != offset) - return 0; - return 1; + return false; + return true; } static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, @@ -458,9 +432,8 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; u32 nritems; - int ret; int recow; - int err = -ENOENT; + int ret; key.objectid = bytenr; if (parent) { @@ -474,26 +447,26 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, again: recow = 0; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; - goto fail; - } + if (ret < 0) + return ret; if (parent) { - if (!ret) - return 0; - goto fail; + if (ret) + return -ENOENT; + return 0; } + ret = -ENOENT; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); while (1) { if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); - if (ret < 0) - err = ret; - if (ret) - goto fail; + if (ret) { + if (ret > 0) + return -ENOENT; + return ret; + } leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -514,37 +487,37 @@ again: btrfs_release_path(path); goto again; } - err = 0; + ret = 0; break; } path->slots[0]++; } fail: - return err; + return ret; } static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid, u64 owner, - u64 offset, int refs_to_add) + const struct btrfs_delayed_ref_node *node, + u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); u32 size; u32 num_refs; int ret; key.objectid = bytenr; - if (parent) { + if (node->parent) { key.type = BTRFS_SHARED_DATA_REF_KEY; - key.offset = parent; + key.offset = node->parent; size = sizeof(struct btrfs_shared_data_ref); } else { key.type = BTRFS_EXTENT_DATA_REF_KEY; - key.offset = hash_extent_data_ref(root_objectid, - owner, offset); + key.offset = hash_extent_data_ref(node->ref_root, owner, offset); size = sizeof(struct btrfs_extent_data_ref); } @@ -553,15 +526,15 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, goto fail; leaf = path->nodes[0]; - if (parent) { + if (node->parent) { struct btrfs_shared_data_ref *ref; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); if (ret == 0) { - btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); + btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_shared_data_ref_count(leaf, ref); - num_refs += refs_to_add; + num_refs += node->ref_mod; btrfs_set_shared_data_ref_count(leaf, ref, num_refs); } } else { @@ -569,7 +542,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, while (ret == -EEXIST) { ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); - if (match_extent_data_ref(leaf, ref, root_objectid, + if (match_extent_data_ref(leaf, ref, node->ref_root, owner, offset)) break; btrfs_release_path(path); @@ -584,18 +557,16 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); if (ret == 0) { - btrfs_set_extent_data_ref_root(leaf, ref, - root_objectid); + btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root); btrfs_set_extent_data_ref_objectid(leaf, ref, owner); btrfs_set_extent_data_ref_offset(leaf, ref, offset); - btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); + btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_extent_data_ref_count(leaf, ref); - num_refs += refs_to_add; + num_refs += node->ref_mod; btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } } - btrfs_mark_buffer_dirty(leaf); ret = 0; fail: btrfs_release_path(path); @@ -625,12 +596,12 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, ref2 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); num_refs = btrfs_shared_data_ref_count(leaf, ref2); - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - btrfs_print_v0_err(trans->fs_info); - btrfs_abort_transaction(trans, -EINVAL); - return -EINVAL; } else { - BUG(); + btrfs_err(trans->fs_info, + "unrecognized backref key " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&key)); + btrfs_abort_transaction(trans, -EUCLEAN); + return -EUCLEAN; } BUG_ON(num_refs < refs_to_drop); @@ -643,25 +614,23 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); else if (key.type == BTRFS_SHARED_DATA_REF_KEY) btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); - btrfs_mark_buffer_dirty(leaf); } return ret; } -static noinline u32 extent_data_ref_count(struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref) +static noinline u32 extent_data_ref_count(const struct btrfs_path *path, + const struct btrfs_extent_inline_ref *iref) { struct btrfs_key key; struct extent_buffer *leaf; - struct btrfs_extent_data_ref *ref1; - struct btrfs_shared_data_ref *ref2; + const struct btrfs_extent_data_ref *ref1; + const struct btrfs_shared_data_ref *ref2; u32 num_refs = 0; int type; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); if (iref) { /* * If type is invalid, we should have bailed out earlier than @@ -670,10 +639,10 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path, type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); ASSERT(type != BTRFS_REF_TYPE_INVALID); if (type == BTRFS_EXTENT_DATA_REF_KEY) { - ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); + ref1 = (const struct btrfs_extent_data_ref *)(&iref->offset); num_refs = btrfs_extent_data_ref_count(leaf, ref1); } else { - ref2 = (struct btrfs_shared_data_ref *)(iref + 1); + ref2 = (const struct btrfs_shared_data_ref *)(iref + 1); num_refs = btrfs_shared_data_ref_count(leaf, ref2); } } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { @@ -716,20 +685,20 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid) + const struct btrfs_delayed_ref_node *node, + u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; key.objectid = bytenr; - if (parent) { + if (node->parent) { key.type = BTRFS_SHARED_BLOCK_REF_KEY; - key.offset = parent; + key.offset = node->parent; } else { key.type = BTRFS_TREE_BLOCK_REF_KEY; - key.offset = root_objectid; + key.offset = node->ref_root; } ret = btrfs_insert_empty_item(trans, root, path, &key, 0); @@ -754,7 +723,7 @@ static inline int extent_ref_type(u64 parent, u64 owner) return type; } -static int find_next_key(struct btrfs_path *path, int level, +static int find_next_key(const struct btrfs_path *path, int level, struct btrfs_key *key) { @@ -810,7 +779,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, int type; int want; int ret; - int err = 0; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); int needed; @@ -821,8 +789,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, want = extent_ref_type(parent, owner); if (insert) { extra_size = btrfs_extent_inline_ref_size(want); - path->search_for_extension = 1; - path->keep_locks = 1; + path->search_for_extension = true; } else extra_size = -1; @@ -837,10 +804,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } /* * We may be a newly converted file system which still has the old fat @@ -867,19 +832,26 @@ again: } if (ret && !insert) { - err = -ENOENT; + ret = -ENOENT; goto out; } else if (WARN_ON(ret)) { - err = -EIO; + btrfs_print_leaf(path->nodes[0]); + btrfs_err(fs_info, +"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", + bytenr, num_bytes, parent, root_objectid, owner, + offset); + ret = -EUCLEAN; goto out; } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EINVAL; - btrfs_print_v0_err(fs_info); - btrfs_abort_transaction(trans, err); + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %llu expect >= %zu", + item_size, sizeof(*ei)); + btrfs_abort_transaction(trans, ret); goto out; } @@ -899,22 +871,17 @@ again: else needed = BTRFS_REF_TYPE_BLOCK; - err = -ENOENT; - while (1) { - if (ptr >= end) { - if (ptr > end) { - err = -EUCLEAN; - btrfs_print_leaf(path->nodes[0]); - btrfs_crit(fs_info, -"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", - path->slots[0], root_objectid, owner, offset, parent); - } - break; - } + ret = -ENOENT; + while (ptr < end) { iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); - if (type == BTRFS_REF_TYPE_INVALID) { - err = -EUCLEAN; + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + ptr += btrfs_extent_inline_ref_size(type); + continue; + } + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) { + ret = -EUCLEAN; goto out; } @@ -930,7 +897,7 @@ again: dref = (struct btrfs_extent_data_ref *)(&iref->offset); if (match_extent_data_ref(leaf, dref, root_objectid, owner, offset)) { - err = 0; + ret = 0; break; } if (hash_extent_data_ref_item(leaf, dref) < @@ -941,14 +908,14 @@ again: ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); if (parent > 0) { if (parent == ref_offset) { - err = 0; + ret = 0; break; } if (ref_offset < parent) break; } else { if (root_objectid == ref_offset) { - err = 0; + ret = 0; break; } if (ref_offset < root_objectid) @@ -957,12 +924,41 @@ again: } ptr += btrfs_extent_inline_ref_size(type); } - if (err == -ENOENT && insert) { + + if (unlikely(ptr > end)) { + ret = -EUCLEAN; + btrfs_print_leaf(path->nodes[0]); + btrfs_crit(fs_info, +"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", + path->slots[0], root_objectid, owner, offset, parent); + goto out; + } + + if (ret == -ENOENT && insert) { if (item_size + extra_size >= BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } + + if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) { + struct btrfs_key tmp_key; + + btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1); + if (tmp_key.objectid == bytenr && + tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { + ret = -EAGAIN; + goto out; + } + goto out_no_entry; + } + + if (!path->keep_locks) { + btrfs_release_path(path); + path->keep_locks = true; + goto again; + } + /* * To add new inline back ref, we have to make sure * there is no corresponding back ref item. @@ -972,25 +968,27 @@ again: if (find_next_key(path, 0, &key) == 0 && key.objectid == bytenr && key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } } +out_no_entry: *ref_ret = (struct btrfs_extent_inline_ref *)ptr; out: - if (insert) { - path->keep_locks = 0; - path->search_for_extension = 0; + if (path->keep_locks) { + path->keep_locks = false; btrfs_unlock_up_safe(path, 1); } - return err; + if (insert) + path->search_for_extension = false; + return ret; } /* * helper to add new inline back ref */ static noinline_for_stack -void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, +void setup_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, u64 parent, u64 root_objectid, @@ -1013,7 +1011,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - btrfs_extend_item(path, size); + btrfs_extend_item(trans, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1047,7 +1045,6 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, } else { btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - btrfs_mark_buffer_dirty(leaf); } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1080,13 +1077,15 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, /* * helper to update/remove inline back ref */ -static noinline_for_stack -void update_inline_extent_backref(struct btrfs_path *path, +static noinline_for_stack int update_inline_extent_backref( + struct btrfs_trans_handle *trans, + struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; struct btrfs_extent_data_ref *dref = NULL; struct btrfs_shared_data_ref *sref = NULL; @@ -1099,18 +1098,33 @@ void update_inline_extent_backref(struct btrfs_path *path, ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); - WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); + if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) { + struct btrfs_key key; + u32 extent_size; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type == BTRFS_METADATA_ITEM_KEY) + extent_size = fs_info->nodesize; + else + extent_size = key.offset; + btrfs_print_leaf(leaf); + btrfs_err(fs_info, + "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu", + key.objectid, extent_size, refs_to_mod, refs); + return -EUCLEAN; + } refs += refs_to_mod; btrfs_set_extent_refs(leaf, ei, refs); if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); /* - * If type is invalid, we should have bailed out after - * lookup_inline_extent_backref(). + * Function btrfs_get_extent_inline_ref_type() has already printed + * error messages. */ - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); - ASSERT(type != BTRFS_REF_TYPE_INVALID); + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) + return -EUCLEAN; if (type == BTRFS_EXTENT_DATA_REF_KEY) { dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -1120,10 +1134,43 @@ void update_inline_extent_backref(struct btrfs_path *path, refs = btrfs_shared_data_ref_count(leaf, sref); } else { refs = 1; - BUG_ON(refs_to_mod != -1); + /* + * For tree blocks we can only drop one ref for it, and tree + * blocks should not have refs > 1. + * + * Furthermore if we're inserting a new inline backref, we + * won't reach this path either. That would be + * setup_inline_extent_backref(). + */ + if (unlikely(refs_to_mod != -1)) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + btrfs_print_leaf(leaf); + btrfs_err(fs_info, + "invalid refs_to_mod for tree block %llu, has %d expect -1", + key.objectid, refs_to_mod); + return -EUCLEAN; + } } - BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); + if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) { + struct btrfs_key key; + u32 extent_size; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type == BTRFS_METADATA_ITEM_KEY) + extent_size = fs_info->nodesize; + else + extent_size = key.offset; + btrfs_print_leaf(leaf); + btrfs_err(fs_info, +"invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu", + (unsigned long)iref, key.objectid, extent_size, + refs_to_mod, refs); + return -EUCLEAN; + } refs += refs_to_mod; if (refs > 0) { @@ -1140,9 +1187,9 @@ void update_inline_extent_backref(struct btrfs_path *path, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - btrfs_truncate_item(path, item_size, 1); + btrfs_truncate_item(trans, path, item_size, 1); } - btrfs_mark_buffer_dirty(leaf); + return 0; } static noinline_for_stack @@ -1164,21 +1211,17 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, * We're adding refs to a tree block we already own, this * should not happen at all. */ - if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (unlikely(owner < BTRFS_FIRST_FREE_OBJECTID)) { + btrfs_print_leaf(path->nodes[0]); btrfs_crit(trans->fs_info, -"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu", - bytenr, num_bytes, root_objectid); - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { - WARN_ON(1); - btrfs_crit(trans->fs_info, - "path->slots[0]=%d path->nodes[0]:", path->slots[0]); - btrfs_print_leaf(path->nodes[0]); - } +"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", + bytenr, num_bytes, root_objectid, path->slots[0]); return -EUCLEAN; } - update_inline_extent_backref(path, iref, refs_to_add, extent_op); + ret = update_inline_extent_backref(trans, path, iref, + refs_to_add, extent_op); } else if (ret == -ENOENT) { - setup_inline_extent_backref(trans->fs_info, path, iref, parent, + setup_inline_extent_backref(trans, path, iref, parent, root_objectid, owner, offset, refs_to_add, extent_op); ret = 0; @@ -1196,7 +1239,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, BUG_ON(!is_data && refs_to_drop != 1); if (iref) - update_inline_extent_backref(path, iref, -refs_to_drop, NULL); + ret = update_inline_extent_backref(trans, path, iref, + -refs_to_drop, NULL); else if (is_data) ret = remove_extent_data_ref(trans, root, path, refs_to_drop); else @@ -1209,11 +1253,12 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, { int j, ret = 0; u64 bytes_left, end; - u64 aligned_start = ALIGN(start, 1 << 9); + u64 aligned_start = ALIGN(start, SECTOR_SIZE); - if (WARN_ON(start != aligned_start)) { + /* Adjust the range to be aligned to 512B sectors if necessary. */ + if (start != aligned_start) { len -= aligned_start - start; - len = round_down(len, 1 << 9); + len = round_down(len, SECTOR_SIZE); start = aligned_start; } @@ -1251,7 +1296,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, } if (size) { - ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, + ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + size >> SECTOR_SHIFT, GFP_NOFS); if (!ret) *discarded_bytes += size; @@ -1267,12 +1313,29 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, bytes_left = end - start; } - if (bytes_left) { - ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, + while (bytes_left) { + u64 bytes_to_discard = min(BTRFS_MAX_DISCARD_CHUNK_SIZE, bytes_left); + + ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + bytes_to_discard >> SECTOR_SHIFT, GFP_NOFS); - if (!ret) - *discarded_bytes += bytes_left; + + if (ret) { + if (ret != -EOPNOTSUPP) + break; + continue; + } + + start += bytes_to_discard; + bytes_left -= bytes_to_discard; + *discarded_bytes += bytes_to_discard; + + if (btrfs_trim_interrupted()) { + ret = -ERESTARTSYS; + break; + } } + return ret; } @@ -1390,7 +1453,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && - generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID); + generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); @@ -1403,7 +1466,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, } /* - * __btrfs_inc_extent_ref - insert backreference for a given extent + * Insert backreference for a given extent. * * The counterpart is in __btrfs_free_extent(), with examples and more details * how it works. @@ -1413,45 +1476,24 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * @node: The delayed ref node used to get the bytenr/length for * extent whose references are incremented. * - * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ - * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical - * bytenr of the parent block. Since new extents are always - * created with indirect references, this will only be the case - * when relocating a shared extent. In that case, root_objectid - * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must - * be 0 - * - * @root_objectid: The id of the root where this modification has originated, - * this can be either one of the well-known metadata trees or - * the subvolume id which references this extent. - * - * @owner: For data extents it is the inode number of the owning file. - * For metadata extents this parameter holds the level in the - * tree of the extent. - * - * @offset: For metadata extents the offset is ignored and is currently - * always passed as 0. For data extents it is the fileoffset - * this extent belongs to. - * - * @refs_to_add Number of references to add - * * @extent_op Pointer to a structure, holding information necessary when * updating a tree block's flags * */ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); u64 refs; + int refs_to_add = node->ref_mod; int ret; path = btrfs_alloc_path(); @@ -1460,10 +1502,10 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, - parent, root_objectid, owner, + node->parent, node->ref_root, owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) - goto out; + return ret; /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -1478,65 +1520,84 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); - btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); /* now insert the actual backref */ if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); - ret = insert_tree_block_ref(trans, path, bytenr, parent, - root_objectid); + ret = insert_tree_block_ref(trans, path, node, bytenr); + if (ret) + btrfs_abort_transaction(trans, ret); } else { - ret = insert_extent_data_ref(trans, path, bytenr, parent, - root_objectid, owner, offset, - refs_to_add); + ret = insert_extent_data_ref(trans, path, node, bytenr); + if (ret) + btrfs_abort_transaction(trans, ret); } - if (ret) - btrfs_abort_transaction(trans, ret); -out: - btrfs_free_path(path); + return ret; } +static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, + const struct btrfs_delayed_ref_head *href) +{ + u64 root = href->owning_root; + + /* + * Don't check must_insert_reserved, as this is called from contexts + * where it has already been unset. + */ + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE || + !href->is_data || !btrfs_is_fstree(root)) + return; + + btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes, + BTRFS_QGROUP_RSV_DATA); +} + static int run_delayed_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_ref_head *href, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) + bool insert_reserved) { int ret = 0; - struct btrfs_delayed_data_ref *ref; - struct btrfs_key ins; u64 parent = 0; - u64 ref_root = 0; u64 flags = 0; - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - - ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); + trace_run_delayed_data_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_DATA_REF_KEY) - parent = ref->parent; - ref_root = ref->root; + parent = node->parent; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + struct btrfs_key key; + struct btrfs_squota_delta delta = { + .root = href->owning_root, + .num_bytes = node->num_bytes, + .is_data = true, + .is_inc = true, + .generation = trans->transid, + }; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); + if (extent_op) flags |= extent_op->flags_to_set; - ret = alloc_reserved_file_extent(trans, parent, ref_root, - flags, ref->objectid, - ref->offset, &ins, - node->ref_mod); + + key.objectid = node->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + + ret = alloc_reserved_file_extent(trans, parent, node->ref_root, + flags, owner, offset, &key, + node->ref_mod, + href->owning_root); + free_head_ref_squota_rsv(trans->fs_info, href); + if (!ret) + ret = btrfs_record_squota_delta(trans->fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, - ref->objectid, ref->offset, - node->ref_mod, extent_op); + ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, node, parent, - ref_root, ref->objectid, - ref->offset, node->ref_mod, - extent_op); + ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } @@ -1562,18 +1623,17 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, } static int run_delayed_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head, + const struct btrfs_delayed_ref_head *head, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; struct extent_buffer *leaf; u32 item_size; int ret; - int err = 0; int metadata = 1; if (TRANS_ABORTED(trans)) @@ -1590,7 +1650,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, if (metadata) { key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = extent_op->level; + key.offset = head->level; } else { key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = head->num_bytes; @@ -1600,10 +1660,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) { + return ret; + } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { path->slots[0]--; @@ -1619,13 +1677,16 @@ again: metadata = 0; key.objectid = head->bytenr; - key.offset = head->num_bytes; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = head->num_bytes; goto again; } } else { - err = -EIO; - goto out; + ret = -EUCLEAN; + btrfs_err(fs_info, + "missing extent item for extent %llu num_bytes %llu level %d", + head->bytenr, head->num_bytes, head->level); + return ret; } } @@ -1633,54 +1694,60 @@ again: item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EINVAL; - btrfs_print_v0_err(fs_info); - btrfs_abort_transaction(trans, err); - goto out; + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); + btrfs_abort_transaction(trans, ret); + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); - btrfs_mark_buffer_dirty(leaf); -out: - btrfs_free_path(path); - return err; + return ret; } static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_ref_head *href, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) + bool insert_reserved) { int ret = 0; - struct btrfs_delayed_tree_ref *ref; + struct btrfs_fs_info *fs_info = trans->fs_info; u64 parent = 0; u64 ref_root = 0; - ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); + trace_run_delayed_tree_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) - parent = ref->parent; - ref_root = ref->root; + parent = node->parent; + ref_root = node->ref_root; - if (node->ref_mod != 1) { + if (unlikely(node->ref_mod != 1)) { btrfs_err(trans->fs_info, - "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", + "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu", node->bytenr, node->ref_mod, node->action, ref_root, parent); - return -EIO; + return -EUCLEAN; } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - BUG_ON(!extent_op || !extent_op->update_flags); + struct btrfs_squota_delta delta = { + .root = href->owning_root, + .num_bytes = fs_info->nodesize, + .is_data = false, + .is_inc = true, + .generation = trans->transid, + }; + ret = alloc_reserved_tree_block(trans, node, extent_op); + if (!ret) + btrfs_record_squota_delta(fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, - ref->level, 0, 1, extent_op); + ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, node, parent, ref_root, - ref->level, 0, 1, extent_op); + ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } @@ -1689,30 +1756,35 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_ref_head *href, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) + bool insert_reserved) { int ret = 0; if (TRANS_ABORTED(trans)) { - if (insert_reserved) - btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + if (insert_reserved) { + btrfs_pin_extent(trans, node->bytenr, node->num_bytes); + free_head_ref_squota_rsv(trans->fs_info, href); + } return 0; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) - ret = run_delayed_tree_ref(trans, node, extent_op, + ret = run_delayed_tree_ref(trans, href, node, extent_op, insert_reserved); else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) - ret = run_delayed_data_ref(trans, node, extent_op, + ret = run_delayed_data_ref(trans, href, node, extent_op, insert_reserved); + else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) + ret = 0; else BUG(); if (ret && insert_reserved) - btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes); if (ret < 0) btrfs_err(trans->fs_info, "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", @@ -1721,40 +1793,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static inline struct btrfs_delayed_ref_node * -select_delayed_ref(struct btrfs_delayed_ref_head *head) -{ - struct btrfs_delayed_ref_node *ref; - - if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) - return NULL; - - /* - * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. - * This is to prevent a ref count from going down to zero, which deletes - * the extent item from the extent tree, when there still are references - * to add, which would fail because they would not find the extent item. - */ - if (!list_empty(&head->ref_add_list)) - return list_first_entry(&head->ref_add_list, - struct btrfs_delayed_ref_node, add_list); - - ref = rb_entry(rb_first_cached(&head->ref_tree), - struct btrfs_delayed_ref_node, ref_node); - ASSERT(list_empty(&ref->add_list)); - return ref; -} - -static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - spin_lock(&delayed_refs->lock); - head->processing = 0; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_delayed_ref_unlock(head); -} - static struct btrfs_delayed_extent_op *cleanup_extent_op( struct btrfs_delayed_ref_head *head) { @@ -1787,28 +1825,38 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, return ret ? ret : 1; } -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, +u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - int nr_items = 1; /* Dropping this ref head update. */ + u64 ret = 0; /* * We had csum deletions accounted for in our delayed refs rsv, we need * to drop the csum leaves for this update from our delayed_refs_rsv. */ if (head->total_ref_mod < 0 && head->is_data) { + int nr_csums; + spin_lock(&delayed_refs->lock); delayed_refs->pending_csums -= head->num_bytes; spin_unlock(&delayed_refs->lock); - nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + + btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums); + + ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); } + /* must_insert_reserved can be set only if we didn't run the head ref. */ + if (head->must_insert_reserved) + free_head_ref_squota_rsv(fs_info, head); - btrfs_delayed_refs_rsv_release(fs_info, nr_items); + return ret; } static int cleanup_ref_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) + struct btrfs_delayed_ref_head *head, + u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1819,7 +1867,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, ret = run_and_cleanup_extent_op(trans, head); if (ret < 0) { - unselect_delayed_ref_head(delayed_refs, head); + btrfs_unselect_ref_head(delayed_refs, head); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); return ret; } else if (ret) { @@ -1838,12 +1886,12 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 1; } - btrfs_delete_ref_head(delayed_refs, head); + btrfs_delete_ref_head(fs_info, delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); if (head->must_insert_reserved) { - btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); + btrfs_pin_extent(trans, head->bytenr, head->num_bytes); if (head->is_data) { struct btrfs_root *csum_root; @@ -1853,7 +1901,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); @@ -1861,48 +1909,15 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, return ret; } -static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( - struct btrfs_trans_handle *trans) -{ - struct btrfs_delayed_ref_root *delayed_refs = - &trans->transaction->delayed_refs; - struct btrfs_delayed_ref_head *head = NULL; - int ret; - - spin_lock(&delayed_refs->lock); - head = btrfs_select_ref_head(delayed_refs); - if (!head) { - spin_unlock(&delayed_refs->lock); - return head; - } - - /* - * Grab the lock that says we are going to process all the refs for - * this head - */ - ret = btrfs_delayed_ref_lock(delayed_refs, head); - spin_unlock(&delayed_refs->lock); - - /* - * We may have dropped the spin lock to get the head mutex lock, and - * that might have given someone else time to free the head. If that's - * true, it has been removed from our list and we can move on. - */ - if (ret == -EAGAIN) - head = ERR_PTR(-EAGAIN); - - return head; -} - static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *locked_ref, - unsigned long *run_refs) + struct btrfs_delayed_ref_head *locked_ref, + u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_extent_op *extent_op; struct btrfs_delayed_ref_node *ref; - int must_insert_reserved = 0; + bool must_insert_reserved; int ret; delayed_refs = &trans->transaction->delayed_refs; @@ -1910,16 +1925,14 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, lockdep_assert_held(&locked_ref->mutex); lockdep_assert_held(&locked_ref->lock); - while ((ref = select_delayed_ref(locked_ref))) { + while ((ref = btrfs_select_delayed_ref(locked_ref))) { if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); - unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_unselect_ref_head(delayed_refs, locked_ref); return -EAGAIN; } - (*run_refs)++; - ref->in_tree = 0; rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) @@ -1939,25 +1952,32 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, default: WARN_ON(1); } - atomic_dec(&delayed_refs->num_entries); /* * Record the must_insert_reserved flag before we drop the * spin lock. */ must_insert_reserved = locked_ref->must_insert_reserved; - locked_ref->must_insert_reserved = 0; + /* + * Unsetting this on the head ref relinquishes ownership of + * the rsv_bytes, so it is critical that every possible code + * path from here forward frees all reserves including qgroup + * reserve. + */ + locked_ref->must_insert_reserved = false; extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; spin_unlock(&locked_ref->lock); - ret = run_one_delayed_ref(trans, ref, extent_op, + ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op, must_insert_reserved); + btrfs_delayed_refs_rsv_release(fs_info, 1, 0); + *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1); btrfs_free_delayed_extent_op(extent_op); if (ret) { - unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_unselect_ref_head(delayed_refs, locked_ref); btrfs_put_delayed_ref(ref); return ret; } @@ -1966,7 +1986,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, cond_resched(); spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); } return 0; @@ -1977,20 +1997,30 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long nr) + u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *locked_ref = NULL; - ktime_t start = ktime_get(); int ret; unsigned long count = 0; - unsigned long actual_count = 0; + unsigned long max_count = 0; + u64 bytes_processed = 0; delayed_refs = &trans->transaction->delayed_refs; + if (min_bytes == 0) { + /* + * We may be subject to a harmless race if some task is + * concurrently adding or removing a delayed ref, so silence + * KCSAN and similar tools. + */ + max_count = data_race(delayed_refs->num_heads_ready); + min_bytes = U64_MAX; + } + do { if (!locked_ref) { - locked_ref = btrfs_obtain_ref_head(trans); + locked_ref = btrfs_select_ref_head(fs_info, delayed_refs); if (IS_ERR_OR_NULL(locked_ref)) { if (PTR_ERR(locked_ref) == -EAGAIN) { continue; @@ -2013,10 +2043,9 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * insert_inline_extent_backref()). */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); - ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, - &actual_count); + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed); if (ret < 0 && ret != -EAGAIN) { /* * Error, btrfs_run_delayed_refs_for_head already @@ -2028,7 +2057,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * Success, perform the usual cleanup of a processed * head */ - ret = cleanup_ref_head(trans, locked_ref); + ret = cleanup_ref_head(trans, locked_ref, &bytes_processed); if (ret > 0 ) { /* We dropped our lock, we need to loop. */ ret = 0; @@ -2045,26 +2074,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, locked_ref = NULL; cond_resched(); - } while ((nr != -1 && count < nr) || locked_ref); + } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) || + (max_count > 0 && count < max_count) || + locked_ref); - /* - * We don't want to include ref heads since we can have empty ref heads - * and those will drastically skew our runtime down since we just do - * accounting, no actual extent tree updates. - */ - if (actual_count > 0) { - u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); - u64 avg; - - /* - * We weigh the current average higher than our current runtime - * to avoid large swings in the average. - */ - spin_lock(&delayed_refs->lock); - avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; - fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ - spin_unlock(&delayed_refs->lock); - } return 0; } @@ -2112,24 +2125,25 @@ static u64 find_middle(struct rb_root *root) #endif /* - * this starts processing the delayed reference count updates and - * extent insertions we have queued up so far. count can be - * 0, which means to process everything in the tree at the start - * of the run (but not newly added entries), or it can be some target - * number you'd like to process. + * Start processing the delayed reference count updates and extent insertions + * we have queued up so far. + * + * @trans: Transaction handle. + * @min_bytes: How many bytes of delayed references to process. After this + * many bytes we stop processing delayed references if there are + * any more. If 0 it means to run all existing delayed references, + * but not new ones added after running all existing ones. + * Use (u64)-1 (U64_MAX) to run all existing delayed references + * plus any new ones that are added. * * Returns 0 on success or if called with an aborted transaction * Returns <0 on error and aborts the transaction */ -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long count) +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_head *head; int ret; - int run_all = count == (unsigned long)-1; /* We'll clean this up in btrfs_cleanup_transaction */ if (TRANS_ABORTED(trans)) @@ -2139,48 +2153,35 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, return 0; delayed_refs = &trans->transaction->delayed_refs; - if (count == 0) - count = delayed_refs->num_heads_ready; - again: #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - ret = __btrfs_run_delayed_refs(trans, count); - if (ret < 0) { + ret = __btrfs_run_delayed_refs(trans, min_bytes); + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } - if (run_all) { + if (min_bytes == U64_MAX) { btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); - node = rb_first_cached(&delayed_refs->href_root); - if (!node) { + if (xa_empty(&delayed_refs->head_refs)) { spin_unlock(&delayed_refs->lock); - goto out; + return 0; } - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); - /* Mutex was contended, block until it's released and retry. */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - - btrfs_put_delayed_ref_head(head); cond_resched(); goto again; } -out: + return 0; } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, - int level) + struct extent_buffer *eb, u64 flags) { struct btrfs_delayed_extent_op *extent_op; int ret; @@ -2192,21 +2193,21 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_flags = true; extent_op->update_key = false; - extent_op->level = level; - ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); + ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, + btrfs_header_level(eb), extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; } -static noinline int check_delayed_ref(struct btrfs_root *root, +static noinline int check_delayed_ref(struct btrfs_inode *inode, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) + u64 offset, u64 bytenr) { + struct btrfs_root *root = inode->root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_transaction *cur_trans; struct rb_node *node; @@ -2222,7 +2223,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, delayed_refs = &cur_trans->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); if (!head) { spin_unlock(&delayed_refs->lock); btrfs_put_transaction(cur_trans); @@ -2260,6 +2261,9 @@ static noinline int check_delayed_ref(struct btrfs_root *root, */ for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) { + u64 ref_owner; + u64 ref_offset; + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { @@ -2267,15 +2271,15 @@ static noinline int check_delayed_ref(struct btrfs_root *root, break; } - data_ref = btrfs_delayed_node_to_data_ref(ref); + ref_owner = btrfs_delayed_ref_owner(ref); + ref_offset = btrfs_delayed_ref_offset(ref); /* * If our ref doesn't match the one we're currently looking at * then we have a cross reference. */ - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || - data_ref->offset != offset) { + if (ref->ref_root != btrfs_root_id(root) || + ref_owner != btrfs_ino(inode) || ref_offset != offset) { ret = 1; break; } @@ -2286,11 +2290,53 @@ static noinline int check_delayed_ref(struct btrfs_root *root, return ret; } -static noinline int check_committed_ref(struct btrfs_root *root, +/* + * Check if there are references for a data extent other than the one belonging + * to the given inode and offset. + * + * @inode: The only inode we expect to find associated with the data extent. + * @path: A path to use for searching the extent tree. + * @offset: The only offset we expect to find associated with the data extent. + * @bytenr: The logical address of the data extent. + * + * When the extent does not have any other references other than the one we + * expect to find, we always return a value of 0 with the path having a locked + * leaf that contains the extent's extent item - this is necessary to ensure + * we don't race with a task running delayed references, and our caller must + * have such a path when calling check_delayed_ref() - it must lock a delayed + * ref head while holding the leaf locked. In case the extent item is not found + * in the extent tree, we return -ENOENT with the path having the leaf (locked) + * where the extent item should be, in order to prevent races with another task + * running delayed references, so that we don't miss any reference when calling + * check_delayed_ref(). + * + * Note: this may return false positives, and this is because we want to be + * quick here as we're called in write paths (when flushing delalloc and + * in the direct IO write path). For example we can have an extent with + * a single reference but that reference is not inlined, or we may have + * many references in the extent tree but we also have delayed references + * that cancel all the reference except the one for our inode and offset, + * but it would be expensive to do such checks and complex due to all + * locking to avoid races between the checks and flushing delayed refs, + * plus non-inline references may be located on leaves other than the one + * that contains the extent item in the extent tree. The important thing + * here is to not return false negatives and that the false positives are + * not very common. + * + * Returns: 0 if there are no cross references and with the path having a locked + * leaf from the extent tree that contains the extent's extent item. + * + * 1 if there are cross references (false positives can happen). + * + * < 0 in case of an error. In case of -ENOENT the leaf in the extent + * tree where the extent item should be located at is read locked and + * accessible in the given path. + */ +static noinline int check_committed_ref(struct btrfs_inode *inode, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr, - bool strict) + u64 offset, u64 bytenr) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct extent_buffer *leaf; @@ -2299,85 +2345,112 @@ static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_extent_item *ei; struct btrfs_key key; u32 item_size; + u32 expected_size; int type; int ret; key.objectid = bytenr; - key.offset = (u64)-1; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out; - BUG_ON(ret == 0); /* Corruption */ + return ret; + if (unlikely(ret == 0)) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + return -EUCLEAN; + } - ret = -ENOENT; if (path->slots[0] == 0) - goto out; + return -ENOENT; path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) - goto out; + return -ENOENT; - ret = 1; item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); - /* If extent item has more than 1 inline ref then it's shared */ - if (item_size != sizeof(*ei) + - btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) - goto out; - - /* - * If extent created before last snapshot => it's shared unless the - * snapshot has been deleted. Use the heuristic if strict is false. - */ - if (!strict && - (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item))) - goto out; + /* No inline refs; we need to bail before checking for owner ref. */ + if (item_size == sizeof(*ei)) + return 1; + /* Check for an owner ref; skip over it to the real inline refs. */ iref = (struct btrfs_extent_inline_ref *)(ei + 1); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); + if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { + expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); + iref = (struct btrfs_extent_inline_ref *)(iref + 1); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); + } + + /* If extent item has more than 1 inline ref then it's shared */ + if (item_size != expected_size) + return 1; /* If this extent has SHARED_DATA_REF then it's shared */ - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (type != BTRFS_EXTENT_DATA_REF_KEY) - goto out; + return 1; ref = (struct btrfs_extent_data_ref *)(&iref->offset); if (btrfs_extent_refs(leaf, ei) != btrfs_extent_data_ref_count(leaf, ref) || - btrfs_extent_data_ref_root(leaf, ref) != - root->root_key.objectid || - btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) || + btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) || btrfs_extent_data_ref_offset(leaf, ref) != offset) - goto out; + return 1; - ret = 0; -out: - return ret; + return 0; } -int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr, bool strict, struct btrfs_path *path) +int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, + u64 bytenr, struct btrfs_path *path) { int ret; do { - ret = check_committed_ref(root, path, objectid, - offset, bytenr, strict); + ret = check_committed_ref(inode, path, offset, bytenr); if (ret && ret != -ENOENT) goto out; - ret = check_delayed_ref(root, path, objectid, offset, bytenr); - } while (ret == -EAGAIN); + /* + * The path must have a locked leaf from the extent tree where + * the extent item for our extent is located, in case it exists, + * or where it should be located in case it doesn't exist yet + * because it's new and its delayed ref was not yet flushed. + * We need to lock the delayed ref head at check_delayed_ref(), + * if one exists, while holding the leaf locked in order to not + * race with delayed ref flushing, missing references and + * incorrectly reporting that the extent is not shared. + */ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { + struct extent_buffer *leaf = path->nodes[0]; + + ASSERT(leaf != NULL); + btrfs_assert_tree_read_locked(leaf); + + if (ret != -ENOENT) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.objectid == bytenr); + ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY); + } + } + + ret = check_delayed_ref(inode, path, offset, bytenr); + } while (ret == -EAGAIN && !path->nowait); out: btrfs_release_path(path); - if (btrfs_is_data_reloc_root(root)) + if (btrfs_is_data_reloc_root(inode->root)) WARN_ON(ret > 0); return ret; } @@ -2385,17 +2458,14 @@ out: static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc) + bool full_backref, bool inc) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 bytenr; - u64 num_bytes; u64 parent; u64 ref_root; u32 nritems; struct btrfs_key key; struct btrfs_file_extent_item *fi; - struct btrfs_ref generic_ref = { 0 }; bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); int i; int action; @@ -2422,6 +2492,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, action = BTRFS_DROP_DELAYED_REF; for (i = 0; i < nritems; i++) { + struct btrfs_ref ref = { + .action = action, + .parent = parent, + .ref_root = ref_root, + }; + if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) @@ -2431,34 +2507,33 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, if (btrfs_file_extent_type(buf, fi) == BTRFS_FILE_EXTENT_INLINE) continue; - bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (bytenr == 0) + ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (ref.bytenr == 0) continue; - num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + ref.owning_root = ref_root; + key.offset -= btrfs_file_extent_offset(buf, fi); - btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent); - btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, - key.offset, root->root_key.objectid, - for_reloc); + btrfs_init_data_ref(&ref, key.objectid, key.offset, + btrfs_root_id(root), for_reloc); if (inc) - ret = btrfs_inc_extent_ref(trans, &generic_ref); + ret = btrfs_inc_extent_ref(trans, &ref); else - ret = btrfs_free_extent(trans, &generic_ref); + ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } else { - bytenr = btrfs_node_blockptr(buf, i); - num_bytes = fs_info->nodesize; - btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent); - btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, - root->root_key.objectid, for_reloc); + /* We don't know the owning_root, leave as 0. */ + ref.bytenr = btrfs_node_blockptr(buf, i); + ref.num_bytes = fs_info->nodesize; + + btrfs_init_tree_ref(&ref, level - 1, + btrfs_root_id(root), for_reloc); if (inc) - ret = btrfs_inc_extent_ref(trans, &generic_ref); + ret = btrfs_inc_extent_ref(trans, &ref); else - ret = btrfs_free_extent(trans, &generic_ref); + ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } @@ -2469,15 +2544,15 @@ fail: } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, bool full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1); + return __btrfs_mod_ref(trans, root, buf, full_backref, true); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, bool full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0); + return __btrfs_mod_ref(trans, root, buf, full_backref, false); } static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) @@ -2517,52 +2592,46 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info) } static int pin_down_extent(struct btrfs_trans_handle *trans, - struct btrfs_block_group *cache, - u64 bytenr, u64 num_bytes, int reserved) + struct btrfs_block_group *bg, + u64 bytenr, u64 num_bytes, bool reserved) { - struct btrfs_fs_info *fs_info = cache->fs_info; - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, - num_bytes); - if (reserved) { - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; - } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - set_extent_dirty(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + struct btrfs_space_info *space_info = bg->space_info; + const u64 reserved_bytes = (reserved ? num_bytes : 0); + + spin_lock(&space_info->lock); + spin_lock(&bg->lock); + bg->pinned += num_bytes; + bg->reserved -= reserved_bytes; + spin_unlock(&bg->lock); + space_info->bytes_reserved -= reserved_bytes; + btrfs_space_info_update_bytes_pinned(space_info, num_bytes); + spin_unlock(&space_info->lock); + + btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); return 0; } -int btrfs_pin_extent(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, int reserved) +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(trans, cache, bytenr, num_bytes, reserved); + pin_down_extent(trans, cache, bytenr, num_bytes, true); btrfs_put_block_group(cache); return 0; } -/* - * this function must be called within transaction - */ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes) + const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret; - cache = btrfs_lookup_block_group(trans->fs_info, bytenr); + cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) return -EINVAL; @@ -2574,10 +2643,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, if (ret) goto out; - pin_down_extent(trans, cache, bytenr, num_bytes, 0); + pin_down_extent(trans, cache, eb->start, eb->len, false); /* remove us from the free space cache (if we're there at all) */ - ret = btrfs_remove_free_space(cache, bytenr, num_bytes); + ret = btrfs_remove_free_space(cache, eb->start, eb->len); out: btrfs_put_block_group(cache); return ret; @@ -2676,22 +2745,24 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, { struct btrfs_block_group *cache = NULL; struct btrfs_space_info *space_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_free_cluster *cluster = NULL; - u64 len; u64 total_unpinned = 0; u64 empty_cluster = 0; - bool readonly; while (start <= end) { - readonly = false; + u64 len; + bool readonly; + if (!cache || start >= cache->start + cache->length) { if (cache) btrfs_put_block_group(cache); total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); - BUG_ON(!cache); /* Logic error */ + if (unlikely(cache == NULL)) { + /* Logic error, something removed the block group. */ + return -EUCLEAN; + } cluster = fetch_cluster_info(fs_info, cache->space_info, @@ -2724,42 +2795,27 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); + readonly = cache->ro; cache->pinned -= len; - btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); + spin_unlock(&cache->lock); + + btrfs_space_info_update_bytes_pinned(space_info, -len); space_info->max_extent_size = 0; - if (cache->ro) { + + if (readonly) { space_info->bytes_readonly += len; - readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ - space_info->bytes_zone_unusable += len; - readonly = true; + btrfs_space_info_update_bytes_zone_unusable(space_info, len); + } else if (return_free_space) { + btrfs_return_free_space(space_info, len); } - spin_unlock(&cache->lock); - if (!readonly && return_free_space && - global_rsv->space_info == space_info) { - spin_lock(&global_rsv->lock); - if (!global_rsv->full) { - u64 to_add = min(len, global_rsv->size - - global_rsv->reserved); - - global_rsv->reserved += to_add; - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, to_add); - if (global_rsv->reserved >= global_rsv->size) - global_rsv->full = 1; - len -= to_add; - } - spin_unlock(&global_rsv->lock); - } - /* Add to any tickets we may have */ - if (!readonly && return_free_space && len) - btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } if (cache) btrfs_put_block_group(cache); + return 0; } @@ -2768,34 +2824,63 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *block_group, *tmp; struct list_head *deleted_bgs; - struct extent_io_tree *unpin; + struct extent_io_tree *unpin = &trans->transaction->pinned_extents; + struct extent_state *cached_state = NULL; u64 start; u64 end; + int unpin_error = 0; int ret; - unpin = &trans->transaction->pinned_extents; + mutex_lock(&fs_info->unused_bg_unpin_mutex); + btrfs_find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, &cached_state); - while (!TRANS_ABORTED(trans)) { - struct extent_state *cached_state = NULL; - - mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - break; - } + while (!TRANS_ABORTED(trans) && cached_state) { + struct extent_state *next_state; if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end, &cached_state); - unpin_extent_range(fs_info, start, end, true); - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - free_extent_state(cached_state); - cond_resched(); + next_state = btrfs_next_extent_state(unpin, cached_state); + btrfs_clear_extent_dirty(unpin, start, end, &cached_state); + ret = unpin_extent_range(fs_info, start, end, true); + /* + * If we get an error unpinning an extent range, store the first + * error to return later after trying to unpin all ranges and do + * the sync discards. Our caller will abort the transaction + * (which already wrote new superblocks) and on the next mount + * the space will be available as it was pinned by in-memory + * only structures in this phase. + */ + if (ret) { + btrfs_err_rl(fs_info, +"failed to unpin extent range [%llu, %llu] when committing transaction %llu: %s (%d)", + start, end, trans->transid, + btrfs_decode_error(ret), ret); + if (!unpin_error) + unpin_error = ret; + } + + btrfs_free_extent_state(cached_state); + + if (need_resched()) { + btrfs_free_extent_state(next_state); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + cond_resched(); + cached_state = NULL; + mutex_lock(&fs_info->unused_bg_unpin_mutex); + btrfs_find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state); + } else { + cached_state = next_state; + if (cached_state) { + start = cached_state->start; + end = cached_state->end; + } + } } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_free_extent_state(cached_state); if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { btrfs_discard_calc_delay(&fs_info->discard_ctl); @@ -2809,16 +2894,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) */ deleted_bgs = &trans->transaction->deleted_bgs; list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { - u64 trimmed = 0; - ret = -EROFS; if (!TRANS_ABORTED(trans)) - ret = btrfs_discard_extent(fs_info, - block_group->start, - block_group->length, - &trimmed); + ret = btrfs_discard_extent(fs_info, block_group->start, + block_group->length, NULL); + /* + * Not strictly necessary to lock, as the block_group should be + * read-only from btrfs_delete_unused_bgs(). + */ + ASSERT(block_group->ro); + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); @@ -2830,27 +2919,88 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) } } + return unpin_error; +} + +/* + * Parse an extent item's inline extents looking for a simple quotas owner ref. + * + * @fs_info: the btrfs_fs_info for this mount + * @leaf: a leaf in the extent tree containing the extent item + * @slot: the slot in the leaf where the extent item is found + * + * Returns the objectid of the root that originally allocated the extent item + * if the inline owner ref is expected and present, otherwise 0. + * + * If an extent item has an owner ref item, it will be the first inline ref + * item. Therefore the logic is to check whether there are any inline ref + * items, then check the type of the first one. + */ +u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_owner_ref *oref; + unsigned long ptr; + unsigned long end; + int type; + + if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) + return 0; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + btrfs_item_size(leaf, slot); + + /* No inline ref items of any kind, can't check type. */ + if (ptr == end) + return 0; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); + + /* We found an owner ref, get the root out of it. */ + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + return btrfs_extent_owner_ref_root_id(leaf, oref); + } + + /* We have inline refs, but not an owner ref. */ return 0; } static int do_free_extent_accounting(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, bool is_data) + u64 bytenr, struct btrfs_squota_delta *delta) { int ret; + u64 num_bytes = delta->num_bytes; - if (is_data) { + if (delta->is_data) { struct btrfs_root *csum_root; csum_root = btrfs_csum_root(trans->fs_info, bytenr); ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); - if (ret) { + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } } - ret = add_to_free_space_tree(trans, bytenr, num_bytes); - if (ret) { + ret = btrfs_record_squota_delta(trans->fs_info, delta); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2862,6 +3012,13 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, return ret; } +#define abort_and_dump(trans, path, fmt, args...) \ +({ \ + btrfs_abort_transaction(trans, -EUCLEAN); \ + btrfs_print_leaf(path->nodes[0]); \ + btrfs_crit(trans->fs_info, fmt, ##args); \ +}) + /* * Drop one or more refs of @node. * @@ -2922,14 +3079,13 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_ref_head *href, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *extent_root; struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -2939,11 +3095,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; + int refs_to_drop = node->ref_mod; u32 item_size; u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; + u64 owner_objectid = btrfs_delayed_ref_owner(node); + u64 owner_offset = btrfs_delayed_ref_offset(node); bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); + u64 delayed_ref_root = href->owning_root; extent_root = btrfs_extent_root(info, bytenr); ASSERT(extent_root); @@ -2954,20 +3114,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - if (!is_data && refs_to_drop != 1) { + if (unlikely(!is_data && refs_to_drop != 1)) { btrfs_crit(info, "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", node->bytenr, refs_to_drop); ret = -EINVAL; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } if (is_data) skinny_metadata = false; ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, - parent, root_objectid, owner_objectid, + node->parent, node->ref_root, owner_objectid, owner_offset); if (ret == 0) { /* @@ -2994,25 +3154,25 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, break; } - /* Quick path didn't find the EXTEMT/METADATA_ITEM */ + /* Quick path didn't find the EXTENT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; } if (!found_extent) { - if (iref) { - btrfs_crit(info, -"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + if (unlikely(iref)) { + abort_and_dump(trans, path, +"invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", + path->slots[0]); + return -EUCLEAN; } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); @@ -3053,64 +3213,63 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (ret) { - btrfs_err(info, - "umm, got %d back from search, was looking for %llu", - ret, bytenr); if (ret > 0) btrfs_print_leaf(path->nodes[0]); + btrfs_err(info, + "umm, got %d back from search, was looking for %llu, slot %d", + ret, bytenr, path->slots[0]); } - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } extent_slot = path->slots[0]; } } else if (WARN_ON(ret == -ENOENT)) { - btrfs_print_leaf(path->nodes[0]); - btrfs_err(info, - "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", - bytenr, parent, root_objectid, owner_objectid, - owner_offset); - btrfs_abort_transaction(trans, ret); - goto out; + abort_and_dump(trans, path, +"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", + bytenr, node->parent, node->ref_root, owner_objectid, + owner_offset, path->slots[0]); + return ret; } else { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, extent_slot); if (unlikely(item_size < sizeof(*ei))) { - ret = -EINVAL; - btrfs_print_v0_err(info); + ret = -EUCLEAN; + btrfs_err(trans->fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; - if (item_size < sizeof(*ei) + sizeof(*bi)) { - btrfs_crit(info, -"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu", - key.objectid, key.type, key.offset, - owner_objectid, item_size, - sizeof(*ei) + sizeof(*bi)); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + + if (unlikely(item_size < sizeof(*ei) + sizeof(*bi))) { + abort_and_dump(trans, path, +"invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", + key.objectid, key.type, key.offset, + path->slots[0], owner_objectid, item_size, + sizeof(*ei) + sizeof(*bi)); + return -EUCLEAN; } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); } refs = btrfs_extent_refs(leaf, ei); - if (refs < refs_to_drop) { - btrfs_crit(info, - "trying to drop %d refs but we only have %llu for bytenr %llu", - refs_to_drop, refs, bytenr); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + if (unlikely(refs < refs_to_drop)) { + abort_and_dump(trans, path, + "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", + refs_to_drop, refs, bytenr, path->slots[0]); + return -EUCLEAN; } refs -= refs_to_drop; @@ -3122,44 +3281,49 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * be updated by remove_extent_backref */ if (iref) { - if (!found_extent) { - btrfs_crit(info, -"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + if (unlikely(!found_extent)) { + abort_and_dump(trans, path, +"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", + path->slots[0]); + return -EUCLEAN; } } else { btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(leaf); } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, is_data); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } } } else { + struct btrfs_squota_delta delta = { + .root = delayed_ref_root, + .num_bytes = num_bytes, + .is_data = is_data, + .is_inc = false, + .generation = btrfs_extent_generation(leaf, ei), + }; + /* In this branch refs == 1 */ if (found_extent) { - if (is_data && refs_to_drop != - extent_data_ref_count(path, iref)) { - btrfs_crit(info, - "invalid refs_to_drop, current refs %u refs_to_drop %u", - extent_data_ref_count(path, iref), - refs_to_drop); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + if (unlikely(is_data && refs_to_drop != + extent_data_ref_count(path, iref))) { + abort_and_dump(trans, path, + "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", + extent_data_ref_count(path, iref), + refs_to_drop, path->slots[0]); + return -EUCLEAN; } if (iref) { - if (path->slots[0] != extent_slot) { - btrfs_crit(info, -"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref", - key.objectid, key.type, - key.offset); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + if (unlikely(path->slots[0] != extent_slot)) { + abort_and_dump(trans, path, +"invalid iref, extent item key " BTRFS_KEY_FMT " slot %u doesn't have wanted iref", + BTRFS_KEY_FMT_VALUE(&key), + path->slots[0]); + return -EUCLEAN; } } else { /* @@ -3168,45 +3332,40 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * | extent_slot ||extent_slot + 1| * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] */ - if (path->slots[0] != extent_slot + 1) { - btrfs_crit(info, - "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + if (unlikely(path->slots[0] != extent_slot + 1)) { + abort_and_dump(trans, path, + "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", + path->slots[0]); + return -EUCLEAN; } path->slots[0] = extent_slot; num_to_del = 2; } } + /* + * We can't infer the data owner from the delayed ref, so we need + * to try to get it from the owning ref item. + * + * If it is not present, then that extent was not written under + * simple quotas mode, so we don't need to account for its deletion. + */ + if (is_data) + delta.root = btrfs_get_extent_owner_root(trans->fs_info, + leaf, extent_slot); ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); - ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); + ret = do_free_extent_accounting(trans, bytenr, &delta); } btrfs_release_path(path); -out: - btrfs_free_path(path); return ret; -err_dump: - /* - * Leaf dump can take up a lot of log buffer, so we only do full leaf - * dump for debug build. - */ - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { - btrfs_crit(info, "path->slots[0]=%d extent_slot=%d", - path->slots[0], extent_slot); - btrfs_print_leaf(path->nodes[0]); - } - - btrfs_free_path(path); - return -EUCLEAN; } /* @@ -3218,13 +3377,14 @@ err_dump: static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, u64 bytenr) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (!head) goto out_delayed_unlock; @@ -3242,8 +3402,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!mutex_trylock(&head->mutex)) goto out; - btrfs_delete_ref_head(delayed_refs, head); - head->processing = 0; + btrfs_delete_ref_head(fs_info, delayed_refs, head); + head->processing = false; spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -3252,7 +3412,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; - btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; @@ -3264,87 +3424,93 @@ out_delayed_unlock: return 0; } -void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - u64 root_id, - struct extent_buffer *buf, - u64 parent, int last_ref) +int btrfs_free_tree_block(struct btrfs_trans_handle *trans, + u64 root_id, + struct extent_buffer *buf, + u64 parent, int last_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_ref generic_ref = { 0 }; + struct btrfs_block_group *bg; int ret; - btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, - buf->start, buf->len, parent); - btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root_id, 0, false); - if (root_id != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_ref generic_ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = buf->start, + .num_bytes = buf->len, + .parent = parent, + .owning_root = btrfs_header_owner(buf), + .ref_root = root_id, + }; + + /* + * Assert that the extent buffer is not cleared due to + * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer + * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for + * detail. + */ + ASSERT(btrfs_header_bytenr(buf) != 0); + + btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); - BUG_ON(ret); /* -ENOMEM */ + if (ret < 0) + return ret; } - if (last_ref && btrfs_header_generation(buf) == trans->transid) { - struct btrfs_block_group *cache; - bool must_pin = false; - - if (root_id != BTRFS_TREE_LOG_OBJECTID) { - ret = check_ref_cleanup(trans, buf->start); - if (!ret) { - btrfs_redirty_list_add(trans->transaction, buf); - goto out; - } - } + if (!last_ref) + return 0; - cache = btrfs_lookup_block_group(fs_info, buf->start); + if (btrfs_header_generation(buf) != trans->transid) + goto out; - if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(trans, cache, buf->start, buf->len, 1); - btrfs_put_block_group(cache); + if (root_id != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, buf->start); + if (!ret) goto out; - } + } - /* - * If there are tree mod log users we may have recorded mod log - * operations for this node. If we re-allocate this node we - * could replay operations on this node that happened when it - * existed in a completely different root. For example if it - * was part of root A, then was reallocated to root B, and we - * are doing a btrfs_old_search_slot(root b), we could replay - * operations that happened when the block was part of root A, - * giving us an inconsistent view of the btree. - * - * We are safe from races here because at this point no other - * node or root points to this extent buffer, so if after this - * check a new tree mod log user joins we will not have an - * existing log of operations on this node that we have to - * contend with. - */ - if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) - must_pin = true; + bg = btrfs_lookup_block_group(fs_info, buf->start); - if (must_pin || btrfs_is_zoned(fs_info)) { - btrfs_redirty_list_add(trans->transaction, buf); - pin_down_extent(trans, cache, buf->start, buf->len, 1); - btrfs_put_block_group(cache); - goto out; - } + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(trans, bg, buf->start, buf->len, true); + btrfs_put_block_group(bg); + goto out; + } - WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + /* + * If there are tree mod log users we may have recorded mod log + * operations for this node. If we re-allocate this node we + * could replay operations on this node that happened when it + * existed in a completely different root. For example if it + * was part of root A, then was reallocated to root B, and we + * are doing a btrfs_old_search_slot(root b), we could replay + * operations that happened when the block was part of root A, + * giving us an inconsistent view of the btree. + * + * We are safe from races here because at this point no other + * node or root points to this extent buffer, so if after this + * check a new tree mod log user joins we will not have an + * existing log of operations on this node that we have to + * contend with. + */ - btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_free_reserved_bytes(cache, buf->len, 0); - btrfs_put_block_group(cache); - trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); + if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags) + || btrfs_is_zoned(fs_info)) { + pin_down_extent(trans, bg, buf->start, buf->len, true); + btrfs_put_block_group(bg); + goto out; } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(bg, buf->start, buf->len); + btrfs_free_reserved_bytes(bg, buf->len, false); + btrfs_put_block_group(bg); + trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); + out: - if (last_ref) { - /* - * Deleting the buffer, clear the corrupt flag since it doesn't - * matter anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); - } + return 0; } /* Can return -ENOMEM */ @@ -3360,12 +3526,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. */ - if ((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || - (ref->type == BTRFS_REF_DATA && - ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) { - /* unlocks the pinned mutex */ - btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); + if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) { + btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); @@ -3373,32 +3535,57 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) ret = btrfs_add_delayed_data_ref(trans, ref, 0); } - if (!((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || - (ref->type == BTRFS_REF_DATA && - ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID))) + if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID) btrfs_ref_tree_mod(fs_info, ref); return ret; } enum btrfs_loop_type { + /* + * Start caching block groups but do not wait for progress or for them + * to be done. + */ LOOP_CACHING_NOWAIT, + + /* + * Wait for the block group free_space >= the space we're waiting for if + * the block group isn't cached. + */ LOOP_CACHING_WAIT, + + /* + * Allow allocations to happen from block groups that do not yet have a + * size classification. + */ + LOOP_UNSET_SIZE_CLASS, + + /* + * Allocate a chunk and then retry the allocation. + */ LOOP_ALLOC_CHUNK, + + /* + * Ignore the size class restrictions for this allocation. + */ + LOOP_WRONG_SIZE_CLASS, + + /* + * Ignore the empty size, only try to allocate the number of bytes + * needed for this allocation. + */ LOOP_NO_EMPTY_SIZE, }; static inline void -btrfs_lock_block_group(struct btrfs_block_group *cache, - int delalloc) +btrfs_lock_block_group(struct btrfs_block_group *cache, bool delalloc) { if (delalloc) down_read(&cache->data_rwsem); } static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, - int delalloc) + bool delalloc) { btrfs_get_block_group(cache); if (delalloc) @@ -3408,7 +3595,7 @@ static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, static struct btrfs_block_group *btrfs_lock_cluster( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, - int delalloc) + bool delalloc) __acquires(&cluster->refill_lock) { struct btrfs_block_group *used_bg = NULL; @@ -3445,94 +3632,32 @@ static struct btrfs_block_group *btrfs_lock_cluster( } static inline void -btrfs_release_block_group(struct btrfs_block_group *cache, - int delalloc) +btrfs_release_block_group(struct btrfs_block_group *cache, bool delalloc) { if (delalloc) up_read(&cache->data_rwsem); btrfs_put_block_group(cache); } -enum btrfs_extent_allocation_policy { - BTRFS_EXTENT_ALLOC_CLUSTERED, - BTRFS_EXTENT_ALLOC_ZONED, -}; - -/* - * Structure used internally for find_free_extent() function. Wraps needed - * parameters. - */ -struct find_free_extent_ctl { - /* Basic allocation info */ - u64 ram_bytes; - u64 num_bytes; - u64 min_alloc_size; - u64 empty_size; - u64 flags; - int delalloc; - - /* Where to start the search inside the bg */ - u64 search_start; - - /* For clustered allocation */ - u64 empty_cluster; - struct btrfs_free_cluster *last_ptr; - bool use_cluster; - - bool have_caching_bg; - bool orig_have_caching_bg; - - /* Allocation is called for tree-log */ - bool for_treelog; - - /* Allocation is called for data relocation */ - bool for_data_reloc; - - /* RAID index, converted from flags */ - int index; - - /* - * Current loop number, check find_free_extent_update_loop() for details - */ - int loop; - - /* - * Whether we're refilling a cluster, if true we need to re-search - * current block group but don't try to refill the cluster again. - */ - bool retry_clustered; - - /* - * Whether we're updating free space cache, if true we need to re-search - * current block group but don't try updating free space cache again. - */ - bool retry_unclustered; - - /* If current block group is cached */ - int cached; - - /* Max contiguous hole found */ - u64 max_extent_size; - - /* Total free space from free space cache, not always contiguous */ - u64 total_free_space; - - /* Found result */ - u64 found_offset; - - /* Hint where to start looking for an empty space */ - u64 hint_byte; - - /* Allocation policy */ - enum btrfs_extent_allocation_policy policy; -}; - +static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl, + const struct btrfs_block_group *bg) +{ + if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) + return true; + if (!btrfs_block_group_should_use_size_class(bg)) + return true; + if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) + return true; + if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && + bg->size_class == BTRFS_BG_SZ_NONE) + return true; + return ffe_ctl->size_class == bg->size_class; +} /* * Helper function for find_free_extent(). * * Return -ENOENT to inform caller that we need fallback to unclustered mode. - * Return -EAGAIN to inform caller that we need to re-search this block group * Return >0 to inform caller that we find nothing * Return 0 means we have found a location and set ffe_ctl->found_offset. */ @@ -3550,7 +3675,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (!cluster_bg) goto refill_cluster; if (cluster_bg != bg && (cluster_bg->ro || - !block_group_bits(cluster_bg, ffe_ctl->flags))) + !block_group_bits(cluster_bg, ffe_ctl->flags) || + !find_free_extent_check_size_class(ffe_ctl, cluster_bg))) goto release_cluster; offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, @@ -3559,8 +3685,7 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (offset) { /* We have a block, we're done */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(cluster_bg, - ffe_ctl->search_start, ffe_ctl->num_bytes); + trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); *cluster_bg_ret = cluster_bg; ffe_ctl->found_offset = offset; return 0; @@ -3610,20 +3735,10 @@ refill_cluster: if (offset) { /* We found one, proceed */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(bg, - ffe_ctl->search_start, - ffe_ctl->num_bytes); ffe_ctl->found_offset = offset; + trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); return 0; } - } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && - !ffe_ctl->retry_clustered) { - spin_unlock(&last_ptr->refill_lock); - - ffe_ctl->retry_clustered = true; - btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_cluster + ffe_ctl->empty_size); - return -EAGAIN; } /* * At this point we either didn't find a cluster or we weren't able to @@ -3638,7 +3753,6 @@ refill_cluster: /* * Return >0 to inform caller that we find nothing * Return 0 when we found an free extent and set ffe_ctrl->found_offset - * Return -EAGAIN to inform caller that we need to re-search this block group */ static int find_free_extent_unclustered(struct btrfs_block_group *bg, struct find_free_extent_ctl *ffe_ctl) @@ -3676,25 +3790,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg, offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, ffe_ctl->num_bytes, ffe_ctl->empty_size, &ffe_ctl->max_extent_size); - - /* - * If we didn't find a chunk, and we haven't failed on this block group - * before, and this block group is in the middle of caching and we are - * ok with waiting, then go ahead and wait for progress to be made, and - * set @retry_unclustered to true. - * - * If @retry_unclustered is true then we've already waited on this - * block group once and should move on to the next block group. - */ - if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && - ffe_ctl->loop > LOOP_CACHING_NOWAIT) { - btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_size); - ffe_ctl->retry_unclustered = true; - return -EAGAIN; - } else if (!offset) { + if (!offset) return 1; - } ffe_ctl->found_offset = offset; return 0; } @@ -3708,7 +3805,7 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, /* We want to try and use the cluster allocator, so lets look there */ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); - if (ret >= 0 || ret == -EAGAIN) + if (ret >= 0) return ret; /* ret == -ENOENT case falls through */ } @@ -3793,7 +3890,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, } spin_unlock(&block_group->lock); - if (!ret && !btrfs_zone_activate(block_group)) { + /* Metadata block group is activated at write time. */ + if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && + !btrfs_zone_activate(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. @@ -3817,7 +3916,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, fs_info->data_reloc_bg == 0); if (block_group->ro || - test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + (!ffe_ctl->for_data_reloc && + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) { ret = 1; goto out; } @@ -3860,8 +3960,26 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, if (ffe_ctl->for_treelog && !fs_info->treelog_bg) fs_info->treelog_bg = block_group->start; - if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) - fs_info->data_reloc_bg = block_group->start; + if (ffe_ctl->for_data_reloc) { + if (!fs_info->data_reloc_bg) + fs_info->data_reloc_bg = block_group->start; + /* + * Do not allow allocations from this block group, unless it is + * for data relocation. Compared to increasing the ->ro, setting + * the ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + * + * Also, this flag avoids this block group to be zone finished. + */ + set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); + } ffe_ctl->found_offset = start + block_group->alloc_offset; block_group->alloc_offset += num_bytes; @@ -3879,24 +3997,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc && - fs_info->data_reloc_bg == block_group->start) { - /* - * Do not allow further allocations from this block group. - * Compared to increasing the ->ro, setting the - * ->zoned_data_reloc_ongoing flag still allows nocow - * writers to come in. See btrfs_inc_nocow_writers(). - * - * We need to disable an allocation to avoid an allocation of - * regular (non-relocation data) extent. With mix of relocation - * extents and regular extents, we can dispatch WRITE commands - * (for relocation extents) and ZONE APPEND commands (for - * regular extents) at the same time to the same zone, which - * easily break the write pointer. - */ - set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); + if (ret && ffe_ctl->for_data_reloc) fs_info->data_reloc_bg = 0; - } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); @@ -3920,12 +4022,11 @@ static int do_allocation(struct btrfs_block_group *block_group, static void release_block_group(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, - int delalloc) + bool delalloc) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: - ffe_ctl->retry_clustered = false; - ffe_ctl->retry_unclustered = false; + ffe_ctl->retry_uncached = false; break; case BTRFS_EXTENT_ALLOC_ZONED: /* Nothing to do */ @@ -3969,6 +4070,10 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { + /* Block group's activeness is not a requirement for METADATA block groups. */ + if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + /* If we can activate new zone, just allocate a chunk and use it */ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) return 0; @@ -4028,24 +4133,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, } } -static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) -{ - switch (ffe_ctl->policy) { - case BTRFS_EXTENT_ALLOC_CLUSTERED: - /* - * If we can't allocate a new chunk we've already looped through - * at least once, move on to the NO_EMPTY_SIZE case. - */ - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; - return 0; - case BTRFS_EXTENT_ALLOC_ZONED: - /* Give up here */ - return -ENOSPC; - default: - BUG(); - } -} - /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. @@ -4054,6 +4141,7 @@ static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, bool full_search) { struct btrfs_root *root = fs_info->chunk_root; @@ -4075,35 +4163,24 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) return 1; - /* - * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking - * caching kthreads as we move along - * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching - * LOOP_ALLOC_CHUNK, force a chunk allocation and try again - * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try - * again - */ + /* See the comments for btrfs_loop_type for an explanation of the phases. */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; - if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { - /* - * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any uncached bgs and we've already done a - * full search through. - */ - if (ffe_ctl->orig_have_caching_bg || !full_search) - ffe_ctl->loop = LOOP_CACHING_WAIT; - else - ffe_ctl->loop = LOOP_ALLOC_CHUNK; - } else { + /* + * We want to skip the LOOP_CACHING_WAIT step if we don't have + * any uncached bgs and we've already done a full search + * through. + */ + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && + (!ffe_ctl->orig_have_caching_bg && full_search)) ffe_ctl->loop++; - } + ffe_ctl->loop++; if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; - /*Check if allocation policy allows to create a new chunk */ + /* Check if allocation policy allows to create a new chunk */ ret = can_allocate_chunk(fs_info, ffe_ctl); if (ret) return ret; @@ -4119,12 +4196,14 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return ret; } - ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, + ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ - if (ret == -ENOSPC) - ret = chunk_allocation_failed(ffe_ctl); + if (ret == -ENOSPC) { + ret = 0; + ffe_ctl->loop++; + } else if (ret < 0) btrfs_abort_transaction(trans, ret); else @@ -4205,6 +4284,44 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, return 0; } +static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info) +{ + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } else if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + struct btrfs_block_group *block_group; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + /* + * No lock is OK here because avail is monotonically + * decreasing, and this is just a hint. + */ + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && + block_group->space_info == space_info && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; + } + } + spin_unlock(&fs_info->zone_active_bgs_lock); + } + + return 0; +} + static int prepare_allocation(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4215,19 +4332,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - if (ffe_ctl->for_treelog) { - spin_lock(&fs_info->treelog_bg_lock); - if (fs_info->treelog_bg) - ffe_ctl->hint_byte = fs_info->treelog_bg; - spin_unlock(&fs_info->treelog_bg_lock); - } - if (ffe_ctl->for_data_reloc) { - spin_lock(&fs_info->relocation_bg_lock); - if (fs_info->data_reloc_bg) - ffe_ctl->hint_byte = fs_info->data_reloc_bg; - spin_unlock(&fs_info->relocation_bg_lock); - } - return 0; + return prepare_allocation_zoned(fs_info, ffe_ctl, space_info); default: BUG(); } @@ -4280,14 +4385,13 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl->orig_have_caching_bg = false; ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); ffe_ctl->loop = 0; - /* For clustered allocation */ - ffe_ctl->retry_clustered = false; - ffe_ctl->retry_unclustered = false; + ffe_ctl->retry_uncached = false; ffe_ctl->cached = 0; ffe_ctl->max_extent_size = 0; ffe_ctl->total_free_space = 0; ffe_ctl->found_offset = 0; ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); if (btrfs_is_zoned(fs_info)) ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; @@ -4296,12 +4400,22 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size, - ffe_ctl->flags); + trace_btrfs_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); + if (btrfs_is_zoned(fs_info) && space_info) { + /* Use dedicated sub-space_info for dedicated block group users. */ + if (ffe_ctl->for_data_reloc) { + space_info = space_info->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + } else if (ffe_ctl->for_treelog) { + space_info = space_info->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_TREELOG); + } + } if (!space_info) { - btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); + btrfs_err(fs_info, "no space info for %llu, tree-log %d, relocation %d", + ffe_ctl->flags, ffe_ctl->for_treelog, ffe_ctl->for_data_reloc); return -ENOSPC; } @@ -4323,6 +4437,7 @@ static noinline int find_free_extent(struct btrfs_root *root, * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, ffe_ctl->flags) && + block_group->space_info == space_info && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || @@ -4340,6 +4455,7 @@ static noinline int find_free_extent(struct btrfs_root *root, block_group->flags); btrfs_lock_block_group(block_group, ffe_ctl->delalloc); + ffe_ctl->hinted = true; goto have_block_group; } } else if (block_group) { @@ -4347,6 +4463,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: + trace_btrfs_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) @@ -4356,6 +4473,7 @@ search: &space_info->block_groups[ffe_ctl->index], list) { struct btrfs_block_group *bg_ret; + ffe_ctl->hinted = false; /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) { if (ffe_ctl->for_treelog) @@ -4397,6 +4515,7 @@ search: } have_block_group: + trace_btrfs_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; @@ -4418,21 +4537,23 @@ have_block_group: ret = 0; } - if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) + if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) { + if (!cache_block_group_error) + cache_block_group_error = -EIO; + goto loop; + } + + if (!find_free_extent_check_size_class(ffe_ctl, block_group)) goto loop; bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); - if (ret == 0) { - if (bg_ret && bg_ret != block_group) { - btrfs_release_block_group(block_group, - ffe_ctl->delalloc); - block_group = bg_ret; - } - } else if (ret == -EAGAIN) { - goto have_block_group; - } else if (ret > 0) { + if (ret > 0) goto loop; + + if (bg_ret && bg_ret != block_group) { + btrfs_release_block_group(block_group, ffe_ctl->delalloc); + block_group = bg_ret; } /* Checks */ @@ -4455,7 +4576,8 @@ have_block_group: ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, ffe_ctl->num_bytes, - ffe_ctl->delalloc); + ffe_ctl->delalloc, + ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); if (ret == -EAGAIN) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, @@ -4468,17 +4590,26 @@ have_block_group: ins->objectid = ffe_ctl->search_start; ins->offset = ffe_ctl->num_bytes; - trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start, - ffe_ctl->num_bytes); + trace_btrfs_reserve_extent(block_group, ffe_ctl); btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: + if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && + !ffe_ctl->retry_uncached) { + ffe_ctl->retry_uncached = true; + btrfs_wait_block_group_cache_progress(block_group, + ffe_ctl->num_bytes + + ffe_ctl->empty_cluster + + ffe_ctl->empty_size); + goto have_block_group; + } release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); cond_resched(); } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); + ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, space_info, + full_search); if (ret > 0) goto search; @@ -4500,8 +4631,8 @@ loop: } /* - * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a - * hole that is at least as big as @num_bytes. + * Entry point to the extent allocator. Tries to find a hole that is at least + * as big as @num_bytes. * * @root - The root that will contain this extent * @@ -4547,14 +4678,14 @@ loop: int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data, int delalloc) + struct btrfs_key *ins, bool is_data, bool delalloc) { struct btrfs_fs_info *fs_info = root->fs_info; struct find_free_extent_ctl ffe_ctl = {}; bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; - bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID); bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); flags = get_alloc_profile_by_root(root, is_data); @@ -4592,16 +4723,15 @@ again: "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", flags, num_bytes, for_treelog, for_data_reloc); if (sinfo) - btrfs_dump_space_info(fs_info, sinfo, - num_bytes, 1); + btrfs_dump_space_info(sinfo, num_bytes, 1); } } return ret; } -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc) +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, + bool is_delalloc) { struct btrfs_block_group *cache; @@ -4613,27 +4743,27 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, } btrfs_add_free_space(cache, start, len); - btrfs_free_reserved_bytes(cache, len, delalloc); + btrfs_free_reserved_bytes(cache, len, is_delalloc); trace_btrfs_reserved_extent_free(fs_info, start, len); btrfs_put_block_group(cache); return 0; } -int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, - u64 len) +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, + const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret = 0; - cache = btrfs_lookup_block_group(trans->fs_info, start); + cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) { btrfs_err(trans->fs_info, "unable to find block group for %llu", - start); + eb->start); return -ENOSPC; } - ret = pin_down_extent(trans, cache, start, len, 1); + ret = pin_down_extent(trans, cache, eb->start, eb->len, true); btrfs_put_block_group(cache); return ret; } @@ -4644,7 +4774,7 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - ret = remove_from_free_space_tree(trans, bytenr, num_bytes); + ret = btrfs_remove_from_free_space_tree(trans, bytenr, num_bytes); if (ret) return ret; @@ -4663,24 +4793,29 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod) + struct btrfs_key *ins, int ref_mod, u64 oref_root) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; + struct btrfs_extent_owner_ref *oref; struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; int type; u32 size; + const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); if (parent > 0) type = BTRFS_SHARED_DATA_REF_KEY; else type = BTRFS_EXTENT_DATA_REF_KEY; - size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); + size = sizeof(*extent_item); + if (simple_quota) + size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); + size += btrfs_extent_inline_ref_size(type); path = btrfs_alloc_path(); if (!path) @@ -4702,7 +4837,14 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, flags | BTRFS_EXTENT_FLAG_DATA); iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + if (simple_quota) { + btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY); + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root); + iref = (struct btrfs_extent_inline_ref *)(oref + 1); + } btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { struct btrfs_shared_data_ref *ref; ref = (struct btrfs_shared_data_ref *)(iref + 1); @@ -4717,14 +4859,13 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); } - btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); return alloc_reserved_extent(trans, ins->objectid, ins->offset); } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -4736,16 +4877,16 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; - struct btrfs_delayed_tree_ref *ref; u32 size = sizeof(*extent_item) + sizeof(*iref); - u64 flags = extent_op->flags_to_set; + const u64 flags = (extent_op ? extent_op->flags_to_set : 0); + /* The owner of a tree block is the level. */ + int level = btrfs_delayed_ref_owner(node); bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); - ref = btrfs_delayed_node_to_tree_ref(node); - extent_key.objectid = node->bytenr; if (skinny_metadata) { - extent_key.offset = ref->level; + /* The owner of a tree block is the level. */ + extent_key.offset = level; extent_key.type = BTRFS_METADATA_ITEM_KEY; } else { extent_key.offset = node->num_bytes; @@ -4778,21 +4919,20 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); - btrfs_set_tree_block_level(leaf, block_info, ref->level); + btrfs_set_tree_block_level(leaf, block_info, level); iref = (struct btrfs_extent_inline_ref *)(block_info + 1); } if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); + btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent); } else { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); + btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root); } - btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); @@ -4803,14 +4943,20 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { - struct btrfs_ref generic_ref = { 0 }; + struct btrfs_ref generic_ref = { + .action = BTRFS_ADD_DELAYED_EXTENT, + .bytenr = ins->objectid, + .num_bytes = ins->offset, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + + ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID); - BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + if (btrfs_is_data_reloc_root(root) && btrfs_is_fstree(root->relocation_src_root)) + generic_ref.owning_root = root->relocation_src_root; - btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins->objectid, ins->offset, 0); - btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, - offset, 0, false); + btrfs_init_data_ref(&generic_ref, owner, offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); @@ -4829,6 +4975,13 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, int ret; struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; + const struct btrfs_squota_delta delta = { + .root = root_objectid, + .num_bytes = ins->offset, + .generation = trans->transid, + .is_data = true, + .is_inc = true, + }; /* * Mixed block groups will exclude before processing the log so we only @@ -4854,13 +5007,36 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, spin_unlock(&space_info->lock); ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, - offset, ins, 1); + offset, ins, 1, root_objectid); if (ret) - btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); + btrfs_pin_extent(trans, ins->objectid, ins->offset); + ret = btrfs_record_squota_delta(fs_info, &delta); btrfs_put_block_group(block_group); return ret; } +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extra safety check in case the extent tree is corrupted and extent allocator + * chooses to use a tree block which is already used and locked. + */ +static bool check_eb_lock_owner(const struct extent_buffer *eb) +{ + if (eb->lock_owner == current->pid) { + btrfs_err_rl(eb->fs_info, +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", + eb->start, btrfs_header_owner(eb), current->pid); + return true; + } + return false; +} +#else +static bool check_eb_lock_owner(struct extent_buffer *eb) +{ + return false; +} +#endif + static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, int level, u64 owner, @@ -4874,15 +5050,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; - /* - * Extra safety check in case the extent tree is corrupted and extent - * allocator chooses to use a tree block which is already used and - * locked. - */ - if (buf->lock_owner == current->pid) { - btrfs_err_rl(fs_info, -"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", - buf->start, btrfs_header_owner(buf), current->pid); + if (unlikely(check_eb_lock_owner(buf))) { free_extent_buffer(buf); return ERR_PTR(-EUCLEAN); } @@ -4901,7 +5069,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) lockdep_owner = BTRFS_FS_TREE_OBJECTID; - /* btrfs_clean_tree_block() accesses generation field. */ + /* btrfs_clear_buffer_dirty() accesses generation field. */ btrfs_set_header_generation(buf, trans->transid); /* @@ -4911,10 +5079,10 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, */ btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); - __btrfs_tree_lock(buf, nest); - btrfs_clean_tree_block(buf); + btrfs_tree_lock_nested(buf, nest); + btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); - clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); + clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags); set_extent_buffer_uptodate(buf); @@ -4926,22 +5094,24 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_owner(buf, owner); write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) - set_extent_dirty(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_DIRTY_LOG1, NULL); else - set_extent_new(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1); + btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_DIRTY_LOG2, NULL); } else { buf->log_index = -1; - set_extent_dirty(&trans->transaction->dirty_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + btrfs_set_extent_bit(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, EXTENT_DIRTY, NULL); } /* this returns a buffer locked for blocking */ return buf; @@ -4957,18 +5127,18 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size, + u64 reloc_src_root, enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; - struct btrfs_delayed_extent_op *extent_op; - struct btrfs_ref generic_ref = { 0 }; u64 flags = 0; int ret; u32 blocksize = fs_info->nodesize; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); + u64 owning_root; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { @@ -4985,7 +5155,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, - empty_size, hint, &ins, 0, 0); + empty_size, hint, &ins, false, false); if (ret) goto out_unuse; @@ -4995,47 +5165,59 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, ret = PTR_ERR(buf); goto out_free_reserved; } + owning_root = btrfs_header_owner(buf); if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) parent = ins.objectid; flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + owning_root = reloc_src_root; } else BUG_ON(parent > 0); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - extent_op = btrfs_alloc_delayed_extent_op(); - if (!extent_op) { - ret = -ENOMEM; - goto out_free_buf; + struct btrfs_delayed_extent_op *extent_op; + struct btrfs_ref generic_ref = { + .action = BTRFS_ADD_DELAYED_EXTENT, + .bytenr = ins.objectid, + .num_bytes = ins.offset, + .parent = parent, + .owning_root = owning_root, + .ref_root = root_objectid, + }; + + if (!skinny_metadata || flags != 0) { + extent_op = btrfs_alloc_delayed_extent_op(); + if (!extent_op) { + ret = -ENOMEM; + goto out_free_buf; + } + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = (skinny_metadata ? false : true); + extent_op->update_flags = (flags != 0); + } else { + extent_op = NULL; } - if (key) - memcpy(&extent_op->key, key, sizeof(extent_op->key)); - else - memset(&extent_op->key, 0, sizeof(extent_op->key)); - extent_op->flags_to_set = flags; - extent_op->update_key = skinny_metadata ? false : true; - extent_op->update_flags = true; - extent_op->level = level; - - btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins.objectid, ins.offset, parent); - btrfs_init_tree_ref(&generic_ref, level, root_objectid, - root->root_key.objectid, false); + + btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); - if (ret) - goto out_free_delayed; + if (ret) { + btrfs_free_delayed_extent_op(extent_op); + goto out_free_buf; + } } return buf; -out_free_delayed: - btrfs_free_delayed_extent_op(extent_op); out_free_buf: btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, false); out_unuse: btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); return ERR_PTR(ret); @@ -5055,11 +5237,99 @@ struct walk_control { int reada_slot; int reada_count; int restarted; + /* Indicate that extent info needs to be looked up when walking the tree. */ + int lookup_info; }; +/* + * This is our normal stage. We are traversing blocks the current snapshot owns + * and we are dropping any of our references to any children we are able to, and + * then freeing the block once we've processed all of the children. + */ #define DROP_REFERENCE 1 + +/* + * We enter this stage when we have to walk into a child block (meaning we can't + * simply drop our reference to it from our current parent node) and there are + * more than one reference on it. If we are the owner of any of the children + * blocks from the current parent node then we have to do the FULL_BACKREF dance + * on them in order to drop our normal ref and add the shared ref. + */ #define UPDATE_BACKREF 2 +/* + * Decide if we need to walk down into this node to adjust the references. + * + * @root: the root we are currently deleting + * @wc: the walk control for this deletion + * @eb: the parent eb that we're currently visiting + * @refs: the number of refs for wc->level - 1 + * @flags: the flags for wc->level - 1 + * @slot: the slot in the eb that we're currently checking + * + * This is meant to be called when we're evaluating if a node we point to at + * wc->level should be read and walked into, or if we can simply delete our + * reference to it. We return true if we should walk into the node, false if we + * can skip it. + * + * We have assertions in here to make sure this is called correctly. We assume + * that sanity checking on the blocks read to this point has been done, so any + * corrupted file systems must have been caught before calling this function. + */ +static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, + struct extent_buffer *eb, u64 flags, int slot) +{ + struct btrfs_key key; + u64 generation; + int level = wc->level; + + ASSERT(level > 0); + ASSERT(wc->refs[level - 1] > 0); + + /* + * The update backref stage we only want to skip if we already have + * FULL_BACKREF set, otherwise we need to read. + */ + if (wc->stage == UPDATE_BACKREF) { + if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + return false; + return true; + } + + /* + * We're the last ref on this block, we must walk into it and process + * any refs it's pointing at. + */ + if (wc->refs[level - 1] == 1) + return true; + + /* + * If we're already FULL_BACKREF then we know we can just drop our + * current reference. + */ + if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + return false; + + /* + * This block is older than our creation generation, we can drop our + * reference to it. + */ + generation = btrfs_node_ptr_generation(eb, slot); + if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) + return false; + + /* + * This block was processed from a previous snapshot deletion run, we + * can skip it. + */ + btrfs_node_key_to_cpu(eb, &key, slot); + if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0) + return false; + + /* All other cases we need to wander into the node. */ + return true; +} + static noinline void reada_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct walk_control *wc, @@ -5071,7 +5341,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, u64 refs; u64 flags; u32 nritems; - struct btrfs_key key; struct extent_buffer *eb; int ret; int slot; @@ -5101,38 +5370,29 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, goto reada; if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, wc->level - 1, 1, &refs, - &flags); + &flags, NULL); /* We don't care about errors in readahead. */ if (ret < 0) continue; - BUG_ON(refs == 0); - if (wc->stage == DROP_REFERENCE) { - if (refs == 1) - goto reada; + /* + * This could be racey, it's conceivable that we raced and end + * up with a bogus refs count, if that's the case just skip, if + * we are actually corrupt we will notice when we look up + * everything again with our locks. + */ + if (refs == 0) + continue; - if (wc->level == 1 && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - continue; - if (!wc->update_ref || - generation <= root->root_key.offset) - continue; - btrfs_node_key_to_cpu(eb, &key, slot); - ret = btrfs_comp_cpu_keys(&key, - &wc->update_progress); - if (ret < 0) - continue; - } else { - if (wc->level == 1 && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - continue; - } + /* If we don't need to visit this node don't reada. */ + if (!visit_node_for_delete(root, wc, eb, flags, slot)) + continue; reada: btrfs_readahead_node_child(eb, slot); nread++; @@ -5151,7 +5411,7 @@ reada: static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc, int lookup_info) + struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; int level = wc->level; @@ -5159,26 +5419,29 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; int ret; - if (wc->stage == UPDATE_BACKREF && - btrfs_header_owner(eb) != root->root_key.objectid) + if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root)) return 1; /* * when reference count of tree block is 1, it won't increase * again. once full backref flag is set, we never clear it. */ - if (lookup_info && + if (wc->lookup_info && ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { - BUG_ON(!path->locks[level]); + ASSERT(path->locks[level]); ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); - BUG_ON(ret == -ENOMEM); + &wc->flags[level], + NULL); if (ret) return ret; - BUG_ON(wc->refs[level] == 0); + if (unlikely(wc->refs[level] == 0)) { + btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", + eb->start); + return -EUCLEAN; + } } if (wc->stage == DROP_REFERENCE) { @@ -5194,14 +5457,22 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { - BUG_ON(!path->locks[level]); + ASSERT(path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1); - BUG_ON(ret); /* -ENOMEM */ + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } ret = btrfs_dec_ref(trans, root, eb, 0); - BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, eb, flag, - btrfs_header_level(eb)); - BUG_ON(ret); /* -ENOMEM */ + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + ret = btrfs_set_disk_extent_flags(trans, eb, flag); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } wc->flags[level] |= flag; } @@ -5224,23 +5495,186 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 parent, int level) { - struct btrfs_path *path; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *head; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_inline_ref *iref; int ret; + bool exists = false; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - +again: ret = lookup_extent_backref(trans, path, &iref, bytenr, root->fs_info->nodesize, parent, - root->root_key.objectid, level, 0); - btrfs_free_path(path); - if (ret == -ENOENT) + btrfs_root_id(root), level, 0); + if (ret != -ENOENT) { + /* + * If we get 0 then we found our reference, return 1, else + * return the error if it's not -ENOENT; + */ + return (ret < 0 ) ? ret : 1; + } + + /* + * We could have a delayed ref with this reference, so look it up while + * we're holding the path open to make sure we don't race with the + * delayed ref running. + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); + if (!head) + goto out; + if (!mutex_trylock(&head->mutex)) { + /* + * We're contended, means that the delayed ref is running, get a + * reference and wait for the ref head to be complete and then + * try again. + */ + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); + goto again; + } + + exists = btrfs_find_delayed_tree_ref(head, btrfs_root_id(root), parent); + mutex_unlock(&head->mutex); +out: + spin_unlock(&delayed_refs->lock); + return exists ? 1 : 0; +} + +/* + * We may not have an uptodate block, so if we are going to walk down into this + * block we need to drop the lock, read it off of the disk, re-lock it and + * return to continue dropping the snapshot. + */ +static int check_next_block_uptodate(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, + struct extent_buffer *next) +{ + struct btrfs_tree_parent_check check = { 0 }; + u64 generation; + int level = wc->level; + int ret; + + btrfs_assert_tree_write_locked(next); + + generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); + + if (btrfs_buffer_uptodate(next, generation, false)) return 0; - if (ret < 0) + + check.level = level - 1; + check.transid = generation; + check.owner_root = btrfs_root_id(root); + check.has_first_key = true; + btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]); + + btrfs_tree_unlock(next); + if (level == 1) + reada_walk_down(trans, root, wc, path); + ret = btrfs_read_extent_buffer(next, &check); + if (ret) { + free_extent_buffer(next); return ret; - return 1; + } + btrfs_tree_lock(next); + wc->lookup_info = 1; + return 0; +} + +/* + * If we determine that we don't have to visit wc->level - 1 then we need to + * determine if we can drop our reference. + * + * If we are UPDATE_BACKREF then we will not, we need to update our backrefs. + * + * If we are DROP_REFERENCE this will figure out if we need to drop our current + * reference, skipping it if we dropped it from a previous uncompleted drop, or + * dropping it if we still have a reference to it. + */ +static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, struct walk_control *wc, + struct extent_buffer *next, u64 owner_root) +{ + struct btrfs_ref ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = next->start, + .num_bytes = root->fs_info->nodesize, + .owning_root = owner_root, + .ref_root = btrfs_root_id(root), + }; + int level = wc->level; + int ret; + + /* We are UPDATE_BACKREF, we're not dropping anything. */ + if (wc->stage == UPDATE_BACKREF) + return 0; + + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + ref.parent = path->nodes[level]->start; + } else { + ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level])); + if (unlikely(btrfs_root_id(root) != btrfs_header_owner(path->nodes[level]))) { + btrfs_err(root->fs_info, "mismatched block owner"); + return -EIO; + } + } + + /* + * If we had a drop_progress we need to verify the refs are set as + * expected. If we find our ref then we know that from here on out + * everything should be correct, and we can clear the + * ->restarted flag. + */ + if (wc->restarted) { + ret = check_ref_exists(trans, root, next->start, ref.parent, + level - 1); + if (ret <= 0) + return ret; + ret = 0; + wc->restarted = 0; + } + + /* + * Reloc tree doesn't contribute to qgroup numbers, and we have already + * accounted them at merge time (replace_path), thus we could skip + * expensive subtree trace here. + */ + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && + wc->refs[level - 1] > 1) { + u64 generation = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + + ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1); + if (ret) { + btrfs_err_rl(root->fs_info, +"error %d accounting shared subtree, quota is out of sync, rescan required", + ret); + } + } + + /* + * We need to update the next key in our walk control so we can update + * the drop_progress key accordingly. We don't care if find_next_key + * doesn't find a key because that means we're at the end and are going + * to clean up now. + */ + wc->drop_level = level; + find_next_key(path, level, &wc->drop_progress); + + btrfs_init_tree_ref(&ref, level - 1, 0, false); + return btrfs_free_extent(trans, &ref); } /* @@ -5259,20 +5693,15 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, static noinline int do_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc, int *lookup_info) + struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 generation; - u64 parent; - struct btrfs_tree_parent_check check = { 0 }; - struct btrfs_key key; - struct btrfs_ref ref = { 0 }; + u64 owner_root = 0; struct extent_buffer *next; int level = wc->level; - int reada = 0; int ret = 0; - bool need_account = false; generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); @@ -5282,92 +5711,57 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { - *lookup_info = 1; + generation <= btrfs_root_origin_generation(root)) { + wc->lookup_info = 1; return 1; } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - check.level = level - 1; - check.transid = generation; - check.owner_root = root->root_key.objectid; - check.has_first_key = true; - btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, - path->slots[level]); + next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root), + level - 1); + if (IS_ERR(next)) + return PTR_ERR(next); - next = find_extent_buffer(fs_info, bytenr); - if (!next) { - next = btrfs_find_create_tree_block(fs_info, bytenr, - root->root_key.objectid, level - 1); - if (IS_ERR(next)) - return PTR_ERR(next); - reada = 1; - } btrfs_tree_lock(next); ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], - &wc->flags[level - 1]); + &wc->flags[level - 1], + &owner_root); if (ret < 0) goto out_unlock; if (unlikely(wc->refs[level - 1] == 0)) { - btrfs_err(fs_info, "Missing references."); - ret = -EIO; + btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", + bytenr); + ret = -EUCLEAN; goto out_unlock; } - *lookup_info = 0; + wc->lookup_info = 0; - if (wc->stage == DROP_REFERENCE) { - if (wc->refs[level - 1] > 1) { - need_account = true; - if (level == 1 && - (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - goto skip; - - if (!wc->update_ref || - generation <= root->root_key.offset) - goto skip; - - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); - if (ret < 0) - goto skip; + /* If we don't have to walk into this node skip it. */ + if (!visit_node_for_delete(root, wc, path->nodes[level], + wc->flags[level - 1], path->slots[level])) + goto skip; - wc->stage = UPDATE_BACKREF; - wc->shared_level = level - 1; - } - } else { - if (level == 1 && - (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - goto skip; + /* + * We have to walk down into this node, and if we're currently at the + * DROP_REFERENCE stage and this block is shared then we need to switch + * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF. + */ + if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) { + wc->stage = UPDATE_BACKREF; + wc->shared_level = level - 1; } - if (!btrfs_buffer_uptodate(next, generation, 0)) { - btrfs_tree_unlock(next); - free_extent_buffer(next); - next = NULL; - *lookup_info = 1; - } - - if (!next) { - if (reada && level == 1) - reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, &check); - if (IS_ERR(next)) { - return PTR_ERR(next); - } else if (!extent_buffer_uptodate(next)) { - free_extent_buffer(next); - return -EIO; - } - btrfs_tree_lock(next); - } + ret = check_next_block_uptodate(trans, root, path, wc, next); + if (ret) + return ret; level--; ASSERT(level == btrfs_header_level(next)); - if (level != btrfs_header_level(next)) { + if (unlikely(level != btrfs_header_level(next))) { btrfs_err(root->fs_info, "mismatched level"); ret = -EIO; goto out_unlock; @@ -5380,76 +5774,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, wc->reada_slot = 0; return 0; skip: + ret = maybe_drop_reference(trans, root, path, wc, next, owner_root); + if (ret) + goto out_unlock; wc->refs[level - 1] = 0; wc->flags[level - 1] = 0; - if (wc->stage == DROP_REFERENCE) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - parent = path->nodes[level]->start; - } else { - ASSERT(root->root_key.objectid == - btrfs_header_owner(path->nodes[level])); - if (root->root_key.objectid != - btrfs_header_owner(path->nodes[level])) { - btrfs_err(root->fs_info, - "mismatched block owner"); - ret = -EIO; - goto out_unlock; - } - parent = 0; - } - - /* - * If we had a drop_progress we need to verify the refs are set - * as expected. If we find our ref then we know that from here - * on out everything should be correct, and we can clear the - * ->restarted flag. - */ - if (wc->restarted) { - ret = check_ref_exists(trans, root, bytenr, parent, - level - 1); - if (ret < 0) - goto out_unlock; - if (ret == 0) - goto no_delete; - ret = 0; - wc->restarted = 0; - } - - /* - * Reloc tree doesn't contribute to qgroup numbers, and we have - * already accounted them at merge time (replace_path), - * thus we could skip expensive subtree trace here. - */ - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - need_account) { - ret = btrfs_qgroup_trace_subtree(trans, next, - generation, level - 1); - if (ret) { - btrfs_err_rl(fs_info, - "Error %d accounting shared subtree. Quota is out of sync, rescan required.", - ret); - } - } - - /* - * We need to update the next key in our walk control so we can - * update the drop_progress key accordingly. We don't care if - * find_next_key doesn't find a key because that means we're at - * the end and are going to clean up now. - */ - wc->drop_level = level; - find_next_key(path, level, &wc->drop_progress); - - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - fs_info->nodesize, parent); - btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, - 0, false); - ret = btrfs_free_extent(trans, &ref); - if (ret) - goto out_unlock; - } -no_delete: - *lookup_info = 1; + wc->lookup_info = 1; ret = 1; out_unlock: @@ -5477,13 +5807,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; + int ret = 0; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; u64 parent = 0; if (wc->stage == UPDATE_BACKREF) { - BUG_ON(wc->shared_level < level); + ASSERT(wc->shared_level >= level); if (level < wc->shared_level) goto out; @@ -5501,20 +5831,26 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, * count is one. */ if (!path->locks[level]) { - BUG_ON(level == 0); + ASSERT(level > 0); btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], + NULL); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; return ret; } - BUG_ON(wc->refs[level] == 0); + if (unlikely(wc->refs[level] == 0)) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", + eb->start); + return -EUCLEAN; + } if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; @@ -5524,16 +5860,24 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } /* wc->stage == DROP_REFERENCE */ - BUG_ON(wc->refs[level] > 1 && !path->locks[level]); + ASSERT(path->locks[level] || wc->refs[level] == 1); if (wc->refs[level] == 1) { if (level == 0) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { ret = btrfs_dec_ref(trans, root, eb, 1); - else + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } else { ret = btrfs_dec_ref(trans, root, eb, 0); - BUG_ON(ret); /* -ENOMEM */ - if (is_fstree(root->root_key.objectid)) { + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + if (btrfs_is_fstree(btrfs_root_id(root))) { ret = btrfs_qgroup_trace_leaf_items(trans, eb); if (ret) { btrfs_err_rl(fs_info, @@ -5542,53 +5886,75 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } } } - /* make block locked assertion in btrfs_clean_tree_block happy */ - if (!path->locks[level] && - btrfs_header_generation(eb) == trans->transid) { + /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ + if (!path->locks[level]) { btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; } - btrfs_clean_tree_block(eb); + btrfs_clear_buffer_dirty(trans, eb); } if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; - else if (root->root_key.objectid != btrfs_header_owner(eb)) + else if (unlikely(btrfs_root_id(root) != btrfs_header_owner(eb))) goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; - else if (root->root_key.objectid != - btrfs_header_owner(path->nodes[level + 1])) + else if (unlikely(btrfs_root_id(root) != + btrfs_header_owner(path->nodes[level + 1]))) goto owner_mismatch; } - btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, - wc->refs[level] == 1); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, + wc->refs[level] == 1); + if (ret < 0) + btrfs_abort_transaction(trans, ret); out: wc->refs[level] = 0; wc->flags[level] = 0; - return 0; + return ret; owner_mismatch: btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", - btrfs_header_owner(eb), root->root_key.objectid); + btrfs_header_owner(eb), btrfs_root_id(root)); return -EUCLEAN; } +/* + * walk_down_tree consists of two steps. + * + * walk_down_proc(). Look up the reference count and reference of our current + * wc->level. At this point path->nodes[wc->level] should be populated and + * uptodate, and in most cases should already be locked. If we are in + * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and + * we can walk back up the tree. If we are UPDATE_BACKREF we have to set + * FULL_BACKREF on this node if it's not already set, and then do the + * FULL_BACKREF conversion dance, which is to drop the root reference and add + * the shared reference to all of this nodes children. + * + * do_walk_down(). This is where we actually start iterating on the children of + * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping + * our reference to the children that return false from visit_node_for_delete(), + * which has various conditions where we know we can just drop our reference + * without visiting the node. For UPDATE_BACKREF we will skip any children that + * visit_node_for_delete() returns false for, only walking down when necessary. + * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of + * snapshot deletion. + */ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { int level = wc->level; - int lookup_info = 1; - int ret; + int ret = 0; + wc->lookup_info = 1; while (level >= 0) { - ret = walk_down_proc(trans, root, path, wc, lookup_info); - if (ret > 0) + ret = walk_down_proc(trans, root, path, wc); + if (ret) break; if (level == 0) @@ -5598,17 +5964,34 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, btrfs_header_nritems(path->nodes[level])) break; - ret = do_walk_down(trans, root, path, wc, &lookup_info); + ret = do_walk_down(trans, root, path, wc); if (ret > 0) { path->slots[level]++; continue; } else if (ret < 0) - return ret; + break; level = wc->level; } - return 0; + return (ret == 1) ? 0 : ret; } +/* + * walk_up_tree() is responsible for making sure we visit every slot on our + * current node, and if we're at the end of that node then we call + * walk_up_proc() on our current node which will do one of a few things based on + * our stage. + * + * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level + * then we need to walk back up the tree, and then going back down into the + * other slots via walk_down_tree to update any other children from our original + * wc->shared_level. Once we're at or above our wc->shared_level we can switch + * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on. + * + * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block. + * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents + * in our current leaf. After that we call btrfs_free_tree_block() on the + * current node and walk up to the next node to walk down the next slot. + */ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -5655,38 +6038,36 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. * - * If called with for_reloc == 0, may exit early with -EAGAIN + * If called with for_reloc set, may exit early with -EAGAIN */ -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) +int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc) { - const bool is_reloc_root = (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID); + const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root_item *root_item = &root->root_item; - struct walk_control *wc; + struct walk_control AUTO_KFREE(wc); struct btrfs_key key; - int err = 0; - int ret; + const u64 rootid = btrfs_root_id(root); + int ret = 0; int level; bool root_dropped = false; bool unfinished_drop = false; - btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); + btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root)); path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } wc = kzalloc(sizeof(*wc), GFP_NOFS); if (!wc) { - btrfs_free_path(path); - err = -ENOMEM; - goto out; + ret = -ENOMEM; + goto out_free; } /* @@ -5698,12 +6079,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) else trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_free; } - err = btrfs_run_delayed_items(trans); - if (err) + ret = btrfs_run_delayed_items(trans); + if (ret) goto out_end_trans; /* @@ -5734,11 +6115,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) path->lowest_level = level; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->lowest_level = 0; - if (ret < 0) { - err = ret; + if (ret < 0) goto out_end_trans; - } + WARN_ON(ret > 0); + ret = 0; /* * unlock our path, this is safe because only this @@ -5751,14 +6132,17 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_tree_lock(path->nodes[level]); path->locks[level] = BTRFS_WRITE_LOCK; + /* + * btrfs_lookup_extent_info() returns 0 for success, + * or < 0 for error. + */ ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, level, 1, &wc->refs[level], - &wc->flags[level]); - if (ret < 0) { - err = ret; + &wc->flags[level], NULL); + if (ret < 0) goto out_end_trans; - } + BUG_ON(wc->refs[level] == 0); if (level == btrfs_root_drop_level(root_item)) @@ -5782,19 +6166,20 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) while (1) { ret = walk_down_tree(trans, root, path, wc); - if (ret < 0) { - err = ret; + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); break; } ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); - if (ret < 0) { - err = ret; + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); break; } if (ret > 0) { BUG_ON(wc->stage != DROP_REFERENCE); + ret = 0; break; } @@ -5814,9 +6199,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } @@ -5827,7 +6211,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { btrfs_debug(fs_info, "drop snapshot early exit"); - err = -EAGAIN; + ret = -EAGAIN; goto out_free; } @@ -5841,37 +6225,36 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) else trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_free; } } } btrfs_release_path(path); - if (err) + if (ret) goto out_end_trans; ret = btrfs_del_root(trans, &root->root_key); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } if (!is_reloc_root) { ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } else if (ret > 0) { - /* if we fail to delete the orphan item this time + ret = 0; + /* + * If we fail to delete the orphan item this time * around, it'll get picked up the next time. * * The most common failure here is just -ENOENT. */ - btrfs_del_orphan_item(trans, tree_root, - root->root_key.objectid); + btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root)); } } @@ -5894,14 +6277,21 @@ out_end_trans: btrfs_end_transaction_throttle(trans); out_free: - kfree(wc); btrfs_free_path(path); out: + if (!ret && root_dropped) { + ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid); + if (ret < 0) + btrfs_warn_rl(fs_info, + "failed to cleanup qgroup 0/%llu: %d", + rootid, ret); + ret = 0; + } /* * We were an unfinished drop root, check to see if there are any * pending, and if not clear and wake up any waiters. */ - if (!err && unfinished_drop) + if (!ret && unfinished_drop) btrfs_maybe_wake_unfinished_drop(fs_info); /* @@ -5913,7 +6303,7 @@ out: */ if (!for_reloc && !root_dropped) btrfs_add_dead_root(root); - return err; + return ret; } /* @@ -5928,28 +6318,25 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *parent) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; - struct walk_control *wc; + BTRFS_PATH_AUTO_FREE(path); + struct walk_control AUTO_KFREE(wc); int level; int parent_level; int ret = 0; - int wret; - BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); path = btrfs_alloc_path(); if (!path) return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); - if (!wc) { - btrfs_free_path(path); + if (!wc) return -ENOMEM; - } btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); - atomic_inc(&parent->refs); + refcount_inc(&parent->refs); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); @@ -5969,28 +6356,28 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); while (1) { - wret = walk_down_tree(trans, root, path, wc); - if (wret < 0) { - ret = wret; - break; - } + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) + return ret; - wret = walk_up_tree(trans, root, path, wc, parent_level); - if (wret < 0) - ret = wret; - if (wret != 0) + ret = walk_up_tree(trans, root, path, wc, parent_level); + if (ret) { + if (ret < 0) + return ret; break; + } } - kfree(wc); - btrfs_free_path(path); - return ret; + return 0; } -int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, - u64 start, u64 end) +/* + * Unpin the extent range in an error context and don't add the space back. + * Errors are not propagated further. + */ +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { - return unpin_extent_range(fs_info, start, end, false); + unpin_extent_range(fs_info, start, end, false); } /* @@ -6042,14 +6429,14 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) if (ret) break; - find_first_clear_extent_bit(&device->alloc_state, start, - &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&device->alloc_state, start, + &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); /* Check if there are any CHUNK_* bits left */ if (start > device->total_bytes) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - btrfs_warn_in_rcu(fs_info, + DEBUG_WARN(); + btrfs_warn(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, btrfs_dev_name(device), @@ -6081,9 +6468,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) ret = btrfs_issue_discard(device->bdev, start, len, &bytes); if (!ret) - set_extent_bits(&device->alloc_state, start, - start + bytes - 1, - CHUNK_TRIMMED); + btrfs_set_extent_bit(&device->alloc_state, start, + start + bytes - 1, CHUNK_TRIMMED, NULL); mutex_unlock(&fs_info->chunk_mutex); if (ret) @@ -6092,7 +6478,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) start += len; *trimmed += bytes; - if (fatal_signal_pending(current)) { + if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; break; } @@ -6184,13 +6570,13 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) continue; ret = btrfs_trim_free_extents(device, &group_trimmed); + + trimmed += group_trimmed; if (ret) { dev_failed++; dev_ret = ret; break; } - - trimmed += group_trimmed; } mutex_unlock(&fs_devices->device_list_mutex); |
