diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r-- | fs/btrfs/extent-tree.c | 1624 |
1 files changed, 911 insertions, 713 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index beedd6ed64d3..cb6128778a83 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -46,9 +46,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, + struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, @@ -72,20 +70,17 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits) int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_root *root = btrfs_extent_root(fs_info, start); - int ret; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = start; - key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - btrfs_free_path(path); - return ret; + key.offset = len; + return btrfs_search_slot(NULL, root, &key, path, 0, 0); } /* @@ -105,11 +100,8 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct extent_buffer *leaf; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; - u32 item_size; u64 num_refs; u64 extent_flags; u64 owner = 0; @@ -128,25 +120,20 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - if (!trans) { - path->skip_locking = 1; - path->search_commit_root = 1; - } - search_again: key.objectid = bytenr; - key.offset = offset; if (metadata) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = offset; extent_root = btrfs_extent_root(fs_info, bytenr); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out_free; + return ret; - if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { + if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -159,41 +146,40 @@ search_again: } if (ret == 0) { - leaf = path->nodes[0]; - item_size = btrfs_item_size(leaf, path->slots[0]); - if (item_size >= sizeof(*ei)) { - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); - extent_flags = btrfs_extent_flags(leaf, ei); - owner = btrfs_get_extent_owner_root(fs_info, leaf, - path->slots[0]); - } else { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_extent_item *ei; + const u32 item_size = btrfs_item_size(leaf, path->slots[0]); + + if (unlikely(item_size < sizeof(*ei))) { ret = -EUCLEAN; btrfs_err(fs_info, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); - if (trans) - btrfs_abort_transaction(trans, ret); - else - btrfs_handle_fs_error(fs_info, ret, NULL); - - goto out_free; + btrfs_abort_transaction(trans, ret); + return ret; } - BUG_ON(num_refs == 0); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + if (unlikely(num_refs == 0)) { + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected zero reference count for extent item (%llu %u %llu)", + key.objectid, key.type, key.offset); + btrfs_abort_transaction(trans, ret); + return ret; + } + extent_flags = btrfs_extent_flags(leaf, ei); + owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); } else { num_refs = 0; extent_flags = 0; ret = 0; } - if (!trans) - goto out; - delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); @@ -213,15 +199,13 @@ search_again: spin_lock(&head->lock); if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; - else - BUG_ON(num_refs == 0); num_refs += head->ref_mod; spin_unlock(&head->lock); mutex_unlock(&head->mutex); } spin_unlock(&delayed_refs->lock); -out: + WARN_ON(num_refs == 0); if (refs) *refs = num_refs; @@ -229,8 +213,7 @@ out: *flags = extent_flags; if (owning_root) *owning_root = owner; -out_free: - btrfs_free_path(path); + return ret; } @@ -426,15 +409,15 @@ static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, btrfs_extent_data_ref_offset(leaf, ref)); } -static int match_extent_data_ref(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - u64 root_objectid, u64 owner, u64 offset) +static bool match_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) { if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || btrfs_extent_data_ref_objectid(leaf, ref) != owner || btrfs_extent_data_ref_offset(leaf, ref) != offset) - return 0; - return 1; + return false; + return true; } static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, @@ -448,9 +431,8 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; u32 nritems; - int ret; int recow; - int err = -ENOENT; + int ret; key.objectid = bytenr; if (parent) { @@ -464,26 +446,26 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, again: recow = 0; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; - goto fail; - } + if (ret < 0) + return ret; if (parent) { - if (!ret) - return 0; - goto fail; + if (ret) + return -ENOENT; + return 0; } + ret = -ENOENT; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); while (1) { if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); - if (ret < 0) - err = ret; - if (ret) - goto fail; + if (ret) { + if (ret > 0) + return -ENOENT; + return ret; + } leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -504,37 +486,37 @@ again: btrfs_release_path(path); goto again; } - err = 0; + ret = 0; break; } path->slots[0]++; } fail: - return err; + return ret; } static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid, u64 owner, - u64 offset, int refs_to_add) + struct btrfs_delayed_ref_node *node, + u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); u32 size; u32 num_refs; int ret; key.objectid = bytenr; - if (parent) { + if (node->parent) { key.type = BTRFS_SHARED_DATA_REF_KEY; - key.offset = parent; + key.offset = node->parent; size = sizeof(struct btrfs_shared_data_ref); } else { key.type = BTRFS_EXTENT_DATA_REF_KEY; - key.offset = hash_extent_data_ref(root_objectid, - owner, offset); + key.offset = hash_extent_data_ref(node->ref_root, owner, offset); size = sizeof(struct btrfs_extent_data_ref); } @@ -543,15 +525,15 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, goto fail; leaf = path->nodes[0]; - if (parent) { + if (node->parent) { struct btrfs_shared_data_ref *ref; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); if (ret == 0) { - btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); + btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_shared_data_ref_count(leaf, ref); - num_refs += refs_to_add; + num_refs += node->ref_mod; btrfs_set_shared_data_ref_count(leaf, ref, num_refs); } } else { @@ -559,7 +541,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, while (ret == -EEXIST) { ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); - if (match_extent_data_ref(leaf, ref, root_objectid, + if (match_extent_data_ref(leaf, ref, node->ref_root, owner, offset)) break; btrfs_release_path(path); @@ -574,18 +556,16 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); if (ret == 0) { - btrfs_set_extent_data_ref_root(leaf, ref, - root_objectid); + btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root); btrfs_set_extent_data_ref_objectid(leaf, ref, owner); btrfs_set_extent_data_ref_offset(leaf, ref, offset); - btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); + btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_extent_data_ref_count(leaf, ref); - num_refs += refs_to_add; + num_refs += node->ref_mod; btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } } - btrfs_mark_buffer_dirty(trans, leaf); ret = 0; fail: btrfs_release_path(path); @@ -633,7 +613,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); else if (key.type == BTRFS_SHARED_DATA_REF_KEY) btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); - btrfs_mark_buffer_dirty(trans, leaf); } return ret; } @@ -705,20 +684,20 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid) + struct btrfs_delayed_ref_node *node, + u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; key.objectid = bytenr; - if (parent) { + if (node->parent) { key.type = BTRFS_SHARED_BLOCK_REF_KEY; - key.offset = parent; + key.offset = node->parent; } else { key.type = BTRFS_TREE_BLOCK_REF_KEY; - key.offset = root_objectid; + key.offset = node->ref_root; } ret = btrfs_insert_empty_item(trans, root, path, &key, 0); @@ -810,7 +789,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, if (insert) { extra_size = btrfs_extent_inline_ref_size(want); path->search_for_extension = 1; - path->keep_locks = 1; } else extra_size = -1; @@ -961,6 +939,25 @@ again: ret = -EAGAIN; goto out; } + + if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) { + struct btrfs_key tmp_key; + + btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1); + if (tmp_key.objectid == bytenr && + tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { + ret = -EAGAIN; + goto out; + } + goto out_no_entry; + } + + if (!path->keep_locks) { + btrfs_release_path(path); + path->keep_locks = 1; + goto again; + } + /* * To add new inline back ref, we have to make sure * there is no corresponding back ref item. @@ -974,13 +971,15 @@ again: goto out; } } +out_no_entry: *ref_ret = (struct btrfs_extent_inline_ref *)ptr; out: - if (insert) { + if (path->keep_locks) { path->keep_locks = 0; - path->search_for_extension = 0; btrfs_unlock_up_safe(path, 1); } + if (insert) + path->search_for_extension = 0; return ret; } @@ -1045,7 +1044,6 @@ void setup_inline_extent_backref(struct btrfs_trans_handle *trans, } else { btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - btrfs_mark_buffer_dirty(trans, leaf); } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1190,7 +1188,6 @@ static noinline_for_stack int update_inline_extent_backref( item_size -= size; btrfs_truncate_item(trans, path, item_size, 1); } - btrfs_mark_buffer_dirty(trans, leaf); return 0; } @@ -1255,12 +1252,12 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, { int j, ret = 0; u64 bytes_left, end; - u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); + u64 aligned_start = ALIGN(start, SECTOR_SIZE); /* Adjust the range to be aligned to 512B sectors if necessary. */ if (start != aligned_start) { len -= aligned_start - start; - len = round_down(len, 1 << SECTOR_SHIFT); + len = round_down(len, SECTOR_SIZE); start = aligned_start; } @@ -1315,13 +1312,29 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, bytes_left = end - start; } - if (bytes_left) { + while (bytes_left) { + u64 bytes_to_discard = min(BTRFS_MAX_DISCARD_CHUNK_SIZE, bytes_left); + ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, - bytes_left >> SECTOR_SHIFT, + bytes_to_discard >> SECTOR_SHIFT, GFP_NOFS); - if (!ret) - *discarded_bytes += bytes_left; + + if (ret) { + if (ret != -EOPNOTSUPP) + break; + continue; + } + + start += bytes_to_discard; + bytes_left -= bytes_to_discard; + *discarded_bytes += bytes_to_discard; + + if (btrfs_trim_interrupted()) { + ret = -ERESTARTSYS; + break; + } } + return ret; } @@ -1439,7 +1452,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && - generic_ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID); + generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); @@ -1462,42 +1475,22 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * @node: The delayed ref node used to get the bytenr/length for * extent whose references are incremented. * - * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ - * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical - * bytenr of the parent block. Since new extents are always - * created with indirect references, this will only be the case - * when relocating a shared extent. In that case, root_objectid - * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must - * be 0 - * - * @root_objectid: The id of the root where this modification has originated, - * this can be either one of the well-known metadata trees or - * the subvolume id which references this extent. - * - * @owner: For data extents it is the inode number of the owning file. - * For metadata extents this parameter holds the level in the - * tree of the extent. - * - * @offset: For metadata extents the offset is ignored and is currently - * always passed as 0. For data extents it is the fileoffset - * this extent belongs to. - * * @extent_op Pointer to a structure, holding information necessary when * updating a tree block's flags * */ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); u64 refs; int refs_to_add = node->ref_mod; int ret; @@ -1508,10 +1501,10 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, - parent, root_objectid, owner, + node->parent, node->ref_root, owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) - goto out; + return ret; /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -1526,22 +1519,17 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* now insert the actual backref */ if (owner < BTRFS_FIRST_FREE_OBJECTID) - ret = insert_tree_block_ref(trans, path, bytenr, parent, - root_objectid); + ret = insert_tree_block_ref(trans, path, node, bytenr); else - ret = insert_extent_data_ref(trans, path, bytenr, parent, - root_objectid, owner, offset, - refs_to_add); + ret = insert_extent_data_ref(trans, path, node, bytenr); if (ret) btrfs_abort_transaction(trans, ret); -out: - btrfs_free_path(path); + return ret; } @@ -1569,15 +1557,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, bool insert_reserved) { int ret = 0; - struct btrfs_delayed_data_ref *ref; u64 parent = 0; u64 flags = 0; - ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); + trace_run_delayed_data_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_DATA_REF_KEY) - parent = ref->parent; + parent = node->parent; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { struct btrfs_key key; @@ -1588,6 +1574,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, .is_inc = true, .generation = trans->transid, }; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); if (extent_op) flags |= extent_op->flags_to_set; @@ -1596,21 +1584,17 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = node->num_bytes; - ret = alloc_reserved_file_extent(trans, parent, ref->root, - flags, ref->objectid, - ref->offset, &key, - node->ref_mod, href->owning_root); + ret = alloc_reserved_file_extent(trans, parent, node->ref_root, + flags, owner, offset, &key, + node->ref_mod, + href->owning_root); free_head_ref_squota_rsv(trans->fs_info, href); if (!ret) ret = btrfs_record_squota_delta(trans->fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root, - ref->objectid, ref->offset, - extent_op); + ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, href, node, parent, - ref->root, ref->objectid, - ref->offset, extent_op); + ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } @@ -1642,7 +1626,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; struct extent_buffer *leaf; u32 item_size; @@ -1663,7 +1647,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, if (metadata) { key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = extent_op->level; + key.offset = head->level; } else { key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = head->num_bytes; @@ -1673,7 +1657,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - goto out; + return ret; } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { @@ -1690,16 +1674,16 @@ again: metadata = 0; key.objectid = head->bytenr; - key.offset = head->num_bytes; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = head->num_bytes; goto again; } } else { ret = -EUCLEAN; btrfs_err(fs_info, "missing extent item for extent %llu num_bytes %llu level %d", - head->bytenr, head->num_bytes, extent_op->level); - goto out; + head->bytenr, head->num_bytes, head->level); + return ret; } } @@ -1712,15 +1696,12 @@ again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); - btrfs_mark_buffer_dirty(trans, leaf); -out: - btrfs_free_path(path); return ret; } @@ -1732,16 +1713,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_tree_ref *ref; u64 parent = 0; u64 ref_root = 0; - ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); + trace_run_delayed_tree_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) - parent = ref->parent; - ref_root = ref->root; + parent = node->parent; + ref_root = node->ref_root; if (unlikely(node->ref_mod != 1)) { btrfs_err(trans->fs_info, @@ -1759,16 +1738,13 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, .generation = trans->transid, }; - BUG_ON(!extent_op || !extent_op->update_flags); ret = alloc_reserved_tree_block(trans, node, extent_op); if (!ret) btrfs_record_squota_delta(fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, - ref->level, 0, extent_op); + ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, href, node, parent, ref_root, - ref->level, 0, extent_op); + ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } @@ -1814,40 +1790,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static inline struct btrfs_delayed_ref_node * -select_delayed_ref(struct btrfs_delayed_ref_head *head) -{ - struct btrfs_delayed_ref_node *ref; - - if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) - return NULL; - - /* - * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. - * This is to prevent a ref count from going down to zero, which deletes - * the extent item from the extent tree, when there still are references - * to add, which would fail because they would not find the extent item. - */ - if (!list_empty(&head->ref_add_list)) - return list_first_entry(&head->ref_add_list, - struct btrfs_delayed_ref_node, add_list); - - ref = rb_entry(rb_first_cached(&head->ref_tree), - struct btrfs_delayed_ref_node, ref_node); - ASSERT(list_empty(&ref->add_list)); - return ref; -} - -static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - spin_lock(&delayed_refs->lock); - head->processing = false; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_delayed_ref_unlock(head); -} - static struct btrfs_delayed_extent_op *cleanup_extent_op( struct btrfs_delayed_ref_head *head) { @@ -1922,7 +1864,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, ret = run_and_cleanup_extent_op(trans, head); if (ret < 0) { - unselect_delayed_ref_head(delayed_refs, head); + btrfs_unselect_ref_head(delayed_refs, head); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); return ret; } else if (ret) { @@ -1941,7 +1883,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 1; } - btrfs_delete_ref_head(delayed_refs, head); + btrfs_delete_ref_head(fs_info, delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -1964,39 +1906,6 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, return ret; } -static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( - struct btrfs_trans_handle *trans) -{ - struct btrfs_delayed_ref_root *delayed_refs = - &trans->transaction->delayed_refs; - struct btrfs_delayed_ref_head *head = NULL; - int ret; - - spin_lock(&delayed_refs->lock); - head = btrfs_select_ref_head(delayed_refs); - if (!head) { - spin_unlock(&delayed_refs->lock); - return head; - } - - /* - * Grab the lock that says we are going to process all the refs for - * this head - */ - ret = btrfs_delayed_ref_lock(delayed_refs, head); - spin_unlock(&delayed_refs->lock); - - /* - * We may have dropped the spin lock to get the head mutex lock, and - * that might have given someone else time to free the head. If that's - * true, it has been removed from our list and we can move on. - */ - if (ret == -EAGAIN) - head = ERR_PTR(-EAGAIN); - - return head; -} - static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *locked_ref, u64 *bytes_released) @@ -2013,11 +1922,11 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, lockdep_assert_held(&locked_ref->mutex); lockdep_assert_held(&locked_ref->lock); - while ((ref = select_delayed_ref(locked_ref))) { + while ((ref = btrfs_select_delayed_ref(locked_ref))) { if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); - unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_unselect_ref_head(delayed_refs, locked_ref); return -EAGAIN; } @@ -2040,7 +1949,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, default: WARN_ON(1); } - atomic_dec(&delayed_refs->num_entries); /* * Record the must_insert_reserved flag before we drop the @@ -2066,7 +1974,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, btrfs_free_delayed_extent_op(extent_op); if (ret) { - unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_unselect_ref_head(delayed_refs, locked_ref); btrfs_put_delayed_ref(ref); return ret; } @@ -2098,13 +2006,18 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; if (min_bytes == 0) { - max_count = delayed_refs->num_heads_ready; + /* + * We may be subject to a harmless race if some task is + * concurrently adding or removing a delayed ref, so silence + * KCSAN and similar tools. + */ + max_count = data_race(delayed_refs->num_heads_ready); min_bytes = U64_MAX; } do { if (!locked_ref) { - locked_ref = btrfs_obtain_ref_head(trans); + locked_ref = btrfs_select_ref_head(fs_info, delayed_refs); if (IS_ERR_OR_NULL(locked_ref)) { if (PTR_ERR(locked_ref) == -EAGAIN) { continue; @@ -2251,7 +2164,7 @@ again: btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); - if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) { + if (xa_empty(&delayed_refs->head_refs)) { spin_unlock(&delayed_refs->lock); return 0; } @@ -2268,7 +2181,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags) { struct btrfs_delayed_extent_op *extent_op; - int level = btrfs_header_level(eb); int ret; extent_op = btrfs_alloc_delayed_extent_op(); @@ -2278,21 +2190,21 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_flags = true; extent_op->update_key = false; - extent_op->level = level; - ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); + ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, + btrfs_header_level(eb), extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; } -static noinline int check_delayed_ref(struct btrfs_root *root, +static noinline int check_delayed_ref(struct btrfs_inode *inode, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) + u64 offset, u64 bytenr) { + struct btrfs_root *root = inode->root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_transaction *cur_trans; struct rb_node *node; @@ -2308,7 +2220,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, delayed_refs = &cur_trans->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); if (!head) { spin_unlock(&delayed_refs->lock); btrfs_put_transaction(cur_trans); @@ -2346,6 +2258,9 @@ static noinline int check_delayed_ref(struct btrfs_root *root, */ for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) { + u64 ref_owner; + u64 ref_offset; + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { @@ -2353,15 +2268,15 @@ static noinline int check_delayed_ref(struct btrfs_root *root, break; } - data_ref = btrfs_delayed_node_to_data_ref(ref); + ref_owner = btrfs_delayed_ref_owner(ref); + ref_offset = btrfs_delayed_ref_offset(ref); /* * If our ref doesn't match the one we're currently looking at * then we have a cross reference. */ - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || - data_ref->offset != offset) { + if (ref->ref_root != btrfs_root_id(root) || + ref_owner != btrfs_ino(inode) || ref_offset != offset) { ret = 1; break; } @@ -2372,11 +2287,53 @@ static noinline int check_delayed_ref(struct btrfs_root *root, return ret; } -static noinline int check_committed_ref(struct btrfs_root *root, +/* + * Check if there are references for a data extent other than the one belonging + * to the given inode and offset. + * + * @inode: The only inode we expect to find associated with the data extent. + * @path: A path to use for searching the extent tree. + * @offset: The only offset we expect to find associated with the data extent. + * @bytenr: The logical address of the data extent. + * + * When the extent does not have any other references other than the one we + * expect to find, we always return a value of 0 with the path having a locked + * leaf that contains the extent's extent item - this is necessary to ensure + * we don't race with a task running delayed references, and our caller must + * have such a path when calling check_delayed_ref() - it must lock a delayed + * ref head while holding the leaf locked. In case the extent item is not found + * in the extent tree, we return -ENOENT with the path having the leaf (locked) + * where the extent item should be, in order to prevent races with another task + * running delayed references, so that we don't miss any reference when calling + * check_delayed_ref(). + * + * Note: this may return false positives, and this is because we want to be + * quick here as we're called in write paths (when flushing delalloc and + * in the direct IO write path). For example we can have an extent with + * a single reference but that reference is not inlined, or we may have + * many references in the extent tree but we also have delayed references + * that cancel all the reference except the one for our inode and offset, + * but it would be expensive to do such checks and complex due to all + * locking to avoid races between the checks and flushing delayed refs, + * plus non-inline references may be located on leaves other than the one + * that contains the extent item in the extent tree. The important thing + * here is to not return false negatives and that the false positives are + * not very common. + * + * Returns: 0 if there are no cross references and with the path having a locked + * leaf from the extent tree that contains the extent's extent item. + * + * 1 if there are cross references (false positives can happen). + * + * < 0 in case of an error. In case of -ENOENT the leaf in the extent + * tree where the extent item should be located at is read locked and + * accessible in the given path. + */ +static noinline int check_committed_ref(struct btrfs_inode *inode, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr, - bool strict) + u64 offset, u64 bytenr) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct extent_buffer *leaf; @@ -2390,40 +2347,37 @@ static noinline int check_committed_ref(struct btrfs_root *root, int ret; key.objectid = bytenr; - key.offset = (u64)-1; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. */ - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } - ret = -ENOENT; if (path->slots[0] == 0) - goto out; + return -ENOENT; path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) - goto out; + return -ENOENT; - ret = 1; item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); /* No inline refs; we need to bail before checking for owner ref. */ if (item_size == sizeof(*ei)) - goto out; + return 1; /* Check for an owner ref; skip over it to the real inline refs. */ iref = (struct btrfs_extent_inline_ref *)(ei + 1); @@ -2431,57 +2385,69 @@ static noinline int check_committed_ref(struct btrfs_root *root, if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); iref = (struct btrfs_extent_inline_ref *)(iref + 1); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); } /* If extent item has more than 1 inline ref then it's shared */ if (item_size != expected_size) - goto out; - - /* - * If extent created before last snapshot => it's shared unless the - * snapshot has been deleted. Use the heuristic if strict is false. - */ - if (!strict && - (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item))) - goto out; + return 1; /* If this extent has SHARED_DATA_REF then it's shared */ - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (type != BTRFS_EXTENT_DATA_REF_KEY) - goto out; + return 1; ref = (struct btrfs_extent_data_ref *)(&iref->offset); if (btrfs_extent_refs(leaf, ei) != btrfs_extent_data_ref_count(leaf, ref) || - btrfs_extent_data_ref_root(leaf, ref) != - root->root_key.objectid || - btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) || + btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) || btrfs_extent_data_ref_offset(leaf, ref) != offset) - goto out; + return 1; - ret = 0; -out: - return ret; + return 0; } -int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr, bool strict, struct btrfs_path *path) +int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, + u64 bytenr, struct btrfs_path *path) { int ret; do { - ret = check_committed_ref(root, path, objectid, - offset, bytenr, strict); + ret = check_committed_ref(inode, path, offset, bytenr); if (ret && ret != -ENOENT) goto out; - ret = check_delayed_ref(root, path, objectid, offset, bytenr); - } while (ret == -EAGAIN); + /* + * The path must have a locked leaf from the extent tree where + * the extent item for our extent is located, in case it exists, + * or where it should be located in case it doesn't exist yet + * because it's new and its delayed ref was not yet flushed. + * We need to lock the delayed ref head at check_delayed_ref(), + * if one exists, while holding the leaf locked in order to not + * race with delayed ref flushing, missing references and + * incorrectly reporting that the extent is not shared. + */ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { + struct extent_buffer *leaf = path->nodes[0]; + + ASSERT(leaf != NULL); + btrfs_assert_tree_read_locked(leaf); + + if (ret != -ENOENT) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.objectid == bytenr); + ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY); + } + } + + ret = check_delayed_ref(inode, path, offset, bytenr); + } while (ret == -EAGAIN && !path->nowait); out: btrfs_release_path(path); - if (btrfs_is_data_reloc_root(root)) + if (btrfs_is_data_reloc_root(inode->root)) WARN_ON(ret > 0); return ret; } @@ -2492,14 +2458,11 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int full_backref, int inc) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 bytenr; - u64 num_bytes; u64 parent; u64 ref_root; u32 nritems; struct btrfs_key key; struct btrfs_file_extent_item *fi; - struct btrfs_ref generic_ref = { 0 }; bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); int i; int action; @@ -2526,6 +2489,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, action = BTRFS_DROP_DELAYED_REF; for (i = 0; i < nritems; i++) { + struct btrfs_ref ref = { + .action = action, + .parent = parent, + .ref_root = ref_root, + }; + if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) @@ -2535,35 +2504,33 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, if (btrfs_file_extent_type(buf, fi) == BTRFS_FILE_EXTENT_INLINE) continue; - bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (bytenr == 0) + ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (ref.bytenr == 0) continue; - num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + ref.owning_root = ref_root; + key.offset -= btrfs_file_extent_offset(buf, fi); - btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent, ref_root); - btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, - key.offset, root->root_key.objectid, - for_reloc); + btrfs_init_data_ref(&ref, key.objectid, key.offset, + btrfs_root_id(root), for_reloc); if (inc) - ret = btrfs_inc_extent_ref(trans, &generic_ref); + ret = btrfs_inc_extent_ref(trans, &ref); else - ret = btrfs_free_extent(trans, &generic_ref); + ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } else { - bytenr = btrfs_node_blockptr(buf, i); - num_bytes = fs_info->nodesize; - /* We don't know the owning_root, use 0. */ - btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent, 0); - btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, - root->root_key.objectid, for_reloc); + /* We don't know the owning_root, leave as 0. */ + ref.bytenr = btrfs_node_blockptr(buf, i); + ref.num_bytes = fs_info->nodesize; + + btrfs_init_tree_ref(&ref, level - 1, + btrfs_root_id(root), for_reloc); if (inc) - ret = btrfs_inc_extent_ref(trans, &generic_ref); + ret = btrfs_inc_extent_ref(trans, &ref); else - ret = btrfs_free_extent(trans, &generic_ref); + ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } @@ -2625,13 +2592,10 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { - struct btrfs_fs_info *fs_info = cache->fs_info; - spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, - num_bytes); + btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -2639,8 +2603,8 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - set_extent_bit(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); return 0; } @@ -2778,15 +2742,15 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, { struct btrfs_block_group *cache = NULL; struct btrfs_space_info *space_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_free_cluster *cluster = NULL; - u64 len; u64 total_unpinned = 0; u64 empty_cluster = 0; bool readonly; int ret = 0; while (start <= end) { + u64 len; + readonly = false; if (!cache || start >= cache->start + cache->length) { @@ -2832,36 +2796,19 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); + btrfs_space_info_update_bytes_pinned(space_info, -len); space_info->max_extent_size = 0; if (cache->ro) { space_info->bytes_readonly += len; readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ - space_info->bytes_zone_unusable += len; + btrfs_space_info_update_bytes_zone_unusable(space_info, len); readonly = true; } spin_unlock(&cache->lock); - if (!readonly && return_free_space && - global_rsv->space_info == space_info) { - spin_lock(&global_rsv->lock); - if (!global_rsv->full) { - u64 to_add = min(len, global_rsv->size - - global_rsv->reserved); - - global_rsv->reserved += to_add; - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, to_add); - if (global_rsv->reserved >= global_rsv->size) - global_rsv->full = 1; - len -= to_add; - } - spin_unlock(&global_rsv->lock); - } - /* Add to any tickets we may have */ - if (!readonly && return_free_space && len) - btrfs_try_granting_tickets(fs_info, space_info); + if (!readonly && return_free_space) + btrfs_return_free_space(space_info, len); spin_unlock(&space_info->lock); } @@ -2876,34 +2823,63 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *block_group, *tmp; struct list_head *deleted_bgs; - struct extent_io_tree *unpin; + struct extent_io_tree *unpin = &trans->transaction->pinned_extents; + struct extent_state *cached_state = NULL; u64 start; u64 end; + int unpin_error = 0; int ret; - unpin = &trans->transaction->pinned_extents; + mutex_lock(&fs_info->unused_bg_unpin_mutex); + btrfs_find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, &cached_state); - while (!TRANS_ABORTED(trans)) { - struct extent_state *cached_state = NULL; - - mutex_lock(&fs_info->unused_bg_unpin_mutex); - if (!find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state)) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - break; - } + while (!TRANS_ABORTED(trans) && cached_state) { + struct extent_state *next_state; if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end, &cached_state); + next_state = btrfs_next_extent_state(unpin, cached_state); + btrfs_clear_extent_dirty(unpin, start, end, &cached_state); ret = unpin_extent_range(fs_info, start, end, true); - BUG_ON(ret); - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - free_extent_state(cached_state); - cond_resched(); + /* + * If we get an error unpinning an extent range, store the first + * error to return later after trying to unpin all ranges and do + * the sync discards. Our caller will abort the transaction + * (which already wrote new superblocks) and on the next mount + * the space will be available as it was pinned by in-memory + * only structures in this phase. + */ + if (ret) { + btrfs_err_rl(fs_info, +"failed to unpin extent range [%llu, %llu] when committing transaction %llu: %s (%d)", + start, end, trans->transid, + btrfs_decode_error(ret), ret); + if (!unpin_error) + unpin_error = ret; + } + + btrfs_free_extent_state(cached_state); + + if (need_resched()) { + btrfs_free_extent_state(next_state); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + cond_resched(); + cached_state = NULL; + mutex_lock(&fs_info->unused_bg_unpin_mutex); + btrfs_find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state); + } else { + cached_state = next_state; + if (cached_state) { + start = cached_state->start; + end = cached_state->end; + } + } } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_free_extent_state(cached_state); if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { btrfs_discard_calc_delay(&fs_info->discard_ctl); @@ -2917,16 +2893,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) */ deleted_bgs = &trans->transaction->deleted_bgs; list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { - u64 trimmed = 0; - ret = -EROFS; if (!TRANS_ABORTED(trans)) - ret = btrfs_discard_extent(fs_info, - block_group->start, - block_group->length, - &trimmed); + ret = btrfs_discard_extent(fs_info, block_group->start, + block_group->length, NULL); + /* + * Not strictly necessary to lock, as the block_group should be + * read-only from btrfs_delete_unused_bgs(). + */ + ASSERT(block_group->ro); + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); @@ -2938,7 +2918,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) } } - return 0; + return unpin_error; } /* @@ -3099,9 +3079,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, + struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *info = trans->fs_info; @@ -3121,6 +3099,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; + u64 owner_objectid = btrfs_delayed_ref_owner(node); + u64 owner_offset = btrfs_delayed_ref_offset(node); bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); u64 delayed_ref_root = href->owning_root; @@ -3146,7 +3126,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, skinny_metadata = false; ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, - parent, root_objectid, owner_objectid, + node->parent, node->ref_root, owner_objectid, owner_offset); if (ret == 0) { /* @@ -3173,7 +3153,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, break; } - /* Quick path didn't find the EXTEMT/METADATA_ITEM */ + /* Quick path didn't find the EXTENT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; @@ -3248,7 +3228,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } else if (WARN_ON(ret == -ENOENT)) { abort_and_dump(trans, path, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", - bytenr, parent, root_objectid, owner_objectid, + bytenr, node->parent, node->ref_root, owner_objectid, owner_offset, path->slots[0]); goto out; } else { @@ -3312,7 +3292,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } else { btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(trans, leaf); } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, @@ -3406,13 +3385,14 @@ out: static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, u64 bytenr) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (!head) goto out_delayed_unlock; @@ -3430,7 +3410,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!mutex_trylock(&head->mutex)) goto out; - btrfs_delete_ref_head(delayed_refs, head); + btrfs_delete_ref_head(fs_info, delayed_refs, head); head->processing = false; spin_unlock(&head->lock); @@ -3440,7 +3420,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; - btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; @@ -3452,30 +3432,42 @@ out_delayed_unlock: return 0; } -void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - u64 root_id, - struct extent_buffer *buf, - u64 parent, int last_ref) +int btrfs_free_tree_block(struct btrfs_trans_handle *trans, + u64 root_id, + struct extent_buffer *buf, + u64 parent, int last_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *bg; int ret; if (root_id != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_ref generic_ref = { 0 }; + struct btrfs_ref generic_ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = buf->start, + .num_bytes = buf->len, + .parent = parent, + .owning_root = btrfs_header_owner(buf), + .ref_root = root_id, + }; - btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, - buf->start, buf->len, parent, - btrfs_header_owner(buf)); - btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root_id, 0, false); + /* + * Assert that the extent buffer is not cleared due to + * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer + * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for + * detail. + */ + ASSERT(btrfs_header_bytenr(buf) != 0); + + btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); - BUG_ON(ret); /* -ENOMEM */ + if (ret < 0) + return ret; } if (!last_ref) - return; + return 0; if (btrfs_header_generation(buf) != trans->transid) goto out; @@ -3521,17 +3513,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(bg, buf->start, buf->len); - btrfs_free_reserved_bytes(bg, buf->len, 0); + btrfs_free_reserved_bytes(bg, buf->len, false); btrfs_put_block_group(bg); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); out: - - /* - * Deleting the buffer, clear the corrupt flag since it doesn't - * matter anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + return 0; } /* Can return -ENOMEM */ @@ -3547,11 +3534,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. */ - if ((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) || - (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { - btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); + if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) { + btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); @@ -3559,10 +3543,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) ret = btrfs_add_delayed_data_ref(trans, ref, 0); } - if (!((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) || - (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) + if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID) btrfs_ref_tree_mod(fs_info, ref); return ret; @@ -4154,6 +4135,7 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, bool full_search) { struct btrfs_root *root = fs_info->chunk_root; @@ -4208,7 +4190,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return ret; } - ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, + ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ @@ -4425,11 +4407,22 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, ffe_ctl); + trace_btrfs_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); + if (btrfs_is_zoned(fs_info) && space_info) { + /* Use dedicated sub-space_info for dedicated block group users. */ + if (ffe_ctl->for_data_reloc) { + space_info = space_info->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + } else if (ffe_ctl->for_treelog) { + space_info = space_info->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_TREELOG); + } + } if (!space_info) { - btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); + btrfs_err(fs_info, "no space info for %llu, tree-log %d, relocation %d", + ffe_ctl->flags, ffe_ctl->for_treelog, ffe_ctl->for_data_reloc); return -ENOSPC; } @@ -4451,6 +4444,7 @@ static noinline int find_free_extent(struct btrfs_root *root, * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, ffe_ctl->flags) && + block_group->space_info == space_info && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || @@ -4476,7 +4470,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: - trace_find_free_extent_search_loop(root, ffe_ctl); + trace_btrfs_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) @@ -4528,7 +4522,7 @@ search: } have_block_group: - trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); + trace_btrfs_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; @@ -4621,7 +4615,8 @@ loop: } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); + ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, space_info, + full_search); if (ret > 0) goto search; @@ -4697,7 +4692,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; - bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID); bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); flags = get_alloc_profile_by_root(root, is_data); @@ -4743,8 +4738,8 @@ again: return ret; } -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc) +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, + bool is_delalloc) { struct btrfs_block_group *cache; @@ -4756,7 +4751,7 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, } btrfs_add_free_space(cache, start, len); - btrfs_free_reserved_bytes(cache, len, delalloc); + btrfs_free_reserved_bytes(cache, len, is_delalloc); trace_btrfs_reserved_extent_free(fs_info, start, len); btrfs_put_block_group(cache); @@ -4872,7 +4867,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); } - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_free_path(path); return alloc_reserved_extent(trans, ins->objectid, ins->offset); @@ -4891,16 +4885,16 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; - struct btrfs_delayed_tree_ref *ref; u32 size = sizeof(*extent_item) + sizeof(*iref); - u64 flags = extent_op->flags_to_set; + const u64 flags = (extent_op ? extent_op->flags_to_set : 0); + /* The owner of a tree block is the level. */ + int level = btrfs_delayed_ref_owner(node); bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); - ref = btrfs_delayed_node_to_tree_ref(node); - extent_key.objectid = node->bytenr; if (skinny_metadata) { - extent_key.offset = ref->level; + /* The owner of a tree block is the level. */ + extent_key.offset = level; extent_key.type = BTRFS_METADATA_ITEM_KEY; } else { extent_key.offset = node->num_bytes; @@ -4933,21 +4927,20 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); - btrfs_set_tree_block_level(leaf, block_info, ref->level); + btrfs_set_tree_block_level(leaf, block_info, level); iref = (struct btrfs_extent_inline_ref *)(block_info + 1); } if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); + btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent); } else { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); + btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root); } - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); @@ -4958,19 +4951,20 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { - struct btrfs_ref generic_ref = { 0 }; - u64 root_objectid = root->root_key.objectid; - u64 owning_root = root_objectid; + struct btrfs_ref generic_ref = { + .action = BTRFS_ADD_DELAYED_EXTENT, + .bytenr = ins->objectid, + .num_bytes = ins->offset, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; - ASSERT(root_objectid != BTRFS_TREE_LOG_OBJECTID); + ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID); if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) - owning_root = root->relocation_src_root; + generic_ref.owning_root = root->relocation_src_root; - btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins->objectid, ins->offset, 0, owning_root); - btrfs_init_data_ref(&generic_ref, root_objectid, owner, - offset, 0, false); + btrfs_init_data_ref(&generic_ref, owner, offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); @@ -5093,7 +5087,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, */ btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); - __btrfs_tree_lock(buf, nest); + btrfs_tree_lock_nested(buf, nest); btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags); @@ -5108,24 +5102,24 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_owner(buf, owner); write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) - set_extent_bit(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, - EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_DIRTY, NULL); else - set_extent_bit(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, - EXTENT_NEW, NULL); + btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_NEW, NULL); } else { buf->log_index = -1; - set_extent_bit(&trans->transaction->dirty_pages, buf->start, - buf->start + buf->len - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, EXTENT_DIRTY, NULL); } /* this returns a buffer locked for blocking */ return buf; @@ -5148,8 +5142,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; - struct btrfs_delayed_extent_op *extent_op; - struct btrfs_ref generic_ref = { 0 }; u64 flags = 0; int ret; u32 blocksize = fs_info->nodesize; @@ -5192,38 +5184,48 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, BUG_ON(parent > 0); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - extent_op = btrfs_alloc_delayed_extent_op(); - if (!extent_op) { - ret = -ENOMEM; - goto out_free_buf; + struct btrfs_delayed_extent_op *extent_op; + struct btrfs_ref generic_ref = { + .action = BTRFS_ADD_DELAYED_EXTENT, + .bytenr = ins.objectid, + .num_bytes = ins.offset, + .parent = parent, + .owning_root = owning_root, + .ref_root = root_objectid, + }; + + if (!skinny_metadata || flags != 0) { + extent_op = btrfs_alloc_delayed_extent_op(); + if (!extent_op) { + ret = -ENOMEM; + goto out_free_buf; + } + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = (skinny_metadata ? false : true); + extent_op->update_flags = (flags != 0); + } else { + extent_op = NULL; } - if (key) - memcpy(&extent_op->key, key, sizeof(extent_op->key)); - else - memset(&extent_op->key, 0, sizeof(extent_op->key)); - extent_op->flags_to_set = flags; - extent_op->update_key = skinny_metadata ? false : true; - extent_op->update_flags = true; - extent_op->level = level; - - btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins.objectid, ins.offset, parent, owning_root); - btrfs_init_tree_ref(&generic_ref, level, root_objectid, - root->root_key.objectid, false); + + btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); - if (ret) - goto out_free_delayed; + if (ret) { + btrfs_free_delayed_extent_op(extent_op); + goto out_free_buf; + } } return buf; -out_free_delayed: - btrfs_free_delayed_extent_op(extent_op); out_free_buf: btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, false); out_unuse: btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); return ERR_PTR(ret); @@ -5243,11 +5245,99 @@ struct walk_control { int reada_slot; int reada_count; int restarted; + /* Indicate that extent info needs to be looked up when walking the tree. */ + int lookup_info; }; +/* + * This is our normal stage. We are traversing blocks the current snapshot owns + * and we are dropping any of our references to any children we are able to, and + * then freeing the block once we've processed all of the children. + */ #define DROP_REFERENCE 1 + +/* + * We enter this stage when we have to walk into a child block (meaning we can't + * simply drop our reference to it from our current parent node) and there are + * more than one reference on it. If we are the owner of any of the children + * blocks from the current parent node then we have to do the FULL_BACKREF dance + * on them in order to drop our normal ref and add the shared ref. + */ #define UPDATE_BACKREF 2 +/* + * Decide if we need to walk down into this node to adjust the references. + * + * @root: the root we are currently deleting + * @wc: the walk control for this deletion + * @eb: the parent eb that we're currently visiting + * @refs: the number of refs for wc->level - 1 + * @flags: the flags for wc->level - 1 + * @slot: the slot in the eb that we're currently checking + * + * This is meant to be called when we're evaluating if a node we point to at + * wc->level should be read and walked into, or if we can simply delete our + * reference to it. We return true if we should walk into the node, false if we + * can skip it. + * + * We have assertions in here to make sure this is called correctly. We assume + * that sanity checking on the blocks read to this point has been done, so any + * corrupted file systems must have been caught before calling this function. + */ +static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, + struct extent_buffer *eb, u64 flags, int slot) +{ + struct btrfs_key key; + u64 generation; + int level = wc->level; + + ASSERT(level > 0); + ASSERT(wc->refs[level - 1] > 0); + + /* + * The update backref stage we only want to skip if we already have + * FULL_BACKREF set, otherwise we need to read. + */ + if (wc->stage == UPDATE_BACKREF) { + if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + return false; + return true; + } + + /* + * We're the last ref on this block, we must walk into it and process + * any refs it's pointing at. + */ + if (wc->refs[level - 1] == 1) + return true; + + /* + * If we're already FULL_BACKREF then we know we can just drop our + * current reference. + */ + if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + return false; + + /* + * This block is older than our creation generation, we can drop our + * reference to it. + */ + generation = btrfs_node_ptr_generation(eb, slot); + if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) + return false; + + /* + * This block was processed from a previous snapshot deletion run, we + * can skip it. + */ + btrfs_node_key_to_cpu(eb, &key, slot); + if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0) + return false; + + /* All other cases we need to wander into the node. */ + return true; +} + static noinline void reada_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct walk_control *wc, @@ -5259,7 +5349,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, u64 refs; u64 flags; u32 nritems; - struct btrfs_key key; struct extent_buffer *eb; int ret; int slot; @@ -5289,7 +5378,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, goto reada; if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ @@ -5299,28 +5388,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, /* We don't care about errors in readahead. */ if (ret < 0) continue; - BUG_ON(refs == 0); - if (wc->stage == DROP_REFERENCE) { - if (refs == 1) - goto reada; + /* + * This could be racey, it's conceivable that we raced and end + * up with a bogus refs count, if that's the case just skip, if + * we are actually corrupt we will notice when we look up + * everything again with our locks. + */ + if (refs == 0) + continue; - if (wc->level == 1 && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - continue; - if (!wc->update_ref || - generation <= root->root_key.offset) - continue; - btrfs_node_key_to_cpu(eb, &key, slot); - ret = btrfs_comp_cpu_keys(&key, - &wc->update_progress); - if (ret < 0) - continue; - } else { - if (wc->level == 1 && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - continue; - } + /* If we don't need to visit this node don't reada. */ + if (!visit_node_for_delete(root, wc, eb, flags, slot)) + continue; reada: btrfs_readahead_node_child(eb, slot); nread++; @@ -5339,7 +5419,7 @@ reada: static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc, int lookup_info) + struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; int level = wc->level; @@ -5347,27 +5427,29 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; int ret; - if (wc->stage == UPDATE_BACKREF && - btrfs_header_owner(eb) != root->root_key.objectid) + if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root)) return 1; /* * when reference count of tree block is 1, it won't increase * again. once full backref flag is set, we never clear it. */ - if (lookup_info && + if (wc->lookup_info && ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { - BUG_ON(!path->locks[level]); + ASSERT(path->locks[level]); ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], &wc->flags[level], NULL); - BUG_ON(ret == -ENOMEM); if (ret) return ret; - BUG_ON(wc->refs[level] == 0); + if (unlikely(wc->refs[level] == 0)) { + btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", + eb->start); + return -EUCLEAN; + } } if (wc->stage == DROP_REFERENCE) { @@ -5383,13 +5465,22 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { - BUG_ON(!path->locks[level]); + ASSERT(path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } ret = btrfs_dec_ref(trans, root, eb, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } ret = btrfs_set_disk_extent_flags(trans, eb, flag); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } wc->flags[level] |= flag; } @@ -5412,23 +5503,186 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 parent, int level) { - struct btrfs_path *path; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *head; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_inline_ref *iref; int ret; + bool exists = false; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - +again: ret = lookup_extent_backref(trans, path, &iref, bytenr, root->fs_info->nodesize, parent, - root->root_key.objectid, level, 0); - btrfs_free_path(path); - if (ret == -ENOENT) + btrfs_root_id(root), level, 0); + if (ret != -ENOENT) { + /* + * If we get 0 then we found our reference, return 1, else + * return the error if it's not -ENOENT; + */ + return (ret < 0 ) ? ret : 1; + } + + /* + * We could have a delayed ref with this reference, so look it up while + * we're holding the path open to make sure we don't race with the + * delayed ref running. + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); + if (!head) + goto out; + if (!mutex_trylock(&head->mutex)) { + /* + * We're contended, means that the delayed ref is running, get a + * reference and wait for the ref head to be complete and then + * try again. + */ + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); + goto again; + } + + exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent); + mutex_unlock(&head->mutex); +out: + spin_unlock(&delayed_refs->lock); + return exists ? 1 : 0; +} + +/* + * We may not have an uptodate block, so if we are going to walk down into this + * block we need to drop the lock, read it off of the disk, re-lock it and + * return to continue dropping the snapshot. + */ +static int check_next_block_uptodate(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, + struct extent_buffer *next) +{ + struct btrfs_tree_parent_check check = { 0 }; + u64 generation; + int level = wc->level; + int ret; + + btrfs_assert_tree_write_locked(next); + + generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); + + if (btrfs_buffer_uptodate(next, generation, 0)) return 0; - if (ret < 0) + + check.level = level - 1; + check.transid = generation; + check.owner_root = btrfs_root_id(root); + check.has_first_key = true; + btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]); + + btrfs_tree_unlock(next); + if (level == 1) + reada_walk_down(trans, root, wc, path); + ret = btrfs_read_extent_buffer(next, &check); + if (ret) { + free_extent_buffer(next); return ret; - return 1; + } + btrfs_tree_lock(next); + wc->lookup_info = 1; + return 0; +} + +/* + * If we determine that we don't have to visit wc->level - 1 then we need to + * determine if we can drop our reference. + * + * If we are UPDATE_BACKREF then we will not, we need to update our backrefs. + * + * If we are DROP_REFERENCE this will figure out if we need to drop our current + * reference, skipping it if we dropped it from a previous incompleted drop, or + * dropping it if we still have a reference to it. + */ +static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, struct walk_control *wc, + struct extent_buffer *next, u64 owner_root) +{ + struct btrfs_ref ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = next->start, + .num_bytes = root->fs_info->nodesize, + .owning_root = owner_root, + .ref_root = btrfs_root_id(root), + }; + int level = wc->level; + int ret; + + /* We are UPDATE_BACKREF, we're not dropping anything. */ + if (wc->stage == UPDATE_BACKREF) + return 0; + + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + ref.parent = path->nodes[level]->start; + } else { + ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level])); + if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) { + btrfs_err(root->fs_info, "mismatched block owner"); + return -EIO; + } + } + + /* + * If we had a drop_progress we need to verify the refs are set as + * expected. If we find our ref then we know that from here on out + * everything should be correct, and we can clear the + * ->restarted flag. + */ + if (wc->restarted) { + ret = check_ref_exists(trans, root, next->start, ref.parent, + level - 1); + if (ret <= 0) + return ret; + ret = 0; + wc->restarted = 0; + } + + /* + * Reloc tree doesn't contribute to qgroup numbers, and we have already + * accounted them at merge time (replace_path), thus we could skip + * expensive subtree trace here. + */ + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && + wc->refs[level - 1] > 1) { + u64 generation = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + + ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1); + if (ret) { + btrfs_err_rl(root->fs_info, +"error %d accounting shared subtree, quota is out of sync, rescan required", + ret); + } + } + + /* + * We need to update the next key in our walk control so we can update + * the drop_progress key accordingly. We don't care if find_next_key + * doesn't find a key because that means we're at the end and are going + * to clean up now. + */ + wc->drop_level = level; + find_next_key(path, level, &wc->drop_progress); + + btrfs_init_tree_ref(&ref, level - 1, 0, false); + return btrfs_free_extent(trans, &ref); } /* @@ -5447,21 +5701,15 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, static noinline int do_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc, int *lookup_info) + struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 generation; - u64 parent; u64 owner_root = 0; - struct btrfs_tree_parent_check check = { 0 }; - struct btrfs_key key; - struct btrfs_ref ref = { 0 }; struct extent_buffer *next; int level = wc->level; - int reada = 0; int ret = 0; - bool need_account = false; generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); @@ -5471,28 +5719,18 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { - *lookup_info = 1; + generation <= btrfs_root_origin_generation(root)) { + wc->lookup_info = 1; return 1; } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - check.level = level - 1; - check.transid = generation; - check.owner_root = root->root_key.objectid; - check.has_first_key = true; - btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, - path->slots[level]); + next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root), + level - 1); + if (IS_ERR(next)) + return PTR_ERR(next); - next = find_extent_buffer(fs_info, bytenr); - if (!next) { - next = btrfs_find_create_tree_block(fs_info, bytenr, - root->root_key.objectid, level - 1); - if (IS_ERR(next)) - return PTR_ERR(next); - reada = 1; - } btrfs_tree_lock(next); ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, @@ -5503,57 +5741,31 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, goto out_unlock; if (unlikely(wc->refs[level - 1] == 0)) { - btrfs_err(fs_info, "Missing references."); - ret = -EIO; + btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", + bytenr); + ret = -EUCLEAN; goto out_unlock; } - *lookup_info = 0; + wc->lookup_info = 0; - if (wc->stage == DROP_REFERENCE) { - if (wc->refs[level - 1] > 1) { - need_account = true; - if (level == 1 && - (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - goto skip; - - if (!wc->update_ref || - generation <= root->root_key.offset) - goto skip; - - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); - if (ret < 0) - goto skip; + /* If we don't have to walk into this node skip it. */ + if (!visit_node_for_delete(root, wc, path->nodes[level], + wc->flags[level - 1], path->slots[level])) + goto skip; - wc->stage = UPDATE_BACKREF; - wc->shared_level = level - 1; - } - } else { - if (level == 1 && - (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - goto skip; + /* + * We have to walk down into this node, and if we're currently at the + * DROP_REFERNCE stage and this block is shared then we need to switch + * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF. + */ + if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) { + wc->stage = UPDATE_BACKREF; + wc->shared_level = level - 1; } - if (!btrfs_buffer_uptodate(next, generation, 0)) { - btrfs_tree_unlock(next); - free_extent_buffer(next); - next = NULL; - *lookup_info = 1; - } - - if (!next) { - if (reada && level == 1) - reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, &check); - if (IS_ERR(next)) { - return PTR_ERR(next); - } else if (!extent_buffer_uptodate(next)) { - free_extent_buffer(next); - return -EIO; - } - btrfs_tree_lock(next); - } + ret = check_next_block_uptodate(trans, root, path, wc, next); + if (ret) + return ret; level--; ASSERT(level == btrfs_header_level(next)); @@ -5570,76 +5782,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, wc->reada_slot = 0; return 0; skip: + ret = maybe_drop_reference(trans, root, path, wc, next, owner_root); + if (ret) + goto out_unlock; wc->refs[level - 1] = 0; wc->flags[level - 1] = 0; - if (wc->stage == DROP_REFERENCE) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - parent = path->nodes[level]->start; - } else { - ASSERT(root->root_key.objectid == - btrfs_header_owner(path->nodes[level])); - if (root->root_key.objectid != - btrfs_header_owner(path->nodes[level])) { - btrfs_err(root->fs_info, - "mismatched block owner"); - ret = -EIO; - goto out_unlock; - } - parent = 0; - } - - /* - * If we had a drop_progress we need to verify the refs are set - * as expected. If we find our ref then we know that from here - * on out everything should be correct, and we can clear the - * ->restarted flag. - */ - if (wc->restarted) { - ret = check_ref_exists(trans, root, bytenr, parent, - level - 1); - if (ret < 0) - goto out_unlock; - if (ret == 0) - goto no_delete; - ret = 0; - wc->restarted = 0; - } - - /* - * Reloc tree doesn't contribute to qgroup numbers, and we have - * already accounted them at merge time (replace_path), - * thus we could skip expensive subtree trace here. - */ - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - need_account) { - ret = btrfs_qgroup_trace_subtree(trans, next, - generation, level - 1); - if (ret) { - btrfs_err_rl(fs_info, - "Error %d accounting shared subtree. Quota is out of sync, rescan required.", - ret); - } - } - - /* - * We need to update the next key in our walk control so we can - * update the drop_progress key accordingly. We don't care if - * find_next_key doesn't find a key because that means we're at - * the end and are going to clean up now. - */ - wc->drop_level = level; - find_next_key(path, level, &wc->drop_progress); - - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - fs_info->nodesize, parent, owner_root); - btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, - 0, false); - ret = btrfs_free_extent(trans, &ref); - if (ret) - goto out_unlock; - } -no_delete: - *lookup_info = 1; + wc->lookup_info = 1; ret = 1; out_unlock: @@ -5667,13 +5815,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; + int ret = 0; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; u64 parent = 0; if (wc->stage == UPDATE_BACKREF) { - BUG_ON(wc->shared_level < level); + ASSERT(wc->shared_level >= level); if (level < wc->shared_level) goto out; @@ -5691,7 +5839,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, * count is one. */ if (!path->locks[level]) { - BUG_ON(level == 0); + ASSERT(level > 0); btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; @@ -5705,7 +5853,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, path->locks[level] = 0; return ret; } - BUG_ON(wc->refs[level] == 0); + if (unlikely(wc->refs[level] == 0)) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", + eb->start); + return -EUCLEAN; + } if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; @@ -5715,7 +5868,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } /* wc->stage == DROP_REFERENCE */ - BUG_ON(wc->refs[level] > 1 && !path->locks[level]); + ASSERT(path->locks[level] || wc->refs[level] == 1); if (wc->refs[level] == 1) { if (level == 0) { @@ -5723,8 +5876,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 1); else ret = btrfs_dec_ref(trans, root, eb, 0); - BUG_ON(ret); /* -ENOMEM */ - if (is_fstree(root->root_key.objectid)) { + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + if (is_fstree(btrfs_root_id(root))) { ret = btrfs_qgroup_trace_leaf_items(trans, eb); if (ret) { btrfs_err_rl(fs_info, @@ -5744,40 +5900,63 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; - else if (root->root_key.objectid != btrfs_header_owner(eb)) + else if (btrfs_root_id(root) != btrfs_header_owner(eb)) goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; - else if (root->root_key.objectid != + else if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level + 1])) goto owner_mismatch; } - btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, - wc->refs[level] == 1); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, + wc->refs[level] == 1); + if (ret < 0) + btrfs_abort_transaction(trans, ret); out: wc->refs[level] = 0; wc->flags[level] = 0; - return 0; + return ret; owner_mismatch: btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", - btrfs_header_owner(eb), root->root_key.objectid); + btrfs_header_owner(eb), btrfs_root_id(root)); return -EUCLEAN; } +/* + * walk_down_tree consists of two steps. + * + * walk_down_proc(). Look up the reference count and reference of our current + * wc->level. At this point path->nodes[wc->level] should be populated and + * uptodate, and in most cases should already be locked. If we are in + * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and + * we can walk back up the tree. If we are UPDATE_BACKREF we have to set + * FULL_BACKREF on this node if it's not already set, and then do the + * FULL_BACKREF conversion dance, which is to drop the root reference and add + * the shared reference to all of this nodes children. + * + * do_walk_down(). This is where we actually start iterating on the children of + * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping + * our reference to the children that return false from visit_node_for_delete(), + * which has various conditions where we know we can just drop our reference + * without visiting the node. For UPDATE_BACKREF we will skip any children that + * visit_node_for_delete() returns false for, only walking down when necessary. + * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of + * snapshot deletion. + */ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { int level = wc->level; - int lookup_info = 1; int ret = 0; + wc->lookup_info = 1; while (level >= 0) { - ret = walk_down_proc(trans, root, path, wc, lookup_info); + ret = walk_down_proc(trans, root, path, wc); if (ret) break; @@ -5788,7 +5967,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, btrfs_header_nritems(path->nodes[level])) break; - ret = do_walk_down(trans, root, path, wc, &lookup_info); + ret = do_walk_down(trans, root, path, wc); if (ret > 0) { path->slots[level]++; continue; @@ -5799,6 +5978,23 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, return (ret == 1) ? 0 : ret; } +/* + * walk_up_tree() is responsible for making sure we visit every slot on our + * current node, and if we're at the end of that node then we call + * walk_up_proc() on our current node which will do one of a few things based on + * our stage. + * + * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level + * then we need to walk back up the tree, and then going back down into the + * other slots via walk_down_tree to update any other children from our original + * wc->shared_level. Once we're at or above our wc->shared_level we can switch + * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on. + * + * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block. + * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents + * in our current leaf. After that we call btrfs_free_tree_block() on the + * current node and walk up to the next node to walk down the next slot. + */ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -5849,8 +6045,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, */ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { - const bool is_reloc_root = (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID); + const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -5858,24 +6053,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) struct btrfs_root_item *root_item = &root->root_item; struct walk_control *wc; struct btrfs_key key; - int err = 0; - int ret; + const u64 rootid = btrfs_root_id(root); + int ret = 0; int level; bool root_dropped = false; bool unfinished_drop = false; - btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); + btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root)); path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } wc = kzalloc(sizeof(*wc), GFP_NOFS); if (!wc) { btrfs_free_path(path); - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -5888,12 +6083,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) else trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_free; } - err = btrfs_run_delayed_items(trans); - if (err) + ret = btrfs_run_delayed_items(trans); + if (ret) goto out_end_trans; /* @@ -5924,11 +6119,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) path->lowest_level = level; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->lowest_level = 0; - if (ret < 0) { - err = ret; + if (ret < 0) goto out_end_trans; - } + WARN_ON(ret > 0); + ret = 0; /* * unlock our path, this is safe because only this @@ -5941,14 +6136,17 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_tree_lock(path->nodes[level]); path->locks[level] = BTRFS_WRITE_LOCK; + /* + * btrfs_lookup_extent_info() returns 0 for success, + * or < 0 for error. + */ ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, level, 1, &wc->refs[level], &wc->flags[level], NULL); - if (ret < 0) { - err = ret; + if (ret < 0) goto out_end_trans; - } + BUG_ON(wc->refs[level] == 0); if (level == btrfs_root_drop_level(root_item)) @@ -5974,19 +6172,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { btrfs_abort_transaction(trans, ret); - err = ret; break; } ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); if (ret < 0) { btrfs_abort_transaction(trans, ret); - err = ret; break; } if (ret > 0) { BUG_ON(wc->stage != DROP_REFERENCE); + ret = 0; break; } @@ -6008,7 +6205,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) root_item); if (ret) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } @@ -6019,7 +6215,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { btrfs_debug(fs_info, "drop snapshot early exit"); - err = -EAGAIN; + ret = -EAGAIN; goto out_free; } @@ -6033,19 +6229,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) else trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_free; } } } btrfs_release_path(path); - if (err) + if (ret) goto out_end_trans; ret = btrfs_del_root(trans, &root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } @@ -6054,16 +6249,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) NULL, NULL); if (ret < 0) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } else if (ret > 0) { - /* if we fail to delete the orphan item this time + ret = 0; + /* + * If we fail to delete the orphan item this time * around, it'll get picked up the next time. * * The most common failure here is just -ENOENT. */ - btrfs_del_orphan_item(trans, tree_root, - root->root_key.objectid); + btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root)); } } @@ -6089,11 +6284,19 @@ out_free: kfree(wc); btrfs_free_path(path); out: + if (!ret && root_dropped) { + ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid); + if (ret < 0) + btrfs_warn_rl(fs_info, + "failed to cleanup qgroup 0/%llu: %d", + rootid, ret); + ret = 0; + } /* * We were an unfinished drop root, check to see if there are any * pending, and if not clear and wake up any waiters. */ - if (!err && unfinished_drop) + if (!ret && unfinished_drop) btrfs_maybe_wake_unfinished_drop(fs_info); /* @@ -6105,7 +6308,7 @@ out: */ if (!for_reloc && !root_dropped) btrfs_add_dead_root(root); - return err; + return ret; } /* @@ -6120,24 +6323,21 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *parent) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct walk_control *wc; int level; int parent_level; int ret = 0; - int wret; - BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); path = btrfs_alloc_path(); if (!path) return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); - if (!wc) { - btrfs_free_path(path); + if (!wc) return -ENOMEM; - } btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); @@ -6161,21 +6361,19 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); while (1) { - wret = walk_down_tree(trans, root, path, wc); - if (wret < 0) { - ret = wret; + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) break; - } - wret = walk_up_tree(trans, root, path, wc, parent_level); - if (wret < 0) - ret = wret; - if (wret != 0) + ret = walk_up_tree(trans, root, path, wc, parent_level); + if (ret) { + if (ret > 0) + ret = 0; break; + } } kfree(wc); - btrfs_free_path(path); return ret; } @@ -6237,13 +6435,13 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) if (ret) break; - find_first_clear_extent_bit(&device->alloc_state, start, - &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&device->alloc_state, start, + &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); /* Check if there are any CHUNK_* bits left */ if (start > device->total_bytes) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); btrfs_warn_in_rcu(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, @@ -6276,8 +6474,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) ret = btrfs_issue_discard(device->bdev, start, len, &bytes); if (!ret) - set_extent_bit(&device->alloc_state, start, - start + bytes - 1, CHUNK_TRIMMED, NULL); + btrfs_set_extent_bit(&device->alloc_state, start, + start + bytes - 1, CHUNK_TRIMMED, NULL); mutex_unlock(&fs_info->chunk_mutex); if (ret) @@ -6286,7 +6484,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) start += len; *trimmed += bytes; - if (fatal_signal_pending(current)) { + if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; break; } @@ -6378,13 +6576,13 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) continue; ret = btrfs_trim_free_extents(device, &group_trimmed); + + trimmed += group_trimmed; if (ret) { dev_failed++; dev_ret = ret; break; } - - trimmed += group_trimmed; } mutex_unlock(&fs_devices->device_list_mutex); |