diff options
Diffstat (limited to 'fs/btrfs')
77 files changed, 8075 insertions, 6201 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 3dcf9bcc2326..4188ba3fd8c3 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -27,7 +27,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ + backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ subpage.o tree-mod-log.o diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 309516e6a968..43c89952b7d2 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -234,6 +234,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq, ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; + /* + * Orders all subsequent loads after reading WORK_DONE_BIT, + * paired with the smp_mb__before_atomic in btrfs_work_helper + * this guarantees that the ordered function will see all + * updates from ordinary work function. + */ + smp_rmb(); /* * we are going to call the ordered done function, but @@ -317,6 +324,13 @@ static void btrfs_work_helper(struct work_struct *normal_work) thresh_exec_hook(wq); work->func(work); if (need_order) { + /* + * Ensures all memory accesses done in the work function are + * ordered before setting the WORK_DONE_BIT. Ensuring the thread + * which is going to executed the ordered work sees them. + * Pairs with the smp_rmb in run_ordered_work. + */ + smp_mb__before_atomic(); set_bit(WORK_DONE_BIT, &work->flags); run_ordered_work(wq, work); } else { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f735b8798ba1..c9ee579bc5a6 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -950,7 +950,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; slot = path->slots[0]; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); @@ -1049,12 +1049,12 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int add_keyed_refs(struct btrfs_fs_info *fs_info, +static int add_keyed_refs(struct btrfs_root *extent_root, struct btrfs_path *path, u64 bytenr, int info_level, struct preftrees *preftrees, struct share_check *sc) { - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_fs_info *fs_info = extent_root->fs_info; int ret; int slot; struct extent_buffer *leaf; @@ -1170,6 +1170,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct ulist *roots, const u64 *extent_item_pos, struct share_check *sc, bool ignore_offset) { + struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); struct btrfs_key key; struct btrfs_path *path; struct btrfs_delayed_ref_root *delayed_refs = NULL; @@ -1203,28 +1204,26 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, if (time_seq == BTRFS_SEQ_LAST) path->skip_locking = 1; - /* - * grab both a lock on the path and a lock on the delayed ref head. - * We need both to get a consistent picture of how the refs look - * at a specified point in time - */ again: head = NULL; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + if (ret == 0) { + /* This shouldn't happen, indicates a bug or fs corruption. */ + ASSERT(ret != 0); + ret = -EUCLEAN; + goto out; + } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && time_seq != BTRFS_SEQ_LAST) { -#else - if (trans && time_seq != BTRFS_SEQ_LAST) { -#endif /* - * look if there are updates for this ref queued and lock the - * head + * We have a specific time_seq we care about and trans which + * means we have the path lock, we need to grab the ref head and + * lock it so we have a consistent view of the refs at the given + * time. */ delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); @@ -1271,7 +1270,7 @@ again: &info_level, &preftrees, sc); if (ret) goto out; - ret = add_keyed_refs(fs_info, path, bytenr, info_level, + ret = add_keyed_refs(root, path, bytenr, info_level, &preftrees, sc); if (ret) goto out; @@ -1360,10 +1359,18 @@ again: goto out; if (!ret && extent_item_pos) { /* - * we've recorded that parent, so we must extend - * its inode list here + * We've recorded that parent, so we must extend + * its inode list here. + * + * However if there was corruption we may not + * have found an eie, return an error in this + * case. */ - BUG_ON(!eie); + ASSERT(eie); + if (!eie) { + ret = -EUCLEAN; + goto out; + } while (eie->next) eie = eie->next; eie->next = ref->inode_list; @@ -1740,6 +1747,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, u64 *flags_ret) { + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); int ret; u64 flags; u64 size = 0; @@ -1755,11 +1763,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, key.objectid = logical; key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0); + ret = btrfs_previous_extent_item(extent_root, path, 0); if (ret) { if (ret > 0) ret = -ENOENT; @@ -1779,7 +1787,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, } eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, path->slots[0]); + item_size = btrfs_item_size(eb, path->slots[0]); BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); @@ -1962,7 +1970,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, extent_item_objectid); if (!search_commit_root) { - trans = btrfs_attach_transaction(fs_info->extent_root); + trans = btrfs_attach_transaction(fs_info->tree_root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) @@ -2058,7 +2066,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, u64 parent = 0; int found = 0; struct extent_buffer *eb; - struct btrfs_item *item; struct btrfs_inode_ref *iref; struct btrfs_key found_key; @@ -2084,10 +2091,9 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, } btrfs_release_path(path); - item = btrfs_item_nr(slot); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { + for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) { name_len = btrfs_inode_ref_name_len(eb, iref); /* path must be released before calling iterate()! */ btrfs_debug(fs_root->fs_info, @@ -2143,7 +2149,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, } btrfs_release_path(path); - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); ptr = btrfs_item_ptr_offset(eb, slot); cur_offset = 0; @@ -2330,6 +2336,7 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc( int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) { struct btrfs_fs_info *fs_info = iter->fs_info; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct btrfs_path *path = iter->path; struct btrfs_extent_item *ei; struct btrfs_key key; @@ -2340,7 +2347,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) key.offset = (u64)-1; iter->bytenr = bytenr; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; if (ret == 0) { @@ -2364,7 +2371,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); iter->end_ptr = (u32)(iter->item_ptr + - btrfs_item_size_nr(path->nodes[0], path->slots[0])); + btrfs_item_size(path->nodes[0], path->slots[0])); ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); @@ -2383,7 +2390,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) /* If there is no inline backref, go search for keyed backref */ if (iter->cur_ptr >= iter->end_ptr) { - ret = btrfs_next_item(fs_info->extent_root, path); + ret = btrfs_next_item(extent_root, path); /* No inline nor keyed ref */ if (ret > 0) { @@ -2404,7 +2411,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); iter->item_ptr = iter->cur_ptr; - iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size_nr( + iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size( path->nodes[0], path->slots[0])); } @@ -2427,6 +2434,7 @@ release: int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) { struct extent_buffer *eb = btrfs_backref_get_eb(iter); + struct btrfs_root *extent_root; struct btrfs_path *path = iter->path; struct btrfs_extent_inline_ref *iref; int ret; @@ -2457,7 +2465,8 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) } /* We're at keyed items, there is no inline item, go to the next one */ - ret = btrfs_next_item(iter->fs_info->extent_root, iter->path); + extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr); + ret = btrfs_next_item(extent_root, iter->path); if (ret) return ret; @@ -2469,7 +2478,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); iter->cur_ptr = iter->item_ptr; - iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size_nr(path->nodes[0], + iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0], path->slots[0]); return 0; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a3b830b8410a..1db24e6d6d90 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/list_sort.h> #include "misc.h" #include "ctree.h" #include "block-group.h" @@ -144,6 +145,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) */ WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); kfree(cache->free_space_ctl); + kfree(cache->physical_map); kfree(cache); } } @@ -512,7 +514,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; @@ -527,6 +529,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) return -ENOMEM; last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); + extent_root = btrfs_extent_root(fs_info, last); #ifdef CONFIG_BTRFS_DEBUG /* @@ -839,7 +842,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_key key; int ret; - root = fs_info->extent_root; + root = btrfs_block_group_root(fs_info); key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = block_group->length; @@ -902,6 +905,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cluster->refill_lock); btrfs_clear_treelog_bg(block_group); + btrfs_clear_data_reloc_bg(block_group); path = btrfs_alloc_path(); if (!path) { @@ -1103,6 +1107,7 @@ out: struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset) { + struct btrfs_root *root = btrfs_block_group_root(fs_info); struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; @@ -1136,8 +1141,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( num_items = 3 + map->num_stripes; free_extent_map(em); - return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, - num_items); + return btrfs_start_transaction_fallback_global_rsv(root, num_items); } /* @@ -1484,13 +1488,27 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } +/* + * We want block groups with a low number of used bytes to be in the beginning + * of the list, so they will get reclaimed first. + */ +static int reclaim_bgs_cmp(void *unused, const struct list_head *a, + const struct list_head *b) +{ + const struct btrfs_block_group *bg1, *bg2; + + bg1 = list_entry(a, struct btrfs_block_group, bg_list); + bg2 = list_entry(b, struct btrfs_block_group, bg_list); + + return bg1->used > bg2->used; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; - LIST_HEAD(again_list); if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; @@ -1508,6 +1526,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_lock(&fs_info->unused_bgs_lock); + /* + * Sort happens under lock because we can't simply splice it and sort. + * The block groups might still be in use and reachable via bg_list, + * and their presence in the reclaim_bgs list must be preserved. + */ + list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { u64 zone_unusable; int ret = 0; @@ -1561,18 +1585,14 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); ret = btrfs_relocate_chunk(fs_info, bg->start); - if (ret && ret != -EAGAIN) + if (ret) btrfs_err(fs_info, "error relocating chunk %llu", bg->start); next: + btrfs_put_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); - if (ret == -EAGAIN && list_empty(&bg->bg_list)) - list_add_tail(&bg->bg_list, &again_list); - else - btrfs_put_block_group(bg); } - list_splice_tail(&again_list, &fs_info->reclaim_bgs); spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); @@ -1654,7 +1674,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_key *key) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; struct btrfs_key found_key; struct extent_buffer *leaf; @@ -1895,6 +1915,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( INIT_LIST_HEAD(&cache->discard_list); INIT_LIST_HEAD(&cache->dirty_list); INIT_LIST_HEAD(&cache->io_list); + INIT_LIST_HEAD(&cache->active_bg_list); btrfs_init_free_space_ctl(cache, cache->free_space_ctl); atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); @@ -2035,6 +2056,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, */ if (btrfs_is_zoned(info)) { btrfs_calc_zone_unusable(cache); + /* Should not have any excluded extents. Just in case, though. */ + btrfs_free_excluded_extents(cache); } else if (cache->length == cache->used) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; @@ -2062,15 +2085,18 @@ static int read_one_block_group(struct btrfs_fs_info *info, link_block_group(cache); set_avail_alloc_bits(info, cache->flags); - if (btrfs_chunk_readonly(info, cache->start)) { + if (btrfs_chunk_writeable(info, cache->start)) { + if (cache->used == 0) { + ASSERT(list_empty(&cache->bg_list)); + if (btrfs_test_opt(info, DISCARD_ASYNC)) + btrfs_discard_queue_work(&info->discard_ctl, cache); + else + btrfs_mark_bg_unused(cache); + } + } else { inc_block_group_ro(cache, 1); - } else if (cache->used == 0) { - ASSERT(list_empty(&cache->bg_list)); - if (btrfs_test_opt(info, DISCARD_ASYNC)) - btrfs_discard_queue_work(&info->discard_ctl, cache); - else - btrfs_mark_bg_unused(cache); } + return 0; error: btrfs_put_block_group(cache); @@ -2135,6 +2161,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) int btrfs_read_block_groups(struct btrfs_fs_info *info) { + struct btrfs_root *root = btrfs_block_group_root(info); struct btrfs_path *path; int ret; struct btrfs_block_group *cache; @@ -2143,7 +2170,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) int need_clear = 0; u64 cache_gen; - if (!info->extent_root) + if (!root) return fill_dummy_bgs(info); key.objectid = 0; @@ -2246,7 +2273,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_item bgi; - struct btrfs_root *root; + struct btrfs_root *root = btrfs_block_group_root(fs_info); struct btrfs_key key; spin_lock(&block_group->lock); @@ -2259,7 +2286,6 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, key.offset = block_group->length; spin_unlock(&block_group->lock); - root = fs_info->extent_root; return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); } @@ -2438,6 +2464,12 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } + /* + * New block group is likely to be used soon. Try to activate it now. + * Failure is OK for now. + */ + btrfs_zone_activate(cache); + ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ @@ -2479,7 +2511,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran */ trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, 0, &cache->space_info); + cache->bytes_super, cache->zone_unusable, + &cache->space_info); btrfs_update_global_block_rsv(fs_info); link_block_group(cache); @@ -2506,12 +2539,13 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_trans_handle *trans; + struct btrfs_root *root = btrfs_block_group_root(fs_info); u64 alloc_flags; int ret; bool dirty_bg_running; do { - trans = btrfs_join_transaction(fs_info->extent_root); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -2594,7 +2628,9 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) if (!--cache->ro) { if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes back */ - cache->zone_unusable = cache->alloc_offset - cache->used; + cache->zone_unusable = + (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); sinfo->bytes_zone_unusable += cache->zone_unusable; sinfo->bytes_readonly -= cache->zone_unusable; } @@ -2614,7 +2650,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = btrfs_block_group_root(fs_info); unsigned long bi; struct extent_buffer *leaf; struct btrfs_block_group_item bgi; @@ -3143,7 +3179,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) } int btrfs_update_block_group(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, int alloc) + u64 bytenr, u64 num_bytes, bool alloc) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_block_group *cache = NULL; @@ -3380,36 +3416,17 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) */ check_system_chunk(trans, flags); - bg = btrfs_alloc_chunk(trans, flags); + bg = btrfs_create_chunk(trans, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); goto out; } - /* - * If this is a system chunk allocation then stop right here and do not - * add the chunk item to the chunk btree. This is to prevent a deadlock - * because this system chunk allocation can be triggered while COWing - * some extent buffer of the chunk btree and while holding a lock on a - * parent extent buffer, in which case attempting to insert the chunk - * item (or update the device item) would result in a deadlock on that - * parent extent buffer. In this case defer the chunk btree updates to - * the second phase of chunk allocation and keep our reservation until - * the second phase completes. - * - * This is a rare case and can only be triggered by the very few cases - * we have where we need to touch the chunk btree outside chunk allocation - * and chunk removal. These cases are basically adding a device, removing - * a device or resizing a device. - */ - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - return 0; - ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); /* * Normally we are not expected to fail with -ENOSPC here, since we have * previously reserved space in the system space_info and allocated one - * new system chunk if necessary. However there are two exceptions: + * new system chunk if necessary. However there are three exceptions: * * 1) We may have enough free space in the system space_info but all the * existing system block groups have a profile which can not be used @@ -3435,13 +3452,20 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) * with enough free space got turned into RO mode by a running scrub, * and in this case we have to allocate a new one and retry. We only * need do this allocate and retry once, since we have a transaction - * handle and scrub uses the commit root to search for block groups. + * handle and scrub uses the commit root to search for block groups; + * + * 3) We had one system block group with enough free space when we called + * check_system_chunk(), but after that, right before we tried to + * allocate the last extent buffer we needed, a discard operation came + * in and it temporarily removed the last free space entry from the + * block group (discard removes a free space entry, discards it, and + * then adds back the entry to the block group cache). */ if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); struct btrfs_block_group *sys_bg; - sys_bg = btrfs_alloc_chunk(trans, sys_flags); + sys_bg = btrfs_create_chunk(trans, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -3519,7 +3543,15 @@ out: * properly, either intentionally or as a bug. One example where this is * done intentionally is fsync, as it does not reserve any transaction units * and ends up allocating a variable number of metadata extents for log - * tree extent buffers. + * tree extent buffers; + * + * 4) The task has reserved enough transaction units / metadata space, but right + * before it tries to allocate the last extent buffer it needs, a discard + * operation comes in and, temporarily, removes the last free space entry from + * the only metadata block group that had free space (discard starts by + * removing a free space entry from a block group, then does the discard + * operation and, once it's done, it adds back the free space entry to the + * block group). * * We also need this 2 phases setup when adding a device to a filesystem with * a seed device - we must create new metadata and system chunks without adding @@ -3537,14 +3569,14 @@ out: * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of * the system chunk array due to concurrent allocations") provides more details. * - * For allocation of system chunks, we defer the updates and insertions into the - * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because - * if the chunk allocation is triggered while COWing an extent buffer of the - * chunk btree, we are holding a lock on the parent of that extent buffer and - * doing the chunk btree updates and insertions can require locking that parent. - * This is for the very few and rare cases where we update the chunk btree that - * are not chunk allocation or chunk removal: adding a device, removing a device - * or resizing a device. + * Allocation of system chunks does not happen through this function. A task that + * needs to update the chunk btree (the only btree that uses system chunks), must + * preallocate chunk space by calling either check_system_chunk() or + * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or + * metadata chunk or when removing a chunk, while the later is used before doing + * a modification to the chunk btree - use cases for the later are adding, + * removing and resizing a device as well as relocation of a system chunk. + * See the comment below for more details. * * The reservation of system space, done through check_system_chunk(), as well * as all the updates and insertions into the chunk btree must be done while @@ -3581,11 +3613,27 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (trans->allocating_chunk) return -ENOSPC; /* - * If we are removing a chunk, don't re-enter or we would deadlock. - * System space reservation and system chunk allocation is done by the - * chunk remove operation (btrfs_remove_chunk()). + * Allocation of system chunks can not happen through this path, as we + * could end up in a deadlock if we are allocating a data or metadata + * chunk and there is another task modifying the chunk btree. + * + * This is because while we are holding the chunk mutex, we will attempt + * to add the new chunk item to the chunk btree or update an existing + * device item in the chunk btree, while the other task that is modifying + * the chunk btree is attempting to COW an extent buffer while holding a + * lock on it and on its parent - if the COW operation triggers a system + * chunk allocation, then we can deadlock because we are holding the + * chunk mutex and we may need to access that extent buffer or its parent + * in order to add the chunk item or update a device item. + * + * Tasks that want to modify the chunk tree should reserve system space + * before updating the chunk btree, by calling either + * btrfs_reserve_chunk_metadata() or check_system_chunk(). + * It's possible that after a task reserves the space, it still ends up + * here - this happens in the cases described above at do_chunk_alloc(). + * The task will have to either retry or fail. */ - if (trans->removing_chunk) + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) return -ENOSPC; space_info = btrfs_find_space_info(fs_info, flags); @@ -3684,17 +3732,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) return num_dev; } -/* - * Reserve space in the system space for allocating or removing a chunk - */ -void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) +static void reserve_chunk_space(struct btrfs_trans_handle *trans, + u64 bytes, + u64 type) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *info; u64 left; - u64 thresh; int ret = 0; - u64 num_devs; /* * Needed because we can end up allocating a system chunk and for an @@ -3707,19 +3752,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) left = info->total_bytes - btrfs_space_info_used(info, true); spin_unlock(&info->lock); - num_devs = get_profile_num_devs(fs_info, type); - - /* num_devs device items to update and 1 chunk item to add or remove */ - thresh = btrfs_calc_metadata_size(fs_info, num_devs) + - btrfs_calc_insert_metadata_size(fs_info, 1); - - if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", - left, thresh, type); + left, bytes, type); btrfs_dump_space_info(fs_info, info, 0, 0); } - if (left < thresh) { + if (left < bytes) { u64 flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *bg; @@ -3728,35 +3767,83 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) * needing it, as we might not need to COW all nodes/leafs from * the paths we visit in the chunk tree (they were already COWed * or created in the current transaction for example). - * - * Also, if our caller is allocating a system chunk, do not - * attempt to insert the chunk item in the chunk btree, as we - * could deadlock on an extent buffer since our caller may be - * COWing an extent buffer from the chunk btree. */ - bg = btrfs_alloc_chunk(trans, flags); + bg = btrfs_create_chunk(trans, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); - } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { + } else { /* * If we fail to add the chunk item here, we end up * trying again at phase 2 of chunk allocation, at * btrfs_create_pending_block_groups(). So ignore - * any error here. + * any error here. An ENOSPC here could happen, due to + * the cases described at do_chunk_alloc() - the system + * block group we just created was just turned into RO + * mode by a scrub for example, or a running discard + * temporarily removed its free space entries, etc. */ btrfs_chunk_alloc_add_chunk_item(trans, bg); } } if (!ret) { - ret = btrfs_block_rsv_add(fs_info->chunk_root, + ret = btrfs_block_rsv_add(fs_info, &fs_info->chunk_block_rsv, - thresh, BTRFS_RESERVE_NO_FLUSH); + bytes, BTRFS_RESERVE_NO_FLUSH); if (!ret) - trans->chunk_bytes_reserved += thresh; + trans->chunk_bytes_reserved += bytes; } } +/* + * Reserve space in the system space for allocating or removing a chunk. + * The caller must be holding fs_info->chunk_mutex. + */ +void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + const u64 num_devs = get_profile_num_devs(fs_info, type); + u64 bytes; + + /* num_devs device items to update and 1 chunk item to add or remove. */ + bytes = btrfs_calc_metadata_size(fs_info, num_devs) + + btrfs_calc_insert_metadata_size(fs_info, 1); + + reserve_chunk_space(trans, bytes, type); +} + +/* + * Reserve space in the system space, if needed, for doing a modification to the + * chunk btree. + * + * @trans: A transaction handle. + * @is_item_insertion: Indicate if the modification is for inserting a new item + * in the chunk btree or if it's for the deletion or update + * of an existing item. + * + * This is used in a context where we need to update the chunk btree outside + * block group allocation and removal, to avoid a deadlock with a concurrent + * task that is allocating a metadata or data block group and therefore needs to + * update the chunk btree while holding the chunk mutex. After the update to the + * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called. + * + */ +void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, + bool is_item_insertion) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + u64 bytes; + + if (is_item_insertion) + bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + else + bytes = btrfs_calc_metadata_size(fs_info, 1); + + mutex_lock(&fs_info->chunk_mutex); + reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM); + mutex_unlock(&fs_info->chunk_mutex); +} + void btrfs_put_block_group_cache(struct btrfs_fs_info *info) { struct btrfs_block_group *block_group; @@ -3821,9 +3908,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) list_del_init(&block_group->bg_list); btrfs_put_block_group(block_group); } - spin_unlock(&info->unused_bgs_lock); - spin_lock(&info->unused_bgs_lock); while (!list_empty(&info->reclaim_bgs)) { block_group = list_first_entry(&info->reclaim_bgs, struct btrfs_block_group, @@ -3833,6 +3918,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->unused_bgs_lock); + spin_lock(&info->zone_active_bgs_lock); + while (!list_empty(&info->zone_active_bgs)) { + block_group = list_first_entry(&info->zone_active_bgs, + struct btrfs_block_group, + active_bg_list); + list_del_init(&block_group->active_bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->zone_active_bgs_lock); + spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group, diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index c72a71efcb18..5878b7ce3b78 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -98,6 +98,7 @@ struct btrfs_block_group { unsigned int to_copy:1; unsigned int relocating_repair:1; unsigned int chunk_item_inserted:1; + unsigned int zone_is_active:1; int disk_cache_state; @@ -202,7 +203,10 @@ struct btrfs_block_group { */ u64 alloc_offset; u64 zone_unusable; + u64 zone_capacity; u64 meta_write_pointer; + struct map_lookup *physical_map; + struct list_head active_bg_list; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -280,7 +284,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, int alloc); + u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, u64 ram_bytes, u64 num_bytes, int delalloc); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, @@ -289,6 +293,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, enum btrfs_chunk_alloc_enum force); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); +void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, + bool is_item_insertion); u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 04a6226e0388..b3ee49b0b1e8 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -6,6 +6,7 @@ #include "space-info.h" #include "transaction.h" #include "block-group.h" +#include "disk-io.h" /* * HOW DO BLOCK RESERVES WORK @@ -208,7 +209,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, kfree(rsv); } -int btrfs_block_rsv_add(struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush) { @@ -217,7 +218,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, if (num_bytes == 0) return 0; - ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); if (!ret) btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); @@ -241,7 +242,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) return ret; } -int btrfs_block_rsv_refill(struct btrfs_root *root, +int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush) { @@ -262,7 +263,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, if (!ret) return 0; - ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); if (!ret) { btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; @@ -351,23 +352,29 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; struct btrfs_space_info *sinfo = block_rsv->space_info; - u64 num_bytes; - unsigned min_items; + struct btrfs_root *root, *tmp; + u64 num_bytes = btrfs_root_used(&fs_info->tree_root->root_item); + unsigned int min_items = 1; /* * The global block rsv is based on the size of the extent tree, the * checksum tree and the root tree. If the fs is empty we want to set * it to a minimal amount for safety. + * + * We also are going to need to modify the minimum of the tree root and + * any global roots we could touch. */ - num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + - btrfs_root_used(&fs_info->csum_root->root_item) + - btrfs_root_used(&fs_info->tree_root->root_item); - - /* - * We at a minimum are going to modify the csum root, the tree root, and - * the extent root. - */ - min_items = 3; + read_lock(&fs_info->global_root_lock); + rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree, + rb_node) { + if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID || + root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || + root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { + num_bytes += btrfs_root_used(&root->root_item); + min_items++; + } + } + read_unlock(&fs_info->global_root_lock); /* * But we also want to reserve enough space so we can do the fallback @@ -412,6 +419,30 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_unlock(&sinfo->lock); } +void btrfs_init_root_block_rsv(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + switch (root->root_key.objectid) { + case BTRFS_CSUM_TREE_OBJECTID: + case BTRFS_EXTENT_TREE_OBJECTID: + case BTRFS_FREE_SPACE_TREE_OBJECTID: + root->block_rsv = &fs_info->delayed_refs_rsv; + break; + case BTRFS_ROOT_TREE_OBJECTID: + case BTRFS_DEV_TREE_OBJECTID: + case BTRFS_QUOTA_TREE_OBJECTID: + root->block_rsv = &fs_info->global_block_rsv; + break; + case BTRFS_CHUNK_TREE_OBJECTID: + root->block_rsv = &fs_info->chunk_block_rsv; + break; + default: + root->block_rsv = NULL; + break; + } +} + void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *space_info; @@ -426,22 +457,6 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->delayed_block_rsv.space_info = space_info; fs_info->delayed_refs_rsv.space_info = space_info; - /* - * Our various recovery options can leave us with NULL roots, so check - * here and just bail before we go dereferencing NULLs everywhere. - */ - if (!fs_info->extent_root || !fs_info->csum_root || - !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root) - return; - - fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; - fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; - if (fs_info->quota_root) - fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; - fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - btrfs_update_global_block_rsv(fs_info); } @@ -467,8 +482,9 @@ static struct btrfs_block_rsv *get_block_rsv( struct btrfs_block_rsv *block_rsv = NULL; if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || - (root == fs_info->csum_root && trans->adding_csums) || - (root == fs_info->uuid_root)) + (root == fs_info->uuid_root) || + (trans->adding_csums && + root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID)) block_rsv = trans->block_rsv; if (!block_rsv) @@ -523,7 +539,7 @@ again: block_rsv->type, ret); } try_reserve: - ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 0b6ae5302837..3b67ff08d434 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -50,6 +50,7 @@ struct btrfs_block_rsv { }; void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +void btrfs_init_root_block_rsv(struct btrfs_root *root); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, @@ -57,11 +58,11 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); -int btrfs_block_rsv_refill(struct btrfs_root *root, +int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 76ee1452c57b..b3e46aabc3d8 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -138,17 +138,26 @@ struct btrfs_inode { /* a local copy of root's last_log_commit */ int last_log_commit; - /* total number of bytes pending delalloc, used by stat to calc the - * real block usage of the file + /* + * Total number of bytes pending delalloc, used by stat to calculate the + * real block usage of the file. This is used only for files. */ u64 delalloc_bytes; - /* - * Total number of bytes pending delalloc that fall within a file - * range that is either a hole or beyond EOF (and no prealloc extent - * exists in the range). This is always <= delalloc_bytes. - */ - u64 new_delalloc_bytes; + union { + /* + * Total number of bytes pending delalloc that fall within a file + * range that is either a hole or beyond EOF (and no prealloc extent + * exists in the range). This is always <= delalloc_bytes and this + * is used only for files. + */ + u64 new_delalloc_bytes; + /* + * The offset of the last dir index key that was logged. + * This is used only for directories. + */ + u64 last_dir_index_offset; + }; /* * total number of bytes pending defrag, used by stat to check whether @@ -339,7 +348,12 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) struct btrfs_dio_private { struct inode *inode; - u64 logical_offset; + + /* + * Since DIO can use anonymous page, we cannot use page_offset() to + * grab the file offset, thus need a dedicated member for file offset. + */ + u64 file_offset; u64 disk_bytenr; /* Used for bio::bi_size */ u32 bytes; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 86816088927f..7e9f90fa0388 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -186,7 +186,6 @@ struct btrfsic_dev_state { struct list_head collision_resolving_node; /* list node */ struct btrfsic_block dummy_block_for_bio_bh_flush; u64 last_flush_gen; - char name[BDEVNAME_SIZE]; }; struct btrfsic_block_hashtable { @@ -403,7 +402,6 @@ static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; ds->bdev = NULL; ds->state = NULL; - ds->name[0] = '\0'; INIT_LIST_HEAD(&ds->collision_resolving_node); ds->last_flush_gen = 0; btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); @@ -756,10 +754,10 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp->mirror_num = 1 + superblock_mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) btrfs_info_in_rcu(fs_info, - "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)", + "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)", superblock_bdev, rcu_str_deref(device->name), dev_bytenr, - dev_state->name, dev_bytenr, + dev_state->bdev, dev_bytenr, superblock_mirror_num); list_add(&superblock_tmp->all_blocks_node, &state->all_blocks_list); @@ -938,9 +936,10 @@ continue_with_current_leaf_stack_frame: if (disk_item_offset + sizeof(struct btrfs_item) > sf->block_ctx->len) { leaf_item_out_of_bounce_error: - pr_info("btrfsic: leaf item out of bounce at logical %llu, dev %s\n", + pr_info( + "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n", sf->block_ctx->start, - sf->block_ctx->dev->name); + sf->block_ctx->dev->bdev); goto one_stack_frame_backwards; } btrfsic_read_from_block_data(sf->block_ctx, @@ -1058,9 +1057,10 @@ continue_with_current_node_stack_frame: (uintptr_t)nodehdr; if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > sf->block_ctx->len) { - pr_info("btrfsic: node item out of bounce at logical %llu, dev %s\n", + pr_info( + "btrfsic: node item out of bounce at logical %llu, dev %pg\n", sf->block_ctx->start, - sf->block_ctx->dev->name); + sf->block_ctx->dev->bdev); goto one_stack_frame_backwards; } btrfsic_read_from_block_data( @@ -1228,15 +1228,17 @@ static int btrfsic_create_link_to_next_block( if (next_block->logical_bytenr != next_bytenr && !(!next_block->is_metadata && 0 == next_block->logical_bytenr)) - pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", - next_bytenr, next_block_ctx->dev->name, + pr_info( +"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", + next_bytenr, next_block_ctx->dev->bdev, next_block_ctx->dev_bytenr, *mirror_nump, btrfsic_get_block_type(state, next_block), next_block->logical_bytenr); else - pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n", - next_bytenr, next_block_ctx->dev->name, + pr_info( + "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n", + next_bytenr, next_block_ctx->dev->bdev, next_block_ctx->dev_bytenr, *mirror_nump, btrfsic_get_block_type(state, next_block)); @@ -1324,8 +1326,8 @@ static int btrfsic_handle_extent_data( if (file_extent_item_offset + offsetof(struct btrfs_file_extent_item, disk_num_bytes) > block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n", - block_ctx->start, block_ctx->dev->name); + pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", + block_ctx->start, block_ctx->dev->bdev); return -1; } @@ -1344,8 +1346,8 @@ static int btrfsic_handle_extent_data( if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n", - block_ctx->start, block_ctx->dev->name); + pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", + block_ctx->start, block_ctx->dev->bdev); return -1; } btrfsic_read_from_block_data(block_ctx, &file_extent_item, @@ -1421,9 +1423,10 @@ static int btrfsic_handle_extent_data( next_block->logical_bytenr != next_bytenr && !(!next_block->is_metadata && 0 == next_block->logical_bytenr)) { - pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu).\n", + pr_info( +"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n", next_bytenr, - next_block_ctx.dev->name, + next_block_ctx.dev->bdev, next_block_ctx.dev_bytenr, mirror_num, next_block->logical_bytenr); @@ -1455,7 +1458,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfs_fs_info *fs_info = state->fs_info; int ret; u64 length; - struct btrfs_bio *multi = NULL; + struct btrfs_io_context *multi = NULL; struct btrfs_device *device; length = len; @@ -1561,7 +1564,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, struct bio *bio; unsigned int j; - bio = btrfs_io_bio_alloc(num_pages - i); + bio = btrfs_bio_alloc(num_pages - i); bio_set_dev(bio, block_ctx->dev->bdev); bio->bi_iter.bi_sector = dev_bytenr >> 9; bio->bi_opf = REQ_OP_READ; @@ -1577,8 +1580,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, return -1; } if (submit_bio_wait(bio)) { - pr_info("btrfsic: read error at logical %llu dev %s!\n", - block_ctx->start, block_ctx->dev->name); + pr_info("btrfsic: read error at logical %llu dev %pg!\n", + block_ctx->start, block_ctx->dev->bdev); bio_put(bio); return -1; } @@ -1602,33 +1605,35 @@ static void btrfsic_dump_database(struct btrfsic_state *state) list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) { const struct btrfsic_block_link *l; - pr_info("%c-block @%llu (%s/%llu/%d)\n", + pr_info("%c-block @%llu (%pg/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->name, + b_all->logical_bytenr, b_all->dev_state->bdev, b_all->dev_bytenr, b_all->mirror_num); list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) { - pr_info(" %c @%llu (%s/%llu/%d) refers %u* to %c @%llu (%s/%llu/%d)\n", + pr_info( + " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->name, + b_all->logical_bytenr, b_all->dev_state->bdev, b_all->dev_bytenr, b_all->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) { - pr_info(" %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n", + pr_info( + " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->name, + b_all->logical_bytenr, b_all->dev_state->bdev, b_all->dev_bytenr, b_all->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, + l->block_ref_from->dev_state->bdev, l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num); } @@ -1743,16 +1748,18 @@ again: if (block->logical_bytenr != bytenr && !(!block->is_metadata && block->logical_bytenr == 0)) - pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", - bytenr, dev_state->name, + pr_info( +"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", + bytenr, dev_state->bdev, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block), block->logical_bytenr); else - pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n", - bytenr, dev_state->name, + pr_info( + "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", + bytenr, dev_state->bdev, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block)); @@ -1767,8 +1774,9 @@ again: processed_len = state->datablock_size; bytenr = block->logical_bytenr; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n", - bytenr, dev_state->name, dev_bytenr, + pr_info( + "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", + bytenr, dev_state->bdev, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block)); } @@ -1778,9 +1786,10 @@ again: list_empty(&block->ref_to_list) ? ' ' : '!', list_empty(&block->ref_from_list) ? ' ' : '!'); if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { - pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n", + pr_info( +"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n", btrfsic_get_block_type(state, block), bytenr, - dev_state->name, dev_bytenr, block->mirror_num, + dev_state->bdev, dev_bytenr, block->mirror_num, block->generation, btrfs_disk_key_objectid(&block->disk_key), block->disk_key.type, @@ -1792,9 +1801,10 @@ again: } if (!block->is_iodone && !block->never_written) { - pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n", + pr_info( +"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n", btrfsic_get_block_type(state, block), bytenr, - dev_state->name, dev_bytenr, block->mirror_num, + dev_state->bdev, dev_bytenr, block->mirror_num, block->generation, btrfs_stack_header_generation( (struct btrfs_header *) @@ -1921,8 +1931,9 @@ again: if (!is_metadata) { processed_len = state->datablock_size; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("Written block (%s/%llu/?) !found in hash table, D.\n", - dev_state->name, dev_bytenr); + pr_info( + "written block (%pg/%llu/?) !found in hash table, D\n", + dev_state->bdev, dev_bytenr); if (!state->include_extent_data) { /* ignore that written D block */ goto continue_loop; @@ -1939,8 +1950,9 @@ again: btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, dev_bytenr); if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("Written block @%llu (%s/%llu/?) !found in hash table, M.\n", - bytenr, dev_state->name, dev_bytenr); + pr_info( + "written block @%llu (%pg/%llu/?) !found in hash table, M\n", + bytenr, dev_state->bdev, dev_bytenr); } block_ctx.dev = dev_state; @@ -1995,9 +2007,9 @@ again: block->next_in_same_bio = NULL; } if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("New written %c-block @%llu (%s/%llu/%d)\n", + pr_info("new written %c-block @%llu (%pg/%llu/%d)\n", is_metadata ? 'M' : 'D', - block->logical_bytenr, block->dev_state->name, + block->logical_bytenr, block->dev_state->bdev, block->dev_bytenr, block->mirror_num); list_add(&block->all_blocks_node, &state->all_blocks_list); btrfsic_block_hashtable_add(block, &state->block_hashtable); @@ -2041,10 +2053,10 @@ static void btrfsic_bio_end_io(struct bio *bp) if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", + pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n", bp->bi_status, btrfsic_get_block_type(dev_state->state, block), - block->logical_bytenr, dev_state->name, + block->logical_bytenr, dev_state->bdev, block->dev_bytenr, block->mirror_num); next_block = block->next_in_same_bio; block->iodone_w_error = iodone_w_error; @@ -2052,8 +2064,8 @@ static void btrfsic_bio_end_io(struct bio *bp) dev_state->last_flush_gen++; if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io() new %s flush_gen=%llu\n", - dev_state->name, + pr_info("bio_end_io() new %pg flush_gen=%llu\n", + dev_state->bdev, dev_state->last_flush_gen); } if (block->submit_bio_bh_rw & REQ_FUA) @@ -2078,17 +2090,19 @@ static int btrfsic_process_written_superblock( if (!(superblock->generation > state->max_superblock_generation || 0 == state->max_superblock_generation)) { if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info("btrfsic: superblock @%llu (%s/%llu/%d) with old gen %llu <= %llu\n", + pr_info( + "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n", superblock->logical_bytenr, - superblock->dev_state->name, + superblock->dev_state->bdev, superblock->dev_bytenr, superblock->mirror_num, btrfs_super_generation(super_hdr), state->max_superblock_generation); } else { if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info("btrfsic: got new superblock @%llu (%s/%llu/%d) with new gen %llu > %llu\n", + pr_info( + "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n", superblock->logical_bytenr, - superblock->dev_state->name, + superblock->dev_state->bdev, superblock->dev_bytenr, superblock->mirror_num, btrfs_super_generation(super_hdr), state->max_superblock_generation); @@ -2232,38 +2246,42 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, */ list_for_each_entry(l, &block->ref_to_list, node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("rl=%d, %c @%llu (%s/%llu/%d) %u* refers to %c @%llu (%s/%llu/%d)\n", + pr_info( + "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n", recursion_level, btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->name, + block->logical_bytenr, block->dev_state->bdev, block->dev_bytenr, block->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); if (l->block_ref_to->never_written) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is never written!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; } else if (!l->block_ref_to->is_iodone) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not yet iodone!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; } else if (l->block_ref_to->iodone_w_error) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which has write error!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; @@ -2273,10 +2291,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, l->parent_generation && BTRFSIC_GENERATION_UNKNOWN != l->block_ref_to->generation) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) with generation %llu != parent generation %llu!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num, l->block_ref_to->generation, @@ -2284,10 +2303,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, ret = -1; } else if (l->block_ref_to->flush_gen > l->block_ref_to->dev_state->last_flush_gen) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num, block->flush_gen, l->block_ref_to->dev_state->last_flush_gen); @@ -2324,15 +2344,16 @@ static int btrfsic_is_block_ref_by_superblock( */ list_for_each_entry(l, &block->ref_from_list, node_ref_from) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("rl=%d, %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n", + pr_info( + "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", recursion_level, btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->name, + block->logical_bytenr, block->dev_state->bdev, block->dev_bytenr, block->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, + l->block_ref_from->dev_state->bdev, l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num); if (l->block_ref_from->is_superblock && @@ -2354,30 +2375,30 @@ static int btrfsic_is_block_ref_by_superblock( static void btrfsic_print_add_link(const struct btrfsic_state *state, const struct btrfsic_block_link *l) { - pr_info("Add %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n", + pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, + l->block_ref_from->dev_state->bdev, l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } static void btrfsic_print_rem_link(const struct btrfsic_state *state, const struct btrfsic_block_link *l) { - pr_info("Rem %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n", + pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, + l->block_ref_from->dev_state->bdev, l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } @@ -2419,9 +2440,9 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, * This algorithm is recursive because the amount of used stack space * is very small and the max recursion depth is limited. */ - indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)", + indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)", btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->name, + block->logical_bytenr, block->dev_state->bdev, block->dev_bytenr, block->mirror_num); if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { printk("[...]\n"); @@ -2542,10 +2563,10 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add( block->never_written = never_written; block->mirror_num = mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("New %s%c-block @%llu (%s/%llu/%d)\n", + pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n", additional_string, btrfsic_get_block_type(state, block), - block->logical_bytenr, dev_state->name, + block->logical_bytenr, dev_state->bdev, block->dev_bytenr, mirror_num); list_add(&block->all_blocks_node, &state->all_blocks_list); btrfsic_block_hashtable_add(block, &state->block_hashtable); @@ -2592,8 +2613,9 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, } if (WARN_ON(!match)) { - pr_info("btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%s, phys_bytenr=%llu)!\n", - bytenr, dev_state->name, dev_bytenr); + pr_info( +"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n", + bytenr, dev_state->bdev, dev_bytenr); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { ret = btrfsic_map_block(state, bytenr, state->metablock_size, @@ -2601,8 +2623,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, if (ret) continue; - pr_info("Read logical bytenr @%llu maps to (%s/%llu/%d)\n", - bytenr, block_ctx.dev->name, + pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n", + bytenr, block_ctx.dev->bdev, block_ctx.dev_bytenr, mirror_num); } } @@ -2675,8 +2697,9 @@ static void __btrfsic_submit_bio(struct bio *bio) if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | BTRFSIC_PRINT_MASK_VERBOSE))) - pr_info("btrfsic_submit_bio(%s) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->name); + pr_info( +"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", + dev_state->bdev); } else { struct btrfsic_block *const block = &dev_state->dummy_block_for_bio_bh_flush; @@ -2751,7 +2774,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, list_for_each_entry(device, dev_head, dev_list) { struct btrfsic_dev_state *ds; - const char *p; if (!device->bdev || !device->name) continue; @@ -2763,10 +2785,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, } ds->bdev = device->bdev; ds->state = state; - bdevname(ds->bdev, ds->name); - ds->name[BDEVNAME_SIZE - 1] = '\0'; - p = kbasename(ds->name); - strlcpy(ds->name, p, sizeof(ds->name)); btrfsic_dev_state_hashtable_add(ds, &btrfsic_dev_state_hashtable); } @@ -2844,9 +2862,10 @@ void btrfsic_unmount(struct btrfs_fs_devices *fs_devices) if (b_all->is_iodone || b_all->never_written) btrfsic_block_free(b_all); else - pr_info("btrfs: attempt to free %c-block @%llu (%s/%llu/%d) on umount which is not yet iodone!\n", + pr_info( +"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n", btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->name, + b_all->logical_bytenr, b_all->dev_state->bdev, b_all->dev_bytenr, b_all->mirror_num); } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7869ad12bc6e..71e5b2e9a1ba 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -9,6 +9,7 @@ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> +#include <linux/kthread.h> #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> @@ -28,6 +29,7 @@ #include "compression.h" #include "extent_io.h" #include "extent_map.h" +#include "subpage.h" #include "zoned.h" static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; @@ -94,10 +96,10 @@ static int compression_compress_pages(int type, struct list_head *ws, } } -static int compression_decompress_bio(int type, struct list_head *ws, - struct compressed_bio *cb) +static int compression_decompress_bio(struct list_head *ws, + struct compressed_bio *cb) { - switch (type) { + switch (cb->compress_type) { case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb); case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb); case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb); @@ -155,7 +157,8 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, struct compressed_bio *cb = bio->bi_private; u8 *cb_sum = cb->sums; - if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM)) + if ((inode->flags & BTRFS_INODE_NODATASUM) || + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) return 0; shash->tfm = fs_info->csum_shash; @@ -172,16 +175,17 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, /* Hash through the page sector by sector */ for (pg_offset = 0; pg_offset < bytes_left; pg_offset += sectorsize) { - kaddr = page_address(page); + kaddr = kmap_atomic(page); crypto_shash_digest(shash, kaddr + pg_offset, sectorsize, csum); + kunmap_atomic(kaddr); if (memcmp(&csum, cb_sum, csum_size) != 0) { btrfs_print_data_csum_error(inode, disk_start, csum, cb_sum, cb->mirror_num); - if (btrfs_io_bio(bio)->device) + if (btrfs_bio(bio)->device) btrfs_dev_stat_inc_and_print( - btrfs_io_bio(bio)->device, + btrfs_bio(bio)->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); return -EIO; } @@ -192,6 +196,87 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, return 0; } +/* + * Reduce bio and io accounting for a compressed_bio with its corresponding bio. + * + * Return true if there is no pending bio nor io. + * Return false otherwise. + */ +static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); + unsigned int bi_size = 0; + bool last_io = false; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + /* + * At endio time, bi_iter.bi_size doesn't represent the real bio size. + * Thus here we have to iterate through all segments to grab correct + * bio size. + */ + bio_for_each_segment_all(bvec, bio, iter_all) + bi_size += bvec->bv_len; + + if (bio->bi_status) + cb->errors = 1; + + ASSERT(bi_size && bi_size <= cb->compressed_len); + last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, + &cb->pending_sectors); + /* + * Here we must wake up the possible error handler after all other + * operations on @cb finished, or we can race with + * finish_compressed_bio_*() which may free @cb. + */ + wake_up_var(cb); + + return last_io; +} + +static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio) +{ + unsigned int index; + struct page *page; + + /* Release the compressed pages */ + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + put_page(page); + } + + /* Do io completion on the original bio */ + if (cb->errors) { + bio_io_error(cb->orig_bio); + } else { + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + ASSERT(bio); + ASSERT(!bio->bi_status); + /* + * We have verified the checksum already, set page checked so + * the end_io handlers know about it + */ + ASSERT(!bio_flagged(bio, BIO_CLONED)); + bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { + u64 bvec_start = page_offset(bvec->bv_page) + + bvec->bv_offset; + + btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb), + bvec->bv_page, bvec_start, + bvec->bv_len); + } + + bio_endio(cb->orig_bio); + } + + /* Finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +} + /* when we finish reading compressed pages from the disk, we * decompress them and then run the bio end_io routines on the * decompressed pages (in the inode address space). @@ -206,25 +291,17 @@ static void end_compressed_bio_read(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; struct inode *inode; - struct page *page; - unsigned int index; - unsigned int mirror = btrfs_io_bio(bio)->mirror_num; + unsigned int mirror = btrfs_bio(bio)->mirror_num; int ret = 0; - if (bio->bi_status) - cb->errors = 1; - - /* if there are more bios still pending for this compressed - * extent, just exit - */ - if (!refcount_dec_and_test(&cb->pending_bios)) + if (!dec_and_test_compressed_bio(cb, bio)) goto out; /* * Record the correct mirror_num in cb->orig_bio so that * read-repair can work properly. */ - btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; + btrfs_bio(cb->orig_bio)->mirror_num = mirror; cb->mirror_num = mirror; /* @@ -248,36 +325,7 @@ static void end_compressed_bio_read(struct bio *bio) csum_failed: if (ret) cb->errors = 1; - - /* release the compressed pages */ - index = 0; - for (index = 0; index < cb->nr_pages; index++) { - page = cb->compressed_pages[index]; - page->mapping = NULL; - put_page(page); - } - - /* do io completion on the original bio */ - if (cb->errors) { - bio_io_error(cb->orig_bio); - } else { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * we have verified the checksum already, set page - * checked so the end_io handlers know about it - */ - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) - SetPageChecked(bvec->bv_page); - - bio_endio(cb->orig_bio); - } - - /* finally free the cb struct */ - kfree(cb->compressed_pages); - kfree(cb); + finish_compressed_bio_read(cb, bio); out: bio_put(bio); } @@ -289,6 +337,7 @@ out: static noinline void end_compressed_writeback(struct inode *inode, const struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct page *pages[16]; @@ -311,7 +360,8 @@ static noinline void end_compressed_writeback(struct inode *inode, for (i = 0; i < ret; i++) { if (cb->errors) SetPageError(pages[i]); - end_page_writeback(pages[i]); + btrfs_page_clamp_clear_writeback(fs_info, pages[i], + cb->start, cb->len); put_page(pages[i]); } nr_pages -= ret; @@ -320,60 +370,127 @@ static noinline void end_compressed_writeback(struct inode *inode, /* the inode may be gone now */ } -/* - * do the cleanup once all the compressed pages hit the disk. - * This will clear writeback on the file pages and free the compressed - * pages. - * - * This also calls the writeback end hooks for the file pages so that - * metadata and checksums can be updated in the file. - */ -static void end_compressed_bio_write(struct bio *bio) +static void finish_compressed_bio_write(struct compressed_bio *cb) { - struct compressed_bio *cb = bio->bi_private; - struct inode *inode; - struct page *page; + struct inode *inode = cb->inode; unsigned int index; - if (bio->bi_status) - cb->errors = 1; - - /* if there are more bios still pending for this compressed - * extent, just exit - */ - if (!refcount_dec_and_test(&cb->pending_bios)) - goto out; - - /* ok, we're the last bio for this extent, step one is to - * call back into the FS and do all the end_io operations + /* + * Ok, we're the last bio for this extent, step one is to call back + * into the FS and do all the end_io operations. */ - inode = cb->inode; - btrfs_record_physical_zoned(inode, cb->start, bio); btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, cb->start, cb->start + cb->len - 1, !cb->errors); end_compressed_writeback(inode, cb); - /* note, our inode could be gone now */ + /* Note, our inode could be gone now */ /* - * release the compressed pages, these came from alloc_page and + * Release the compressed pages, these came from alloc_page and * are not attached to the inode at all */ - index = 0; for (index = 0; index < cb->nr_pages; index++) { - page = cb->compressed_pages[index]; + struct page *page = cb->compressed_pages[index]; + page->mapping = NULL; put_page(page); } - /* finally free the cb struct */ + /* Finally free the cb struct */ kfree(cb->compressed_pages); kfree(cb); +} + +/* + * Do the cleanup once all the compressed pages hit the disk. This will clear + * writeback on the file pages and free the compressed pages. + * + * This also calls the writeback end hooks for the file pages so that metadata + * and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio) +{ + struct compressed_bio *cb = bio->bi_private; + + if (!dec_and_test_compressed_bio(cb, bio)) + goto out; + + btrfs_record_physical_zoned(cb->inode, cb->start, bio); + + finish_compressed_bio_write(cb); out: bio_put(bio); } +static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, + struct compressed_bio *cb, + struct bio *bio, int mirror_num) +{ + blk_status_t ret; + + ASSERT(bio->bi_iter.bi_size); + ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); + if (ret) + return ret; + ret = btrfs_map_bio(fs_info, bio, mirror_num); + return ret; +} + +/* + * Allocate a compressed_bio, which will be used to read/write on-disk + * (aka, compressed) * data. + * + * @cb: The compressed_bio structure, which records all the needed + * information to bind the compressed data to the uncompressed + * page cache. + * @disk_byten: The logical bytenr where the compressed data will be read + * from or written to. + * @endio_func: The endio function to call after the IO for compressed data + * is finished. + * @next_stripe_start: Return value of logical bytenr of where next stripe starts. + * Let the caller know to only fill the bio up to the stripe + * boundary. + */ + + +static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, + unsigned int opf, bio_end_io_t endio_func, + u64 *next_stripe_start) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); + struct btrfs_io_geometry geom; + struct extent_map *em; + struct bio *bio; + int ret; + + bio = btrfs_bio_alloc(BIO_MAX_VECS); + + bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bio->bi_opf = opf; + bio->bi_private = cb; + bio->bi_end_io = endio_func; + + em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); + if (IS_ERR(em)) { + bio_put(bio); + return ERR_CAST(em); + } + + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); + + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom); + free_extent_map(em); + if (ret < 0) { + bio_put(bio); + return ERR_PTR(ret); + } + *next_stripe_start = disk_bytenr + geom.len; + + return bio; +} + /* * worker function to build and submit bios for previously compressed pages. * The corresponding pages in the inode should be marked for writeback @@ -394,20 +511,19 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = NULL; struct compressed_bio *cb; - unsigned long bytes_left; - int pg_index = 0; - struct page *page; - u64 first_byte = disk_start; + u64 cur_disk_bytenr = disk_start; + u64 next_stripe_start; blk_status_t ret; int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; const bool use_append = btrfs_use_zone_append(inode, disk_start); const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; - WARN_ON(!PAGE_ALIGNED(start)); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(len, fs_info->sectorsize)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_bios, 0); + refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); cb->errors = 0; cb->inode = &inode->vfs_inode; cb->start = start; @@ -418,118 +534,100 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb->orig_bio = NULL; cb->nr_pages = nr_pages; - bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = bio_op | write_flags; - bio->bi_private = cb; - bio->bi_end_io = end_compressed_bio_write; - - if (use_append) { - struct btrfs_device *device; - - device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE); - if (IS_ERR(device)) { - kfree(cb); - bio_put(bio); - return BLK_STS_NOTSUPP; + while (cur_disk_bytenr < disk_start + compressed_len) { + u64 offset = cur_disk_bytenr - disk_start; + unsigned int index = offset >> PAGE_SHIFT; + unsigned int real_size; + unsigned int added; + struct page *page = compressed_pages[index]; + bool submit = false; + + /* Allocate new bio if submitted or not yet allocated */ + if (!bio) { + bio = alloc_compressed_bio(cb, cur_disk_bytenr, + bio_op | write_flags, end_compressed_bio_write, + &next_stripe_start); + if (IS_ERR(bio)) { + ret = errno_to_blk_status(PTR_ERR(bio)); + bio = NULL; + goto finish_cb; + } } - - bio_set_dev(bio, device->bdev); - } - - if (blkcg_css) { - bio->bi_opf |= REQ_CGROUP_PUNT; - kthread_associate_blkcg(blkcg_css); - } - refcount_set(&cb->pending_bios, 1); - - /* create and submit bios for the compressed pages */ - bytes_left = compressed_len; - for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { - int submit = 0; - int len = 0; - - page = compressed_pages[pg_index]; - page->mapping = inode->vfs_inode.i_mapping; - if (bio->bi_iter.bi_size) - submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, - 0); - /* - * Page can only be added to bio if the current bio fits in - * stripe. + * We should never reach next_stripe_start start as we will + * submit comp_bio when reach the boundary immediately. */ - if (!submit) { - if (pg_index == 0 && use_append) - len = bio_add_zone_append_page(bio, page, - PAGE_SIZE, 0); - else - len = bio_add_page(bio, page, PAGE_SIZE, 0); - } - - page->mapping = NULL; - if (submit || len < PAGE_SIZE) { - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the cb might get - * freed before we're done setting it up - */ - refcount_inc(&cb->pending_bios); - ret = btrfs_bio_wq_end_io(fs_info, bio, - BTRFS_WQ_ENDIO_DATA); - BUG_ON(ret); /* -ENOMEM */ + ASSERT(cur_disk_bytenr != next_stripe_start); + /* + * We have various limits on the real read size: + * - stripe boundary + * - page boundary + * - compressed length boundary + */ + real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr); + real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, real_size, compressed_len - offset); + ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); + + if (use_append) + added = bio_add_zone_append_page(bio, page, real_size, + offset_in_page(offset)); + else + added = bio_add_page(bio, page, real_size, + offset_in_page(offset)); + /* Reached zoned boundary */ + if (added == 0) + submit = true; + + cur_disk_bytenr += added; + /* Reached stripe boundary */ + if (cur_disk_bytenr == next_stripe_start) + submit = true; + + /* Finished the range */ + if (cur_disk_bytenr == disk_start + compressed_len) + submit = true; + + if (submit) { if (!skip_sum) { ret = btrfs_csum_one_bio(inode, bio, start, 1); - BUG_ON(ret); /* -ENOMEM */ - } - - ret = btrfs_map_bio(fs_info, bio, 0); - if (ret) { - bio->bi_status = ret; - bio_endio(bio); + if (ret) + goto finish_cb; } - bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = bio_op | write_flags; - bio->bi_private = cb; - bio->bi_end_io = end_compressed_bio_write; - if (blkcg_css) - bio->bi_opf |= REQ_CGROUP_PUNT; - /* - * Use bio_add_page() to ensure the bio has at least one - * page. - */ - bio_add_page(bio, page, PAGE_SIZE, 0); + ret = submit_compressed_bio(fs_info, cb, bio, 0); + if (ret) + goto finish_cb; + bio = NULL; } - if (bytes_left < PAGE_SIZE) { - btrfs_info(fs_info, - "bytes left %lu compress len %u nr %u", - bytes_left, cb->compressed_len, cb->nr_pages); - } - bytes_left -= PAGE_SIZE; - first_byte += PAGE_SIZE; cond_resched(); } + if (blkcg_css) + kthread_associate_blkcg(NULL); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - BUG_ON(ret); /* -ENOMEM */ - - if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, 1); - BUG_ON(ret); /* -ENOMEM */ - } + return 0; - ret = btrfs_map_bio(fs_info, bio, 0); - if (ret) { +finish_cb: + if (bio) { bio->bi_status = ret; bio_endio(bio); } + /* Last byte of @cb is submitted, endio will free @cb */ + if (cur_disk_bytenr == disk_start + compressed_len) + return ret; - if (blkcg_css) - kthread_associate_blkcg(NULL); - - return 0; + wait_var_event(cb, refcount_read(&cb->pending_sectors) == + (disk_start + compressed_len - cur_disk_bytenr) >> + fs_info->sectorsize_bits); + /* + * Even with previous bio ended, we should still have io not yet + * submitted, thus need to finish manually. + */ + ASSERT(refcount_read(&cb->pending_sectors)); + /* Now we are the only one referring @cb, can finish it safely. */ + finish_compressed_bio_write(cb); + return ret; } static u64 bio_end_offset(struct bio *bio) @@ -539,25 +637,33 @@ static u64 bio_end_offset(struct bio *bio) return page_offset(last->bv_page) + last->bv_len + last->bv_offset; } +/* + * Add extra pages in the same compressed file extent so that we don't need to + * re-read the same extent again and again. + * + * NOTE: this won't work well for subpage, as for subpage read, we lock the + * full page then submit bio for each compressed/regular extents. + * + * This means, if we have several sectors in the same page points to the same + * on-disk compressed data, we will re-read the same extent many times and + * this function can only help for the next page. + */ static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long end_index; - unsigned long pg_index; - u64 last_offset; + u64 cur = bio_end_offset(cb->orig_bio); u64 isize = i_size_read(inode); int ret; struct page *page; - unsigned long nr_pages = 0; struct extent_map *em; struct address_space *mapping = inode->i_mapping; struct extent_map_tree *em_tree; struct extent_io_tree *tree; - u64 end; - int misses = 0; + int sectors_missed = 0; - last_offset = bio_end_offset(cb->orig_bio); em_tree = &BTRFS_I(inode)->extent_tree; tree = &BTRFS_I(inode)->io_tree; @@ -576,18 +682,29 @@ static noinline int add_ra_bio_pages(struct inode *inode, end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; - while (last_offset < compressed_end) { - pg_index = last_offset >> PAGE_SHIFT; + while (cur < compressed_end) { + u64 page_end; + u64 pg_index = cur >> PAGE_SHIFT; + u32 add_size; if (pg_index > end_index) break; page = xa_load(&mapping->i_pages, pg_index); if (page && !xa_is_value(page)) { - misses++; - if (misses > 4) + sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >> + fs_info->sectorsize_bits; + + /* Beyond threshold, no need to continue */ + if (sectors_missed > 4) break; - goto next; + + /* + * Jump to next page start as we already have page for + * current offset. + */ + cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + continue; } page = __page_cache_alloc(mapping_gfp_constraint(mapping, @@ -597,14 +714,11 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { put_page(page); - goto next; + /* There is already a page, skip to page end */ + cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + continue; } - /* - * at this point, we have a locked page in the page cache - * for these bytes in the file. But, we have to make - * sure they map to this compressed extent on disk. - */ ret = set_page_extent_mapped(page); if (ret < 0) { unlock_page(page); @@ -612,18 +726,22 @@ static noinline int add_ra_bio_pages(struct inode *inode, break; } - end = last_offset + PAGE_SIZE - 1; - lock_extent(tree, last_offset, end); + page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; + lock_extent(tree, cur, page_end); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, last_offset, - PAGE_SIZE); + em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); read_unlock(&em_tree->lock); - if (!em || last_offset < em->start || - (last_offset + PAGE_SIZE > extent_map_end(em)) || + /* + * At this point, we have a locked page in the page cache for + * these bytes in the file. But, we have to make sure they map + * to this compressed extent on disk. + */ + if (!em || cur < em->start || + (cur + fs_info->sectorsize > extent_map_end(em)) || (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { free_extent_map(em); - unlock_extent(tree, last_offset, end); + unlock_extent(tree, cur, page_end); unlock_page(page); put_page(page); break; @@ -641,20 +759,23 @@ static noinline int add_ra_bio_pages(struct inode *inode, } } - ret = bio_add_page(cb->orig_bio, page, - PAGE_SIZE, 0); - - if (ret == PAGE_SIZE) { - nr_pages++; - put_page(page); - } else { - unlock_extent(tree, last_offset, end); + add_size = min(em->start + em->len, page_end + 1) - cur; + ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur)); + if (ret != add_size) { + unlock_extent(tree, cur, page_end); unlock_page(page); put_page(page); break; } -next: - last_offset += PAGE_SIZE; + /* + * If it's subpage, we also need to increase its + * subpage::readers number, as at endio we will decrease + * subpage::readers and to unlock the page. + */ + if (fs_info->sectorsize < PAGE_SIZE) + btrfs_subpage_start_reader(fs_info, page, cur, add_size); + put_page(page); + cur += add_size; } return 0; } @@ -679,9 +800,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, unsigned int compressed_len; unsigned int nr_pages; unsigned int pg_index; - struct page *page; - struct bio *comp_bio; - u64 cur_disk_byte = bio->bi_iter.bi_sector << 9; + struct bio *comp_bio = NULL; + const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 cur_disk_byte = disk_bytenr; + u64 next_stripe_start; u64 file_offset; u64 em_len; u64 em_start; @@ -708,7 +830,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!cb) goto out; - refcount_set(&cb->pending_bios, 0); + refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); cb->errors = 0; cb->inode = inode; cb->mirror_num = mirror_num; @@ -748,86 +870,74 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; - comp_bio = btrfs_bio_alloc(cur_disk_byte); - comp_bio->bi_opf = REQ_OP_READ; - comp_bio->bi_private = cb; - comp_bio->bi_end_io = end_compressed_bio_read; - refcount_set(&cb->pending_bios, 1); - - for (pg_index = 0; pg_index < nr_pages; pg_index++) { - u32 pg_len = PAGE_SIZE; - int submit = 0; + while (cur_disk_byte < disk_bytenr + compressed_len) { + u64 offset = cur_disk_byte - disk_bytenr; + unsigned int index = offset >> PAGE_SHIFT; + unsigned int real_size; + unsigned int added; + struct page *page = cb->compressed_pages[index]; + bool submit = false; + + /* Allocate new bio if submitted or not yet allocated */ + if (!comp_bio) { + comp_bio = alloc_compressed_bio(cb, cur_disk_byte, + REQ_OP_READ, end_compressed_bio_read, + &next_stripe_start); + if (IS_ERR(comp_bio)) { + ret = errno_to_blk_status(PTR_ERR(comp_bio)); + comp_bio = NULL; + goto finish_cb; + } + } + /* + * We should never reach next_stripe_start start as we will + * submit comp_bio when reach the boundary immediately. + */ + ASSERT(cur_disk_byte != next_stripe_start); + /* + * We have various limit on the real read size: + * - stripe boundary + * - page boundary + * - compressed length boundary + */ + real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte); + real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, real_size, compressed_len - offset); + ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); + added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset)); /* - * To handle subpage case, we need to make sure the bio only - * covers the range we need. - * - * If we're at the last page, truncate the length to only cover - * the remaining part. + * Maximum compressed extent is smaller than bio size limit, + * thus bio_add_page() should always success. */ - if (pg_index == nr_pages - 1) - pg_len = min_t(u32, PAGE_SIZE, - compressed_len - pg_index * PAGE_SIZE); + ASSERT(added == real_size); + cur_disk_byte += added; - page = cb->compressed_pages[pg_index]; - page->mapping = inode->i_mapping; - page->index = em_start >> PAGE_SHIFT; + /* Reached stripe boundary, need to submit */ + if (cur_disk_byte == next_stripe_start) + submit = true; - if (comp_bio->bi_iter.bi_size) - submit = btrfs_bio_fits_in_stripe(page, pg_len, - comp_bio, 0); + /* Has finished the range, need to submit */ + if (cur_disk_byte == disk_bytenr + compressed_len) + submit = true; - page->mapping = NULL; - if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) { + if (submit) { unsigned int nr_sectors; - ret = btrfs_bio_wq_end_io(fs_info, comp_bio, - BTRFS_WQ_ENDIO_DATA); - BUG_ON(ret); /* -ENOMEM */ - - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the cb might get - * freed before we're done setting it up - */ - refcount_inc(&cb->pending_bios); - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + goto finish_cb; nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, fs_info->sectorsize); sums += fs_info->csum_size * nr_sectors; - ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); - if (ret) { - comp_bio->bi_status = ret; - bio_endio(comp_bio); - } - - comp_bio = btrfs_bio_alloc(cur_disk_byte); - comp_bio->bi_opf = REQ_OP_READ; - comp_bio->bi_private = cb; - comp_bio->bi_end_io = end_compressed_bio_read; - - bio_add_page(comp_bio, page, pg_len, 0); + ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num); + if (ret) + goto finish_cb; + comp_bio = NULL; } - cur_disk_byte += pg_len; } - - ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); - BUG_ON(ret); /* -ENOMEM */ - - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); - BUG_ON(ret); /* -ENOMEM */ - - ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); - if (ret) { - comp_bio->bi_status = ret; - bio_endio(comp_bio); - } - return 0; fail2: @@ -842,6 +952,26 @@ fail1: out: free_extent_map(em); return ret; +finish_cb: + if (comp_bio) { + comp_bio->bi_status = ret; + bio_endio(comp_bio); + } + /* All bytes of @cb is submitted, endio will free @cb */ + if (cur_disk_byte == disk_bytenr + compressed_len) + return ret; + + wait_var_event(cb, refcount_read(&cb->pending_sectors) == + (disk_bytenr + compressed_len - cur_disk_byte) >> + fs_info->sectorsize_bits); + /* + * Even with previous bio ended, we should still have io not yet + * submitted, thus need to finish @cb manually. + */ + ASSERT(refcount_read(&cb->pending_sectors)); + /* Now we are the only one referring @cb, can finish it safely. */ + finish_compressed_bio_read(cb, NULL); + return ret; } /* @@ -1230,7 +1360,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) int type = cb->compress_type; workspace = get_workspace(type, 0); - ret = compression_decompress_bio(type, workspace, cb); + ret = compression_decompress_bio(workspace, cb); put_workspace(type, workspace); return ret; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 399be0b435bf..56eef0821e3e 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -28,8 +28,8 @@ struct btrfs_inode; #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* number of bios pending for this compressed extent */ - refcount_t pending_bios; + /* Number of sectors with unfinished IO (unsubmitted or unfinished) */ + refcount_t pending_sectors; /* Number of compressed pages in the array */ unsigned int nr_pages; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 84627cbd5b5b..a7db3f6f1b7b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/mm.h> +#include <linux/error-injection.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -395,7 +396,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (*cow_ret == buf) unlock_orig = 1; - btrfs_assert_tree_locked(buf); + btrfs_assert_tree_write_locked(buf); WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); @@ -462,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, BUG_ON(ret < 0); rcu_assign_pointer(root->node, cow); - btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref); + btrfs_free_tree_block(trans, btrfs_root_id(root), buf, + parent_start, last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -484,8 +485,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } } - btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref); + btrfs_free_tree_block(trans, btrfs_root_id(root), buf, + parent_start, last_ref); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -725,21 +726,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, } /* - * search for key in the extent_buffer. The items start at offset p, - * and they are item_size apart. + * Search for a key in the given extent_buffer. * - * the slot in the array is returned via slot, and it points to - * the place where you would insert key if it is not found in - * the array. + * The lower boundary for the search is specified by the slot number @low. Use a + * value of 0 to search over the whole extent buffer. * - * Slot may point to total number of items if the key is bigger than - * all of the keys + * The slot in the extent buffer is returned via @slot. If the key exists in the + * extent buffer, then @slot will point to the slot where the key is, otherwise + * it points to the slot where you would insert the key. + * + * Slot may point to the total number of items (i.e. one position beyond the last + * key) if the key is bigger than the last key in the extent buffer. */ -static noinline int generic_bin_search(struct extent_buffer *eb, - unsigned long p, int item_size, +static noinline int generic_bin_search(struct extent_buffer *eb, int low, const struct btrfs_key *key, int *slot) { - int low = 0; + unsigned long p; + int item_size; int high = btrfs_header_nritems(eb); int ret; const int key_size = sizeof(struct btrfs_disk_key); @@ -752,6 +755,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb, return -EINVAL; } + if (btrfs_header_level(eb) == 0) { + p = offsetof(struct btrfs_leaf, items); + item_size = sizeof(struct btrfs_item); + } else { + p = offsetof(struct btrfs_node, ptrs); + item_size = sizeof(struct btrfs_key_ptr); + } + while (low < high) { unsigned long oip; unsigned long offset; @@ -790,20 +801,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb, } /* - * simple bin_search frontend that does the right thing for - * leaves vs nodes + * Simple binary search on an extent buffer. Works for both leaves and nodes, and + * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int *slot) { - if (btrfs_header_level(eb) == 0) - return generic_bin_search(eb, - offsetof(struct btrfs_leaf, items), - sizeof(struct btrfs_item), key, slot); - else - return generic_bin_search(eb, - offsetof(struct btrfs_node, ptrs), - sizeof(struct btrfs_key_ptr), key, slot); + return generic_bin_search(eb, 0, key, slot); } static void root_add_used(struct btrfs_root *root, u32 size) @@ -926,7 +930,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, free_extent_buffer(mid); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1); + btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); return 0; @@ -985,7 +989,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); - btrfs_free_tree_block(trans, root, right, 0, 1); + btrfs_free_tree_block(trans, btrfs_root_id(root), right, + 0, 1); free_extent_buffer_stale(right); right = NULL; } else { @@ -1030,7 +1035,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1); + btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; } else { @@ -1344,33 +1349,34 @@ static noinline void unlock_up(struct btrfs_path *path, int level, { int i; int skip_level = level; - int no_skips = 0; - struct extent_buffer *t; + bool check_skip = true; for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) break; if (!path->locks[i]) break; - if (!no_skips && path->slots[i] == 0) { - skip_level = i + 1; - continue; - } - if (!no_skips && path->keep_locks) { - u32 nritems; - t = path->nodes[i]; - nritems = btrfs_header_nritems(t); - if (nritems < 1 || path->slots[i] >= nritems - 1) { + + if (check_skip) { + if (path->slots[i] == 0) { skip_level = i + 1; continue; } + + if (path->keep_locks) { + u32 nritems; + + nritems = btrfs_header_nritems(path->nodes[i]); + if (nritems < 1 || path->slots[i] >= nritems - 1) { + skip_level = i + 1; + continue; + } + } } - if (skip_level < i && i >= lowest_unlock) - no_skips = 1; - t = path->nodes[i]; if (i >= lowest_unlock && i > skip_level) { - btrfs_tree_unlock_rw(t, path->locks[i]); + check_skip = false; + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); path->locks[i] = 0; if (write_lock_level && i > min_write_lock_level && @@ -1566,35 +1572,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, struct btrfs_path *p, int write_lock_level) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; - int root_lock; + int root_lock = 0; int level = 0; - /* We try very hard to do read locks on the root */ - root_lock = BTRFS_READ_LOCK; - if (p->search_commit_root) { - /* - * The commit roots are read only so we always do read locks, - * and we always must hold the commit_root_sem when doing - * searches on them, the only exception is send where we don't - * want to block transaction commits for a long time, so - * we need to clone the commit root in order to avoid races - * with transaction commits that create a snapshot of one of - * the roots used by a send operation. - */ - if (p->need_commit_sem) { - down_read(&fs_info->commit_root_sem); - b = btrfs_clone_extent_buffer(root->commit_root); - up_read(&fs_info->commit_root_sem); - if (!b) - return ERR_PTR(-ENOMEM); - - } else { - b = root->commit_root; - atomic_inc(&b->refs); - } + b = root->commit_root; + atomic_inc(&b->refs); level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when @@ -1611,6 +1595,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, goto out; } + /* We try very hard to do read locks on the root */ + root_lock = BTRFS_READ_LOCK; + /* * If the level is set to maximum, we can skip trying to get the read * lock. @@ -1637,6 +1624,17 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, level = btrfs_header_level(b); out: + /* + * The root may have failed to write out at some point, and thus is no + * longer valid, return an error in this case. + */ + if (!extent_buffer_uptodate(b)) { + if (root_lock) + btrfs_tree_unlock_rw(b, root_lock); + free_extent_buffer(b); + return ERR_PTR(-EIO); + } + p->nodes[level] = b; if (!p->skip_locking) p->locks[level] = root_lock; @@ -1646,6 +1644,191 @@ out: return b; } +/* + * Replace the extent buffer at the lowest level of the path with a cloned + * version. The purpose is to be able to use it safely, after releasing the + * commit root semaphore, even if relocation is happening in parallel, the + * transaction used for relocation is committed and the extent buffer is + * reallocated in the next transaction. + * + * This is used in a context where the caller does not prevent transaction + * commits from happening, either by holding a transaction handle or holding + * some lock, while it's doing searches through a commit root. + * At the moment it's only used for send operations. + */ +static int finish_need_commit_sem_search(struct btrfs_path *path) +{ + const int i = path->lowest_level; + const int slot = path->slots[i]; + struct extent_buffer *lowest = path->nodes[i]; + struct extent_buffer *clone; + + ASSERT(path->need_commit_sem); + + if (!lowest) + return 0; + + lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); + + clone = btrfs_clone_extent_buffer(lowest); + if (!clone) + return -ENOMEM; + + btrfs_release_path(path); + path->nodes[i] = clone; + path->slots[i] = slot; + + return 0; +} + +static inline int search_for_key_slot(struct extent_buffer *eb, + int search_low_slot, + const struct btrfs_key *key, + int prev_cmp, + int *slot) +{ + /* + * If a previous call to btrfs_bin_search() on a parent node returned an + * exact match (prev_cmp == 0), we can safely assume the target key will + * always be at slot 0 on lower levels, since each key pointer + * (struct btrfs_key_ptr) refers to the lowest key accessible from the + * subtree it points to. Thus we can skip searching lower levels. + */ + if (prev_cmp == 0) { + *slot = 0; + return 0; + } + + return generic_bin_search(eb, search_low_slot, key, slot); +} + +static int search_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const struct btrfs_key *key, + struct btrfs_path *path, + int ins_len, + int prev_cmp) +{ + struct extent_buffer *leaf = path->nodes[0]; + int leaf_free_space = -1; + int search_low_slot = 0; + int ret; + bool do_bin_search = true; + + /* + * If we are doing an insertion, the leaf has enough free space and the + * destination slot for the key is not slot 0, then we can unlock our + * write lock on the parent, and any other upper nodes, before doing the + * binary search on the leaf (with search_for_key_slot()), allowing other + * tasks to lock the parent and any other upper nodes. + */ + if (ins_len > 0) { + /* + * Cache the leaf free space, since we will need it later and it + * will not change until then. + */ + leaf_free_space = btrfs_leaf_free_space(leaf); + + /* + * !path->locks[1] means we have a single node tree, the leaf is + * the root of the tree. + */ + if (path->locks[1] && leaf_free_space >= ins_len) { + struct btrfs_disk_key first_key; + + ASSERT(btrfs_header_nritems(leaf) > 0); + btrfs_item_key(leaf, &first_key, 0); + + /* + * Doing the extra comparison with the first key is cheap, + * taking into account that the first key is very likely + * already in a cache line because it immediately follows + * the extent buffer's header and we have recently accessed + * the header's level field. + */ + ret = comp_keys(&first_key, key); + if (ret < 0) { + /* + * The first key is smaller than the key we want + * to insert, so we are safe to unlock all upper + * nodes and we have to do the binary search. + * + * We do use btrfs_unlock_up_safe() and not + * unlock_up() because the later does not unlock + * nodes with a slot of 0 - we can safely unlock + * any node even if its slot is 0 since in this + * case the key does not end up at slot 0 of the + * leaf and there's no need to split the leaf. + */ + btrfs_unlock_up_safe(path, 1); + search_low_slot = 1; + } else { + /* + * The first key is >= then the key we want to + * insert, so we can skip the binary search as + * the target key will be at slot 0. + * + * We can not unlock upper nodes when the key is + * less than the first key, because we will need + * to update the key at slot 0 of the parent node + * and possibly of other upper nodes too. + * If the key matches the first key, then we can + * unlock all the upper nodes, using + * btrfs_unlock_up_safe() instead of unlock_up() + * as stated above. + */ + if (ret == 0) + btrfs_unlock_up_safe(path, 1); + /* + * ret is already 0 or 1, matching the result of + * a btrfs_bin_search() call, so there is no need + * to adjust it. + */ + do_bin_search = false; + path->slots[0] = 0; + } + } + } + + if (do_bin_search) { + ret = search_for_key_slot(leaf, search_low_slot, key, + prev_cmp, &path->slots[0]); + if (ret < 0) + return ret; + } + + if (ins_len > 0) { + /* + * Item key already exists. In this case, if we are allowed to + * insert the item (for example, in dir_item case, item key + * collision is allowed), it will be merged with the original + * item. Only the item size grows, no new btrfs item will be + * added. If search_for_extension is not set, ins_len already + * accounts the size btrfs_item, deduct it here so leaf space + * check will be correct. + */ + if (ret == 0 && !path->search_for_extension) { + ASSERT(ins_len >= sizeof(struct btrfs_item)); + ins_len -= sizeof(struct btrfs_item); + } + + ASSERT(leaf_free_space >= 0); + + if (leaf_free_space < ins_len) { + int err; + + err = split_leaf(trans, root, key, path, ins_len, + (ret == 0)); + ASSERT(err <= 0); + if (WARN_ON(err > 0)) + err = -EUCLEAN; + if (err) + ret = err; + } + } + + return ret; +} /* * btrfs_search_slot - look for a key in a tree and perform necessary @@ -1682,6 +1865,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; int slot; int ret; @@ -1723,6 +1907,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, min_write_lock_level = write_lock_level; + if (p->need_commit_sem) { + ASSERT(p->search_commit_root); + down_read(&fs_info->commit_root_sem); + } + again: prev_cmp = -1; b = btrfs_search_slot_get_root(root, p, write_lock_level); @@ -1776,10 +1965,6 @@ again: } cow_done: p->nodes[level] = b; - /* - * Leave path with blocking locks to avoid massive - * lock context switch, this is made on purpose. - */ /* * we have a lock on b and as long as we aren't changing @@ -1801,62 +1986,22 @@ cow_done: } } - /* - * If btrfs_bin_search returns an exact match (prev_cmp == 0) - * we can safely assume the target key will always be in slot 0 - * on lower levels due to the invariants BTRFS' btree provides, - * namely that a btrfs_key_ptr entry always points to the - * lowest key in the child node, thus we can skip searching - * lower levels - */ - if (prev_cmp == 0) { - slot = 0; - ret = 0; - } else { - ret = btrfs_bin_search(b, key, &slot); - prev_cmp = ret; - if (ret < 0) - goto done; - } - if (level == 0) { - p->slots[level] = slot; - /* - * Item key already exists. In this case, if we are - * allowed to insert the item (for example, in dir_item - * case, item key collision is allowed), it will be - * merged with the original item. Only the item size - * grows, no new btrfs item will be added. If - * search_for_extension is not set, ins_len already - * accounts the size btrfs_item, deduct it here so leaf - * space check will be correct. - */ - if (ret == 0 && ins_len > 0 && !p->search_for_extension) { - ASSERT(ins_len >= sizeof(struct btrfs_item)); - ins_len -= sizeof(struct btrfs_item); - } - if (ins_len > 0 && - btrfs_leaf_free_space(b) < ins_len) { - if (write_lock_level < 1) { - write_lock_level = 1; - btrfs_release_path(p); - goto again; - } + if (ins_len > 0) + ASSERT(write_lock_level >= 1); - err = split_leaf(trans, root, key, - p, ins_len, ret == 0); - - BUG_ON(err > 0); - if (err) { - ret = err; - goto done; - } - } + ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); if (!p->search_for_split) unlock_up(p, level, lowest_unlock, min_write_lock_level, NULL); goto done; } + + ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); + if (ret < 0) + goto done; + prev_cmp = ret; + if (ret && slot > 0) { dec = 1; slot--; @@ -1917,6 +2062,16 @@ cow_done: done: if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); + + if (p->need_commit_sem) { + int ret2; + + ret2 = finish_need_commit_sem_search(p); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } + return ret; } ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); @@ -2487,7 +2642,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans, int ret; BUG_ON(!path->nodes[level]); - btrfs_assert_tree_locked(path->nodes[level]); + btrfs_assert_tree_write_locked(path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); @@ -2614,19 +2769,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans, */ static int leaf_space_used(struct extent_buffer *l, int start, int nr) { - struct btrfs_item *start_item; - struct btrfs_item *end_item; int data_len; int nritems = btrfs_header_nritems(l); int end = min(nritems, start + nr) - 1; if (!nr) return 0; - start_item = btrfs_item_nr(start); - end_item = btrfs_item_nr(end); - data_len = btrfs_item_offset(l, start_item) + - btrfs_item_size(l, start_item); - data_len = data_len - btrfs_item_offset(l, end_item); + data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); + data_len = data_len - btrfs_item_offset(l, end); data_len += sizeof(struct btrfs_item) * nr; WARN_ON(data_len < 0); return data_len; @@ -2673,7 +2823,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path, u32 i; int push_space = 0; int push_items = 0; - struct btrfs_item *item; u32 nr; u32 right_nritems; u32 data_end; @@ -2690,8 +2839,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path, slot = path->slots[1]; i = left_nritems - 1; while (i >= nr) { - item = btrfs_item_nr(i); - if (!empty && push_items > 0) { if (path->slots[0] > i) break; @@ -2706,12 +2853,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path, if (path->slots[0] == i) push_space += data_size; - this_item_size = btrfs_item_size(left, item); - if (this_item_size + sizeof(*item) + push_space > free_space) + this_item_size = btrfs_item_size(left, i); + if (this_item_size + sizeof(struct btrfs_item) + + push_space > free_space) break; push_items++; - push_space += this_item_size + sizeof(*item); + push_space += this_item_size + sizeof(struct btrfs_item); if (i == 0) break; i--; @@ -2725,7 +2873,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, /* push left to right */ right_nritems = btrfs_header_nritems(right); - push_space = btrfs_item_end_nr(left, left_nritems - push_items); + push_space = btrfs_item_data_end(left, left_nritems - push_items); push_space -= leaf_data_end(left); /* make room in the right data area */ @@ -2756,9 +2904,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path, btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(i); - push_space -= btrfs_token_item_size(&token, item); - btrfs_set_token_item_offset(&token, item, push_space); + push_space -= btrfs_token_item_size(&token, i); + btrfs_set_token_item_offset(&token, i, push_space); } left_nritems -= push_items; @@ -2827,7 +2974,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (slot >= btrfs_header_nritems(upper) - 1) return 1; - btrfs_assert_tree_locked(path->nodes[1]); + btrfs_assert_tree_write_locked(path->nodes[1]); right = btrfs_read_node_slot(upper, slot + 1); /* @@ -2903,7 +3050,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, int i; int push_space = 0; int push_items = 0; - struct btrfs_item *item; u32 old_left_nritems; u32 nr; int ret = 0; @@ -2917,8 +3063,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) { - item = btrfs_item_nr(i); - if (!empty && push_items > 0) { if (path->slots[0] < i) break; @@ -2933,12 +3077,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, if (path->slots[0] == i) push_space += data_size; - this_item_size = btrfs_item_size(right, item); - if (this_item_size + sizeof(*item) + push_space > free_space) + this_item_size = btrfs_item_size(right, i); + if (this_item_size + sizeof(struct btrfs_item) + push_space > + free_space) break; push_items++; - push_space += this_item_size + sizeof(*item); + push_space += this_item_size + sizeof(struct btrfs_item); } if (push_items == 0) { @@ -2954,25 +3099,23 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, push_items * sizeof(struct btrfs_item)); push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - - btrfs_item_offset_nr(right, push_items - 1); + btrfs_item_offset(right, push_items - 1); copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left) - push_space, BTRFS_LEAF_DATA_OFFSET + - btrfs_item_offset_nr(right, push_items - 1), + btrfs_item_offset(right, push_items - 1), push_space); old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); btrfs_init_map_token(&token, left); - old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); + old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; - item = btrfs_item_nr(i); - - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -2983,7 +3126,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, right_nritems); if (push_items < right_nritems) { - push_space = btrfs_item_offset_nr(right, push_items - 1) - + push_space = btrfs_item_offset(right, push_items - 1) - leaf_data_end(right); memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, @@ -3001,10 +3144,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(i); - - push_space = push_space - btrfs_token_item_size(&token, item); - btrfs_set_token_item_offset(&token, item, push_space); + push_space = push_space - btrfs_token_item_size(&token, i); + btrfs_set_token_item_offset(&token, i, push_space); } btrfs_mark_buffer_dirty(left); @@ -3065,7 +3206,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (right_nritems == 0) return 1; - btrfs_assert_tree_locked(path->nodes[1]); + btrfs_assert_tree_write_locked(path->nodes[1]); left = btrfs_read_node_slot(path->nodes[1], slot - 1); /* @@ -3132,7 +3273,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); - data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l); + data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); copy_extent_buffer(right, l, btrfs_item_nr_offset(0), btrfs_item_nr_offset(mid), @@ -3143,15 +3284,14 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, data_copy_size, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(l), data_copy_size); - rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); + rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); btrfs_init_map_token(&token, right); for (i = 0; i < nritems; i++) { - struct btrfs_item *item = btrfs_item_nr(i); u32 ioff; - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, ioff + rt_data_off); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); } btrfs_set_header_nritems(l, mid); @@ -3267,7 +3407,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; - if (extend && data_size + btrfs_item_size_nr(l, slot) + + if (extend && data_size + btrfs_item_size(l, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) return -EOVERFLOW; @@ -3436,7 +3576,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, if (btrfs_leaf_free_space(leaf) >= ins_len) return 0; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (key.type == BTRFS_EXTENT_DATA_KEY) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -3456,7 +3596,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, ret = -EAGAIN; leaf = path->nodes[0]; /* if our item isn't there, return now */ - if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) + if (item_size != btrfs_item_size(leaf, path->slots[0])) goto err; /* the leaf has changed, it now has room. return now */ @@ -3487,9 +3627,7 @@ static noinline int split_item(struct btrfs_path *path, unsigned long split_offset) { struct extent_buffer *leaf; - struct btrfs_item *item; - struct btrfs_item *new_item; - int slot; + int orig_slot, slot; char *buf; u32 nritems; u32 item_size; @@ -3499,9 +3637,9 @@ static noinline int split_item(struct btrfs_path *path, leaf = path->nodes[0]; BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); - item = btrfs_item_nr(path->slots[0]); - orig_offset = btrfs_item_offset(leaf, item); - item_size = btrfs_item_size(leaf, item); + orig_slot = path->slots[0]; + orig_offset = btrfs_item_offset(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); buf = kmalloc(item_size, GFP_NOFS); if (!buf) @@ -3522,14 +3660,12 @@ static noinline int split_item(struct btrfs_path *path, btrfs_cpu_key_to_disk(&disk_key, new_key); btrfs_set_item_key(leaf, &disk_key, slot); - new_item = btrfs_item_nr(slot); - - btrfs_set_item_offset(leaf, new_item, orig_offset); - btrfs_set_item_size(leaf, new_item, item_size - split_offset); + btrfs_set_item_offset(leaf, slot, orig_offset); + btrfs_set_item_size(leaf, slot, item_size - split_offset); - btrfs_set_item_offset(leaf, item, - orig_offset + item_size - split_offset); - btrfs_set_item_size(leaf, item, split_offset); + btrfs_set_item_offset(leaf, orig_slot, + orig_offset + item_size - split_offset); + btrfs_set_item_size(leaf, orig_slot, split_offset); btrfs_set_header_nritems(leaf, nritems + 1); @@ -3581,40 +3717,6 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, } /* - * This function duplicate a item, giving 'new_key' to the new item. - * It guarantees both items live in the same tree leaf and the new item - * is contiguous with the original item. - * - * This allows us to split file extent in place, keeping a lock on the - * leaf the entire time. - */ -int btrfs_duplicate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const struct btrfs_key *new_key) -{ - struct extent_buffer *leaf; - int ret; - u32 item_size; - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ret = setup_leaf_for_split(trans, root, path, - item_size + sizeof(struct btrfs_item)); - if (ret) - return ret; - - path->slots[0]++; - setup_items_for_insert(root, path, new_key, &item_size, 1); - leaf = path->nodes[0]; - memcpy_extent_buffer(leaf, - btrfs_item_ptr_offset(leaf, path->slots[0]), - btrfs_item_ptr_offset(leaf, path->slots[0] - 1), - item_size); - return 0; -} - -/* * make the item pointed to by the path smaller. new_size indicates * how small to make it, and from_end tells us if we just chop bytes * off the end of the item or if we shift the item to chop bytes off @@ -3624,7 +3726,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; - struct btrfs_item *item; u32 nritems; unsigned int data_end; unsigned int old_data_start; @@ -3636,14 +3737,14 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) leaf = path->nodes[0]; slot = path->slots[0]; - old_size = btrfs_item_size_nr(leaf, slot); + old_size = btrfs_item_size(leaf, slot); if (old_size == new_size) return; nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(leaf); - old_data_start = btrfs_item_offset_nr(leaf, slot); + old_data_start = btrfs_item_offset(leaf, slot); size_diff = old_size - new_size; @@ -3657,10 +3758,9 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, ioff + size_diff); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff + size_diff); } /* shift the data */ @@ -3703,8 +3803,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) fixup_low_keys(path, &disk_key, 1); } - item = btrfs_item_nr(slot); - btrfs_set_item_size(leaf, item, new_size); + btrfs_set_item_size(leaf, slot, new_size); btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(leaf) < 0) { @@ -3720,7 +3819,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; - struct btrfs_item *item; u32 nritems; unsigned int data_end; unsigned int old_data; @@ -3738,7 +3836,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) BUG(); } slot = path->slots[0]; - old_data = btrfs_item_end_nr(leaf, slot); + old_data = btrfs_item_data_end(leaf, slot); BUG_ON(slot < 0); if (slot >= nritems) { @@ -3755,10 +3853,9 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, ioff - data_size); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff - data_size); } /* shift the data */ @@ -3767,9 +3864,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) data_end, old_data - data_end); data_end = old_data; - old_size = btrfs_item_size_nr(leaf, slot); - item = btrfs_item_nr(slot); - btrfs_set_item_size(leaf, item, old_size + data_size); + old_size = btrfs_item_size(leaf, slot); + btrfs_set_item_size(leaf, slot, old_size + data_size); btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(leaf) < 0) { @@ -3785,16 +3881,12 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) * * @root: root we are inserting items to * @path: points to the leaf/slot where we are going to insert new items - * @cpu_key: array of keys for items to be inserted - * @data_size: size of the body of each item we are going to insert - * @nr: size of @cpu_key/@data_size arrays + * @batch: information about the batch of items to insert */ -void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, - const struct btrfs_key *cpu_key, u32 *data_size, - int nr) +static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, + const struct btrfs_item_batch *batch) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_item *item; int i; u32 nritems; unsigned int data_end; @@ -3803,14 +3895,14 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, int slot; struct btrfs_map_token token; u32 total_size; - u32 total_data = 0; - - for (i = 0; i < nr; i++) - total_data += data_size[i]; - total_size = total_data + (nr * sizeof(struct btrfs_item)); + /* + * Before anything else, update keys in the parent and other ancestors + * if needed, then release the write locks on them, so that other tasks + * can use them while we modify the leaf. + */ if (path->slots[0] == 0) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key); + btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); fixup_low_keys(path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); @@ -3820,6 +3912,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(leaf); + total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); if (btrfs_leaf_free_space(leaf) < total_size) { btrfs_print_leaf(leaf); @@ -3830,7 +3923,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_init_map_token(&token, leaf); if (slot != nritems) { - unsigned int old_data = btrfs_item_end_nr(leaf, slot); + unsigned int old_data = btrfs_item_data_end(leaf, slot); if (old_data < data_end) { btrfs_print_leaf(leaf); @@ -3846,34 +3939,33 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, - ioff - total_data); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, + ioff - batch->total_data_size); } /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr), btrfs_item_nr_offset(slot), (nritems - slot) * sizeof(struct btrfs_item)); /* shift the data */ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end - total_data, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data - data_end); + data_end - batch->total_data_size, + BTRFS_LEAF_DATA_OFFSET + data_end, + old_data - data_end); data_end = old_data; } /* setup the item for the new data */ - for (i = 0; i < nr; i++) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + for (i = 0; i < batch->nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(slot + i); - data_end -= data_size[i]; - btrfs_set_token_item_offset(&token, item, data_end); - btrfs_set_token_item_size(&token, item, data_size[i]); + data_end -= batch->data_sizes[i]; + btrfs_set_token_item_offset(&token, slot + i, data_end); + btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); } - btrfs_set_header_nritems(leaf, nritems + nr); + btrfs_set_header_nritems(leaf, nritems + batch->nr); btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(leaf) < 0) { @@ -3883,26 +3975,43 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, } /* + * Insert a new item into a leaf. + * + * @root: The root of the btree. + * @path: A path pointing to the target leaf and slot. + * @key: The key of the new item. + * @data_size: The size of the data associated with the new key. + */ +void btrfs_setup_item_for_insert(struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key, + u32 data_size) +{ + struct btrfs_item_batch batch; + + batch.keys = key; + batch.data_sizes = &data_size; + batch.total_data_size = data_size; + batch.nr = 1; + + setup_items_for_insert(root, path, &batch); +} + +/* * Given a key and some data, insert items into the tree. * This does all the path init required, making room in the tree if needed. */ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const struct btrfs_key *cpu_key, u32 *data_size, - int nr) + const struct btrfs_item_batch *batch) { int ret = 0; int slot; - int i; - u32 total_size = 0; - u32 total_data = 0; - - for (i = 0; i < nr; i++) - total_data += data_size[i]; + u32 total_size; - total_size = total_data + (nr * sizeof(struct btrfs_item)); - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); if (ret == 0) return -EEXIST; if (ret < 0) @@ -3911,7 +4020,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, cpu_key, data_size, nr); + setup_items_for_insert(root, path, batch); return 0; } @@ -3943,6 +4052,40 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* + * This function duplicates an item, giving 'new_key' to the new item. + * It guarantees both items live in the same tree leaf and the new item is + * contiguous with the original item. + * + * This allows us to split a file extent in place, keeping a lock on the leaf + * the entire time. + */ +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *new_key) +{ + struct extent_buffer *leaf; + int ret; + u32 item_size; + + leaf = path->nodes[0]; + item_size = btrfs_item_size(leaf, path->slots[0]); + ret = setup_leaf_for_split(trans, root, path, + item_size + sizeof(struct btrfs_item)); + if (ret) + return ret; + + path->slots[0]++; + btrfs_setup_item_for_insert(root, path, new_key, item_size); + leaf = path->nodes[0]; + memcpy_extent_buffer(leaf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + btrfs_item_ptr_offset(leaf, path->slots[0] - 1), + item_size); + return 0; +} + +/* * delete the pointer from a given node. * * the tree should have been previously balanced so the deletion does not @@ -4015,7 +4158,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); atomic_inc(&leaf->refs); - btrfs_free_tree_block(trans, root, leaf, 0, 1); + btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); free_extent_buffer_stale(leaf); } /* @@ -4027,7 +4170,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; - struct btrfs_item *item; u32 last_off; u32 dsize = 0; int ret = 0; @@ -4036,10 +4178,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 nritems; leaf = path->nodes[0]; - last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); + last_off = btrfs_item_offset(leaf, slot + nr - 1); for (i = 0; i < nr; i++) - dsize += btrfs_item_size_nr(leaf, slot + i); + dsize += btrfs_item_size(leaf, slot + i); nritems = btrfs_header_nritems(leaf); @@ -4056,9 +4198,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, for (i = slot + nr; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(&token, item); - btrfs_set_token_item_offset(&token, item, ioff + dsize); + ioff = btrfs_token_item_offset(&token, i); + btrfs_set_token_item_offset(&token, i, ioff + dsize); } memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), @@ -4385,7 +4526,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int level; struct extent_buffer *c; struct extent_buffer *next; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; + bool need_commit_sem = false; u32 nritems; int ret; int i; @@ -4402,14 +4545,20 @@ again: path->keep_locks = 1; - if (time_seq) + if (time_seq) { ret = btrfs_search_old_slot(root, &key, path, time_seq); - else + } else { + if (path->need_commit_sem) { + path->need_commit_sem = 0; + need_commit_sem = true; + down_read(&fs_info->commit_root_sem); + } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + } path->keep_locks = 0; if (ret < 0) - return ret; + goto done; nritems = btrfs_header_nritems(path->nodes[0]); /* @@ -4532,6 +4681,15 @@ again: ret = 0; done: unlock_up(path, 0, 1, 0, NULL); + if (need_commit_sem) { + int ret2; + + path->need_commit_sem = 1; + ret2 = finish_need_commit_sem_search(path); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dff2c8a3e059..b4a9b1c58d22 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,6 +48,7 @@ extern struct kmem_cache *btrfs_free_space_cachep; extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; +struct btrfs_bio; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ @@ -142,6 +143,8 @@ enum { BTRFS_FS_STATE_DEV_REPLACING, /* The btrfs_fs_info created for self-tests */ BTRFS_FS_STATE_DUMMY_FS_INFO, + + BTRFS_FS_STATE_NO_CSUMS, }; #define BTRFS_BACKREF_REV_MAX 256 @@ -217,6 +220,9 @@ struct btrfs_root_backup { u8 unused_8[10]; } __attribute__ ((__packed__)); +#define BTRFS_SUPER_INFO_OFFSET SZ_64K +#define BTRFS_SUPER_INFO_SIZE 4096 + /* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc @@ -269,7 +275,11 @@ struct btrfs_super_block { __le64 reserved[28]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; + + /* Padded to 4096 bytes */ + u8 padding[565]; } __attribute__ ((__packed__)); +static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); /* * Compat flags that we support. If any incompat flags are set other than the @@ -503,11 +513,6 @@ struct btrfs_discard_ctl { atomic64_t discard_bytes_saved; }; -enum btrfs_orphan_cleanup_state { - ORPHAN_CLEANUP_STARTED = 1, - ORPHAN_CLEANUP_DONE = 2, -}; - void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); /* fs_info */ @@ -545,7 +550,6 @@ struct btrfs_swapfile_pin { bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); enum { - BTRFS_FS_BARRIER, BTRFS_FS_CLOSING_START, BTRFS_FS_CLOSING_DONE, BTRFS_FS_LOG_RECOVERING, @@ -568,7 +572,6 @@ enum { /* * Indicate that relocation of a chunk has started, it's set per chunk * and is toggled between chunks. - * Set, tested and cleared while holding fs_info::send_reloc_lock. */ BTRFS_FS_RELOC_RUNNING, @@ -593,6 +596,9 @@ enum { /* Indicate whether there are any tree modification log users */ BTRFS_FS_TREE_MOD_LOG_USERS, + /* Indicate that we want the transaction kthread to commit right now. */ + BTRFS_FS_COMMIT_TRANS, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -605,6 +611,7 @@ enum { */ enum btrfs_exclusive_operation { BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE_PAUSED, BTRFS_EXCLOP_BALANCE, BTRFS_EXCLOP_DEV_ADD, BTRFS_EXCLOP_DEV_REMOVE, @@ -616,20 +623,21 @@ enum btrfs_exclusive_operation { struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; - struct btrfs_root *extent_root; struct btrfs_root *tree_root; struct btrfs_root *chunk_root; struct btrfs_root *dev_root; struct btrfs_root *fs_root; - struct btrfs_root *csum_root; struct btrfs_root *quota_root; struct btrfs_root *uuid_root; - struct btrfs_root *free_space_root; struct btrfs_root *data_reloc_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; + /* The tree that holds the global roots (csum, extent, etc) */ + rwlock_t global_root_lock; + struct rb_root global_root_tree; + spinlock_t fs_roots_radix_lock; struct radix_tree_root fs_roots_radix; @@ -665,6 +673,12 @@ struct btrfs_fs_info { u64 generation; u64 last_trans_committed; + /* + * Generation of the last transaction used for block group relocation + * since the filesystem was last mounted (or 0 if none happened yet). + * Must be written and read while holding btrfs_fs_info::commit_root_sem. + */ + u64 last_reloc_trans; u64 avg_delayed_ref_runtime; /* @@ -807,7 +821,6 @@ struct btrfs_fs_info { struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *caching_workers; - struct btrfs_workqueue *readahead_workers; /* * fixup workers take dirty pages that didn't properly go through @@ -899,6 +912,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *scrub_workers; struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_parity_workers; + struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; @@ -943,13 +957,6 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* readahead tree */ - spinlock_t reada_lock; - struct radix_tree_root reada_tree; - - /* readahead works cnt */ - atomic_t reada_works_cnt; - /* Extent buffer radix tree */ spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ @@ -994,13 +1001,6 @@ struct btrfs_fs_info { struct crypto_shash *csum_shash; - spinlock_t send_reloc_lock; - /* - * Number of send operations in progress. - * Updated while holding fs_info::send_reloc_lock. - */ - int send_in_progress; - /* Type of exclusive operation running, protected by super_lock */ enum btrfs_exclusive_operation exclusive_operation; @@ -1017,6 +1017,16 @@ struct btrfs_fs_info { spinlock_t treelog_bg_lock; u64 treelog_bg; + /* + * Start of the dedicated data relocation block group, protected by + * relocation_bg_lock. + */ + spinlock_t relocation_bg_lock; + u64 data_reloc_bg; + + spinlock_t zone_active_bgs_lock; + struct list_head zone_active_bgs; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -1091,6 +1101,8 @@ enum { BTRFS_ROOT_HAS_LOG_TREE, /* Qgroup flushing is in progress */ BTRFS_ROOT_QGROUP_FLUSHING, + /* We started the orphan cleanup for this root. */ + BTRFS_ROOT_ORPHAN_CLEANUP, }; /* @@ -1109,6 +1121,8 @@ struct btrfs_qgroup_swapped_blocks { * and for the extent tree extent_root root. */ struct btrfs_root { + struct rb_node rb_node; + struct extent_buffer *node; struct extent_buffer *commit_root; @@ -1159,8 +1173,6 @@ struct btrfs_root { spinlock_t log_extents_lock[2]; struct list_head logged_list[2]; - int orphan_cleanup_state; - spinlock_t inode_lock; /* red-black tree that keeps track of in-memory inodes */ struct rb_root inode_tree; @@ -1941,8 +1953,8 @@ static inline void btrfs_set_node_key(const struct extent_buffer *eb, } /* struct btrfs_item */ -BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); -BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); +BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); @@ -1957,25 +1969,36 @@ static inline struct btrfs_item *btrfs_item_nr(int nr) return (struct btrfs_item *)btrfs_item_nr_offset(nr); } -static inline u32 btrfs_item_end(const struct extent_buffer *eb, - struct btrfs_item *item) -{ - return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); -} - -static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr) -{ - return btrfs_item_end(eb, btrfs_item_nr(nr)); +#define BTRFS_ITEM_SETGET_FUNCS(member) \ +static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \ + int slot) \ +{ \ + return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \ +} \ +static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ + int slot, u32 val) \ +{ \ + btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \ +} \ +static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ + int slot) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(slot); \ + return btrfs_token_raw_item_##member(token, item); \ +} \ +static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ + int slot, u32 val) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(slot); \ + btrfs_set_token_raw_item_##member(token, item, val); \ } -static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr) -{ - return btrfs_item_offset(eb, btrfs_item_nr(nr)); -} +BTRFS_ITEM_SETGET_FUNCS(offset) +BTRFS_ITEM_SETGET_FUNCS(size); -static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr) +static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) { - return btrfs_item_size(eb, btrfs_item_nr(nr)); + return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); } static inline void btrfs_item_key(const struct extent_buffer *eb, @@ -2238,6 +2261,11 @@ static inline bool btrfs_root_dead(const struct btrfs_root *root) return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; } +static inline u64 btrfs_root_id(const struct btrfs_root *root) +{ + return root->root_key.objectid; +} + /* struct btrfs_root_backup */ BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, tree_root, 64); @@ -2439,7 +2467,7 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) if (nr == 0) return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); - return btrfs_item_offset_nr(leaf, nr - 1); + return btrfs_item_offset(leaf, nr - 1); } /* struct btrfs_file_extent_item */ @@ -2498,9 +2526,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, */ static inline u32 btrfs_file_extent_inline_item_len( const struct extent_buffer *eb, - struct btrfs_item *e) + int nr) { - return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; + return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } /* btrfs_qgroup_status_item */ @@ -2592,11 +2620,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset_nr(leaf, slot))) + btrfs_item_offset(leaf, slot))) #define btrfs_item_ptr_offset(leaf, slot) \ ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset_nr(leaf, slot))) + btrfs_item_offset(leaf, slot))) static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) { @@ -2700,7 +2728,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 empty_size, enum btrfs_lock_nesting nest); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + u64 root_id, struct extent_buffer *buf, u64 parent, int last_ref); int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -2885,16 +2913,42 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, return btrfs_del_items(trans, root, path, path->slots[0], 1); } -void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, - const struct btrfs_key *cpu_key, u32 *data_size, - int nr); +/* + * Describes a batch of items to insert in a btree. This is used by + * btrfs_insert_empty_items(). + */ +struct btrfs_item_batch { + /* + * Pointer to an array containing the keys of the items to insert (in + * sorted order). + */ + const struct btrfs_key *keys; + /* Pointer to an array containing the data size for each item to insert. */ + const u32 *data_sizes; + /* + * The sum of data sizes for all items. The caller can compute this while + * setting up the data_sizes array, so it ends up being more efficient + * than having btrfs_insert_empty_items() or setup_item_for_insert() + * doing it, as it would avoid an extra loop over a potentially large + * array, and in the case of setup_item_for_insert(), we would be doing + * it while holding a write lock on a leaf and often on upper level nodes + * too, unnecessarily increasing the size of a critical section. + */ + u32 total_data_size; + /* Size of the keys and data_sizes arrays (number of items in the batch). */ + int nr; +}; + +void btrfs_setup_item_for_insert(struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key, + u32 data_size); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const struct btrfs_key *cpu_key, u32 *data_size, - int nr); + const struct btrfs_item_batch *batch); static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2902,7 +2956,14 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, const struct btrfs_key *key, u32 data_size) { - return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); + struct btrfs_item_batch batch; + + batch.keys = key; + batch.data_sizes = &data_size; + batch.total_data_size = data_size; + batch.nr = 1; + + return btrfs_insert_empty_items(trans, root, path, &batch); } int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); @@ -3030,7 +3091,7 @@ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, + u64 index, const char *name, int name_len, int mod); struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, @@ -3062,36 +3123,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); -/* inode-item.c */ -int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 index); -int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 *index); -int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid); -int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, - struct btrfs_key *location, int mod); - -struct btrfs_inode_extref * -btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int ins_len, - int cow); - -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, - int slot, const char *name, - int name_len); -struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, - const char *name, int name_len); /* file-item.c */ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, @@ -3129,8 +3160,9 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end); +unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, + u64 start, u64 end); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -3142,7 +3174,6 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len); int btrfs_add_link(struct btrfs_trans_handle *trans, @@ -3151,10 +3182,6 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front); -int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode, u64 new_size, - u32 min_type, u64 *extents_found); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, @@ -3174,8 +3201,6 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other); void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); -int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, - unsigned long bio_flags); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); @@ -3242,9 +3267,9 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns, int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(u8 *uuid); -int btrfs_defrag_file(struct inode *inode, struct file *file, +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, - u64 newer_than, unsigned long max_pages); + u64 newer_than, unsigned long max_to_defrag); void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, @@ -3255,6 +3280,9 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op); + /* file.c */ int __init btrfs_auto_defrag_init(void); @@ -3563,6 +3591,9 @@ do { \ (errno), fmt, ##args); \ } while (0) +#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ + &(fs_info)->fs_state))) + __printf(5, 6) __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, @@ -3768,23 +3799,6 @@ static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) btrfs_bio_counter_sub(fs_info, 1); } -/* reada.c */ -struct reada_control { - struct btrfs_fs_info *fs_info; /* tree to prefetch */ - struct btrfs_key key_start; - struct btrfs_key key_end; /* exclusive */ - atomic_t elems; - struct kref refcnt; - wait_queue_head_t wait; -}; -struct reada_control *btrfs_reada_add(struct btrfs_root *root, - struct btrfs_key *start, struct btrfs_key *end); -int btrfs_reada_wait(void *handle); -void btrfs_reada_detach(void *handle); -int btree_readahead_hook(struct extent_buffer *eb, int err); -void btrfs_reada_remove_dev(struct btrfs_device *dev); -void btrfs_reada_undo_remove_dev(struct btrfs_device *dev); - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || @@ -3842,6 +3856,11 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) return fs_info->zoned != 0; } +static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) +{ + return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; +} + /* * We use page status Private2 to indicate there is an ordered extent with * unfinished IO. diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 2059d1504149..fb46a28f5065 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -143,10 +143,13 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode, /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); - if (ret < 0) + if (ret < 0) { btrfs_free_reserved_data_space_noquota(fs_info, len); - else + extent_changeset_free(*reserved); + *reserved = NULL; + } else { ret = 0; + } return ret; } @@ -331,7 +334,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); if (ret) return ret; - ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); if (ret) { btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); return ret; @@ -452,8 +455,11 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, if (ret < 0) return ret; ret = btrfs_delalloc_reserve_metadata(inode, len); - if (ret < 0) + if (ret < 0) { btrfs_free_reserved_data_space(inode, *reserved, start, len); + extent_changeset_free(*reserved); + *reserved = NULL; + } return ret; } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 1e08eb2b27f0..748bf6b0d860 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -13,6 +13,7 @@ #include "ctree.h" #include "qgroup.h" #include "locking.h" +#include "inode-item.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 @@ -629,7 +630,7 @@ static int btrfs_delayed_inode_reserve_metadata( BTRFS_QGROUP_RSV_META_PREALLOC, true); if (ret < 0) return ret; - ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); /* NO_FLUSH could only fail with -ENOSPC */ ASSERT(ret == 0 || ret == -ENOSPC); @@ -679,19 +680,18 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *first_item) { - LIST_HEAD(batch); + LIST_HEAD(item_list); struct btrfs_delayed_item *curr; struct btrfs_delayed_item *next; const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info); + struct btrfs_item_batch batch; int total_size; - int nitems; char *ins_data = NULL; - struct btrfs_key *ins_keys; - u32 *ins_sizes; int ret; - list_add_tail(&first_item->tree_list, &batch); - nitems = 1; + list_add_tail(&first_item->tree_list, &item_list); + batch.total_data_size = first_item->data_len; + batch.nr = 1; total_size = first_item->data_len + sizeof(struct btrfs_item); curr = first_item; @@ -706,39 +706,43 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, if (total_size + next_size > max_size) break; - list_add_tail(&next->tree_list, &batch); - nitems++; + list_add_tail(&next->tree_list, &item_list); + batch.nr++; total_size += next_size; + batch.total_data_size += next->data_len; curr = next; } - if (nitems == 1) { - ins_keys = &first_item->key; - ins_sizes = &first_item->data_len; + if (batch.nr == 1) { + batch.keys = &first_item->key; + batch.data_sizes = &first_item->data_len; } else { + struct btrfs_key *ins_keys; + u32 *ins_sizes; int i = 0; - ins_data = kmalloc(nitems * sizeof(u32) + - nitems * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc(batch.nr * sizeof(u32) + + batch.nr * sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) { ret = -ENOMEM; goto out; } ins_sizes = (u32 *)ins_data; - ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32)); - list_for_each_entry(curr, &batch, tree_list) { + ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32)); + batch.keys = ins_keys; + batch.data_sizes = ins_sizes; + list_for_each_entry(curr, &item_list, tree_list) { ins_keys[i] = curr->key; ins_sizes[i] = curr->data_len; i++; } } - ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes, - nitems); + ret = btrfs_insert_empty_items(trans, root, path, &batch); if (ret) goto out; - list_for_each_entry(curr, &batch, tree_list) { + list_for_each_entry(curr, &item_list, tree_list) { char *data_ptr; data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); @@ -754,7 +758,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, */ btrfs_release_path(path); - list_for_each_entry_safe(curr, next, &batch, tree_list) { + list_for_each_entry_safe(curr, next, &item_list, tree_list) { list_del(&curr->tree_list); btrfs_delayed_item_release_metadata(root, curr); btrfs_release_delayed_item(curr); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ca848b183474..4176df149d04 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -84,6 +84,17 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); u64 released = 0; + /* + * We have to check the mount option here because we could be enabling + * the free space tree for the first time and don't have the compat_ro + * option set yet. + * + * We need extra reservations if we have the free space tree because + * we'll have to modify that tree as well. + */ + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + num_bytes *= 2; + released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", @@ -108,6 +119,17 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) num_bytes = btrfs_calc_insert_metadata_size(fs_info, trans->delayed_ref_updates); + /* + * We have to check the mount option here because we could be enabling + * the free space tree for the first time and don't have the compat_ro + * option set yet. + * + * We need extra reservations if we have the free space tree because + * we'll have to modify that tree as well. + */ + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + num_bytes *= 2; + spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; delayed_rsv->full = 0; @@ -191,8 +213,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, if (!num_bytes) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv, - num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); if (ret) return ret; btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); @@ -906,7 +927,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, u64 parent = generic_ref->parent; u8 ref_type; - is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID); + is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); BUG_ON(extent_op && extent_op->is_data); @@ -921,8 +942,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, } if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - is_fstree(generic_ref->real_root) && - is_fstree(generic_ref->tree_ref.root) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { @@ -938,14 +957,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, ref_type = BTRFS_TREE_BLOCK_REF_KEY; init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - generic_ref->tree_ref.root, action, ref_type); - ref->root = generic_ref->tree_ref.root; + generic_ref->tree_ref.owning_root, action, + ref_type); + ref->root = generic_ref->tree_ref.owning_root; ref->parent = parent; ref->level = level; init_delayed_ref_head(head_ref, record, bytenr, num_bytes, - generic_ref->tree_ref.root, 0, action, false, - is_system); + generic_ref->tree_ref.owning_root, 0, action, + false, is_system); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -997,7 +1017,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; - u64 ref_root = generic_ref->data_ref.ref_root; + u64 ref_root = generic_ref->data_ref.owning_root; u64 owner = generic_ref->data_ref.ino; u64 offset = generic_ref->data_ref.offset; u8 ref_type; @@ -1026,8 +1046,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, } if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - is_fstree(ref_root) && - is_fstree(generic_ref->real_root) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index e22fba272e4f..91a3aabad150 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -186,8 +186,8 @@ enum btrfs_ref_type { struct btrfs_data_ref { /* For EXTENT_DATA_REF */ - /* Root which refers to this data extent */ - u64 ref_root; + /* Original root this data extent belongs to */ + u64 owning_root; /* Inode which refers to this data extent */ u64 ino; @@ -210,11 +210,11 @@ struct btrfs_tree_ref { int level; /* - * Root which refers to this tree block. + * Root which owns this tree block. * * For TREE_BLOCK_REF (skinny metadata, either inline or keyed) */ - u64 root; + u64 owning_root; /* For non-skinny metadata, no special member needed */ }; @@ -231,17 +231,10 @@ struct btrfs_ref { */ bool skip_qgroup; - /* - * Optional. For which root is this modification. - * Mostly used for qgroup optimization. - * - * When unset, data/tree ref init code will populate it. - * In certain cases, we're modifying reference for a different root. - * E.g. COW fs tree blocks for balance. - * In that case, tree_ref::root will be fs tree, but we're doing this - * for reloc tree, then we should set @real_root to reloc tree. - */ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* Through which root is this modification. */ u64 real_root; +#endif u64 bytenr; u64 len; @@ -271,26 +264,40 @@ static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, } static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, - int level, u64 root) + int level, u64 root, u64 mod_root, bool skip_qgroup) { +#ifdef CONFIG_BTRFS_FS_REF_VERIFY /* If @real_root not set, use @root as fallback */ - if (!generic_ref->real_root) - generic_ref->real_root = root; + generic_ref->real_root = mod_root ?: root; +#endif generic_ref->tree_ref.level = level; - generic_ref->tree_ref.root = root; + generic_ref->tree_ref.owning_root = root; generic_ref->type = BTRFS_REF_METADATA; + if (skip_qgroup || !(is_fstree(root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; + } static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, - u64 ref_root, u64 ino, u64 offset) + u64 ref_root, u64 ino, u64 offset, u64 mod_root, + bool skip_qgroup) { +#ifdef CONFIG_BTRFS_FS_REF_VERIFY /* If @real_root not set, use @root as fallback */ - if (!generic_ref->real_root) - generic_ref->real_root = ref_root; - generic_ref->data_ref.ref_root = ref_root; + generic_ref->real_root = mod_root ?: ref_root; +#endif + generic_ref->data_ref.owning_root = ref_root; generic_ref->data_ref.ino = ino; generic_ref->data_ref.offset = offset; generic_ref->type = BTRFS_REF_DATA; + if (skip_qgroup || !(is_fstree(ref_root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; } static inline struct btrfs_delayed_extent_op * diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index d029be40ea6f..62b9651ea662 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -70,6 +70,7 @@ static int btrfs_dev_replace_kthread(void *data); int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) { + struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID }; struct btrfs_key key; struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; @@ -100,8 +101,7 @@ no_valid_dev_replace_entry_found: * We don't have a replace item or it's corrupted. If there is * a replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { + if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, "found replace target device without a valid replace item"); ret = -EUCLEAN; @@ -128,7 +128,7 @@ no_valid_dev_replace_entry_found: } slot = path->slots[0]; eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); if (item_size != sizeof(struct btrfs_dev_replace_item)) { @@ -163,8 +163,7 @@ no_valid_dev_replace_entry_found: * We don't have an active replace item but if there is a * replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { + if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, "replace devid present without an active replace item"); ret = -EUCLEAN; @@ -175,11 +174,10 @@ no_valid_dev_replace_entry_found: break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, - src_devid, NULL, NULL); - dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, - NULL, NULL); + dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args); + args.devid = src_devid; + dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args); + /* * allow 'btrfs dev replace_cancel' if src/tgt device is * missing @@ -283,8 +281,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } - if (i_size_read(bdev->bd_inode) < - btrfs_device_get_total_bytes(srcdev)) { + if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) { btrfs_err(fs_info, "target device is smaller than source device!"); ret = -EINVAL; @@ -325,7 +322,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_info->fs_devices; - ret = btrfs_get_dev_zone_info(device); + ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error; @@ -384,7 +381,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) } if (ret == 0 && - btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { /* * need to delete old one and insert a new one. * Since no attempt is made to recover any old state, if the @@ -909,9 +906,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - if (!scrub_ret) - btrfs_reada_remove_dev(src_device); - /* * We have to use this loop approach because at this point src_device * has to be available for transaction commit to complete, yet new @@ -920,7 +914,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, while (1) { trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - btrfs_reada_undo_remove_dev(src_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return PTR_ERR(trans); } @@ -971,7 +964,6 @@ error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - btrfs_reada_undo_remove_dev(src_device); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(tgt_device); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index f1274d5c3805..3b532bab0755 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *ptr; - struct btrfs_item *item; struct extent_buffer *leaf; ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); @@ -41,10 +40,9 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle return ERR_PTR(ret); WARN_ON(ret > 0); leaf = path->nodes[0]; - item = btrfs_item_nr(path->slots[0]); ptr = btrfs_item_ptr(leaf, path->slots[0], char); - BUG_ON(data_size > btrfs_item_size(leaf, item)); - ptr += btrfs_item_size(leaf, item) - data_size; + ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0])); + ptr += btrfs_item_size(leaf, path->slots[0]) - data_size; return (struct btrfs_dir_item *)ptr; } @@ -190,9 +188,20 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( } /* - * lookup a directory item based on name. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) + * Lookup for a directory item by name. + * + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. + * + * Returns: NULL if the dir item does not exists, an error pointer if an error + * happened, or a pointer to a dir item if a dir item exists for the given name. */ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -260,7 +269,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, data_size = sizeof(*di) + name_len; leaf = path->nodes[0]; slot = path->slots[0]; - if (data_size + btrfs_item_size_nr(leaf, slot) + + if (data_size + btrfs_item_size(leaf, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { ret = -EOVERFLOW; } else { @@ -273,27 +282,42 @@ out: } /* - * lookup a directory item based on index. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) + * Lookup for a directory index item by name and index number. * - * The name is used to make sure the index really points to the name you were - * looking for. + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @index: The index number. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. + * + * Returns: NULL if the dir index item does not exists, an error pointer if an + * error happened, or a pointer to a dir item if the dir index item exists and + * matches the criteria (name and index number). */ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, + u64 index, const char *name, int name_len, int mod) { + struct btrfs_dir_item *di; struct btrfs_key key; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; - key.offset = objectid; + key.offset = index; - return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + if (di == ERR_PTR(-ENOENT)) + return NULL; + + return di; } struct btrfs_dir_item * @@ -383,7 +407,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); - total_len = btrfs_item_size_nr(leaf, path->slots[0]); + total_len = btrfs_item_size(leaf, path->slots[0]); while (cur < total_len) { this_len = sizeof(*dir_item) + btrfs_dir_name_len(leaf, dir_item) + @@ -419,7 +443,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + btrfs_dir_data_len(leaf, di); - item_len = btrfs_item_size_nr(leaf, path->slots[0]); + item_len = btrfs_item_size(leaf, path->slots[0]); if (sub_item_len == item_len) { ret = btrfs_del_item(trans, root, path); } else { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 355ea88d5c5f..87a5addbedf6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -665,9 +665,6 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end, if (ret < 0) goto err; - if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb, ret); - set_extent_buffer_uptodate(eb); free_extent_buffer(eb); @@ -683,7 +680,7 @@ err: return ret; } -int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, +int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror) { @@ -715,10 +712,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, } ret = validate_extent_buffer(eb); err: - if (reads_done && - test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb, ret); - if (ret) { /* * our io error hook is going to dec the io pages @@ -1036,7 +1029,7 @@ static int btree_set_page_dirty(struct page *page) BUG_ON(!eb); BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); - btrfs_assert_tree_locked(eb); + btrfs_assert_tree_write_locked(eb); return __set_page_dirty_nobuffers(page); } ASSERT(PagePrivate(page) && page->private); @@ -1061,7 +1054,7 @@ static int btree_set_page_dirty(struct page *page) ASSERT(eb); ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); ASSERT(atomic_read(&eb->refs)); - btrfs_assert_tree_locked(eb); + btrfs_assert_tree_write_locked(eb); free_extent_buffer(eb); cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); @@ -1125,7 +1118,7 @@ void btrfs_clean_tree_block(struct extent_buffer *buf) struct btrfs_fs_info *fs_info = buf->fs_info; if (btrfs_header_generation(buf) == fs_info->running_transaction->transid) { - btrfs_assert_tree_locked(buf); + btrfs_assert_tree_write_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, @@ -1140,11 +1133,16 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + + memset(&root->root_key, 0, sizeof(root->root_key)); + memset(&root->root_item, 0, sizeof(root->root_item)); + memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); root->fs_info = fs_info; + root->root_key.objectid = objectid; root->node = NULL; root->commit_root = NULL; root->state = 0; - root->orphan_cleanup_state = 0; + RB_CLEAR_NODE(&root->rb_node); root->last_trans = 0; root->free_objectid = 0; @@ -1152,7 +1150,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); - root->block_rsv = NULL; + + btrfs_init_root_block_rsv(root); INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); @@ -1190,6 +1189,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; + root->anon_dev = 0; if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL); @@ -1197,12 +1197,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, IO_TREE_LOG_CSUM_RANGE, NULL); } - memset(&root->root_key, 0, sizeof(root->root_key)); - memset(&root->root_item, 0, sizeof(root->root_item)); - memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - root->root_key.objectid = objectid; - root->anon_dev = 0; - spin_lock_init(&root->root_item_lock); btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); #ifdef CONFIG_BTRFS_DEBUG @@ -1242,6 +1236,81 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) } #endif +static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node) +{ + const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node); + const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node); + + return btrfs_comp_cpu_keys(&a->root_key, &b->root_key); +} + +static int global_root_key_cmp(const void *k, const struct rb_node *node) +{ + const struct btrfs_key *key = k; + const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node); + + return btrfs_comp_cpu_keys(key, &root->root_key); +} + +int btrfs_global_root_insert(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *tmp; + + write_lock(&fs_info->global_root_lock); + tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp); + write_unlock(&fs_info->global_root_lock); + ASSERT(!tmp); + + return tmp ? -EEXIST : 0; +} + +void btrfs_global_root_delete(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + write_lock(&fs_info->global_root_lock); + rb_erase(&root->rb_node, &fs_info->global_root_tree); + write_unlock(&fs_info->global_root_lock); +} + +struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *key) +{ + struct rb_node *node; + struct btrfs_root *root = NULL; + + read_lock(&fs_info->global_root_lock); + node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp); + if (node) + root = container_of(node, struct btrfs_root, rb_node); + read_unlock(&fs_info->global_root_lock); + + return root; +} + +struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_key key = { + .objectid = BTRFS_CSUM_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + + return btrfs_global_root(fs_info, &key); +} + +struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_key key = { + .objectid = BTRFS_EXTENT_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + + return btrfs_global_root(fs_info, &key); +} + struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid) { @@ -1500,7 +1569,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) goto fail; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && - root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + !btrfs_is_data_reloc_root(root)) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1554,25 +1623,33 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, u64 objectid) { + struct btrfs_key key = { + .objectid = objectid, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + if (objectid == BTRFS_ROOT_TREE_OBJECTID) return btrfs_grab_root(fs_info->tree_root); if (objectid == BTRFS_EXTENT_TREE_OBJECTID) - return btrfs_grab_root(fs_info->extent_root); + return btrfs_grab_root(btrfs_global_root(fs_info, &key)); if (objectid == BTRFS_CHUNK_TREE_OBJECTID) return btrfs_grab_root(fs_info->chunk_root); if (objectid == BTRFS_DEV_TREE_OBJECTID) return btrfs_grab_root(fs_info->dev_root); if (objectid == BTRFS_CSUM_TREE_OBJECTID) - return btrfs_grab_root(fs_info->csum_root); + return btrfs_grab_root(btrfs_global_root(fs_info, &key)); if (objectid == BTRFS_QUOTA_TREE_OBJECTID) return btrfs_grab_root(fs_info->quota_root) ? fs_info->quota_root : ERR_PTR(-ENOENT); if (objectid == BTRFS_UUID_TREE_OBJECTID) return btrfs_grab_root(fs_info->uuid_root) ? fs_info->uuid_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return btrfs_grab_root(fs_info->free_space_root) ? - fs_info->free_space_root : ERR_PTR(-ENOENT); + if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { + struct btrfs_root *root = btrfs_global_root(fs_info, &key); + + return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT); + } return NULL; } @@ -1619,6 +1696,18 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) #endif } +static void free_global_roots(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct rb_node *node; + + while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) { + root = rb_entry(node, struct btrfs_root, rb_node); + rb_erase(&root->rb_node, &fs_info->global_root_tree); + btrfs_put_root(root); + } +} + void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { percpu_counter_destroy(&fs_info->dirty_metadata_bytes); @@ -1630,20 +1719,19 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_free_ref_cache(fs_info); kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); - btrfs_put_root(fs_info->extent_root); + free_global_roots(fs_info); btrfs_put_root(fs_info->tree_root); btrfs_put_root(fs_info->chunk_root); btrfs_put_root(fs_info->dev_root); - btrfs_put_root(fs_info->csum_root); btrfs_put_root(fs_info->quota_root); btrfs_put_root(fs_info->uuid_root); - btrfs_put_root(fs_info->free_space_root); btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); + kfree(fs_info->subpage_info); kvfree(fs_info); } @@ -1731,6 +1819,14 @@ again: } return root; fail: + /* + * If our caller provided us an anonymous device, then it's his + * responsability to free it in case we fail. So we have to set our + * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() + * and once again by our caller. + */ + if (anon_dev) + root->anon_dev = 0; btrfs_put_root(root); return ERR_PTR(ret); } @@ -1926,7 +2022,8 @@ static int transaction_kthread(void *arg) } delta = ktime_get_seconds() - cur->start_time; - if (cur->state < TRANS_STATE_COMMIT_START && + if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) && + cur->state < TRANS_STATE_COMMIT_START && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay -= msecs_to_jiffies((delta - 1) * 1000); @@ -1953,8 +2050,7 @@ sleep: wake_up_process(fs_info->cleaner_kthread); mutex_unlock(&fs_info->transaction_kthread_mutex); - if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, - &fs_info->fs_state))) + if (BTRFS_FS_ERROR(fs_info)) btrfs_cleanup_transaction(fs_info); if (!kthread_should_stop() && (!btrfs_transaction_blocked(fs_info) || @@ -1999,6 +2095,8 @@ static void backup_super_roots(struct btrfs_fs_info *info) { const int next_backup = info->backup_root_index; struct btrfs_root_backup *root_backup; + struct btrfs_root *extent_root = btrfs_extent_root(info, 0); + struct btrfs_root *csum_root = btrfs_csum_root(info, 0); root_backup = info->super_for_commit->super_roots + next_backup; @@ -2023,11 +2121,11 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_chunk_root_level(root_backup, btrfs_header_level(info->chunk_root->node)); - btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); + btrfs_set_backup_extent_root(root_backup, extent_root->node->start); btrfs_set_backup_extent_root_gen(root_backup, - btrfs_header_generation(info->extent_root->node)); + btrfs_header_generation(extent_root->node)); btrfs_set_backup_extent_root_level(root_backup, - btrfs_header_level(info->extent_root->node)); + btrfs_header_level(extent_root->node)); /* * we might commit during log recovery, which happens before we set @@ -2048,11 +2146,11 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_dev_root_level(root_backup, btrfs_header_level(info->dev_root->node)); - btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); + btrfs_set_backup_csum_root(root_backup, csum_root->node->start); btrfs_set_backup_csum_root_gen(root_backup, - btrfs_header_generation(info->csum_root->node)); + btrfs_header_generation(csum_root->node)); btrfs_set_backup_csum_root_level(root_backup, - btrfs_header_level(info->csum_root->node)); + btrfs_header_level(csum_root->node)); btrfs_set_backup_total_bytes(root_backup, btrfs_super_total_bytes(info->super_copy)); @@ -2127,7 +2225,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); btrfs_destroy_workqueue(fs_info->caching_workers); - btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); if (fs_info->discard_ctl.discard_workers) @@ -2151,21 +2248,29 @@ static void free_root_extent_buffers(struct btrfs_root *root) } } +static void free_global_root_pointers(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root, *tmp; + + rbtree_postorder_for_each_entry_safe(root, tmp, + &fs_info->global_root_tree, + rb_node) + free_root_extent_buffers(root); +} + /* helper to cleanup tree roots */ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) { free_root_extent_buffers(info->tree_root); + free_global_root_pointers(info); free_root_extent_buffers(info->dev_root); - free_root_extent_buffers(info->extent_root); - free_root_extent_buffers(info->csum_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); - free_root_extent_buffers(info->free_space_root); } void btrfs_put_root(struct btrfs_root *root) @@ -2283,8 +2388,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->qgroup_rescan_lock); } -static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices) +static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) { u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; @@ -2333,9 +2437,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->delayed_workers = btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, max_active, 0); - fs_info->readahead_workers = - btrfs_alloc_workqueue(fs_info, "readahead", flags, - max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); fs_info->discard_ctl.discard_workers = @@ -2347,9 +2448,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->endio_meta_write_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && - fs_info->caching_workers && fs_info->readahead_workers && - fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->qgroup_rescan_workers && + fs_info->caching_workers && fs_info->fixup_workers && + fs_info->delayed_workers && fs_info->qgroup_rescan_workers && fs_info->discard_ctl.discard_workers)) { return -ENOMEM; } @@ -2427,6 +2527,104 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, return 0; } +static int load_global_roots_objectid(struct btrfs_root *tree_root, + struct btrfs_path *path, u64 objectid, + const char *name) +{ + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_root *root; + int ret; + struct btrfs_key key = { + .objectid = objectid, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + bool found = false; + + /* If we have IGNOREDATACSUMS skip loading these roots. */ + if (objectid == BTRFS_CSUM_TREE_OBJECTID && + btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { + set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + return 0; + } + + while (1) { + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) + break; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(tree_root, path); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + } + ret = 0; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != objectid) + break; + btrfs_release_path(path); + + found = true; + root = read_tree_root_path(tree_root, path, &key); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) + ret = PTR_ERR(root); + break; + } + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + ret = btrfs_global_root_insert(root); + if (ret) { + btrfs_put_root(root); + break; + } + key.offset++; + } + btrfs_release_path(path); + + if (!found || ret) { + if (objectid == BTRFS_CSUM_TREE_OBJECTID) + set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) + ret = ret ? ret : -ENOENT; + else + ret = 0; + btrfs_err(fs_info, "failed to load root %s", name); + } + return ret; +} + +static int load_global_roots(struct btrfs_root *tree_root) +{ + struct btrfs_path *path; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = load_global_roots_objectid(tree_root, path, + BTRFS_EXTENT_TREE_OBJECTID, "extent"); + if (ret) + goto out; + ret = load_global_roots_objectid(tree_root, path, + BTRFS_CSUM_TREE_OBJECTID, "csum"); + if (ret) + goto out; + if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) + goto out; + ret = load_global_roots_objectid(tree_root, path, + BTRFS_FREE_SPACE_TREE_OBJECTID, + "free space"); +out: + btrfs_free_path(path); + return ret; +} + static int btrfs_read_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root *tree_root = fs_info->tree_root; @@ -2436,7 +2634,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) BUG_ON(!fs_info->tree_root); - location.objectid = BTRFS_EXTENT_TREE_OBJECTID; + ret = load_global_roots(tree_root); + if (ret) + return ret; + + location.objectid = BTRFS_DEV_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; @@ -2448,38 +2650,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->extent_root = root; - } - - location.objectid = BTRFS_DEV_TREE_OBJECTID; - root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - ret = PTR_ERR(root); - goto out; - } - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->dev_root = root; } /* Initialize fs_info for all devices in any case */ btrfs_init_devices_late(fs_info); - /* If IGNOREDATACSUMS is set don't bother reading the csum root. */ - if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { - location.objectid = BTRFS_CSUM_TREE_OBJECTID; - root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - ret = PTR_ERR(root); - goto out; - } - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->csum_root = root; - } - } - /* * This tree can share blocks with some other fs tree during relocation * and we need a proper setup by btrfs_get_fs_root @@ -2517,20 +2692,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) fs_info->uuid_root = root; } - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; - root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - ret = PTR_ERR(root); - goto out; - } - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->free_space_root = root; - } - } - return 0; out: btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", @@ -2592,8 +2753,7 @@ static int validate_super(struct btrfs_fs_info *fs_info, /* * For 4K page size, we only support 4K sector size. - * For 64K page size, we support read-write for 64K sector size, and - * read-only for 4K sector size. + * For 64K page size, we support 64K and 4K sector sizes. */ if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && @@ -2851,6 +3011,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) /* All successful */ fs_info->generation = generation; fs_info->last_trans_committed = generation; + fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ if (backup_index < 0) { @@ -2883,7 +3044,10 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->treelog_bg_lock); + spin_lock_init(&fs_info->zone_active_bgs_lock); + spin_lock_init(&fs_info->relocation_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); + rwlock_init(&fs_info->global_root_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->reclaim_bgs_lock); mutex_init(&fs_info->reloc_mutex); @@ -2896,6 +3060,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); INIT_LIST_HEAD(&fs_info->reclaim_bgs); + INIT_LIST_HEAD(&fs_info->zone_active_bgs); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&fs_info->allocated_roots); INIT_LIST_HEAD(&fs_info->allocated_ebs); @@ -2914,9 +3079,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->defrag_running, 0); - atomic_set(&fs_info->reada_works_cnt, 0); atomic_set(&fs_info->nr_delayed_iputs, 0); atomic64_set(&fs_info->tree_mod_seq, 0); + fs_info->global_root_tree = RB_ROOT; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; @@ -2924,9 +3089,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ - /* readahead state */ - INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - spin_lock_init(&fs_info->reada_lock); btrfs_init_ref_verify(fs_info); fs_info->thread_pool_size = min_t(unsigned long, @@ -2948,7 +3110,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) extent_io_tree_init(fs_info, &fs_info->excluded_extents, IO_TREE_FS_EXCLUDED_EXTENTS, NULL); - set_bit(BTRFS_FS_BARRIER, &fs_info->flags); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); @@ -2983,9 +3144,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; - spin_lock_init(&fs_info->send_reloc_lock); - fs_info->send_in_progress = 0; - fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work); } @@ -3228,12 +3386,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); btrfs_init_btree_inode(fs_info); - invalidate_bdev(fs_devices->latest_bdev); + invalidate_bdev(fs_devices->latest_dev->bdev); /* * Read super block and check the signature bytes only */ - disk_super = btrfs_read_dev_super(fs_devices->latest_bdev); + disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); if (IS_ERR(disk_super)) { err = PTR_ERR(disk_super); goto fail_alloc; @@ -3392,12 +3550,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } - if (sectorsize != PAGE_SIZE) { + if (sectorsize < PAGE_SIZE) { + struct btrfs_subpage_info *subpage_info; + btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - } - if (sectorsize != PAGE_SIZE) { if (btrfs_super_incompat_flags(fs_info->super_copy) & BTRFS_FEATURE_INCOMPAT_RAID56) { btrfs_err(fs_info, @@ -3406,9 +3564,14 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device err = -EINVAL; goto fail_alloc; } + subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); + if (!subpage_info) + goto fail_alloc; + btrfs_init_subpage_info(subpage_info, sectorsize); + fs_info->subpage_info = subpage_info; } - ret = btrfs_init_workqueues(fs_info, fs_devices); + ret = btrfs_init_workqueues(fs_info); if (ret) { err = ret; goto fail_sb_buffer; @@ -3465,7 +3628,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * below in btrfs_init_dev_replace(). */ btrfs_free_extra_devids(fs_devices); - if (!fs_devices->latest_bdev) { + if (!fs_devices->latest_dev->bdev) { btrfs_err(fs_info, "failed to read devices"); goto fail_tree_roots; } @@ -3556,7 +3719,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } - if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) { + btrfs_free_zone_cache(fs_info); + + if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && + !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "writable mount is not allowed due to too many missing devices"); goto fail_sysfs; @@ -3740,7 +3906,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, else if (ret) return ERR_PTR(ret); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); @@ -3881,7 +4047,9 @@ static int write_dev_supers(struct btrfs_device *device, bio->bi_opf |= REQ_FUA; btrfsic_submit_bio(bio); - btrfs_advance_sb_log(device, i); + + if (btrfs_advance_sb_log(device, i)) + errors++; } return errors < i ? 0 : -1; } @@ -3968,11 +4136,23 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static void write_dev_flush(struct btrfs_device *device) { - struct request_queue *q = bdev_get_queue(device->bdev); struct bio *bio = device->flush_bio; +#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY + /* + * When a disk has write caching disabled, we skip submission of a bio + * with flush and sync requests before writing the superblock, since + * it's not needed. However when the integrity checker is enabled, this + * results in reports that there are metadata blocks referred by a + * superblock that were not properly flushed. So don't skip the bio + * submission only when the integrity checker is enabled for the sake + * of simplicity, since this is a debug tool and not meant for use in + * non-debug builds. + */ + struct request_queue *q = bdev_get_queue(device->bdev); if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) return; +#endif bio_reset(bio); bio->bi_end_io = btrfs_end_empty_barrier; @@ -4221,7 +4401,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { ASSERT(root->log_root == NULL); if (root->reloc_root) { btrfs_put_root(root->reloc_root); @@ -4303,6 +4483,48 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info) return btrfs_commit_transaction(trans); } +static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) +{ + struct btrfs_transaction *trans; + struct btrfs_transaction *tmp; + bool found = false; + + if (list_empty(&fs_info->trans_list)) + return; + + /* + * This function is only called at the very end of close_ctree(), + * thus no other running transaction, no need to take trans_lock. + */ + ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)); + list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) { + struct extent_state *cached = NULL; + u64 dirty_bytes = 0; + u64 cur = 0; + u64 found_start; + u64 found_end; + + found = true; + while (!find_first_extent_bit(&trans->dirty_pages, cur, + &found_start, &found_end, EXTENT_DIRTY, &cached)) { + dirty_bytes += found_end + 1 - found_start; + cur = found_end + 1; + } + btrfs_warn(fs_info, + "transaction %llu (with %llu dirty metadata bytes) is not committed", + trans->transid, dirty_bytes); + btrfs_cleanup_one_transaction(trans, fs_info); + + if (trans == fs_info->running_transaction) + fs_info->running_transaction = NULL; + list_del_init(&trans->list); + + btrfs_put_transaction(trans); + trace_btrfs_transaction_commit(fs_info); + } + ASSERT(!found); +} + void __cold close_ctree(struct btrfs_fs_info *fs_info) { int ret; @@ -4372,8 +4594,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) || - test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) btrfs_error_commit_super(fs_info); kthread_stop(fs_info->transaction_kthread); @@ -4412,7 +4633,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_stop_all_workers(fs_info); /* We shouldn't have any transaction open at this point */ - ASSERT(list_empty(&fs_info->trans_list)); + warn_about_uncommitted_trans(fs_info); clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); @@ -4470,7 +4691,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) return; #endif - btrfs_assert_tree_locked(buf); + btrfs_assert_tree_write_locked(buf); if (transid != fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", buf->start, transid, fs_info->generation); @@ -4960,7 +5181,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->trans_lock); btrfs_put_transaction(t); - trace_btrfs_transaction_commit(fs_info->tree_root); + trace_btrfs_transaction_commit(fs_info); spin_lock(&fs_info->trans_lock); } spin_unlock(&fs_info->trans_lock); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 0e7e9526b6a8..5e8bef4b7563 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -6,9 +6,6 @@ #ifndef BTRFS_DISK_IO_H #define BTRFS_DISK_IO_H -#define BTRFS_SUPER_INFO_OFFSET SZ_64K -#define BTRFS_SUPER_INFO_SIZE 4096 - #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 @@ -74,6 +71,12 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 objectid); +int btrfs_global_root_insert(struct btrfs_root *root); +void btrfs_global_root_delete(struct btrfs_root *root); +struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *key); +struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); @@ -81,7 +84,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, +int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, @@ -106,6 +109,11 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) return NULL; } +static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) +{ + return btrfs_extent_root(fs_info, 0); +} + void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fc3da7585fb7..d89273c4b6b8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -87,6 +87,7 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache) /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { + struct btrfs_root *root = btrfs_extent_root(fs_info, start); int ret; struct btrfs_key key; struct btrfs_path *path; @@ -98,7 +99,7 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) key.objectid = start; key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); btrfs_free_path(path); return ret; } @@ -116,6 +117,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags) { + struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_path *path; @@ -153,7 +155,8 @@ search_again: else key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + extent_root = btrfs_extent_root(fs_info, bytenr); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out_free; @@ -171,7 +174,7 @@ search_again: if (ret == 0) { leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (item_size >= sizeof(*ei)) { ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -443,7 +446,7 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset) { - struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; @@ -519,7 +522,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset, int refs_to_add) { - struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; u32 size; @@ -593,6 +596,7 @@ fail: } static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, int refs_to_drop, int *last_ref) { @@ -626,7 +630,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, num_refs -= refs_to_drop; if (num_refs == 0) { - ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); + ret = btrfs_del_item(trans, root, path); *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) @@ -685,7 +689,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 parent, u64 root_objectid) { - struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; @@ -709,6 +713,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 parent, u64 root_objectid) { + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; @@ -721,8 +726,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, key.offset = root_objectid; } - ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root, - path, &key, 0); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); btrfs_release_path(path); return ret; } @@ -787,7 +791,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, u64 owner, u64 offset, int insert) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -865,7 +869,7 @@ again: } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { err = -EINVAL; btrfs_print_v0_err(fs_info); @@ -1007,7 +1011,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, __run_delayed_extent_op(extent_op, leaf, ei); ptr = (unsigned long)ei + item_offset; - end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); + end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]); if (ptr < end - size) memmove_extent_buffer(leaf, ptr + size, ptr, end - size - ptr); @@ -1119,7 +1123,7 @@ void update_inline_extent_backref(struct btrfs_path *path, } else { *last_ref = 1; size = btrfs_extent_inline_ref_size(type); - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ptr = (unsigned long)iref; end = (unsigned long)ei + item_size; if (ptr + size < end) @@ -1174,6 +1178,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, } static int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_drop, int is_data, int *last_ref) @@ -1185,11 +1190,11 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, update_inline_extent_backref(path, iref, -refs_to_drop, NULL, last_ref); } else if (is_data) { - ret = remove_extent_data_ref(trans, path, refs_to_drop, + ret = remove_extent_data_ref(trans, root, path, refs_to_drop, last_ref); } else { *last_ref = 1; - ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); + ret = btrfs_del_item(trans, root, path); } return ret; } @@ -1266,7 +1271,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, return ret; } -static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes) +static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) { struct btrfs_device *dev = stripe->dev; struct btrfs_fs_info *fs_info = dev->fs_info; @@ -1313,22 +1318,21 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 discarded_bytes = 0; u64 end = bytenr + num_bytes; u64 cur = bytenr; - struct btrfs_bio *bbio = NULL; - + struct btrfs_io_context *bioc = NULL; /* - * Avoid races with device replace and make sure our bbio has devices + * Avoid races with device replace and make sure our bioc has devices * associated to its stripes that don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); while (cur < end) { - struct btrfs_bio_stripe *stripe; + struct btrfs_io_stripe *stripe; int i; num_bytes = end - cur; /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur, - &num_bytes, &bbio, 0); + &num_bytes, &bioc, 0); /* * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or * -EOPNOTSUPP. For any such error, @num_bytes is not updated, @@ -1337,8 +1341,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, if (ret < 0) goto out; - stripe = bbio->stripes; - for (i = 0; i < bbio->num_stripes; i++, stripe++) { + stripe = bioc->stripes; + for (i = 0; i < bioc->num_stripes; i++, stripe++) { u64 bytes; struct btrfs_device *device = stripe->dev; @@ -1361,7 +1365,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, * And since there are two loops, explicitly * go to out to avoid confusion. */ - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); goto out; } @@ -1372,7 +1376,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, */ ret = 0; } - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); cur += num_bytes; } out: @@ -1397,7 +1401,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && - generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); + generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); @@ -1573,6 +1577,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; struct btrfs_extent_item *ei; @@ -1602,8 +1607,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, key.offset = head->num_bytes; } + root = btrfs_extent_root(fs_info, key.objectid); again: - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { err = ret; goto out; @@ -1635,7 +1641,7 @@ again: } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { err = -EINVAL; @@ -1845,8 +1851,11 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) { btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info->csum_root, - head->bytenr, head->num_bytes); + struct btrfs_root *csum_root; + + csum_root = btrfs_csum_root(fs_info, head->bytenr); + ret = btrfs_del_csums(trans, csum_root, head->bytenr, + head->num_bytes); } } @@ -2286,7 +2295,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, bool strict) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct extent_buffer *leaf; struct btrfs_extent_data_ref *ref; struct btrfs_extent_inline_ref *iref; @@ -2317,7 +2326,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, goto out; ret = 1; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); /* If extent item has more than 1 inline ref then it's shared */ @@ -2376,7 +2385,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, out: btrfs_free_path(path); - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) WARN_ON(ret > 0); return ret; } @@ -2438,10 +2447,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); btrfs_init_generic_ref(&generic_ref, action, bytenr, num_bytes, parent); - generic_ref.real_root = root->root_key.objectid; btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, - key.offset); - generic_ref.skip_qgroup = for_reloc; + key.offset, root->root_key.objectid, + for_reloc); if (inc) ret = btrfs_inc_extent_ref(trans, &generic_ref); else @@ -2453,9 +2461,8 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = fs_info->nodesize; btrfs_init_generic_ref(&generic_ref, action, bytenr, num_bytes, parent); - generic_ref.real_root = root->root_key.objectid; - btrfs_init_tree_ref(&generic_ref, level - 1, ref_root); - generic_ref.skip_qgroup = for_reloc; + btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, + root->root_key.objectid, for_reloc); if (inc) ret = btrfs_inc_extent_ref(trans, &generic_ref); else @@ -2923,7 +2930,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *info = trans->fs_info; struct btrfs_key key; struct btrfs_path *path; - struct btrfs_root *extent_root = info->extent_root; + struct btrfs_root *extent_root; struct extent_buffer *leaf; struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -2939,6 +2946,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int last_ref = 0; bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); + extent_root = btrfs_extent_root(info, bytenr); + ASSERT(extent_root); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2999,9 +3009,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto err_dump; } /* Must be SHARED_* item, remove the backref first */ - ret = remove_extent_backref(trans, path, NULL, - refs_to_drop, - is_data, &last_ref); + ret = remove_extent_backref(trans, extent_root, path, + NULL, refs_to_drop, is_data, + &last_ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3071,7 +3081,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, extent_slot); + item_size = btrfs_item_size(leaf, extent_slot); if (unlikely(item_size < sizeof(*ei))) { ret = -EINVAL; btrfs_print_v0_err(info); @@ -3125,8 +3135,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); } if (found_extent) { - ret = remove_extent_backref(trans, path, iref, - refs_to_drop, is_data, + ret = remove_extent_backref(trans, extent_root, path, + iref, refs_to_drop, is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3182,7 +3192,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (is_data) { - ret = btrfs_del_csums(trans, info->csum_root, bytenr, + struct btrfs_root *csum_root; + csum_root = btrfs_csum_root(info, bytenr); + ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3196,7 +3208,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0); + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3278,20 +3290,20 @@ out_delayed_unlock: } void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + u64 root_id, struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_ref generic_ref = { 0 }; int ret; btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, buf->start, buf->len, parent); btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root->root_key.objectid); + root_id, 0, false); - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + if (root_id != BTRFS_TREE_LOG_OBJECTID) { btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); BUG_ON(ret); /* -ENOMEM */ @@ -3301,7 +3313,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache; bool must_pin = false; - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + if (root_id != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); if (!ret) { btrfs_redirty_list_add(trans->transaction, buf); @@ -3373,9 +3385,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree, just update pinning info and exit early. */ if ((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { + ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); ret = 0; @@ -3386,9 +3398,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) } if (!((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) + ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID))) btrfs_ref_tree_mod(fs_info, ref); return ret; @@ -3476,7 +3488,9 @@ enum btrfs_extent_allocation_policy { */ struct find_free_extent_ctl { /* Basic allocation info */ + u64 ram_bytes; u64 num_bytes; + u64 min_alloc_size; u64 empty_size; u64 flags; int delalloc; @@ -3495,6 +3509,9 @@ struct find_free_extent_ctl { /* Allocation is called for tree-log */ bool for_treelog; + /* Allocation is called for data relocation */ + bool for_data_reloc; + /* RAID index, converted from flags */ int index; @@ -3756,8 +3773,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, u64 avail; u64 bytenr = block_group->start; u64 log_bytenr; + u64 data_reloc_bytenr; int ret = 0; - bool skip; + bool skip = false; ASSERT(btrfs_is_zoned(block_group->fs_info)); @@ -3767,19 +3785,61 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, */ spin_lock(&fs_info->treelog_bg_lock); log_bytenr = fs_info->treelog_bg; - skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || - (!ffe_ctl->for_treelog && bytenr == log_bytenr)); + if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || + (!ffe_ctl->for_treelog && bytenr == log_bytenr))) + skip = true; spin_unlock(&fs_info->treelog_bg_lock); if (skip) return 1; + /* + * Do not allow non-relocation blocks in the dedicated relocation block + * group, and vice versa. + */ + spin_lock(&fs_info->relocation_bg_lock); + data_reloc_bytenr = fs_info->data_reloc_bg; + if (data_reloc_bytenr && + ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) || + (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr))) + skip = true; + spin_unlock(&fs_info->relocation_bg_lock); + if (skip) + return 1; + + /* Check RO and no space case before trying to activate it */ + spin_lock(&block_group->lock); + if (block_group->ro || + block_group->alloc_offset == block_group->zone_capacity) { + ret = 1; + /* + * May need to clear fs_info->{treelog,data_reloc}_bg. + * Return the error after taking the locks. + */ + } + spin_unlock(&block_group->lock); + + if (!ret && !btrfs_zone_activate(block_group)) { + ret = 1; + /* + * May need to clear fs_info->{treelog,data_reloc}_bg. + * Return the error after taking the locks. + */ + } + spin_lock(&space_info->lock); spin_lock(&block_group->lock); spin_lock(&fs_info->treelog_bg_lock); + spin_lock(&fs_info->relocation_bg_lock); + + if (ret) + goto out; ASSERT(!ffe_ctl->for_treelog || block_group->start == fs_info->treelog_bg || fs_info->treelog_bg == 0); + ASSERT(!ffe_ctl->for_data_reloc || + block_group->start == fs_info->data_reloc_bg || + fs_info->data_reloc_bg == 0); if (block_group->ro) { ret = 1; @@ -3796,7 +3856,18 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, goto out; } - avail = block_group->length - block_group->alloc_offset; + /* + * Do not allow currently used block group to be the data relocation + * dedicated block group. + */ + if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg && + (block_group->used || block_group->reserved)) { + ret = 1; + goto out; + } + + WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity); + avail = block_group->zone_capacity - block_group->alloc_offset; if (avail < num_bytes) { if (ffe_ctl->max_extent_size < avail) { /* @@ -3813,6 +3884,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, if (ffe_ctl->for_treelog && !fs_info->treelog_bg) fs_info->treelog_bg = block_group->start; + if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) + fs_info->data_reloc_bg = block_group->start; + ffe_ctl->found_offset = start + block_group->alloc_offset; block_group->alloc_offset += num_bytes; spin_lock(&ctl->tree_lock); @@ -3829,6 +3903,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; + if (ret && ffe_ctl->for_data_reloc) + fs_info->data_reloc_bg = 0; + spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); @@ -3897,6 +3974,28 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, } } +static bool can_allocate_chunk(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return true; + case BTRFS_EXTENT_ALLOC_ZONED: + /* + * If we have enough free space left in an already + * active block group and we can't activate any other + * zone now, do not allow allocating a new chunk and + * let find_free_extent() retry with a smaller size. + */ + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && + !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) + return false; + return true; + default: + BUG(); + } +} + static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) { switch (ffe_ctl->policy) { @@ -3925,25 +4024,25 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, bool full_search) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = fs_info->chunk_root; int ret; if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) ffe_ctl->orig_have_caching_bg = true; - if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && - ffe_ctl->have_caching_bg) - return 1; - - if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) - return 1; - if (ins->objectid) { found_extent(ffe_ctl, ins); return 0; } + if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) + return 1; + + ffe_ctl->index++; + if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) + return 1; + /* * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along @@ -3972,6 +4071,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; int exist = 0; + /*Check if allocation policy allows to create a new chunk */ + if (!can_allocate_chunk(fs_info, ffe_ctl)) + return -ENOSPC; + trans = current->journal_info; if (trans) exist = 1; @@ -4085,6 +4188,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, ffe_ctl->hint_byte = fs_info->treelog_bg; spin_unlock(&fs_info->treelog_bg_lock); } + if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } return 0; default: BUG(); @@ -4117,65 +4226,62 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, * |- If not found, re-iterate all block groups */ static noinline int find_free_extent(struct btrfs_root *root, - u64 ram_bytes, u64 num_bytes, u64 empty_size, - u64 hint_byte_orig, struct btrfs_key *ins, - u64 flags, int delalloc) + struct btrfs_key *ins, + struct find_free_extent_ctl *ffe_ctl) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int cache_block_group_error = 0; struct btrfs_block_group *block_group = NULL; - struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; bool full_search = false; - bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); - WARN_ON(num_bytes < fs_info->sectorsize); - - ffe_ctl.num_bytes = num_bytes; - ffe_ctl.empty_size = empty_size; - ffe_ctl.flags = flags; - ffe_ctl.search_start = 0; - ffe_ctl.delalloc = delalloc; - ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); - ffe_ctl.have_caching_bg = false; - ffe_ctl.orig_have_caching_bg = false; - ffe_ctl.found_offset = 0; - ffe_ctl.hint_byte = hint_byte_orig; - ffe_ctl.for_treelog = for_treelog; - ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize); + ffe_ctl->search_start = 0; + /* For clustered allocation */ + ffe_ctl->empty_cluster = 0; + ffe_ctl->last_ptr = NULL; + ffe_ctl->use_cluster = true; + ffe_ctl->have_caching_bg = false; + ffe_ctl->orig_have_caching_bg = false; + ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); + ffe_ctl->loop = 0; /* For clustered allocation */ - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; - ffe_ctl.last_ptr = NULL; - ffe_ctl.use_cluster = true; + ffe_ctl->retry_clustered = false; + ffe_ctl->retry_unclustered = false; + ffe_ctl->cached = 0; + ffe_ctl->max_extent_size = 0; + ffe_ctl->total_free_space = 0; + ffe_ctl->found_offset = 0; + ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; if (btrfs_is_zoned(fs_info)) - ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED; + ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, num_bytes, empty_size, flags); + trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size, + ffe_ctl->flags); - space_info = btrfs_find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); if (!space_info) { - btrfs_err(fs_info, "No space info for %llu", flags); + btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); return -ENOSPC; } - ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins); + ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins); if (ret < 0) return ret; - ffe_ctl.search_start = max(ffe_ctl.search_start, - first_logical_byte(fs_info, 0)); - ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte); - if (ffe_ctl.search_start == ffe_ctl.hint_byte) { + ffe_ctl->search_start = max(ffe_ctl->search_start, + first_logical_byte(fs_info, 0)); + ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); + if (ffe_ctl->search_start == ffe_ctl->hint_byte) { block_group = btrfs_lookup_block_group(fs_info, - ffe_ctl.search_start); + ffe_ctl->search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -4183,7 +4289,7 @@ static noinline int find_free_extent(struct btrfs_root *root, * However if we are re-searching with an ideal block group * picked out then we don't care that the block group is cached. */ - if (block_group && block_group_bits(block_group, flags) && + if (block_group && block_group_bits(block_group, ffe_ctl->flags) && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || @@ -4197,9 +4303,10 @@ static noinline int find_free_extent(struct btrfs_root *root, btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { - ffe_ctl.index = btrfs_bg_flags_to_raid_index( - block_group->flags); - btrfs_lock_block_group(block_group, delalloc); + ffe_ctl->index = btrfs_bg_flags_to_raid_index( + block_group->flags); + btrfs_lock_block_group(block_group, + ffe_ctl->delalloc); goto have_block_group; } } else if (block_group) { @@ -4207,31 +4314,33 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: - ffe_ctl.have_caching_bg = false; - if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || - ffe_ctl.index == 0) + ffe_ctl->have_caching_bg = false; + if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || + ffe_ctl->index == 0) full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, - &space_info->block_groups[ffe_ctl.index], list) { + &space_info->block_groups[ffe_ctl->index], list) { struct btrfs_block_group *bg_ret; /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) { - if (for_treelog) + if (ffe_ctl->for_treelog) btrfs_clear_treelog_bg(block_group); + if (ffe_ctl->for_data_reloc) + btrfs_clear_data_reloc_bg(block_group); continue; } - btrfs_grab_block_group(block_group, delalloc); - ffe_ctl.search_start = block_group->start; + btrfs_grab_block_group(block_group, ffe_ctl->delalloc); + ffe_ctl->search_start = block_group->start; /* * this can happen if we end up cycling through all the * raid types, but we want to make sure we only allocate * for the proper type. */ - if (!block_group_bits(block_group, flags)) { + if (!block_group_bits(block_group, ffe_ctl->flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID56_MASK | @@ -4242,7 +4351,7 @@ search: * doesn't provide them, bail. This does allow us to * fill raid0 from raid1. */ - if ((flags & extra) && !(block_group->flags & extra)) + if ((ffe_ctl->flags & extra) && !(block_group->flags & extra)) goto loop; /* @@ -4250,14 +4359,14 @@ search: * It's possible that we have MIXED_GROUP flag but no * block group is mixed. Just skip such block group. */ - btrfs_release_block_group(block_group, delalloc); + btrfs_release_block_group(block_group, ffe_ctl->delalloc); continue; } have_block_group: - ffe_ctl.cached = btrfs_block_group_done(block_group); - if (unlikely(!ffe_ctl.cached)) { - ffe_ctl.have_caching_bg = true; + ffe_ctl->cached = btrfs_block_group_done(block_group); + if (unlikely(!ffe_ctl->cached)) { + ffe_ctl->have_caching_bg = true; ret = btrfs_cache_block_group(block_group, 0); /* @@ -4280,10 +4389,11 @@ have_block_group: goto loop; bg_ret = NULL; - ret = do_allocation(block_group, &ffe_ctl, &bg_ret); + ret = do_allocation(block_group, ffe_ctl, &bg_ret); if (ret == 0) { if (bg_ret && bg_ret != block_group) { - btrfs_release_block_group(block_group, delalloc); + btrfs_release_block_group(block_group, + ffe_ctl->delalloc); block_group = bg_ret; } } else if (ret == -EAGAIN) { @@ -4293,46 +4403,49 @@ have_block_group: } /* Checks */ - ffe_ctl.search_start = round_up(ffe_ctl.found_offset, - fs_info->stripesize); + ffe_ctl->search_start = round_up(ffe_ctl->found_offset, + fs_info->stripesize); /* move on to the next group */ - if (ffe_ctl.search_start + num_bytes > + if (ffe_ctl->search_start + ffe_ctl->num_bytes > block_group->start + block_group->length) { btrfs_add_free_space_unused(block_group, - ffe_ctl.found_offset, num_bytes); + ffe_ctl->found_offset, + ffe_ctl->num_bytes); goto loop; } - if (ffe_ctl.found_offset < ffe_ctl.search_start) + if (ffe_ctl->found_offset < ffe_ctl->search_start) btrfs_add_free_space_unused(block_group, - ffe_ctl.found_offset, - ffe_ctl.search_start - ffe_ctl.found_offset); + ffe_ctl->found_offset, + ffe_ctl->search_start - ffe_ctl->found_offset); - ret = btrfs_add_reserved_bytes(block_group, ram_bytes, - num_bytes, delalloc); + ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, + ffe_ctl->num_bytes, + ffe_ctl->delalloc); if (ret == -EAGAIN) { btrfs_add_free_space_unused(block_group, - ffe_ctl.found_offset, num_bytes); + ffe_ctl->found_offset, + ffe_ctl->num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ - ins->objectid = ffe_ctl.search_start; - ins->offset = num_bytes; + ins->objectid = ffe_ctl->search_start; + ins->offset = ffe_ctl->num_bytes; - trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, - num_bytes); - btrfs_release_block_group(block_group, delalloc); + trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start, + ffe_ctl->num_bytes); + btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: - release_block_group(block_group, &ffe_ctl, delalloc); + release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); cond_resched(); } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search); + ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); if (ret > 0) goto search; @@ -4341,12 +4454,12 @@ loop: * Use ffe_ctl->total_free_space as fallback if we can't find * any contiguous hole. */ - if (!ffe_ctl.max_extent_size) - ffe_ctl.max_extent_size = ffe_ctl.total_free_space; + if (!ffe_ctl->max_extent_size) + ffe_ctl->max_extent_size = ffe_ctl->total_free_space; spin_lock(&space_info->lock); - space_info->max_extent_size = ffe_ctl.max_extent_size; + space_info->max_extent_size = ffe_ctl->max_extent_size; spin_unlock(&space_info->lock); - ins->offset = ffe_ctl.max_extent_size; + ins->offset = ffe_ctl->max_extent_size; } else if (ret == -ENOSPC) { ret = cache_block_group_error; } @@ -4404,16 +4517,28 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, struct btrfs_key *ins, int is_data, int delalloc) { struct btrfs_fs_info *fs_info = root->fs_info; + struct find_free_extent_ctl ffe_ctl = {}; bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); - ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, - hint_byte, ins, flags, delalloc); + + ffe_ctl.ram_bytes = ram_bytes; + ffe_ctl.num_bytes = num_bytes; + ffe_ctl.min_alloc_size = min_alloc_size; + ffe_ctl.empty_size = empty_size; + ffe_ctl.flags = flags; + ffe_ctl.delalloc = delalloc; + ffe_ctl.hint_byte = hint_byte; + ffe_ctl.for_treelog = for_treelog; + ffe_ctl.for_data_reloc = for_data_reloc; + + ret = find_free_extent(root, ins, &ffe_ctl); if (!ret && !is_data) { btrfs_dec_block_group_reservations(fs_info, ins->objectid); } else if (ret == -ENOSPC) { @@ -4431,8 +4556,8 @@ again: sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, - "allocation failed flags %llu, wanted %llu tree-log %d", - flags, num_bytes, for_treelog); + "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", + flags, num_bytes, for_treelog, for_data_reloc); if (sinfo) btrfs_dump_space_info(fs_info, sinfo, num_bytes, 1); @@ -4486,6 +4611,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins, int ref_mod) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; struct btrfs_extent_inline_ref *iref; @@ -4505,8 +4631,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - ins, size); + extent_root = btrfs_extent_root(fs_info, ins->objectid); + ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size); if (ret) { btrfs_free_path(path); return ret; @@ -4543,7 +4669,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1); + ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -4558,6 +4684,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; struct btrfs_key extent_key; @@ -4589,8 +4716,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - &extent_key, size); + extent_root = btrfs_extent_root(fs_info, extent_key.objectid); + ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key, + size); if (ret) { btrfs_free_path(path); return ret; @@ -4632,7 +4760,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, return ret; ret = btrfs_update_block_group(trans, extent_key.objectid, - fs_info->nodesize, 1); + fs_info->nodesize, true); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", extent_key.objectid, extent_key.offset); @@ -4655,7 +4783,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, ins->objectid, ins->offset, 0); - btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); + btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, + offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); @@ -4847,8 +4976,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, ins.objectid, ins.offset, parent); - generic_ref.real_root = root->root_key.objectid; - btrfs_init_tree_ref(&generic_ref, level, root_objectid); + btrfs_init_tree_ref(&generic_ref, level, root_objectid, + root->root_key.objectid, false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); if (ret) @@ -4859,6 +4988,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, out_free_delayed: btrfs_free_delayed_extent_op(extent_op); out_free_buf: + btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); @@ -5264,7 +5394,8 @@ skip: btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, fs_info->nodesize, parent); - btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid); + btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, + 0, false); ret = btrfs_free_extent(trans, &ref); if (ret) goto out_unlock; @@ -5385,7 +5516,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, goto owner_mismatch; } - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); + btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, + wc->refs[level] == 1); out: wc->refs[level] = 0; wc->flags[level] = 0; @@ -5749,13 +5881,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return -ENOMEM; } - btrfs_assert_tree_locked(parent); + btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); atomic_inc(&parent->refs); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); - btrfs_assert_tree_locked(node); + btrfs_assert_tree_write_locked(node); level = btrfs_header_level(node); path->nodes[level] = node; path->slots[level] = 0; @@ -5964,6 +6096,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) int dev_ret = 0; int ret = 0; + if (range->start == U64_MAX) + return -EINVAL; + /* * Check range overflow if range->len is set. * The default range->len is U64_MAX. diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index aaddd7225348..d6d48ecf823c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -241,7 +241,7 @@ int __init extent_io_init(void) return -ENOMEM; if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio), + offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) goto free_buffer_cache; @@ -1975,10 +1975,18 @@ static noinline int lock_delalloc_pages(struct inode *inode, /* * Find and lock a contiguous range of bytes in the file marked as delalloc, no - * more than @max_bytes. @Start and @end are used to return the range, + * more than @max_bytes. * - * Return: true if we find something - * false if nothing was in the tree + * @start: The original start bytenr to search. + * Will store the extent range start bytenr. + * @end: The original end bytenr of the search range + * Will store the extent range end bytenr. + * + * Return true if we find a delalloc range which starts inside the original + * range, and @start/@end will store the delalloc range start/end. + * + * Return false if we can't find any delalloc range which starts inside the + * original range, and @start/@end will be the non-delalloc range start/end. */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, @@ -1986,6 +1994,8 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, u64 *end) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + const u64 orig_start = *start; + const u64 orig_end = *end; u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; @@ -1994,15 +2004,23 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, int ret; int loops = 0; + /* Caller should pass a valid @end to indicate the search range end */ + ASSERT(orig_end > orig_start); + + /* The range should at least cover part of the page */ + ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || + orig_end <= page_offset(locked_page))); again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; delalloc_end = 0; found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, max_bytes, &cached_state); - if (!found || delalloc_end <= *start) { + if (!found || delalloc_end <= *start || delalloc_start > orig_end) { *start = delalloc_start; - *end = delalloc_end; + + /* @delalloc_end can be -1, never go beyond @orig_end */ + *end = min(delalloc_end, orig_end); free_extent_state(cached_state); return false; } @@ -2282,29 +2300,29 @@ int free_io_failure(struct extent_io_tree *failure_tree, * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) +static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) { struct bio *bio; struct btrfs_device *dev; u64 map_length = 0; u64 sector; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; int ret; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); - if (btrfs_is_zoned(fs_info)) - return btrfs_repair_one_zone(fs_info, logical); + if (btrfs_repair_one_zone(fs_info, logical)) + return 0; - bio = btrfs_io_bio_alloc(1); + bio = btrfs_bio_alloc(1); bio->bi_iter.bi_size = 0; map_length = length; /* - * Avoid races with device replace and make sure our bbio has devices + * Avoid races with device replace and make sure our bioc has devices * associated to its stripes that don't go away while we are doing the * read repair operation. */ @@ -2317,28 +2335,28 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, * stripe's dev and sector. */ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, - &map_length, &bbio, 0); + &map_length, &bioc, 0); if (ret) { btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } - ASSERT(bbio->mirror_num == 1); + ASSERT(bioc->mirror_num == 1); } else { ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bbio, mirror_num); + &map_length, &bioc, mirror_num); if (ret) { btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } - BUG_ON(mirror_num != bbio->mirror_num); + BUG_ON(mirror_num != bioc->mirror_num); } - sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; + sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; bio->bi_iter.bi_sector = sector; - dev = bbio->stripes[bbio->mirror_num - 1].dev; - btrfs_put_bbio(bbio); + dev = bioc->stripes[bioc->mirror_num - 1].dev; + btrfs_put_bioc(bioc); if (!dev || !dev->bdev || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { btrfs_bio_counter_dec(fs_info); @@ -2618,10 +2636,10 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); + struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio); const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; - struct btrfs_io_bio *repair_io_bio; + struct btrfs_bio *repair_bbio; blk_status_t status; btrfs_debug(fs_info, @@ -2639,24 +2657,23 @@ int btrfs_repair_one_sector(struct inode *inode, return -EIO; } - repair_bio = btrfs_io_bio_alloc(1); - repair_io_bio = btrfs_io_bio(repair_bio); + repair_bio = btrfs_bio_alloc(1); + repair_bbio = btrfs_bio(repair_bio); repair_bio->bi_opf = REQ_OP_READ; repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; repair_bio->bi_private = failed_bio->bi_private; - if (failed_io_bio->csum) { + if (failed_bbio->csum) { const u32 csum_size = fs_info->csum_size; - repair_io_bio->csum = repair_io_bio->csum_inline; - memcpy(repair_io_bio->csum, - failed_io_bio->csum + csum_size * icsum, csum_size); + repair_bbio->csum = repair_bbio->csum_inline; + memcpy(repair_bbio->csum, + failed_bbio->csum + csum_size * icsum, csum_size); } bio_add_page(repair_bio, page, failrec->len, pgoff); - repair_io_bio->logical = failrec->start; - repair_io_bio->iter = repair_bio->bi_iter; + repair_bbio->iter = repair_bio->bi_iter; btrfs_debug(btrfs_sb(inode->i_sb), "repair read error: submitting new read to mirror %d", @@ -2976,7 +2993,7 @@ static struct extent_buffer *find_extent_buffer_readpage( static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct btrfs_bio *bbio = btrfs_bio(bio); struct extent_io_tree *tree, *failure_tree; struct processed_extent processed = { 0 }; /* @@ -3003,7 +3020,7 @@ static void end_bio_extent_readpage(struct bio *bio) btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", bio->bi_iter.bi_sector, bio->bi_status, - io_bio->mirror_num); + bbio->mirror_num); tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; @@ -3028,14 +3045,14 @@ static void end_bio_extent_readpage(struct bio *bio) end = start + bvec->bv_len - 1; len = bvec->bv_len; - mirror = io_bio->mirror_num; + mirror = bbio->mirror_num; if (likely(uptodate)) { if (is_data_inode(inode)) { - error_bitmap = btrfs_verify_data_csum(io_bio, + error_bitmap = btrfs_verify_data_csum(bbio, bio_offset, page, start, end); ret = error_bitmap; } else { - ret = btrfs_validate_metadata_buffer(io_bio, + ret = btrfs_validate_metadata_buffer(bbio, page, start, end, mirror); } if (ret) @@ -3070,9 +3087,6 @@ static void end_bio_extent_readpage(struct bio *bio) set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = mirror; atomic_dec(&eb->io_pages); - if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, - &eb->bflags)) - btree_readahead_hook(eb, -EIO); } readpage_ok: if (likely(uptodate)) { @@ -3106,7 +3120,7 @@ readpage_ok: } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); - btrfs_io_bio_free_csum(io_bio); + btrfs_bio_free_csum(bbio); bio_put(bio); } @@ -3115,53 +3129,43 @@ readpage_ok: * new bio by bio_alloc_bioset as it does not initialize the bytes outside of * 'bio' because use of __GFP_ZERO is not supported. */ -static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) +static inline void btrfs_bio_init(struct btrfs_bio *bbio) { - memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio)); + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); } /* - * The following helpers allocate a bio. As it's backed by a bioset, it'll - * never fail. We're returning a bio right now but you can call btrfs_io_bio - * for the appropriate container_of magic + * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs. + * + * The bio allocation is backed by bioset and does not fail. */ -struct bio *btrfs_bio_alloc(u64 first_byte) +struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) { struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset); - bio->bi_iter.bi_sector = first_byte >> 9; - btrfs_io_bio_init(btrfs_io_bio(bio)); + ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); + bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); + btrfs_bio_init(btrfs_bio(bio)); return bio; } struct bio *btrfs_bio_clone(struct bio *bio) { - struct btrfs_io_bio *btrfs_bio; + struct btrfs_bio *bbio; struct bio *new; /* Bio allocation backed by a bioset does not fail */ new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); - btrfs_bio = btrfs_io_bio(new); - btrfs_io_bio_init(btrfs_bio); - btrfs_bio->iter = bio->bi_iter; + bbio = btrfs_bio(new); + btrfs_bio_init(bbio); + bbio->iter = bio->bi_iter; return new; } -struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) -{ - struct bio *bio; - - /* Bio allocation backed by a bioset does not fail */ - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); - btrfs_io_bio_init(btrfs_io_bio(bio)); - return bio; -} - struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) { struct bio *bio; - struct btrfs_io_bio *btrfs_bio; + struct btrfs_bio *bbio; ASSERT(offset <= UINT_MAX && size <= UINT_MAX); @@ -3169,24 +3173,23 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); - btrfs_bio = btrfs_io_bio(bio); - btrfs_io_bio_init(btrfs_bio); + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio); bio_trim(bio, offset >> 9, size >> 9); - btrfs_bio->iter = bio->bi_iter; + bbio->iter = bio->bi_iter; return bio; } /** * Attempt to add a page to bio * - * @bio: destination bio + * @bio_ctrl: record both the bio, and its bio_flags * @page: page to add to the bio * @disk_bytenr: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @pg_offset: starting offset in the page * @size: portion of page that we want to write - * @prev_bio_flags: flags of previous bio to see if we can merge the current one + * @pg_offset: starting offset in the page * @bio_flags: flags of the current bio to see if we can merge them * * Attempt to add a page to bio considering stripe alignment etc. @@ -3276,8 +3279,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, else bio_ctrl->len_to_stripe_boundary = (u32)geom.len; - if (!btrfs_is_zoned(fs_info) || - bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { + if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { bio_ctrl->len_to_oe_boundary = U32_MAX; return 0; } @@ -3307,14 +3309,15 @@ static int alloc_new_bio(struct btrfs_inode *inode, struct bio *bio; int ret; + bio = btrfs_bio_alloc(BIO_MAX_VECS); /* * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. */ if (bio_flags & EXTENT_BIO_COMPRESSED) - bio = btrfs_bio_alloc(disk_bytenr); + bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else - bio = btrfs_bio_alloc(disk_bytenr + offset); + bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; bio_ctrl->bio = bio; bio_ctrl->bio_flags = bio_flags; bio->bi_end_io = end_io_func; @@ -3327,11 +3330,11 @@ static int alloc_new_bio(struct btrfs_inode *inode, if (wbc) { struct block_device *bdev; - bdev = fs_info->fs_devices->latest_bdev; + bdev = fs_info->fs_devices->latest_dev->bdev; bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); } - if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { struct btrfs_device *device; device = btrfs_zoned_get_device(fs_info, disk_bytenr, @@ -3341,7 +3344,7 @@ static int alloc_new_bio(struct btrfs_inode *inode, goto error; } - btrfs_io_bio(bio)->device = device; + btrfs_bio(bio)->device = device; } return 0; error: @@ -3599,6 +3602,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, bool force_bio_submit = false; u64 disk_bytenr; + ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { struct extent_state *cached = NULL; @@ -3776,18 +3780,20 @@ static void update_nr_written(struct writeback_control *wbc, * This returns < 0 if there were errors (page still locked) */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, - struct page *page, struct writeback_control *wbc, - u64 delalloc_start, unsigned long *nr_written) + struct page *page, struct writeback_control *wbc) { - u64 page_end = delalloc_start + PAGE_SIZE - 1; - bool found; + const u64 page_end = page_offset(page) + PAGE_SIZE - 1; + u64 delalloc_start = page_offset(page); u64 delalloc_to_write = 0; - u64 delalloc_end = 0; + /* How many pages are started by btrfs_run_delalloc_range() */ + unsigned long nr_written = 0; int ret; int page_started = 0; + while (delalloc_start < page_end) { + u64 delalloc_end = page_end; + bool found; - while (delalloc_end < page_end) { found = find_lock_delalloc_range(&inode->vfs_inode, page, &delalloc_start, &delalloc_end); @@ -3796,7 +3802,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, continue; } ret = btrfs_run_delalloc_range(inode, page, delalloc_start, - delalloc_end, &page_started, nr_written, wbc); + delalloc_end, &page_started, &nr_written, wbc); if (ret) { btrfs_page_set_error(inode->root->fs_info, page, page_offset(page), PAGE_SIZE); @@ -3819,16 +3825,13 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, thresh); } - /* did the fill delalloc function already unlock and start - * the IO? - */ + /* Did btrfs_run_dealloc_range() already unlock and start the IO? */ if (page_started) { /* - * we've unlocked the page, so we can't update - * the mapping's writeback index, just update - * nr_to_write. + * We've unlocked the page, so we can't update the mapping's + * writeback index, just update nr_to_write. */ - wbc->nr_to_write -= *nr_written; + wbc->nr_to_write -= nr_written; return 1; } @@ -3854,12 +3857,11 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, struct page *page, u64 *start, u64 *end) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage_info *spi = fs_info->subpage_info; u64 orig_start = *start; /* Declare as unsigned long so we can use bitmap ops */ - unsigned long dirty_bitmap; unsigned long flags; - int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits; - int range_start_bit = nbits; + int range_start_bit; int range_end_bit; /* @@ -3872,13 +3874,18 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, return; } + range_start_bit = spi->dirty_offset + + (offset_in_page(orig_start) >> fs_info->sectorsize_bits); + /* We should have the page locked, but just in case */ spin_lock_irqsave(&subpage->lock, flags); - dirty_bitmap = subpage->dirty_bitmap; + bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit, + spi->dirty_offset + spi->bitmap_nr_bits); spin_unlock_irqrestore(&subpage->lock, flags); - bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit, - BTRFS_SUBPAGE_BITMAP_SIZE); + range_start_bit -= spi->dirty_offset; + range_end_bit -= spi->dirty_offset; + *start = page_offset(page) + range_start_bit * fs_info->sectorsize; *end = page_offset(page) + range_end_bit * fs_info->sectorsize; } @@ -3896,7 +3903,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct writeback_control *wbc, struct extent_page_data *epd, loff_t i_size, - unsigned long nr_written, int *nr_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -3915,7 +3921,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (ret) { /* Fixup worker will requeue */ redirty_page_for_writepage(wbc, page); - update_nr_written(wbc, nr_written); unlock_page(page); return 1; } @@ -3924,7 +3929,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * we don't want to touch the inode after unlocking the page, * so we update the mapping writeback index now */ - update_nr_written(wbc, nr_written + 1); + update_nr_written(wbc, 1); while (cur <= end) { u64 disk_bytenr; @@ -4054,14 +4059,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd) { struct inode *inode = page->mapping->host; - u64 start = page_offset(page); - u64 page_end = start + PAGE_SIZE - 1; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u64 page_start = page_offset(page); + const u64 page_end = page_start + PAGE_SIZE - 1; int ret; int nr = 0; size_t pg_offset; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - unsigned long nr_written = 0; trace___extent_writepage(page, inode, wbc); @@ -4090,8 +4095,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (!epd->extent_locked) { - ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start, - &nr_written); + ret = writepage_delalloc(BTRFS_I(inode), page, wbc); if (ret == 1) return 0; if (ret) @@ -4099,7 +4103,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, - nr_written, &nr); + &nr); if (ret == 1) return 0; @@ -4141,8 +4145,20 @@ done: * capable of that. */ if (PageError(page)) - end_extent_writepage(page, ret, start, page_end); - unlock_page(page); + end_extent_writepage(page, ret, page_start, page_end); + if (epd->extent_locked) { + /* + * If epd->extent_locked, it's from extent_write_locked_range(), + * the page can either be locked by lock_page() or + * process_one_page(). + * Let btrfs_page_unlock_writer() handle both cases. + */ + ASSERT(wbc); + btrfs_page_unlock_writer(fs_info, page, wbc->range_start, + wbc->range_end + 1 - wbc->range_start); + } else { + unlock_page(page); + } ASSERT(ret <= 0); return ret; } @@ -4155,6 +4171,9 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) static void end_extent_buffer_writeback(struct extent_buffer *eb) { + if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags)) + btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len); + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -4285,6 +4304,20 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) return; /* + * A read may stumble upon this buffer later, make sure that it gets an + * error and knows there was an error. + */ + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + + /* + * We need to set the mapping with the io error as well because a write + * error will flip the file system readonly, and then syncfs() will + * return a 0 because we are readonly if we don't modify the err seq for + * the superblock. + */ + mapping_set_error(page->mapping, -EIO); + + /* * If we error out, we should add back the dirty_metadata_bytes * to make it consistent. */ @@ -4602,12 +4635,11 @@ static int submit_eb_subpage(struct page *page, int submitted = 0; u64 page_start = page_offset(page); int bit_start = 0; - const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; int ret; /* Lock and write each dirty extent buffers in the range */ - while (bit_start < nbits) { + while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; struct extent_buffer *eb; unsigned long flags; @@ -4623,7 +4655,8 @@ static int submit_eb_subpage(struct page *page, break; } spin_lock_irqsave(&subpage->lock, flags); - if (!((1 << bit_start) & subpage->dirty_bitmap)) { + if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, + subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock(&page->mapping->private_lock); bit_start++; @@ -4756,8 +4789,13 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, free_extent_buffer(eb); return ret; } - if (cache) + if (cache) { + /* Impiles write in zoned mode */ btrfs_put_block_group(cache); + /* Mark the last eb in a block group */ + if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) + set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); + } ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); if (ret < 0) @@ -4873,7 +4911,7 @@ retry: * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. */ - if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (!BTRFS_FS_ERROR(fs_info)) { ret = flush_write_bio(&epd); } else { ret = -EROFS; @@ -5069,23 +5107,28 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) return ret; } -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - int mode) +/* + * Submit the pages in the range to bio for call sites which delalloc range has + * already been ran (aka, ordered extent inserted) and all pages are still + * locked. + */ +int extent_write_locked_range(struct inode *inode, u64 start, u64 end) { + bool found_error = false; + int first_error = 0; int ret = 0; struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long nr_pages = (end - start + PAGE_SIZE) >> - PAGE_SHIFT; - + u64 cur = start; + unsigned long nr_pages; + const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; struct extent_page_data epd = { .bio_ctrl = { 0 }, .extent_locked = 1, - .sync_io = mode == WB_SYNC_ALL, + .sync_io = 1, }; struct writeback_control wbc_writepages = { - .sync_mode = mode, - .nr_to_write = nr_pages * 2, + .sync_mode = WB_SYNC_ALL, .range_start = start, .range_end = end + 1, /* We're called from an async helper function */ @@ -5093,33 +5136,49 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, .no_cgroup_owner = 1, }; + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); + nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >> + PAGE_SHIFT; + wbc_writepages.nr_to_write = nr_pages * 2; + wbc_attach_fdatawrite_inode(&wbc_writepages, inode); - while (start <= end) { - page = find_get_page(mapping, start >> PAGE_SHIFT); - if (clear_page_dirty_for_io(page)) - ret = __extent_writepage(page, &wbc_writepages, &epd); - else { - btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), - page, start, start + PAGE_SIZE - 1, true); - unlock_page(page); + while (cur <= end) { + u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + + page = find_get_page(mapping, cur >> PAGE_SHIFT); + /* + * All pages in the range are locked since + * btrfs_run_delalloc_range(), thus there is no way to clear + * the page dirty flag. + */ + ASSERT(PageLocked(page)); + ASSERT(PageDirty(page)); + clear_page_dirty_for_io(page); + ret = __extent_writepage(page, &wbc_writepages, &epd); + ASSERT(ret <= 0); + if (ret < 0) { + found_error = true; + first_error = ret; } put_page(page); - start += PAGE_SIZE; + cur = cur_end + 1; } - ASSERT(ret <= 0); - if (ret == 0) + if (!found_error) ret = flush_write_bio(&epd); else end_write_bio(&epd, ret); wbc_detach_inode(&wbc_writepages); + if (found_error) + return first_error; return ret; } int extent_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct inode *inode = mapping->host; int ret = 0; struct extent_page_data epd = { .bio_ctrl = { 0 }, @@ -5127,7 +5186,13 @@ int extent_writepages(struct address_space *mapping, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; + /* + * Allow only a single thread to do the reloc work in zoned mode to + * protect the write pointer updates. + */ + btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); ASSERT(ret <= 0); if (ret < 0) { end_write_bio(&epd, ret); @@ -6137,13 +6202,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * page, but it may change in the future for 16K page size * support, so we still preallocate the memory in the loop. */ - ret = btrfs_alloc_subpage(fs_info, &prealloc, - BTRFS_SUBPAGE_METADATA); - if (ret < 0) { - unlock_page(p); - put_page(p); - exists = ERR_PTR(ret); - goto free_eb; + if (fs_info->sectorsize < PAGE_SIZE) { + prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (IS_ERR(prealloc)) { + ret = PTR_ERR(prealloc); + unlock_page(p); + put_page(p); + exists = ERR_PTR(ret); + goto free_eb; + } } spin_lock(&mapping->private_lock); @@ -6530,6 +6597,14 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; + /* + * We could have had EXTENT_BUFFER_UPTODATE cleared by the write + * operation, which could potentially still be in flight. In this case + * we simply want to return an error. + */ + if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) + return -EIO; + if (eb->fs_info->sectorsize < PAGE_SIZE) return read_extent_buffer_subpage(eb, wait, mirror_num); @@ -7167,32 +7242,41 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } +#define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) { - struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE]; + struct extent_buffer *gang[GANG_LOOKUP_SIZE]; struct extent_buffer *found = NULL; u64 page_start = page_offset(page); - int ret; - int i; + u64 cur = page_start; ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); - ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE); lockdep_assert_held(&fs_info->buffer_lock); - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang, - bytenr >> fs_info->sectorsize_bits, - PAGE_SIZE / fs_info->nodesize); - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= page_start + PAGE_SIZE) - break; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - break; + while (cur < page_start + PAGE_SIZE) { + int ret; + int i; + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, + (void **)gang, cur >> fs_info->sectorsize_bits, + min_t(unsigned int, GANG_LOOKUP_SIZE, + PAGE_SIZE / fs_info->nodesize)); + if (ret == 0) + goto out; + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + goto out; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + goto out; + } } + cur = gang[ret - 1]->start + gang[ret - 1]->len; } +out: return found; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 53abdc280451..0399cf8e3c32 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -32,6 +32,7 @@ enum { /* write IO error */ EXTENT_BUFFER_WRITE_ERR, EXTENT_BUFFER_NO_CHECK, + EXTENT_BUFFER_ZONE_FINISH, }; /* these are flags for __process_pages_contig */ @@ -183,8 +184,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, unsigned int read_flags, u64 *prev_em_start); int extent_write_full_page(struct page *page, struct writeback_control *wbc); -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - int mode); +int extent_write_locked_range(struct inode *inode, u64 start, u64 end); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, @@ -277,14 +277,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); -struct bio *btrfs_bio_alloc(u64 first_byte); -struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); +struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); -int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 4a8e02f7b6c7..5a36add21305 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -360,7 +360,7 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) int i; for (i = 0; i < map->num_stripes; i++) { - struct btrfs_bio_stripe *stripe = &map->stripes[i]; + struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; set_extent_bits_nowait(&device->alloc_state, stripe->physical, @@ -375,7 +375,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) int i; for (i = 0; i < map->num_stripes; i++) { - struct btrfs_bio_stripe *stripe = &map->stripes[i]; + struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; __clear_extent_bit(&device->alloc_state, stripe->physical, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 0b9401a5afd3..90c5c38836ab 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -208,7 +208,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits; - csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); + csums_in_item = btrfs_item_size(leaf, path->slots[0]); csums_in_item /= csum_size; if (csum_offset == csums_in_item) { @@ -257,6 +257,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 disk_bytenr, u64 len, u8 *dst) { + struct btrfs_root *csum_root; struct btrfs_csum_item *item = NULL; struct btrfs_key key; const u32 sectorsize = fs_info->sectorsize; @@ -274,7 +275,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info, item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + itemsize = btrfs_item_size(path->nodes[0], path->slots[0]); csum_start = key.offset; csum_len = (itemsize / csum_size) * sectorsize; @@ -285,13 +286,14 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info, /* Current item doesn't contain the desired range, search again */ btrfs_release_path(path); - item = btrfs_lookup_csum(NULL, fs_info->csum_root, path, disk_bytenr, 0); + csum_root = btrfs_csum_root(fs_info, disk_bytenr); + item = btrfs_lookup_csum(NULL, csum_root, path, disk_bytenr, 0); if (IS_ERR(item)) { ret = PTR_ERR(item); goto out; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + itemsize = btrfs_item_size(path->nodes[0], path->slots[0]); csum_start = key.offset; csum_len = (itemsize / csum_size) * sectorsize; @@ -358,7 +360,7 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If * NULL, the checksum buffer is allocated and returned in - * btrfs_io_bio(bio)->csum instead. + * btrfs_bio(bio)->csum instead. * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ @@ -376,7 +378,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; - if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) return BLK_STS_OK; /* @@ -397,19 +400,18 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst return BLK_STS_RESOURCE; if (!dst) { - struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); + struct btrfs_bio *bbio = btrfs_bio(bio); if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - btrfs_bio->csum = kmalloc_array(nblocks, csum_size, - GFP_NOFS); - if (!btrfs_bio->csum) { + bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); + if (!bbio->csum) { btrfs_free_path(path); return BLK_STS_RESOURCE; } } else { - btrfs_bio->csum = btrfs_bio->csum_inline; + bbio->csum = bbio->csum_inline; } - csum = btrfs_bio->csum; + csum = bbio->csum; } else { csum = dst; } @@ -535,7 +537,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, key.type == BTRFS_EXTENT_CSUM_KEY) { offset = (start - key.offset) >> fs_info->sectorsize_bits; if (offset * csum_size < - btrfs_item_size_nr(leaf, path->slots[0] - 1)) + btrfs_item_size(leaf, path->slots[0] - 1)) path->slots[0]--; } } @@ -560,7 +562,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (key.offset > start) start = key.offset; - size = btrfs_item_size_nr(leaf, path->slots[0]); + size = btrfs_item_size(leaf, path->slots[0]); csum_end = key.offset + (size / csum_size) * fs_info->sectorsize; if (csum_end <= start) { path->slots[0]++; @@ -709,12 +711,12 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, index = 0; } - data = kmap_atomic(bvec.bv_page); - crypto_shash_digest(shash, data + bvec.bv_offset - + (i * fs_info->sectorsize), + data = bvec_kmap_local(&bvec); + crypto_shash_digest(shash, + data + (i * fs_info->sectorsize), fs_info->sectorsize, sums->sums + index); - kunmap_atomic(data); + kunmap_local(data); index += fs_info->csum_size; offset += fs_info->sectorsize; this_sum_bytes += fs_info->sectorsize; @@ -751,7 +753,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, u32 blocksize_bits = fs_info->sectorsize_bits; leaf = path->nodes[0]; - csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size; csum_end <<= blocksize_bits; csum_end += key->offset; @@ -802,7 +804,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, const u32 csum_size = fs_info->csum_size; u32 blocksize_bits = fs_info->sectorsize_bits; - ASSERT(root == fs_info->csum_root || + ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); path = btrfs_alloc_path(); @@ -835,7 +837,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, if (key.offset >= end_byte) break; - csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size; csum_end <<= blocksize_bits; csum_end += key.offset; @@ -1003,7 +1005,7 @@ again: item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); item_end = (struct btrfs_csum_item *)((char *)item_end + - btrfs_item_size_nr(leaf, path->slots[0])); + btrfs_item_size(leaf, path->slots[0])); goto found; } ret = PTR_ERR(item); @@ -1014,7 +1016,7 @@ again: u32 item_size; /* we found one, but it isn't big enough yet */ leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if ((item_size / csum_size) >= MAX_CSUM_ITEMS(fs_info, csum_size)) { /* already at max size, make a new one */ @@ -1071,7 +1073,7 @@ again: } extend_csum: - if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) / + if (csum_offset == btrfs_item_size(leaf, path->slots[0]) / csum_size) { int extend_nr; u64 tmp; @@ -1126,7 +1128,7 @@ extend_csum: diff = min(diff, MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size); - diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); + diff = diff - btrfs_item_size(leaf, path->slots[0]); diff = min_t(u32, btrfs_leaf_free_space(leaf), diff); diff /= csum_size; diff *= csum_size; @@ -1163,7 +1165,7 @@ insert: csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); item_end = (struct btrfs_csum_item *)((unsigned char *)item + - btrfs_item_size_nr(leaf, path->slots[0])); + btrfs_item_size(leaf, path->slots[0])); item = (struct btrfs_csum_item *)((unsigned char *)item + csum_offset * csum_size); found: diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7ff577005d0f..11204dbbe053 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -437,9 +437,15 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, /* * unlocks pages after btrfs_file_write is done with them */ -static void btrfs_drop_pages(struct page **pages, size_t num_pages) +static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, + struct page **pages, size_t num_pages, + u64 pos, u64 copied) { size_t i; + u64 block_start = round_down(pos, fs_info->sectorsize); + u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; + + ASSERT(block_len <= U32_MAX); for (i = 0; i < num_pages; i++) { /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty @@ -447,7 +453,8 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) * accessed as prepare_pages should have marked them accessed * in prepare_pages via find_or_create_page() */ - ClearPageChecked(pages[i]); + btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start, + block_len); unlock_page(pages[i]); put_page(pages[i]); } @@ -504,7 +511,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, struct page *p = pages[i]; btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); - ClearPageChecked(p); + btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes); btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); } @@ -734,8 +741,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; - update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || - root == fs_info->tree_root); + update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -870,7 +876,8 @@ next_slot: btrfs_init_data_ref(&ref, root->root_key.objectid, new_key.objectid, - args->start - extent_offset); + args->start - extent_offset, + 0, false); ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); /* -ENOMEM */ } @@ -956,7 +963,8 @@ delete_extent_item: btrfs_init_data_ref(&ref, root->root_key.objectid, key.objectid, - key.offset - extent_offset); + key.offset - extent_offset, 0, + false); ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); /* -ENOMEM */ args->bytes_found += extent_end - key.offset; @@ -1021,8 +1029,7 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, - &args->extent_item_size, 1); + btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size); args->extent_inserted = true; } @@ -1233,7 +1240,7 @@ again: btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, num_bytes, 0); btrfs_init_data_ref(&ref, root->root_key.objectid, ino, - orig_offset); + orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1258,7 +1265,8 @@ again: other_end = 0; btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, num_bytes, 0); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset); + btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, + 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { @@ -1710,7 +1718,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, * Fault pages before locking them in prepare_pages * to avoid recursive lock */ - if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { + if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { ret = -EFAULT; break; } @@ -1845,7 +1853,7 @@ again: btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { - btrfs_drop_pages(pages, num_pages); + btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); break; } @@ -1853,7 +1861,7 @@ again: if (only_release_metadata) btrfs_check_nocow_unlock(BTRFS_I(inode)); - btrfs_drop_pages(pages, num_pages); + btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); cond_resched(); @@ -1904,16 +1912,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { + const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC); struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); loff_t pos; ssize_t written = 0; ssize_t written_buffered; + size_t prev_left = 0; loff_t endbyte; ssize_t err; unsigned int ilock_flags = 0; - struct iomap_dio *dio = NULL; if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; @@ -1956,23 +1965,80 @@ relock: goto buffered; } - dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - 0); + /* + * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw() + * calls generic_write_sync() (through iomap_dio_complete()), because + * that results in calling fsync (btrfs_sync_file()) which will try to + * lock the inode in exclusive/write mode. + */ + if (is_sync_write) + iocb->ki_flags &= ~IOCB_DSYNC; - btrfs_inode_unlock(inode, ilock_flags); + /* + * The iov_iter can be mapped to the same file range we are writing to. + * If that's the case, then we will deadlock in the iomap code, because + * it first calls our callback btrfs_dio_iomap_begin(), which will create + * an ordered extent, and after that it will fault in the pages that the + * iov_iter refers to. During the fault in we end up in the readahead + * pages code (starting at btrfs_readahead()), which will lock the range, + * find that ordered extent and then wait for it to complete (at + * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since + * obviously the ordered extent can never complete as we didn't submit + * yet the respective bio(s). This always happens when the buffer is + * memory mapped to the same file range, since the iomap DIO code always + * invalidates pages in the target file range (after starting and waiting + * for any writeback). + * + * So here we disable page faults in the iov_iter and then retry if we + * got -EFAULT, faulting in the pages before the retry. + */ +again: + from->nofault = true; + err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, written); + from->nofault = false; - if (IS_ERR_OR_NULL(dio)) { - err = PTR_ERR_OR_ZERO(dio); - if (err < 0 && err != -ENOTBLK) - goto out; - } else { - written = iomap_dio_complete(dio); + /* No increment (+=) because iomap returns a cumulative value. */ + if (err > 0) + written = err; + + if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { + const size_t left = iov_iter_count(from); + /* + * We have more data left to write. Try to fault in as many as + * possible of the remainder pages and retry. We do this without + * releasing and locking again the inode, to prevent races with + * truncate. + * + * Also, in case the iov refers to pages in the file range of the + * file we want to write to (due to a mmap), we could enter an + * infinite loop if we retry after faulting the pages in, since + * iomap will invalidate any pages in the range early on, before + * it tries to fault in the pages of the iov. So we keep track of + * how much was left of iov in the previous EFAULT and fallback + * to buffered IO in case we haven't made any progress. + */ + if (left == prev_left) { + err = -ENOTBLK; + } else { + fault_in_iov_iter_readable(from, left); + prev_left = left; + goto again; + } } - if (written < 0 || !iov_iter_count(from)) { - err = written; + btrfs_inode_unlock(inode, ilock_flags); + + /* + * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do + * the fsync (call generic_write_sync()). + */ + if (is_sync_write) + iocb->ki_flags |= IOCB_DSYNC; + + /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */ + if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) goto out; - } buffered: pos = iocb->ki_pos; @@ -1997,7 +2063,7 @@ buffered: invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); out: - return written ? written : err; + return err < 0 ? err : written; } static ssize_t btrfs_file_write_iter(struct kiocb *iocb, @@ -2013,7 +2079,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * have opened a file as writable, we have to stop this write operation * to ensure consistency. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state)) + if (BTRFS_FS_ERROR(inode->root->fs_info)) return -EROFS; if (!(iocb->ki_flags & IOCB_DIRECT) && @@ -2621,7 +2687,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, extent_info->disk_len, 0); ref_offset = extent_info->file_offset - extent_info->data_offset; btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(inode), ref_offset); + btrfs_ino(inode), ref_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); } @@ -2704,14 +2770,16 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, drop_args.bytes_found); if (ret != -ENOSPC) { /* - * When cloning we want to avoid transaction aborts when - * nothing was done and we are attempting to clone parts - * of inline extents, in such cases -EOPNOTSUPP is - * returned by __btrfs_drop_extents() without having - * changed anything in the file. + * The only time we don't want to abort is if we are + * attempting to clone a partial inline extent, in which + * case we'll get EOPNOTSUPP. However if we aren't + * clone we need to abort no matter what, because if we + * got EOPNOTSUPP via prealloc then we messed up and + * need to abort. */ - if (extent_info && !extent_info->is_new_extent && - ret && ret != -EOPNOTSUPP) + if (ret && + (ret != -EOPNOTSUPP || + (extent_info && extent_info->is_new_extent))) btrfs_abort_transaction(trans, ret); break; } @@ -3649,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info, static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); + size_t prev_left = 0; + ssize_t read = 0; ssize_t ret; if (fsverity_active(inode)) @@ -3658,9 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) return 0; btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0); +again: + /* + * This is similar to what we do for direct IO writes, see the comment + * at btrfs_direct_write(), but we also disable page faults in addition + * to disabling them only at the iov_iter level. This is because when + * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), + * which can still trigger page fault ins despite having set ->nofault + * to true of our 'to' iov_iter. + * + * The difference to direct IO writes is that we deadlock when trying + * to lock the extent range in the inode's tree during he page reads + * triggered by the fault in (while for writes it is due to waiting for + * our own ordered extent). This is because for direct IO reads, + * btrfs_dio_iomap_begin() returns with the extent range locked, which + * is only unlocked in the endio callback (end_bio_extent_readpage()). + */ + pagefault_disable(); + to->nofault = true; + ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, read); + to->nofault = false; + pagefault_enable(); + + /* No increment (+=) because iomap returns a cumulative value. */ + if (ret > 0) + read = ret; + + if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { + const size_t left = iov_iter_count(to); + + if (left == prev_left) { + /* + * We didn't make any progress since the last attempt, + * fallback to a buffered read for the remainder of the + * range. This is just to avoid any possibility of looping + * for too long. + */ + ret = read; + } else { + /* + * We made some progress since the last retry or this is + * the first time we are retrying. Fault in as many pages + * as possible and retry. + */ + fault_in_iov_iter_writeable(to, left); + prev_left = left; + goto again; + } + } btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - return ret; + return ret < 0 ? ret : read; } static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index da0eee7c9e5f..01a408db5683 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -22,6 +22,8 @@ #include "delalloc-space.h" #include "block-group.h" #include "discard.h" +#include "subpage.h" +#include "inode-item.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K @@ -36,7 +38,7 @@ struct btrfs_trim_range { static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info); + struct btrfs_free_space *info, bool update_stat); static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes, bool for_alloc); @@ -44,7 +46,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info); static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, - u64 bytes); + u64 bytes, bool update_stats); static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, @@ -287,9 +289,18 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, - struct inode *inode) + struct inode *vfs_inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_truncate_control control = { + .inode = BTRFS_I(vfs_inode), + .new_size = 0, + .ino = btrfs_ino(BTRFS_I(vfs_inode)), + .min_type = BTRFS_EXTENT_DATA_KEY, + .clear_extent_range = true, + }; + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_root *root = inode->root; + struct extent_state *cached_state = NULL; int ret = 0; bool locked = false; @@ -319,19 +330,26 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, btrfs_free_path(path); } - btrfs_i_size_write(BTRFS_I(inode), 0); - truncate_pagecache(inode, 0); + btrfs_i_size_write(inode, 0); + truncate_pagecache(vfs_inode, 0); + + lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); /* * We skip the throttling logic for free space cache inodes, so we don't * need to check for -EAGAIN. */ - ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, BTRFS_EXTENT_DATA_KEY, NULL); + ret = btrfs_truncate_inode_items(trans, root, &control); + + inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); + btrfs_inode_safe_disk_i_size_write(inode, control.last_size); + + unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state); if (ret) goto fail; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); fail: if (locked) @@ -411,7 +429,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) for (i = 0; i < io_ctl->num_pages; i++) { if (io_ctl->pages[i]) { - ClearPageChecked(io_ctl->pages[i]); + btrfs_page_clear_checked(io_ctl->fs_info, + io_ctl->pages[i], + page_offset(io_ctl->pages[i]), + PAGE_SIZE); unlock_page(io_ctl->pages[i]); put_page(io_ctl->pages[i]); } @@ -662,7 +683,7 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) { - struct btrfs_block_group *block_group = ctl->private; + struct btrfs_block_group *block_group = ctl->block_group; u64 max_bytes; u64 bitmap_bytes; u64 extent_bytes; @@ -868,7 +889,7 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group, while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (!info->bitmap) { - unlink_free_space(ctl, info); + unlink_free_space(ctl, info, true); ret = btrfs_add_free_space(block_group, info->offset, info->bytes); kmem_cache_free(btrfs_free_space_cachep, info); @@ -882,7 +903,7 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group, bytes); if (ret) break; - bitmap_clear_bits(ctl, info, offset, bytes); + bitmap_clear_bits(ctl, info, offset, bytes, true); offset = info->offset; bytes = ctl->unit; } @@ -1577,6 +1598,50 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, } /* + * This is a little subtle. We *only* have ->max_extent_size set if we actually + * searched through the bitmap and figured out the largest ->max_extent_size, + * otherwise it's 0. In the case that it's 0 we don't want to tell the + * allocator the wrong thing, we want to use the actual real max_extent_size + * we've found already if it's larger, or we want to use ->bytes. + * + * This matters because find_free_space() will skip entries who's ->bytes is + * less than the required bytes. So if we didn't search down this bitmap, we + * may pick some previous entry that has a smaller ->max_extent_size than we + * have. For example, assume we have two entries, one that has + * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set + * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will + * call into find_free_space(), and return with max_extent_size == 4K, because + * that first bitmap entry had ->max_extent_size set, but the second one did + * not. If instead we returned 8K we'd come in searching for 8K, and find the + * 8K contiguous range. + * + * Consider the other case, we have 2 8K chunks in that second entry and still + * don't have ->max_extent_size set. We'll return 16K, and the next time the + * allocator comes in it'll fully search our second bitmap, and this time it'll + * get an uptodate value of 8K as the maximum chunk size. Then we'll get the + * right allocation the next loop through. + */ +static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) +{ + if (entry->bitmap && entry->max_extent_size) + return entry->max_extent_size; + return entry->bytes; +} + +/* + * We want the largest entry to be leftmost, so this is inverted from what you'd + * normally expect. + */ +static bool entry_less(struct rb_node *node, const struct rb_node *parent) +{ + const struct btrfs_free_space *entry, *exist; + + entry = rb_entry(node, struct btrfs_free_space, bytes_index); + exist = rb_entry(parent, struct btrfs_free_space, bytes_index); + return get_max_extent_size(exist) < get_max_extent_size(entry); +} + +/* * searches the tree for the given offset. * * fuzzy - If this is set, then we are trying to make an allocation, and we just @@ -1588,15 +1653,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, u64 offset, int bitmap_only, int fuzzy) { struct rb_node *n = ctl->free_space_offset.rb_node; - struct btrfs_free_space *entry, *prev = NULL; + struct btrfs_free_space *entry = NULL, *prev = NULL; /* find entry that is closest to the 'offset' */ - while (1) { - if (!n) { - entry = NULL; - break; - } - + while (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); prev = entry; @@ -1606,6 +1666,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, n = n->rb_right; else break; + + entry = NULL; } if (bitmap_only) { @@ -1682,6 +1744,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, return NULL; while (1) { + n = rb_next(&entry->offset_index); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); if (entry->bitmap) { if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) @@ -1690,33 +1756,25 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, if (entry->offset + entry->bytes > offset) break; } - - n = rb_next(&entry->offset_index); - if (!n) - return NULL; - entry = rb_entry(n, struct btrfs_free_space, offset_index); } return entry; } -static inline void -__unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) +static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) { rb_erase(&info->offset_index, &ctl->free_space_offset); + rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); ctl->free_extents--; if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; } -} -static void unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - __unlink_free_space(ctl, info); - ctl->free_space -= info->bytes; + if (update_stat) + ctl->free_space -= info->bytes; } static int link_free_space(struct btrfs_free_space_ctl *ctl, @@ -1730,6 +1788,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, if (ret) return ret; + rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]++; ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; @@ -1740,9 +1800,25 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, return ret; } -static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, - u64 offset, u64 bytes) +static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + ASSERT(info->bitmap); + + /* + * If our entry is empty it's because we're on a cluster and we don't + * want to re-link it into our ctl bytes index. + */ + if (RB_EMPTY_NODE(&info->bytes_index)) + return; + + rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); + rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); +} + +static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + u64 offset, u64 bytes, bool update_stat) { unsigned long start, count, end; int extent_delta = -1; @@ -1758,6 +1834,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, if (info->max_extent_size > ctl->unit) info->max_extent_size = 0; + relink_bitmap_entry(ctl, info); + if (start && test_bit(start - 1, info->bitmap)) extent_delta++; @@ -1769,14 +1847,9 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; } -} -static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, u64 offset, - u64 bytes) -{ - __bitmap_clear_bits(ctl, info, offset, bytes); - ctl->free_space -= bytes; + if (update_stat) + ctl->free_space -= bytes; } static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, @@ -1793,9 +1866,16 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, bitmap_set(info->bitmap, start, count); + /* + * We set some bytes, we have no idea what the max extent size is + * anymore. + */ + info->max_extent_size = 0; info->bytes += bytes; ctl->free_space += bytes; + relink_bitmap_entry(ctl, info); + if (start && test_bit(start - 1, info->bitmap)) extent_delta--; @@ -1863,20 +1943,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, *bytes = (u64)(max_bits) * ctl->unit; bitmap_info->max_extent_size = *bytes; + relink_bitmap_entry(ctl, bitmap_info); return -1; } -static inline u64 get_max_extent_size(struct btrfs_free_space *entry) -{ - if (entry->bitmap) - return entry->max_extent_size; - return entry->bytes; -} - /* Cache the size of the max extent in bytes */ static struct btrfs_free_space * find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, - unsigned long align, u64 *max_extent_size) + unsigned long align, u64 *max_extent_size, bool use_bytes_index) { struct btrfs_free_space *entry; struct rb_node *node; @@ -1886,16 +1960,38 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, if (!ctl->free_space_offset.rb_node) goto out; +again: + if (use_bytes_index) { + node = rb_first_cached(&ctl->free_space_bytes); + } else { + entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), + 0, 1); + if (!entry) + goto out; + node = &entry->offset_index; + } - entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); - if (!entry) - goto out; + for (; node; node = rb_next(node)) { + if (use_bytes_index) + entry = rb_entry(node, struct btrfs_free_space, + bytes_index); + else + entry = rb_entry(node, struct btrfs_free_space, + offset_index); - for (node = &entry->offset_index; node; node = rb_next(node)) { - entry = rb_entry(node, struct btrfs_free_space, offset_index); + /* + * If we are using the bytes index then all subsequent entries + * in this tree are going to be < bytes, so simply set the max + * extent size and exit the loop. + * + * If we're using the offset index then we need to keep going + * through the rest of the tree. + */ if (entry->bytes < *bytes) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); + if (use_bytes_index) + break; continue; } @@ -1912,6 +2008,13 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, tmp = entry->offset; } + /* + * We don't break here if we're using the bytes index because we + * may have another entry that has the correct alignment that is + * the right size, so we don't want to miss that possibility. + * At worst this adds another loop through the logic, but if we + * broke here we could prematurely ENOSPC. + */ if (entry->bytes < *bytes + align_off) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); @@ -1919,6 +2022,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, } if (entry->bitmap) { + struct rb_node *old_next = rb_next(node); u64 size = *bytes; ret = search_bitmap(ctl, entry, &tmp, &size, true); @@ -1931,6 +2035,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, max(get_max_extent_size(entry), *max_extent_size); } + + /* + * The bitmap may have gotten re-arranged in the space + * index here because the max_extent_size may have been + * updated. Start from the beginning again if this + * happened. + */ + if (use_bytes_index && old_next != rb_next(node)) + goto again; continue; } @@ -1969,7 +2082,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl, ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; } - unlink_free_space(ctl, bitmap_info); + unlink_free_space(ctl, bitmap_info, true); kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); ctl->total_bitmaps--; @@ -2007,7 +2120,7 @@ again: /* Cannot clear past the end of the bitmap */ search_bytes = min(search_bytes, end - search_start + 1); - bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes); + bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); *offset += search_bytes; *bytes -= search_bytes; @@ -2079,12 +2192,6 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, bitmap_set_bits(ctl, info, offset, bytes_to_set); - /* - * We set some bytes, we have no idea what the max extent size is - * anymore. - */ - info->max_extent_size = 0; - return bytes_to_set; } @@ -2092,7 +2199,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { - struct btrfs_block_group *block_group = ctl->private; + struct btrfs_block_group *block_group = ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; bool forced = false; @@ -2161,7 +2268,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, return 0; if (ctl->op == &free_space_op) - block_group = ctl->private; + block_group = ctl->block_group; again: /* * Since we link bitmaps right into the cluster we need to see if we @@ -2306,10 +2413,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, /* See try_merge_free_space() comment. */ if (right_info && !right_info->bitmap && (!is_trimmed || btrfs_free_space_trimmed(right_info))) { - if (update_stat) - unlink_free_space(ctl, right_info); - else - __unlink_free_space(ctl, right_info); + unlink_free_space(ctl, right_info, update_stat); info->bytes += right_info->bytes; kmem_cache_free(btrfs_free_space_cachep, right_info); merged = true; @@ -2319,10 +2423,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, if (left_info && !left_info->bitmap && left_info->offset + left_info->bytes == offset && (!is_trimmed || btrfs_free_space_trimmed(left_info))) { - if (update_stat) - unlink_free_space(ctl, left_info); - else - __unlink_free_space(ctl, left_info); + unlink_free_space(ctl, left_info, update_stat); info->offset = left_info->offset; info->bytes += left_info->bytes; kmem_cache_free(btrfs_free_space_cachep, left_info); @@ -2358,10 +2459,7 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, if (!btrfs_free_space_trimmed(bitmap)) info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; - if (update_stat) - bitmap_clear_bits(ctl, bitmap, end, bytes); - else - __bitmap_clear_bits(ctl, bitmap, end, bytes); + bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); if (!bitmap->bytes) free_bitmap(ctl, bitmap); @@ -2415,10 +2513,7 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, if (!btrfs_free_space_trimmed(bitmap)) info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; - if (update_stat) - bitmap_clear_bits(ctl, bitmap, info->offset, bytes); - else - __bitmap_clear_bits(ctl, bitmap, info->offset, bytes); + bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); if (!bitmap->bytes) free_bitmap(ctl, bitmap); @@ -2462,12 +2557,12 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, } } -int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_free_space_ctl *ctl, +int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { - struct btrfs_block_group *block_group = ctl->private; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; int ret = 0; u64 filter_bytes = bytes; @@ -2482,6 +2577,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, info->bytes = bytes; info->trim_state = trim_state; RB_CLEAR_NODE(&info->offset_index); + RB_CLEAR_NODE(&info->bytes_index); spin_lock(&ctl->tree_lock); @@ -2539,10 +2635,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold); + bool initial = (size == block_group->length); + u64 reclaimable_unusable; + + WARN_ON(!initial && offset + size > block_group->zone_capacity); spin_lock(&ctl->tree_lock); if (!used) to_free = size; + else if (initial) + to_free = block_group->zone_capacity; else if (offset >= block_group->alloc_offset) to_free = size; else if (offset + size <= block_group->alloc_offset) @@ -2565,12 +2667,15 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, spin_unlock(&block_group->lock); } + reclaimable_unusable = block_group->zone_unusable - + (block_group->length - block_group->zone_capacity); /* All the region is now unusable. Mark it as unused and reclaim */ if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && - block_group->zone_unusable >= - div_factor_fine(block_group->length, bg_reclaim_threshold)) { + reclaimable_unusable >= + div_factor_fine(block_group->zone_capacity, + bg_reclaim_threshold)) { btrfs_mark_bg_to_reclaim(block_group); } @@ -2589,9 +2694,7 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group, if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; - return __btrfs_add_free_space(block_group->fs_info, - block_group->free_space_ctl, - bytenr, size, trim_state); + return __btrfs_add_free_space(block_group, bytenr, size, trim_state); } int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, @@ -2622,9 +2725,7 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; - return __btrfs_add_free_space(block_group->fs_info, - block_group->free_space_ctl, - bytenr, size, trim_state); + return __btrfs_add_free_space(block_group, bytenr, size, trim_state); } int btrfs_remove_free_space(struct btrfs_block_group *block_group, @@ -2683,7 +2784,7 @@ again: re_search = false; if (!info->bitmap) { - unlink_free_space(ctl, info); + unlink_free_space(ctl, info, true); if (offset == info->offset) { u64 to_free = min(bytes, info->bytes); @@ -2719,7 +2820,7 @@ again: } spin_unlock(&ctl->tree_lock); - ret = __btrfs_add_free_space(block_group->fs_info, ctl, + ret = __btrfs_add_free_space(block_group, offset + bytes, old_end - (offset + bytes), info->trim_state); @@ -2754,8 +2855,9 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, * out the free space after the allocation offset. */ if (btrfs_is_zoned(fs_info)) { - btrfs_info(fs_info, "free space %llu", - block_group->length - block_group->alloc_offset); + btrfs_info(fs_info, "free space %llu active %d", + block_group->zone_capacity - block_group->alloc_offset, + block_group->zone_is_active); return; } @@ -2783,8 +2885,9 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, spin_lock_init(&ctl->tree_lock); ctl->unit = fs_info->sectorsize; ctl->start = block_group->start; - ctl->private = block_group; + ctl->block_group = block_group; ctl->op = &free_space_op; + ctl->free_space_bytes = RB_ROOT_CACHED; INIT_LIST_HEAD(&ctl->trimming_ranges); mutex_init(&ctl->cache_writeout_mutex); @@ -2850,6 +2953,8 @@ static void __btrfs_return_cluster_to_free_space( } tree_insert_offset(&ctl->free_space_offset, entry->offset, &entry->offset_index, bitmap); + rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, + entry_less); } cluster->root = RB_ROOT; spin_unlock(&cluster->lock); @@ -2865,7 +2970,7 @@ static void __btrfs_remove_free_space_cache_locked( while ((node = rb_last(&ctl->free_space_offset)) != NULL) { info = rb_entry(node, struct btrfs_free_space, offset_index); if (!info->bitmap) { - unlink_free_space(ctl, info); + unlink_free_space(ctl, info, true); kmem_cache_free(btrfs_free_space_cachep, info); } else { free_bitmap(ctl, info); @@ -2879,8 +2984,8 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) { spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache_locked(ctl); - if (ctl->private) - btrfs_discard_update_discardable(ctl->private); + if (ctl->block_group) + btrfs_discard_update_discardable(ctl->block_group); spin_unlock(&ctl->tree_lock); } @@ -2951,18 +3056,20 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 align_gap = 0; u64 align_gap_len = 0; enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + bool use_bytes_index = (offset == block_group->start); ASSERT(!btrfs_is_zoned(block_group->fs_info)); spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, - block_group->full_stripe_len, max_extent_size); + block_group->full_stripe_len, max_extent_size, + use_bytes_index); if (!entry) goto out; ret = offset; if (entry->bitmap) { - bitmap_clear_bits(ctl, entry, offset, bytes); + bitmap_clear_bits(ctl, entry, offset, bytes, true); if (!btrfs_free_space_trimmed(entry)) atomic64_add(bytes, &discard_ctl->discard_bytes_saved); @@ -2970,7 +3077,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, if (!entry->bytes) free_bitmap(ctl, entry); } else { - unlink_free_space(ctl, entry); + unlink_free_space(ctl, entry, true); align_gap_len = offset - entry->offset; align_gap = entry->offset; align_gap_trim_state = entry->trim_state; @@ -2992,8 +3099,7 @@ out: spin_unlock(&ctl->tree_lock); if (align_gap_len) - __btrfs_add_free_space(block_group->fs_info, ctl, - align_gap, align_gap_len, + __btrfs_add_free_space(block_group, align_gap, align_gap_len, align_gap_trim_state); return ret; } @@ -3064,7 +3170,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, } ret = search_start; - __bitmap_clear_bits(ctl, entry, ret, bytes); + bitmap_clear_bits(ctl, entry, ret, bytes, false); return ret; } @@ -3240,6 +3346,17 @@ again: cluster->window_start = start * ctl->unit + entry->offset; rb_erase(&entry->offset_index, &ctl->free_space_offset); + rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); + + /* + * We need to know if we're currently on the normal space index when we + * manipulate the bitmap so that we know we need to remove and re-insert + * it into the space_index tree. Clear the bytes_index node here so the + * bitmap manipulation helpers know not to mess with the space_index + * until this bitmap entry is added back into the normal cache. + */ + RB_CLEAR_NODE(&entry->bytes_index); + ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 1); ASSERT(!ret); /* -EEXIST; Logic error */ @@ -3330,6 +3447,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group, continue; rb_erase(&entry->offset_index, &ctl->free_space_offset); + rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); total_size += entry->bytes; @@ -3521,13 +3639,13 @@ static int do_trimming(struct btrfs_block_group *block_group, mutex_lock(&ctl->cache_writeout_mutex); if (reserved_start < start) - __btrfs_add_free_space(fs_info, ctl, reserved_start, + __btrfs_add_free_space(block_group, reserved_start, start - reserved_start, reserved_trim_state); if (start + bytes < reserved_start + reserved_bytes) - __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end, + __btrfs_add_free_space(block_group, end, reserved_end - end, reserved_trim_state); - __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state); + __btrfs_add_free_space(block_group, start, bytes, trim_state); list_del(&trim_entry->list); mutex_unlock(&ctl->cache_writeout_mutex); @@ -3601,7 +3719,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, mutex_unlock(&ctl->cache_writeout_mutex); goto next; } - unlink_free_space(ctl, entry); + unlink_free_space(ctl, entry, true); /* * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim @@ -3627,7 +3745,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, goto next; } - unlink_free_space(ctl, entry); + unlink_free_space(ctl, entry, true); kmem_cache_free(btrfs_free_space_cachep, entry); } @@ -3814,7 +3932,7 @@ static int trim_bitmaps(struct btrfs_block_group *block_group, bytes > (max_discard_size + minlen)) bytes = max_discard_size; - bitmap_clear_bits(ctl, entry, start, bytes); + bitmap_clear_bits(ctl, entry, start, bytes, true); if (entry->bytes == 0) free_bitmap(ctl, entry); diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 1f23088d43f9..15591b299895 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -22,6 +22,7 @@ enum btrfs_trim_state { struct btrfs_free_space { struct rb_node offset_index; + struct rb_node bytes_index; u64 offset; u64 bytes; u64 max_extent_size; @@ -45,6 +46,7 @@ static inline bool btrfs_free_space_trimming_bitmap( struct btrfs_free_space_ctl { spinlock_t tree_lock; struct rb_root free_space_offset; + struct rb_root_cached free_space_bytes; u64 free_space; int extents_thresh; int free_extents; @@ -54,7 +56,7 @@ struct btrfs_free_space_ctl { s32 discardable_extents[BTRFS_STAT_NR_ENTRIES]; s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES]; const struct btrfs_free_space_op *op; - void *private; + struct btrfs_block_group *block_group; struct mutex cache_writeout_mutex; struct list_head trimming_ranges; }; @@ -101,10 +103,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl); -int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_free_space_ctl *ctl, - u64 bytenr, u64 size, - enum btrfs_trim_state trim_state); +int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, + u64 size, enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index a33bca94d133..655aad0f9e1c 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -16,6 +16,18 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); +static struct btrfs_root *btrfs_free_space_root( + struct btrfs_block_group *block_group) +{ + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + + return btrfs_global_root(block_group->fs_info, &key); +} + void set_free_space_tree_thresholds(struct btrfs_block_group *cache) { u32 bitmap_range; @@ -51,7 +63,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { - struct btrfs_root *root = trans->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_free_space_info *info; struct btrfs_key key; struct extent_buffer *leaf; @@ -85,7 +97,7 @@ struct btrfs_free_space_info *search_free_space_info( struct btrfs_path *path, int cow) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; int ret; @@ -188,7 +200,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; @@ -326,7 +338,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; @@ -586,7 +598,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 start, u64 size, int remove) { - struct btrfs_root *root = block_group->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; u64 end = start + size; u64 cur_start, cur_size; @@ -699,7 +711,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_root *root = trans->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; u64 found_start, found_end; u64 end = start + size; @@ -851,7 +863,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_root *root = trans->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key, new_key; u64 found_start, found_end; u64 end = start + size; @@ -1046,7 +1058,7 @@ out: static int populate_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { - struct btrfs_root *extent_root = trans->fs_info->extent_root; + struct btrfs_root *extent_root; struct btrfs_path *path, *path2; struct btrfs_key key; u64 start, end; @@ -1080,6 +1092,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = 0; + extent_root = btrfs_extent_root(trans->fs_info, key.objectid); ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); if (ret < 0) goto out_locked; @@ -1157,7 +1170,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) ret = PTR_ERR(free_space_root); goto abort; } - fs_info->free_space_root = free_space_root; + ret = btrfs_global_root_insert(free_space_root); + if (ret) { + btrfs_put_root(free_space_root); + goto abort; + } node = rb_first(&fs_info->block_group_cache_tree); while (node) { @@ -1232,7 +1249,12 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) { struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_root *free_space_root = fs_info->free_space_root; + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key); int ret; trans = btrfs_start_transaction(tree_root, 0); @@ -1241,7 +1263,6 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); - fs_info->free_space_root = NULL; ret = clear_free_space_tree(trans, free_space_root); if (ret) @@ -1251,13 +1272,14 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) if (ret) goto abort; + btrfs_global_root_delete(free_space_root); list_del(&free_space_root->dirty_list); btrfs_tree_lock(free_space_root->node); btrfs_clean_tree_block(free_space_root->node); btrfs_tree_unlock(free_space_root->node); - btrfs_free_tree_block(trans, free_space_root, free_space_root->node, - 0, 1); + btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), + free_space_root->node, 0, 1); btrfs_put_root(free_space_root); @@ -1319,7 +1341,7 @@ out: int remove_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { - struct btrfs_root *root = trans->fs_info->free_space_root; + struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_path *path; struct btrfs_key key, found_key; struct extent_buffer *leaf; @@ -1410,7 +1432,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, block_group = caching_ctl->block_group; fs_info = block_group->fs_info; - root = fs_info->free_space_root; + root = btrfs_free_space_root(block_group); end = block_group->start + block_group->length; @@ -1488,7 +1510,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, block_group = caching_ctl->block_group; fs_info = block_group->fs_info; - root = fs_info->free_space_root; + root = btrfs_free_space_root(block_group); end = block_group->start + block_group->length; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 37f36ffdaf6b..0eeb5ea87894 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -4,6 +4,7 @@ */ #include "ctree.h" +#include "inode-item.h" #include "disk-io.h" #include "transaction.h" #include "print-tree.h" @@ -19,7 +20,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, u32 cur_offset = 0; int len; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { ref = (struct btrfs_inode_ref *)(ptr + cur_offset); @@ -45,7 +46,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( u32 cur_offset = 0; int ref_name_len; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); /* @@ -139,7 +140,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (index) *index = btrfs_inode_extref_index(leaf, extref); @@ -208,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, goto out; } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (index) *index = btrfs_inode_ref_index(leaf, ref); @@ -256,7 +257,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf; - struct btrfs_item *item; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; @@ -282,9 +282,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, goto out; leaf = path->nodes[0]; - item = btrfs_item_nr(path->slots[0]); ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); - ptr += btrfs_item_size(leaf, item) - ins_len; + ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len; extref = (struct btrfs_inode_extref *)ptr; btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len); @@ -332,7 +331,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ref) goto out; - old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + old_size = btrfs_item_size(path->nodes[0], path->slots[0]); btrfs_extend_item(path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); @@ -419,3 +418,332 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root } return ret; } + +static inline void btrfs_trace_truncate(struct btrfs_inode *inode, + struct extent_buffer *leaf, + struct btrfs_file_extent_item *fi, + u64 offset, int extent_type, int slot) +{ + if (!inode) + return; + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + trace_btrfs_truncate_show_fi_inline(inode, leaf, fi, slot, + offset); + else + trace_btrfs_truncate_show_fi_regular(inode, leaf, fi, offset); +} + +/* + * Remove inode items from a given root. + * + * @trans: A transaction handle. + * @root: The root from which to remove items. + * @inode: The inode whose items we want to remove. + * @control: The btrfs_truncate_control to control how and what we + * are truncating. + * + * Remove all keys associated with the inode from the given root that have a key + * with a type greater than or equals to @min_type. When @min_type has a value of + * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value + * greater than or equals to @new_size. If a file extent item that starts before + * @new_size and ends after it is found, its length is adjusted. + * + * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is + * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. + */ +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_truncate_control *control) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; + u64 new_size = control->new_size; + u64 extent_num_bytes = 0; + u64 extent_offset = 0; + u64 item_end = 0; + u32 found_type = (u8)-1; + int del_item; + int pending_del_nr = 0; + int pending_del_slot = 0; + int extent_type = -1; + int ret; + u64 bytes_deleted = 0; + bool be_nice = false; + + ASSERT(control->inode || !control->clear_extent_range); + ASSERT(new_size == 0 || control->min_type == BTRFS_EXTENT_DATA_KEY); + + control->last_size = new_size; + control->sub_bytes = 0; + + /* + * For shareable roots we want to back off from time to time, this turns + * out to be subvolume roots, reloc roots, and data reloc roots. + */ + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + be_nice = true; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_BACK; + + key.objectid = control->ino; + key.offset = (u64)-1; + key.type = (u8)-1; + +search_again: + /* + * With a 16K leaf size and 128MiB extents, you can actually queue up a + * huge file in a single leaf. Most of the time that bytes_deleted is + * > 0, it will be huge by the time we get here + */ + if (be_nice && bytes_deleted > SZ_32M && + btrfs_should_end_transaction(trans)) { + ret = -EAGAIN; + goto out; + } + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = 0; + /* There are no items in the tree for us to truncate, we're done */ + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + + while (1) { + u64 clear_start = 0, clear_len = 0, extent_start = 0; + bool should_throttle = false; + + fi = NULL; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = found_key.type; + + if (found_key.objectid != control->ino) + break; + + if (found_type < control->min_type) + break; + + item_end = found_key.offset; + if (found_type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + if (extent_type != BTRFS_FILE_EXTENT_INLINE) + item_end += + btrfs_file_extent_num_bytes(leaf, fi); + else if (extent_type == BTRFS_FILE_EXTENT_INLINE) + item_end += btrfs_file_extent_ram_bytes(leaf, fi); + + btrfs_trace_truncate(control->inode, leaf, fi, + found_key.offset, extent_type, + path->slots[0]); + item_end--; + } + if (found_type > control->min_type) { + del_item = 1; + } else { + if (item_end < new_size) + break; + if (found_key.offset >= new_size) + del_item = 1; + else + del_item = 0; + } + + /* FIXME, shrink the extent if the ref count is only 1 */ + if (found_type != BTRFS_EXTENT_DATA_KEY) + goto delete; + + control->extents_found++; + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + u64 num_dec; + + clear_start = found_key.offset; + extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); + if (!del_item) { + u64 orig_num_bytes = + btrfs_file_extent_num_bytes(leaf, fi); + extent_num_bytes = ALIGN(new_size - + found_key.offset, + fs_info->sectorsize); + clear_start = ALIGN(new_size, fs_info->sectorsize); + + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_num_bytes); + num_dec = (orig_num_bytes - extent_num_bytes); + if (extent_start != 0) + control->sub_bytes += num_dec; + btrfs_mark_buffer_dirty(leaf); + } else { + extent_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = found_key.offset - + btrfs_file_extent_offset(leaf, fi); + + /* FIXME blocksize != 4096 */ + num_dec = btrfs_file_extent_num_bytes(leaf, fi); + if (extent_start != 0) + control->sub_bytes += num_dec; + } + clear_len = num_dec; + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + /* + * We can't truncate inline items that have had + * special encodings + */ + if (!del_item && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0 && + btrfs_file_extent_compression(leaf, fi) == 0) { + u32 size = (u32)(new_size - found_key.offset); + + btrfs_set_file_extent_ram_bytes(leaf, fi, size); + size = btrfs_file_extent_calc_inline_size(size); + btrfs_truncate_item(path, size, 1); + } else if (!del_item) { + /* + * We have to bail so the last_size is set to + * just before this extent. + */ + ret = BTRFS_NEED_TRUNCATE_BLOCK; + break; + } else { + /* + * Inline extents are special, we just treat + * them as a full sector worth in the file + * extent tree just for simplicity sake. + */ + clear_len = fs_info->sectorsize; + } + + control->sub_bytes += item_end + 1 - new_size; + } +delete: + /* + * We only want to clear the file extent range if we're + * modifying the actual inode's mapping, which is just the + * normal truncate path. + */ + if (control->clear_extent_range) { + ret = btrfs_inode_clear_file_extent_range(control->inode, + clear_start, clear_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + } + + if (del_item) { + ASSERT(!pending_del_nr || + ((path->slots[0] + 1) == pending_del_slot)); + + control->last_size = found_key.offset; + if (!pending_del_nr) { + /* No pending yet, add ourselves */ + pending_del_slot = path->slots[0]; + pending_del_nr = 1; + } else if (pending_del_nr && + path->slots[0] + 1 == pending_del_slot) { + /* Hop on the pending chunk */ + pending_del_nr++; + pending_del_slot = path->slots[0]; + } + } else { + control->last_size = new_size; + break; + } + + if (del_item && extent_start != 0 && !control->skip_ref_updates) { + struct btrfs_ref ref = { 0 }; + + bytes_deleted += extent_num_bytes; + + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, + extent_start, extent_num_bytes, 0); + btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), + control->ino, extent_offset, + root->root_key.objectid, false); + ret = btrfs_free_extent(trans, &ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + if (be_nice) { + if (btrfs_should_throttle_delayed_refs(trans)) + should_throttle = true; + } + } + + if (found_type == BTRFS_INODE_ITEM_KEY) + break; + + if (path->slots[0] == 0 || + path->slots[0] != pending_del_slot || + should_throttle) { + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + pending_del_nr = 0; + } + btrfs_release_path(path); + + /* + * We can generate a lot of delayed refs, so we need to + * throttle every once and a while and make sure we're + * adding enough space to keep up with the work we are + * generating. Since we hold a transaction here we + * can't flush, and we don't want to FLUSH_LIMIT because + * we could have generated too many delayed refs to + * actually allocate, so just bail if we're short and + * let the normal reservation dance happen higher up. + */ + if (should_throttle) { + ret = btrfs_delayed_refs_rsv_refill(fs_info, + BTRFS_RESERVE_NO_FLUSH); + if (ret) { + ret = -EAGAIN; + break; + } + } + goto search_again; + } else { + path->slots[0]--; + } + } +out: + if (ret >= 0 && pending_del_nr) { + int err; + + err = btrfs_del_items(trans, root, path, pending_del_slot, + pending_del_nr); + if (err) { + btrfs_abort_transaction(trans, err); + ret = err; + } + } + + ASSERT(control->last_size >= new_size); + if (!ret && control->last_size > new_size) + control->last_size = new_size; + + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h new file mode 100644 index 000000000000..a8fc16d0147f --- /dev/null +++ b/fs/btrfs/inode-item.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_INODE_ITEM_H +#define BTRFS_INODE_ITEM_H + +#include <linux/types.h> + +struct btrfs_trans_handle; +struct btrfs_root; +struct btrfs_path; +struct btrfs_key; +struct btrfs_inode_extref; +struct btrfs_inode; +struct extent_buffer; + +/* + * Return this if we need to call truncate_block for the last bit of the + * truncate. + */ +#define BTRFS_NEED_TRUNCATE_BLOCK 1 + +struct btrfs_truncate_control { + /* + * IN: the inode we're operating on, this can be NULL if + * ->clear_extent_range is false. + */ + struct btrfs_inode *inode; + + /* IN: the size we're truncating to. */ + u64 new_size; + + /* OUT: the number of extents truncated. */ + u64 extents_found; + + /* OUT: the last size we truncated this inode to. */ + u64 last_size; + + /* OUT: the number of bytes to sub from this inode. */ + u64 sub_bytes; + + /* IN: the ino we are truncating. */ + u64 ino; + + /* + * IN: minimum key type to remove. All key types with this type are + * removed only if their offset >= new_size. + */ + u32 min_type; + + /* + * IN: true if we don't want to do extent reference updates for any file + * extents we drop. + */ + bool skip_ref_updates; + + /* + * IN: true if we need to clear the file extent range for the inode as + * we drop the file extent items. + */ + bool clear_extent_range; +}; + +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_truncate_control *control); +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index); +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index); +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod); + +struct btrfs_inode_extref *btrfs_lookup_inode_extref( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow); + +struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, + int slot, const char *name, + int name_len); +struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( + struct extent_buffer *leaf, int slot, u64 ref_objectid, + const char *name, int name_len); + +#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 487533c35ddb..3b2403b6127f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6,6 +6,7 @@ #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/bio.h> +#include <linux/blk-cgroup.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -53,6 +54,7 @@ #include "space-info.h" #include "zoned.h" #include "subpage.h" +#include "inode-item.h" struct btrfs_iget_args { u64 ino; @@ -60,8 +62,6 @@ struct btrfs_iget_args { }; struct btrfs_dio_data { - u64 reserve; - loff_t length; ssize_t submitted; struct extent_changeset *data_reserved; }; @@ -287,8 +287,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); - kaddr = page_address(cpage); + kaddr = kmap_atomic(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); + kunmap_atomic(kaddr); i++; ptr += cur_size; @@ -455,11 +456,10 @@ struct async_chunk { struct list_head extents; struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; - atomic_t *pending; + struct async_cow *async_cow; }; struct async_cow { - /* Number of chunks in flight; must be first in the structure */ atomic_t num_chunks; struct async_chunk chunks[]; }; @@ -490,9 +490,6 @@ static noinline int add_async_extent(struct async_chunk *cow, */ static inline bool inode_can_compress(struct btrfs_inode *inode) { - /* Subpage doesn't support compression yet */ - if (inode->root->fs_info->sectorsize < PAGE_SIZE) - return false; if (inode->flags & BTRFS_INODE_NODATACOW || inode->flags & BTRFS_INODE_NODATASUM) return false; @@ -514,6 +511,38 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, btrfs_ino(inode)); return 0; } + /* + * Special check for subpage. + * + * We lock the full page then run each delalloc range in the page, thus + * for the following case, we will hit some subpage specific corner case: + * + * 0 32K 64K + * | |///////| |///////| + * \- A \- B + * + * In above case, both range A and range B will try to unlock the full + * page [0, 64K), causing the one finished later will have page + * unlocked already, triggering various page lock requirement BUG_ON()s. + * + * So here we add an artificial limit that subpage compression can only + * if the range is fully page aligned. + * + * In theory we only need to ensure the first page is fully covered, but + * the tailing partial page will be locked until the full compression + * finishes, delaying the write of other range. + * + * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range + * first to prevent any submitted async extent to unlock the full page. + * By this, we can ensure for subpage case that only the last async_cow + * will unlock the full page. + */ + if (fs_info->sectorsize < PAGE_SIZE) { + if (!IS_ALIGNED(start, PAGE_SIZE) || + !IS_ALIGNED(end + 1, PAGE_SIZE)) + return 0; + } + /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; @@ -615,13 +644,24 @@ again: total_compressed = actual_end - start; /* - * skip compression for a small file range(<=blocksize) that + * Skip compression for a small file range(<=blocksize) that * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) goto cleanup_and_bail_uncompressed; + /* + * For subpage case, we require full page alignment for the sector + * aligned range. + * Thus we must also check against @actual_end, not just @end. + */ + if (blocksize < PAGE_SIZE) { + if (!IS_ALIGNED(start, PAGE_SIZE) || + !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE)) + goto cleanup_and_bail_uncompressed; + } + total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; @@ -759,7 +799,7 @@ cont: * win, compare the page count read with the blocks on disk, * compression must free at least one sector size */ - total_in = ALIGN(total_in, PAGE_SIZE); + total_in = round_up(total_in, fs_info->sectorsize); if (total_compressed + blocksize <= total_in) { compressed_extents++; @@ -840,166 +880,148 @@ static void free_async_extent_pages(struct async_extent *async_extent) async_extent->pages = NULL; } -/* - * phase two of compressed writeback. This is the ordered portion - * of the code, which only gets called in the order the work was - * queued. We walk all the async extents created by compress_file_range - * and send them down to the disk. - */ -static noinline void submit_compressed_extents(struct async_chunk *async_chunk) +static int submit_uncompressed_range(struct btrfs_inode *inode, + struct async_extent *async_extent, + struct page *locked_page) { - struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct async_extent *async_extent; - u64 alloc_hint = 0; - struct btrfs_key ins; - struct extent_map *em; - struct btrfs_root *root = inode->root; - struct extent_io_tree *io_tree = &inode->io_tree; - int ret = 0; - -again: - while (!list_empty(&async_chunk->extents)) { - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); - list_del(&async_extent->list); - -retry: - lock_extent(io_tree, async_extent->start, - async_extent->start + async_extent->ram_size - 1); - /* did the compression code fall back to uncompressed IO? */ - if (!async_extent->pages) { - int page_started = 0; - unsigned long nr_written = 0; + u64 start = async_extent->start; + u64 end = async_extent->start + async_extent->ram_size - 1; + unsigned long nr_written = 0; + int page_started = 0; + int ret; - /* allocate blocks */ - ret = cow_file_range(inode, async_chunk->locked_page, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - &page_started, &nr_written, 0); + /* + * Call cow_file_range() to run the delalloc range directly, since we + * won't go to NOCOW or async path again. + * + * Also we call cow_file_range() with @unlock_page == 0, so that we + * can directly submit them without interruption. + */ + ret = cow_file_range(inode, locked_page, start, end, &page_started, + &nr_written, 0); + /* Inline extent inserted, page gets unlocked and everything is done */ + if (page_started) { + ret = 0; + goto out; + } + if (ret < 0) { + if (locked_page) + unlock_page(locked_page); + goto out; + } - /* JDM XXX */ + ret = extent_write_locked_range(&inode->vfs_inode, start, end); + /* All pages will be unlocked, including @locked_page */ +out: + kfree(async_extent); + return ret; +} - /* - * if page_started, cow_file_range inserted an - * inline extent and took care of all the unlocking - * and IO for us. Otherwise, we need to submit - * all those pages down to the drive. - */ - if (!page_started && !ret) - extent_write_locked_range(&inode->vfs_inode, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - WB_SYNC_ALL); - else if (ret && async_chunk->locked_page) - unlock_page(async_chunk->locked_page); - kfree(async_extent); - cond_resched(); - continue; - } +static int submit_one_async_extent(struct btrfs_inode *inode, + struct async_chunk *async_chunk, + struct async_extent *async_extent, + u64 *alloc_hint) +{ + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key ins; + struct page *locked_page = NULL; + struct extent_map *em; + int ret = 0; + u64 start = async_extent->start; + u64 end = async_extent->start + async_extent->ram_size - 1; - ret = btrfs_reserve_extent(root, async_extent->ram_size, - async_extent->compressed_size, - async_extent->compressed_size, - 0, alloc_hint, &ins, 1, 1); - if (ret) { - free_async_extent_pages(async_extent); + /* + * If async_chunk->locked_page is in the async_extent range, we need to + * handle it. + */ + if (async_chunk->locked_page) { + u64 locked_page_start = page_offset(async_chunk->locked_page); + u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; - if (ret == -ENOSPC) { - unlock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1); + if (!(start >= locked_page_end || end <= locked_page_start)) + locked_page = async_chunk->locked_page; + } + lock_extent(io_tree, start, end); - /* - * we need to redirty the pages if we decide to - * fallback to uncompressed IO, otherwise we - * will not submit these pages down to lower - * layers. - */ - extent_range_redirty_for_io(&inode->vfs_inode, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1); + /* We have fall back to uncompressed write */ + if (!async_extent->pages) + return submit_uncompressed_range(inode, async_extent, locked_page); - goto retry; - } - goto out_free; - } + ret = btrfs_reserve_extent(root, async_extent->ram_size, + async_extent->compressed_size, + async_extent->compressed_size, + 0, *alloc_hint, &ins, 1, 1); + if (ret) { + free_async_extent_pages(async_extent); /* - * here we're doing allocation and writeback of the - * compressed pages + * Here we used to try again by going back to non-compressed + * path for ENOSPC. But we can't reserve space even for + * compressed size, how could it work for uncompressed size + * which requires larger size? So here we directly go error + * path. */ - em = create_io_em(inode, async_extent->start, - async_extent->ram_size, /* len */ - async_extent->start, /* orig_start */ - ins.objectid, /* block_start */ - ins.offset, /* block_len */ - ins.offset, /* orig_block_len */ - async_extent->ram_size, /* ram_bytes */ - async_extent->compress_type, - BTRFS_ORDERED_COMPRESSED); - if (IS_ERR(em)) - /* ret value is not necessary due to void function */ - goto out_free_reserve; - free_extent_map(em); - - ret = btrfs_add_ordered_extent_compress(inode, - async_extent->start, - ins.objectid, - async_extent->ram_size, - ins.offset, - async_extent->compress_type); - if (ret) { - btrfs_drop_extent_cache(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, 0); - goto out_free_reserve; - } - btrfs_dec_block_group_reservations(fs_info, ins.objectid); + goto out_free; + } + + /* Here we're doing allocation and writeback of the compressed pages */ + em = create_io_em(inode, start, + async_extent->ram_size, /* len */ + start, /* orig_start */ + ins.objectid, /* block_start */ + ins.offset, /* block_len */ + ins.offset, /* orig_block_len */ + async_extent->ram_size, /* ram_bytes */ + async_extent->compress_type, + BTRFS_ORDERED_COMPRESSED); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_free_reserve; + } + free_extent_map(em); - /* - * clear dirty, set writeback and unlock the pages. - */ - extent_clear_unlock_delalloc(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - NULL, EXTENT_LOCKED | EXTENT_DELALLOC, - PAGE_UNLOCK | PAGE_START_WRITEBACK); - if (btrfs_submit_compressed_write(inode, async_extent->start, - async_extent->ram_size, - ins.objectid, - ins.offset, async_extent->pages, - async_extent->nr_pages, - async_chunk->write_flags, - async_chunk->blkcg_css)) { - struct page *p = async_extent->pages[0]; - const u64 start = async_extent->start; - const u64 end = start + async_extent->ram_size - 1; - - p->mapping = inode->vfs_inode.i_mapping; - btrfs_writepage_endio_finish_ordered(inode, p, start, - end, false); - - p->mapping = NULL; - extent_clear_unlock_delalloc(inode, start, end, NULL, 0, - PAGE_END_WRITEBACK | - PAGE_SET_ERROR); - free_async_extent_pages(async_extent); - } - alloc_hint = ins.objectid + ins.offset; - kfree(async_extent); - cond_resched(); + ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */ + ins.objectid, /* disk_bytenr */ + async_extent->ram_size, /* num_bytes */ + ins.offset, /* disk_num_bytes */ + async_extent->compress_type); + if (ret) { + btrfs_drop_extent_cache(inode, start, end, 0); + goto out_free_reserve; } - return; + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + + /* Clear dirty, set writeback and unlock the pages. */ + extent_clear_unlock_delalloc(inode, start, end, + NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_START_WRITEBACK); + if (btrfs_submit_compressed_write(inode, start, /* file_offset */ + async_extent->ram_size, /* num_bytes */ + ins.objectid, /* disk_bytenr */ + ins.offset, /* compressed_len */ + async_extent->pages, /* compressed_pages */ + async_extent->nr_pages, + async_chunk->write_flags, + async_chunk->blkcg_css)) { + const u64 start = async_extent->start; + const u64 end = start + async_extent->ram_size - 1; + + btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0); + + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, + PAGE_END_WRITEBACK | PAGE_SET_ERROR); + free_async_extent_pages(async_extent); + } + *alloc_hint = ins.objectid + ins.offset; + kfree(async_extent); + return ret; + out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_free: - extent_clear_unlock_delalloc(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, + extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, @@ -1007,7 +1029,39 @@ out_free: PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); kfree(async_extent); - goto again; + return ret; +} + +/* + * Phase two of compressed writeback. This is the ordered portion of the code, + * which only gets called in the order the work was queued. We walk all the + * async extents created by compress_file_range and send them down to the disk. + */ +static noinline void submit_compressed_extents(struct async_chunk *async_chunk) +{ + struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct async_extent *async_extent; + u64 alloc_hint = 0; + int ret = 0; + + while (!list_empty(&async_chunk->extents)) { + u64 extent_start; + u64 ram_size; + + async_extent = list_entry(async_chunk->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + extent_start = async_extent->start; + ram_size = async_extent->ram_size; + + ret = submit_one_async_extent(inode, async_chunk, async_extent, + &alloc_hint); + btrfs_debug(fs_info, +"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", + inode->root->root_key.objectid, + btrfs_ino(inode), extent_start, ram_size, ret); + } } static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, @@ -1150,7 +1204,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * fails during the stage where it updates the bytenr of file extent * items. */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) min_alloc_size = num_bytes; else min_alloc_size = fs_info->sectorsize; @@ -1186,8 +1240,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_drop_extent_cache; - if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { + if (btrfs_is_data_reloc_root(root)) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); /* @@ -1325,18 +1378,17 @@ static noinline void async_cow_submit(struct btrfs_work *work) static noinline void async_cow_free(struct btrfs_work *work) { struct async_chunk *async_chunk; + struct async_cow *async_cow; async_chunk = container_of(work, struct async_chunk, work); if (async_chunk->inode) btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); - /* - * Since the pointer to 'pending' is at the beginning of the array of - * async_chunk's, freeing it ensures the whole array has been freed. - */ - if (atomic_dec_and_test(async_chunk->pending)) - kvfree(async_chunk->pending); + + async_cow = async_chunk->async_cow; + if (atomic_dec_and_test(&async_cow->num_chunks)) + kvfree(async_cow); } static int cow_file_range_async(struct btrfs_inode *inode, @@ -1397,7 +1449,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, * lightweight reference for the callback lifetime */ ihold(&inode->vfs_inode); - async_chunk[i].pending = &ctx->num_chunks; + async_chunk[i].async_cow = ctx; async_chunk[i].inode = &inode->vfs_inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; @@ -1470,7 +1522,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, __set_page_dirty_nobuffers(locked_page); account_page_redirty(locked_page); - extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL); + extent_write_locked_range(&inode->vfs_inode, start, end); *page_started = 1; return 0; @@ -1479,11 +1531,12 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes) { - int ret; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); struct btrfs_ordered_sum *sums; + int ret; LIST_HEAD(list); - ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr, + ret = btrfs_lookup_csums_range(csum_root, bytenr, bytenr + num_bytes - 1, &list, 0); if (ret == 0 && list_empty(&list)) return 0; @@ -1503,8 +1556,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, int *page_started, unsigned long *nr_written) { const bool is_space_ino = btrfs_is_free_space_inode(inode); - const bool is_reloc_ino = (inode->root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID); + const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; u64 range_start = start; @@ -1866,8 +1918,7 @@ out_check: btrfs_dec_nocow_writers(fs_info, disk_bytenr); nocow = false; - if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) /* * Error handled later, as we must prevent * extent_clear_unlock_delalloc() in error handler @@ -1946,8 +1997,23 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page int ret; const bool zoned = btrfs_is_zoned(inode->root->fs_info); + /* + * The range must cover part of the @locked_page, or the returned + * @page_started can confuse the caller. + */ + ASSERT(!(end <= page_offset(locked_page) || + start >= page_offset(locked_page) + PAGE_SIZE)); + if (should_nocow(inode, start, end)) { - ASSERT(!zoned); + /* + * Normally on a zoned device we're only doing COW writes, but + * in case of relocation on a zoned filesystem we have taken + * precaution, that we're only writing sequentially. It's safe + * to use run_delalloc_nocow() here, like for regular + * preallocated inodes. + */ + ASSERT(!zoned || + (zoned && btrfs_is_data_reloc_root(inode->root))); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); } else if (!inode_can_compress(inode) || @@ -2206,7 +2272,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, if (btrfs_is_testing(fs_info)) return; - if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && + if (!btrfs_is_data_reloc_root(root) && do_list && !(state->state & EXTENT_NORESERVE) && (*bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); @@ -2234,48 +2300,6 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, } /* - * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit - * in a chunk's stripe. This function ensures that bios do not span a - * stripe/chunk - * - * @page - The page we are about to add to the bio - * @size - size we want to add to the bio - * @bio - bio we want to ensure is smaller than a stripe - * @bio_flags - flags of the bio - * - * return 1 if page cannot be added to the bio - * return 0 if page can be added to the bio - * return error otherwise - */ -int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, - unsigned long bio_flags) -{ - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 logical = bio->bi_iter.bi_sector << 9; - u32 bio_len = bio->bi_iter.bi_size; - struct extent_map *em; - int ret = 0; - struct btrfs_io_geometry geom; - - if (bio_flags & EXTENT_BIO_COMPRESSED) - return 0; - - em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); - if (IS_ERR(em)) - return PTR_ERR(em); - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom); - if (ret < 0) - goto out; - - if (geom.len < bio_len + size) - ret = 1; -out: - free_extent_map(em); - return ret; -} - -/* * in order to insert checksums into the metadata in large chunks, * we wait until bio submission time. All the pages in the bio are * checksummed and sums are attached onto the ordered extent record. @@ -2494,7 +2518,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int async = !atomic_read(&BTRFS_I(inode)->sync_writers); skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || - !fs_info->csum_root; + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; @@ -2531,7 +2555,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, goto mapit; } else if (async && !skip_sum) { /* csum items have already been cloned */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) goto mapit; /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, @@ -2562,11 +2586,15 @@ static int add_pending_csums(struct btrfs_trans_handle *trans, struct list_head *list) { struct btrfs_ordered_sum *sum; + struct btrfs_root *csum_root = NULL; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; - ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum); + if (!csum_root) + csum_root = btrfs_csum_root(trans->fs_info, + sum->bytenr); + ret = btrfs_csum_file_blocks(trans, csum_root, sum); trans->adding_csums = false; if (ret) return ret; @@ -2764,7 +2792,7 @@ out_page: clear_page_dirty_for_io(page); SetPageError(page); } - ClearPageChecked(page); + btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); @@ -2819,7 +2847,7 @@ int btrfs_writepage_cow_fixup(struct page *page) * page->mapping outside of the page lock. */ ihold(inode); - SetPageChecked(page); + btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; @@ -3010,8 +3038,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (ordered_extent->bdev) + /* A valid bdev implies a write on a sequential zone */ + if (ordered_extent->bdev) { btrfs_rewrite_logical_zoned(ordered_extent); + btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); + } btrfs_free_io_failure_record(inode, start, end); @@ -3208,7 +3240,7 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, * * The length of such check is always one sector size. */ -static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, +static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u32 pgoff, u64 start) { @@ -3224,7 +3256,7 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, ASSERT(pgoff + len <= PAGE_SIZE); offset_sectors = bio_offset >> fs_info->sectorsize_bits; - csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size; + csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size; kaddr = kmap_atomic(page); shash->tfm = fs_info->csum_shash; @@ -3238,9 +3270,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, return 0; zeroit: btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, - io_bio->mirror_num); - if (io_bio->device) - btrfs_dev_stat_inc_and_print(io_bio->device, + bbio->mirror_num); + if (bbio->device) + btrfs_dev_stat_inc_and_print(bbio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); @@ -3260,39 +3292,35 @@ zeroit: * Return a bitmap where bit set means a csum mismatch, and bit not set means * csum match. */ -unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end) +unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, + u64 start, u64 end) { struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_root *root = BTRFS_I(inode)->root; const u32 sectorsize = root->fs_info->sectorsize; u32 pg_off; unsigned int result = 0; - if (PageChecked(page)) { - ClearPageChecked(page); + if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) { + btrfs_page_clear_checked(fs_info, page, start, end + 1 - start); return 0; } /* - * For subpage case, above PageChecked is not safe as it's not subpage - * compatible. - * But for now only cow fixup and compressed read utilize PageChecked - * flag, while in this context we can easily use io_bio->csum to - * determine if we really need to do csum verification. - * - * So for now, just exit if io_bio->csum is NULL, as it means it's - * compressed read, and its compressed data csum has already been - * verified. + * This only happens for NODATASUM or compressed read. + * Normally this should be covered by above check for compressed read + * or the next check for NODATASUM. Just do a quicker exit here. */ - if (io_bio->csum == NULL) + if (bbio->csum == NULL) return 0; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; - if (!root->fs_info->csum_root) + if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) return 0; ASSERT(page_offset(page) <= start && @@ -3303,7 +3331,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, u64 file_offset = pg_off + page_offset(page); int ret; - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + if (btrfs_is_data_reloc_root(root) && test_range_bit(io_tree, file_offset, file_offset + sectorsize - 1, EXTENT_NODATASUM, 1, NULL)) { @@ -3313,7 +3341,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, EXTENT_NODATASUM); continue; } - ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, + ret = check_data_csum(inode, bbio, bio_offset, page, pg_off, page_offset(page) + pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> @@ -3453,7 +3481,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) u64 last_objectid = 0; int ret = 0, nr_unlink = 0; - if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) + if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; path = btrfs_alloc_path(); @@ -3611,8 +3639,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* release the path since we're done with it */ btrfs_release_path(path); - root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; - if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) @@ -4004,7 +4030,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * without delay */ if (!btrfs_is_free_space_inode(inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID + && !btrfs_is_data_reloc_root(root) && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); @@ -4034,11 +4060,11 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, * also drops the back refs in the inode to the directory */ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len) { + struct btrfs_root *root = dir->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; int ret = 0; @@ -4098,19 +4124,9 @@ skip_backref: goto err; } - ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, - dir_ino); - if (ret != 0 && ret != -ENOENT) { - btrfs_abort_transaction(trans, ret); - goto err; - } - - ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, - index); - if (ret == -ENOENT) - ret = 0; - else if (ret) - btrfs_abort_transaction(trans, ret); + btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, + dir_ino); + btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); /* * If we have a pending delayed iput we could end up with the final iput @@ -4138,15 +4154,14 @@ out: } int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len) { int ret; - ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len); if (!ret) { drop_nlink(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode->root, inode); } return ret; } @@ -4175,7 +4190,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) static int btrfs_unlink(struct inode *dir, struct dentry *dentry) { - struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; struct inode *inode = d_inode(dentry); int ret; @@ -4187,7 +4201,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 0); - ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), dentry->d_name.name, dentry->d_name.len); if (ret) @@ -4201,7 +4215,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) out: btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(root->fs_info); + btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); return ret; } @@ -4368,7 +4382,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root) struct inode *inode; u64 objectid = 0; - if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (!BTRFS_FS_ERROR(fs_info)) WARN_ON(btrfs_root_refs(&root->root_item) != 0); spin_lock(&root->inode_lock); @@ -4552,7 +4566,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); int err = 0; - struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; u64 last_unlink_trans; @@ -4577,7 +4590,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), dentry->d_name.name, dentry->d_name.len); if (!err) { @@ -4598,395 +4611,12 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) } out: btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(root->fs_info); + btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); return err; } /* - * Return this if we need to call truncate_block for the last bit of the - * truncate. - */ -#define NEED_TRUNCATE_BLOCK 1 - -/* - * Remove inode items from a given root. - * - * @trans: A transaction handle. - * @root: The root from which to remove items. - * @inode: The inode whose items we want to remove. - * @new_size: The new i_size for the inode. This is only applicable when - * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise. - * @min_type: The minimum key type to remove. All keys with a type - * greater than this value are removed and all keys with - * this type are removed only if their offset is >= @new_size. - * @extents_found: Output parameter that will contain the number of file - * extent items that were removed or adjusted to the new - * inode i_size. The caller is responsible for initializing - * the counter. Also, it can be NULL if the caller does not - * need this counter. - * - * Remove all keys associated with the inode from the given root that have a key - * with a type greater than or equals to @min_type. When @min_type has a value of - * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value - * greater than or equals to @new_size. If a file extent item that starts before - * @new_size and ends after it is found, its length is adjusted. - * - * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is - * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. - */ -int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode, - u64 new_size, u32 min_type, - u64 *extents_found) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *fi; - struct btrfs_key key; - struct btrfs_key found_key; - u64 extent_start = 0; - u64 extent_num_bytes = 0; - u64 extent_offset = 0; - u64 item_end = 0; - u64 last_size = new_size; - u32 found_type = (u8)-1; - int found_extent; - int del_item; - int pending_del_nr = 0; - int pending_del_slot = 0; - int extent_type = -1; - int ret; - u64 ino = btrfs_ino(inode); - u64 bytes_deleted = 0; - bool be_nice = false; - bool should_throttle = false; - const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); - struct extent_state *cached_state = NULL; - - BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); - - /* - * For non-free space inodes and non-shareable roots, we want to back - * off from time to time. This means all inodes in subvolume roots, - * reloc roots, and data reloc roots. - */ - if (!btrfs_is_free_space_inode(inode) && - test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - be_nice = true; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = READA_BACK; - - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - lock_extent_bits(&inode->io_tree, lock_start, (u64)-1, - &cached_state); - - /* - * We want to drop from the next block forward in case this - * new size is not block aligned since we will be keeping the - * last block of the extent just the way it is. - */ - btrfs_drop_extent_cache(inode, ALIGN(new_size, - fs_info->sectorsize), - (u64)-1, 0); - } - - /* - * This function is also used to drop the items in the log tree before - * we relog the inode, so if root != BTRFS_I(inode)->root, it means - * it is used to drop the logged items. So we shouldn't kill the delayed - * items. - */ - if (min_type == 0 && root == inode->root) - btrfs_kill_delayed_inode_items(inode); - - key.objectid = ino; - key.offset = (u64)-1; - key.type = (u8)-1; - -search_again: - /* - * with a 16K leaf size and 128MB extents, you can actually queue - * up a huge file in a single leaf. Most of the time that - * bytes_deleted is > 0, it will be huge by the time we get here - */ - if (be_nice && bytes_deleted > SZ_32M && - btrfs_should_end_transaction(trans)) { - ret = -EAGAIN; - goto out; - } - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = 0; - /* there are no items in the tree for us to truncate, we're - * done - */ - if (path->slots[0] == 0) - goto out; - path->slots[0]--; - } - - while (1) { - u64 clear_start = 0, clear_len = 0; - - fi = NULL; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = found_key.type; - - if (found_key.objectid != ino) - break; - - if (found_type < min_type) - break; - - item_end = found_key.offset; - if (found_type == BTRFS_EXTENT_DATA_KEY) { - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(leaf, fi); - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { - item_end += - btrfs_file_extent_num_bytes(leaf, fi); - - trace_btrfs_truncate_show_fi_regular( - inode, leaf, fi, found_key.offset); - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - item_end += btrfs_file_extent_ram_bytes(leaf, - fi); - - trace_btrfs_truncate_show_fi_inline( - inode, leaf, fi, path->slots[0], - found_key.offset); - } - item_end--; - } - if (found_type > min_type) { - del_item = 1; - } else { - if (item_end < new_size) - break; - if (found_key.offset >= new_size) - del_item = 1; - else - del_item = 0; - } - found_extent = 0; - /* FIXME, shrink the extent if the ref count is only 1 */ - if (found_type != BTRFS_EXTENT_DATA_KEY) - goto delete; - - if (extents_found != NULL) - (*extents_found)++; - - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { - u64 num_dec; - - clear_start = found_key.offset; - extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); - if (!del_item) { - u64 orig_num_bytes = - btrfs_file_extent_num_bytes(leaf, fi); - extent_num_bytes = ALIGN(new_size - - found_key.offset, - fs_info->sectorsize); - clear_start = ALIGN(new_size, fs_info->sectorsize); - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_num_bytes); - num_dec = (orig_num_bytes - - extent_num_bytes); - if (test_bit(BTRFS_ROOT_SHAREABLE, - &root->state) && - extent_start != 0) - inode_sub_bytes(&inode->vfs_inode, - num_dec); - btrfs_mark_buffer_dirty(leaf); - } else { - extent_num_bytes = - btrfs_file_extent_disk_num_bytes(leaf, - fi); - extent_offset = found_key.offset - - btrfs_file_extent_offset(leaf, fi); - - /* FIXME blocksize != 4096 */ - num_dec = btrfs_file_extent_num_bytes(leaf, fi); - if (extent_start != 0) { - found_extent = 1; - if (test_bit(BTRFS_ROOT_SHAREABLE, - &root->state)) - inode_sub_bytes(&inode->vfs_inode, - num_dec); - } - } - clear_len = num_dec; - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - /* - * we can't truncate inline items that have had - * special encodings - */ - if (!del_item && - btrfs_file_extent_encryption(leaf, fi) == 0 && - btrfs_file_extent_other_encoding(leaf, fi) == 0 && - btrfs_file_extent_compression(leaf, fi) == 0) { - u32 size = (u32)(new_size - found_key.offset); - - btrfs_set_file_extent_ram_bytes(leaf, fi, size); - size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(path, size, 1); - } else if (!del_item) { - /* - * We have to bail so the last_size is set to - * just before this extent. - */ - ret = NEED_TRUNCATE_BLOCK; - break; - } else { - /* - * Inline extents are special, we just treat - * them as a full sector worth in the file - * extent tree just for simplicity sake. - */ - clear_len = fs_info->sectorsize; - } - - if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - inode_sub_bytes(&inode->vfs_inode, - item_end + 1 - new_size); - } -delete: - /* - * We use btrfs_truncate_inode_items() to clean up log trees for - * multiple fsyncs, and in this case we don't want to clear the - * file extent range because it's just the log. - */ - if (root == inode->root) { - ret = btrfs_inode_clear_file_extent_range(inode, - clear_start, clear_len); - if (ret) { - btrfs_abort_transaction(trans, ret); - break; - } - } - - if (del_item) - last_size = found_key.offset; - else - last_size = new_size; - if (del_item) { - if (!pending_del_nr) { - /* no pending yet, add ourselves */ - pending_del_slot = path->slots[0]; - pending_del_nr = 1; - } else if (pending_del_nr && - path->slots[0] + 1 == pending_del_slot) { - /* hop on the pending chunk */ - pending_del_nr++; - pending_del_slot = path->slots[0]; - } else { - BUG(); - } - } else { - break; - } - should_throttle = false; - - if (found_extent && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_ref ref = { 0 }; - - bytes_deleted += extent_num_bytes; - - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, - extent_start, extent_num_bytes, 0); - ref.real_root = root->root_key.objectid; - btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - ino, extent_offset); - ret = btrfs_free_extent(trans, &ref); - if (ret) { - btrfs_abort_transaction(trans, ret); - break; - } - if (be_nice) { - if (btrfs_should_throttle_delayed_refs(trans)) - should_throttle = true; - } - } - - if (found_type == BTRFS_INODE_ITEM_KEY) - break; - - if (path->slots[0] == 0 || - path->slots[0] != pending_del_slot || - should_throttle) { - if (pending_del_nr) { - ret = btrfs_del_items(trans, root, path, - pending_del_slot, - pending_del_nr); - if (ret) { - btrfs_abort_transaction(trans, ret); - break; - } - pending_del_nr = 0; - } - btrfs_release_path(path); - - /* - * We can generate a lot of delayed refs, so we need to - * throttle every once and a while and make sure we're - * adding enough space to keep up with the work we are - * generating. Since we hold a transaction here we - * can't flush, and we don't want to FLUSH_LIMIT because - * we could have generated too many delayed refs to - * actually allocate, so just bail if we're short and - * let the normal reservation dance happen higher up. - */ - if (should_throttle) { - ret = btrfs_delayed_refs_rsv_refill(fs_info, - BTRFS_RESERVE_NO_FLUSH); - if (ret) { - ret = -EAGAIN; - break; - } - } - goto search_again; - } else { - path->slots[0]--; - } - } -out: - if (ret >= 0 && pending_del_nr) { - int err; - - err = btrfs_del_items(trans, root, path, pending_del_slot, - pending_del_nr); - if (err) { - btrfs_abort_transaction(trans, err); - ret = err; - } - } - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ASSERT(last_size >= new_size); - if (!ret && last_size > new_size) - last_size = new_size; - btrfs_inode_safe_disk_i_size_write(inode, last_size); - unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1, - &cached_state); - } - - btrfs_free_path(path); - return ret; -} - -/* * btrfs_truncate_block - read, zero a chunk and write a block * @inode - inode that we're zeroing * @from - the offset to start zeroing @@ -5105,7 +4735,8 @@ again: len); flush_dcache_page(page); } - ClearPageChecked(page); + btrfs_page_clear_checked(fs_info, page, block_start, + block_end + 1 - block_start); btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); @@ -5513,7 +5144,6 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_trans_handle *trans; u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); int ret; @@ -5528,18 +5158,16 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * above. We reserve our extra bit here because we generate a ton of * delayed refs activity by truncating. * - * If we cannot make our reservation we'll attempt to steal from the - * global reserve, because we really want to be able to free up space. + * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can, + * if we fail to make this reservation we can re-try without the + * delayed_refs_extra so we can make some forward progress. */ - ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra, + ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra, BTRFS_RESERVE_FLUSH_EVICT); if (ret) { - /* - * Try to steal from the global reserve if there is space for - * it. - */ - if (btrfs_check_space_for_delayed_refs(fs_info) || - btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) { + ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size, + BTRFS_RESERVE_FLUSH_EVICT); + if (ret) { btrfs_warn(fs_info, "could not allocate space for delete; will truncate on mount"); return ERR_PTR(-ENOSPC); @@ -5598,10 +5226,22 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + /* + * This makes sure the inode item in tree is uptodate and the space for + * the inode update is released. + */ ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); if (ret) goto no_delete; + /* + * This drops any pending insert or delete operations we have for this + * inode. We could have a delayed dir index deletion queued up, but + * we're removing the inode completely so that'll be taken care of in + * the truncate. + */ + btrfs_kill_delayed_inode_items(BTRFS_I(inode)); + rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) goto no_delete; @@ -5611,14 +5251,20 @@ void btrfs_evict_inode(struct inode *inode) btrfs_i_size_write(BTRFS_I(inode), 0); while (1) { + struct btrfs_truncate_control control = { + .inode = BTRFS_I(inode), + .ino = btrfs_ino(BTRFS_I(inode)), + .new_size = 0, + .min_type = 0, + }; + trans = evict_refill_and_join(root, rsv); if (IS_ERR(trans)) goto free_rsv; trans->block_rsv = rsv; - ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, 0, NULL); + ret = btrfs_truncate_inode_items(trans, root, &control); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -6435,7 +6081,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; - int nitems = name ? 2 : 1; + struct btrfs_item_batch batch; unsigned long ptr; unsigned int nofs_flag; int ret; @@ -6527,7 +6173,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, goto fail; } - ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); + batch.keys = &key[0]; + batch.data_sizes = &sizes[0]; + batch.total_data_size = sizes[0] + (name ? sizes[1] : 0); + batch.nr = name ? 2 : 1; + ret = btrfs_insert_empty_items(trans, root, path, &batch); if (ret != 0) goto fail_unlock; @@ -6982,8 +6632,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, WARN_ON(pg_offset != 0); compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); - inline_size = btrfs_file_extent_inline_item_len(leaf, - btrfs_item_nr(path->slots[0])); + inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); tmp = kmalloc(inline_size, GFP_NOFS); if (!tmp) return -ENOMEM; @@ -7757,6 +7406,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em = *map; + int type; + u64 block_start, orig_start, orig_block_len, ram_bytes; + bool can_nocow = false; + bool space_reserved = false; int ret = 0; /* @@ -7771,9 +7424,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { - int type; - u64 block_start, orig_start, orig_block_len, ram_bytes; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) type = BTRFS_ORDERED_PREALLOC; else @@ -7783,53 +7433,92 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, if (can_nocow_extent(inode, start, &len, &orig_start, &orig_block_len, &ram_bytes, false) == 1 && - btrfs_inc_nocow_writers(fs_info, block_start)) { - struct extent_map *em2; + btrfs_inc_nocow_writers(fs_info, block_start)) + can_nocow = true; + } + + if (can_nocow) { + struct extent_map *em2; - em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, - orig_start, block_start, - len, orig_block_len, - ram_bytes, type); + /* We can NOCOW, so only need to reserve metadata space. */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + if (ret < 0) { + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; btrfs_dec_nocow_writers(fs_info, block_start); - if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); - *map = em = em2; - } + goto out; + } + space_reserved = true; - if (em2 && IS_ERR(em2)) { - ret = PTR_ERR(em2); - goto out; - } - /* - * For inode marked NODATACOW or extent marked PREALLOC, - * use the existing or preallocated extent, so does not - * need to adjust btrfs_space_info's bytes_may_use. - */ - btrfs_free_reserved_data_space_noquota(fs_info, len); - goto skip_cow; + em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); + btrfs_dec_nocow_writers(fs_info, block_start); + if (type == BTRFS_ORDERED_PREALLOC) { + free_extent_map(em); + *map = em = em2; } - } - /* this will cow the extent */ - free_extent_map(em); - *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; + if (IS_ERR(em2)) { + ret = PTR_ERR(em2); + goto out; + } + } else { + const u64 prev_len = len; + + /* Our caller expects us to free the input extent map. */ + free_extent_map(em); + *map = NULL; + + /* We have to COW, so need to reserve metadata and data space. */ + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, len); + if (ret < 0) + goto out; + space_reserved = true; + + em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + *map = em; + len = min(len, em->len - (start - em->start)); + if (len < prev_len) + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, + start + len, prev_len - len, + true); } - len = min(len, em->len - (start - em->start)); + /* + * We have created our ordered extent, so we can now release our reservation + * for an outstanding extent. + */ + btrfs_delalloc_release_extents(BTRFS_I(inode), len); -skip_cow: /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - - dio_data->reserve -= len; out: + if (ret && space_reserved) { + btrfs_delalloc_release_extents(BTRFS_I(inode), len); + if (can_nocow) { + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); + } else { + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, + start, len, true); + extent_changeset_free(dio_data->data_reserved); + dio_data->data_reserved = NULL; + } + } return ret; } @@ -7871,18 +7560,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (!dio_data) return -ENOMEM; - dio_data->length = length; - if (write) { - dio_data->reserve = round_up(length, fs_info->sectorsize); - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), - &dio_data->data_reserved, - start, dio_data->reserve); - if (ret) { - extent_changeset_free(dio_data->data_reserved); - kfree(dio_data); - return ret; - } - } iomap->private = dio_data; @@ -7961,7 +7638,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->type = IOMAP_MAPPED; } iomap->offset = start; - iomap->bdev = fs_info->fs_devices->latest_bdev; + iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) @@ -7975,14 +7652,8 @@ unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) { - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, start, - dio_data->reserve, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); - extent_changeset_free(dio_data->data_reserved); - kfree(dio_data); - } + kfree(dio_data); + return ret; } @@ -8012,14 +7683,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, ret = -ENOTBLK; } - if (write) { - if (dio_data->reserve) - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, pos, - dio_data->reserve, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); + if (write) extent_changeset_free(dio_data->data_reserved); - } out: kfree(dio_data); iomap->private = NULL; @@ -8038,13 +7703,13 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { __endio_write_update_ordered(BTRFS_I(dip->inode), - dip->logical_offset, + dip->file_offset, dip->bytes, !dip->dio_bio->bi_status); } else { unlock_extent(&BTRFS_I(dip->inode)->io_tree, - dip->logical_offset, - dip->logical_offset + dip->bytes - 1); + dip->file_offset, + dip->file_offset + dip->bytes - 1); } bio_endio(dip->dio_bio); @@ -8072,10 +7737,11 @@ static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, return ret; } -static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, - struct btrfs_io_bio *io_bio, +static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, + struct btrfs_bio *bbio, const bool uptodate) { + struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; const u32 sectorsize = fs_info->sectorsize; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; @@ -8083,11 +7749,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct bio_vec bvec; struct bvec_iter iter; - u64 start = io_bio->logical; + const u64 orig_file_offset = dip->file_offset; + u64 start = orig_file_offset; u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; - __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) { + __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) { unsigned int i, nr_sectors, pgoff; nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); @@ -8095,7 +7762,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); if (uptodate && - (!csum || !check_data_csum(inode, io_bio, + (!csum || !check_data_csum(inode, bbio, bio_offset, bvec.bv_page, pgoff, start))) { clean_io_failure(fs_info, failure_tree, io_tree, @@ -8105,12 +7772,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, } else { int ret; - ASSERT((start - io_bio->logical) < UINT_MAX); + ASSERT((start - orig_file_offset) < UINT_MAX); ret = btrfs_repair_one_sector(inode, - &io_bio->bio, - start - io_bio->logical, + &bbio->bio, + start - orig_file_offset, bvec.bv_page, pgoff, - start, io_bio->mirror_num, + start, bbio->mirror_num, submit_dio_repair_bio); if (ret) err = errno_to_blk_status(ret); @@ -8151,15 +7818,13 @@ static void btrfs_end_dio_bio(struct bio *bio) bio->bi_opf, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); - if (bio_op(bio) == REQ_OP_READ) { - err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio), - !err); - } + if (bio_op(bio) == REQ_OP_READ) + err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err); if (err) dip->dio_bio->bi_status = err; - btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio); + btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio); bio_put(bio); btrfs_dio_private_put(dip); @@ -8201,10 +7866,10 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, } else { u64 csum_offset; - csum_offset = file_offset - dip->logical_offset; + csum_offset = file_offset - dip->file_offset; csum_offset >>= fs_info->sectorsize_bits; csum_offset *= fs_info->csum_size; - btrfs_io_bio(bio)->csum = dip->csums + csum_offset; + btrfs_bio(bio)->csum = dip->csums + csum_offset; } map: ret = btrfs_map_bio(fs_info, bio, 0); @@ -8239,7 +7904,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, return NULL; dip->inode = inode; - dip->logical_offset = file_offset; + dip->file_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; @@ -8247,7 +7912,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, return dip; } -static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, +static void btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { struct inode *inode = iter->inode; @@ -8277,7 +7942,7 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, } dio_bio->bi_status = BLK_STS_RESOURCE; bio_endio(dio_bio); - return BLK_QC_T_NONE; + return; } if (!write) { @@ -8320,7 +7985,6 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; - btrfs_io_bio(bio)->logical = file_offset; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { status = extract_ordered_extent(BTRFS_I(inode), bio, @@ -8371,15 +8035,13 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, free_extent_map(em); } while (submit_len > 0); - return BLK_QC_T_NONE; + return; out_err_em: free_extent_map(em); out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); - - return BLK_QC_T_NONE; } const struct iomap_ops btrfs_dio_iomap_ops = { @@ -8696,9 +8358,9 @@ next: * did something wrong. */ ASSERT(!PageOrdered(page)); + btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE); if (!inode_evicting) __btrfs_releasepage(page, GFP_NOFS); - ClearPageChecked(page); clear_page_extent_mapped(page); } @@ -8842,7 +8504,7 @@ again: memzero_page(page, zero_start, PAGE_SIZE - zero_start); flush_dcache_page(page); } - ClearPageChecked(page); + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); @@ -8871,6 +8533,12 @@ out_noreserve: static int btrfs_truncate(struct inode *inode, bool skip_writeback) { + struct btrfs_truncate_control control = { + .inode = BTRFS_I(inode), + .ino = btrfs_ino(BTRFS_I(inode)), + .min_type = BTRFS_EXTENT_DATA_KEY, + .clear_extent_range = true, + }; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; @@ -8878,7 +8546,6 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; u64 min_size = btrfs_calc_metadata_size(fs_info, 1); - u64 extents_found = 0; if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), @@ -8939,10 +8606,30 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) trans->block_rsv = rsv; while (1) { - ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - inode->i_size, - BTRFS_EXTENT_DATA_KEY, - &extents_found); + struct extent_state *cached_state = NULL; + const u64 new_size = inode->i_size; + const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); + + control.new_size = new_size; + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); + /* + * We want to drop from the next block forward in case this new + * size is not block aligned since we will be keeping the last + * block of the extent just the way it is. + */ + btrfs_drop_extent_cache(BTRFS_I(inode), + ALIGN(new_size, fs_info->sectorsize), + (u64)-1, 0); + + ret = btrfs_truncate_inode_items(trans, root, &control); + + inode_sub_bytes(inode, control.sub_bytes); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, + (u64)-1, &cached_state); + trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; @@ -8970,11 +8657,11 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) /* * We can't call btrfs_truncate_block inside a trans handle as we could - * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know - * we've truncated everything except the last little bit, and can do - * btrfs_truncate_block and then update the disk_i_size. + * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we + * know we've truncated everything except the last little bit, and can + * do btrfs_truncate_block and then update the disk_i_size. */ - if (ret == NEED_TRUNCATE_BLOCK) { + if (ret == BTRFS_NEED_TRUNCATE_BLOCK) { btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -9018,7 +8705,7 @@ out: * between the old i_size and the new i_size, and there were no prealloc * extents beyond i_size to drop. */ - if (extents_found > 0) + if (control.extents_found > 0) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); return ret; @@ -9152,8 +8839,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode) WARN_ON(inode->block_rsv.reserved); WARN_ON(inode->block_rsv.size); WARN_ON(inode->outstanding_extents); - WARN_ON(inode->delalloc_bytes); - WARN_ON(inode->new_delalloc_bytes); + if (!S_ISDIR(vfs_inode->i_mode)) { + WARN_ON(inode->delalloc_bytes); + WARN_ON(inode->new_delalloc_bytes); + } WARN_ON(inode->csum_bytes); WARN_ON(inode->defrag_bytes); @@ -9450,7 +9139,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { /* src is an inode */ - ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), + ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_dentry->d_name.name, old_dentry->d_name.len); @@ -9466,7 +9155,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); } else { /* dest is an inode */ - ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), + ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_dentry->d_name.name, new_dentry->d_name.len); @@ -9741,7 +9430,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, */ btrfs_pin_log_trans(root); log_pinned = true; - ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), + ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, old_dentry->d_name.len); @@ -9761,7 +9450,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); BUG_ON(new_inode->i_nlink == 0); } else { - ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), + ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), new_dentry->d_name.name, new_dentry->d_name.len); @@ -9979,7 +9668,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_conte }; struct btrfs_fs_info *fs_info = root->fs_info; - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) return -EROFS; return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); @@ -9998,7 +9687,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, struct list_head splice; int ret; - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) return -EROFS; INIT_LIST_HEAD(&splice); @@ -10580,9 +10269,19 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, struct btrfs_swap_info *bsi) { unsigned long nr_pages; + unsigned long max_pages; u64 first_ppage, first_ppage_reported, next_ppage; int ret; + /* + * Our swapfile may have had its size extended after the swap header was + * written. In that case activating the swapfile should not go beyond + * the max size set in the swap header. + */ + if (bsi->nr_pages >= sis->max) + return 0; + + max_pages = sis->max - bsi->nr_pages; first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, PAGE_SIZE) >> PAGE_SHIFT; @@ -10590,6 +10289,7 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, if (first_ppage >= next_ppage) return 0; nr_pages = next_ppage - first_ppage; + nr_pages = min(nr_pages, max_pages); first_ppage_reported = first_ppage; if (bsi->start == 0) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cc61813213d8..a5bd6926f7ff 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -48,6 +48,7 @@ #include "space-info.h" #include "delalloc-space.h" #include "block-group.h" +#include "subpage.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -81,7 +82,8 @@ struct btrfs_ioctl_send_args_32 { compat_uptr_t clone_sources; /* in */ __u64 parent_root; /* in */ __u64 flags; /* in */ - __u64 reserved[4]; /* in */ + __u32 version; /* in */ + __u8 reserved[28]; /* in */ } __attribute__ ((__packed__)); #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ @@ -385,6 +387,7 @@ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, * * Compatibility: * - the same type is already running + * - when trying to add a device and balance has been paused * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller * must check the condition first that would allow none -> @type */ @@ -392,7 +395,9 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type) { spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == type) + if (fs_info->exclusive_operation == type || + (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && + type == BTRFS_EXCLOP_DEV_ADD)) return true; spin_unlock(&fs_info->super_lock); @@ -412,6 +417,29 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); } +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op) +{ + switch (op) { + case BTRFS_EXCLOP_BALANCE_PAUSED: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || + fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; + spin_unlock(&fs_info->super_lock); + break; + case BTRFS_EXCLOP_BALANCE: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; + spin_unlock(&fs_info->super_lock); + break; + default: + btrfs_warn(fs_info, + "invalid exclop balance operation %d requested", op); + } +} + static int btrfs_ioctl_getversion(struct file *file, int __user *arg) { struct inode *inode = file_inode(file); @@ -516,7 +544,6 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, struct timespec64 cur_time = current_time(dir); struct inode *inode; int ret; - int err; dev_t anon_dev = 0; u64 objectid; u64 index = 0; @@ -615,11 +642,13 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, * Since we don't abort the transaction in this case, free the * tree block so that we don't leak space and leave the * filesystem in an inconsistent state (an extent item in the - * extent tree without backreferences). Also no need to have - * the tree block locked since it is not in any tree at this - * point, so no other task can find it and use it. + * extent tree with a backreference for a root that does not + * exists). */ - btrfs_free_tree_block(trans, root, leaf, 0, 1); + btrfs_tree_lock(leaf); + btrfs_clean_tree_block(leaf); + btrfs_tree_unlock(leaf); + btrfs_free_tree_block(trans, objectid, leaf, 0, 1); free_extent_buffer(leaf); goto fail; } @@ -694,9 +723,10 @@ fail: trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv); - err = btrfs_commit_transaction(trans); - if (err && !ret) - ret = err; + if (ret) + btrfs_end_transaction(trans); + else + ret = btrfs_commit_transaction(trans); if (!ret) { inode = btrfs_lookup_dentry(dir, dentry); @@ -985,129 +1015,32 @@ out: return ret; } -/* - * When we're defragging a range, we don't want to kick it off again - * if it is really just waiting for delalloc to send it down. - * If we find a nice big extent or delalloc range for the bytes in the - * file you want to defrag, we return 0 to let you know to skip this - * part of the file - */ -static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) -{ - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - u64 end; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE); - read_unlock(&em_tree->lock); - - if (em) { - end = extent_map_end(em); - free_extent_map(em); - if (end - offset > thresh) - return 0; - } - /* if we already have a nice delalloc here, just stop */ - thresh /= 2; - end = count_range_bits(io_tree, &offset, offset + thresh, - thresh, EXTENT_DELALLOC, 1); - if (end >= thresh) - return 0; - return 1; -} - -/* - * helper function to walk through a file and find extents - * newer than a specific transid, and smaller than thresh. - * - * This is used by the defragging code to find new and small - * extents - */ -static int find_new_extents(struct btrfs_root *root, - struct inode *inode, u64 newer_than, - u64 *off, u32 thresh) -{ - struct btrfs_path *path; - struct btrfs_key min_key; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *extent; - int type; - int ret; - u64 ino = btrfs_ino(BTRFS_I(inode)); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - min_key.objectid = ino; - min_key.type = BTRFS_EXTENT_DATA_KEY; - min_key.offset = *off; - - while (1) { - ret = btrfs_search_forward(root, &min_key, path, newer_than); - if (ret != 0) - goto none; -process_slot: - if (min_key.objectid != ino) - goto none; - if (min_key.type != BTRFS_EXTENT_DATA_KEY) - goto none; - - leaf = path->nodes[0]; - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG && - btrfs_file_extent_num_bytes(leaf, extent) < thresh && - check_defrag_in_cache(inode, min_key.offset, thresh)) { - *off = min_key.offset; - btrfs_free_path(path); - return 0; - } - - path->slots[0]++; - if (path->slots[0] < btrfs_header_nritems(leaf)) { - btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); - goto process_slot; - } - - if (min_key.offset == (u64)-1) - goto none; - - min_key.offset++; - btrfs_release_path(path); - } -none: - btrfs_free_path(path); - return -ENOENT; -} - -static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, + bool locked) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em; - u64 len = PAGE_SIZE; + const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; /* * hopefully we have this extent in the tree already, try without * the full extent lock */ read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); if (!em) { struct extent_state *cached = NULL; - u64 end = start + len - 1; + u64 end = start + sectorsize - 1; /* get the big lock and read metadata off disk */ - lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); - unlock_extent_cached(io_tree, start, end, &cached); + if (!locked) + lock_extent_bits(io_tree, start, end, &cached); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize); + if (!locked) + unlock_extent_cached(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; @@ -1116,7 +1049,8 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) return em; } -static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, + bool locked) { struct extent_map *next; bool ret = true; @@ -1125,7 +1059,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) if (em->start + em->len >= i_size_read(inode)) return false; - next = defrag_lookup_extent(inode, em->start + em->len); + next = defrag_lookup_extent(inode, em->start + em->len, locked); if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) ret = false; else if ((em->block_start + em->block_len == next->block_start) && @@ -1136,297 +1070,435 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) return ret; } -static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, - u64 *last_len, u64 *skip, u64 *defrag_end, - int compress) +/* + * Prepare one page to be defragged. + * + * This will ensure: + * + * - Returned page is locked and has been set up properly. + * - No ordered extent exists in the page. + * - The page is uptodate. + * + * NOTE: Caller should also wait for page writeback after the cluster is + * prepared, here we don't do writeback wait for each page. + */ +static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, + pgoff_t index) { - struct extent_map *em; - int ret = 1; - bool next_mergeable = true; - bool prev_mergeable = true; + struct address_space *mapping = inode->vfs_inode.i_mapping; + gfp_t mask = btrfs_alloc_write_mask(mapping); + u64 page_start = (u64)index << PAGE_SHIFT; + u64 page_end = page_start + PAGE_SIZE - 1; + struct extent_state *cached_state = NULL; + struct page *page; + int ret; + +again: + page = find_or_create_page(mapping, index, mask); + if (!page) + return ERR_PTR(-ENOMEM); /* - * make sure that once we start defragging an extent, we keep on - * defragging it + * Since we can defragment files opened read-only, we can encounter + * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We + * can't do I/O using huge pages yet, so return an error for now. + * Filesystem transparent huge pages are typically only used for + * executables that explicitly enable them, so this isn't very + * restrictive. */ - if (start < *defrag_end) - return 1; + if (PageCompound(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ETXTBSY); + } - *skip = 0; + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + return ERR_PTR(ret); + } - em = defrag_lookup_extent(inode, start); - if (!em) - return 0; + /* Wait for any existing ordered extent in the range */ + while (1) { + struct btrfs_ordered_extent *ordered; - /* this will cover holes, and inline extents */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - ret = 0; - goto out; - } + lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); + unlock_extent_cached(&inode->io_tree, page_start, page_end, + &cached_state); + if (!ordered) + break; - if (!*defrag_end) - prev_mergeable = false; + unlock_page(page); + btrfs_start_ordered_extent(ordered, 1); + btrfs_put_ordered_extent(ordered); + lock_page(page); + /* + * We unlocked the page above, so we need check if it was + * released or not. + */ + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + } - next_mergeable = defrag_check_next_extent(inode, em); /* - * we hit a real extent, if it is big or the next extent is not a - * real extent, don't bother defragging it + * Now the page range has no ordered extent any more. Read the page to + * make it uptodate. */ - if (!compress && (*last_len == 0 || *last_len >= thresh) && - (em->len >= thresh || (!next_mergeable && !prev_mergeable))) - ret = 0; -out: - /* - * last_len ends up being a counter of how many bytes we've defragged. - * every time we choose not to defrag an extent, we reset *last_len - * so that the next tiny extent will force a defrag. - * - * The end result of this is that tiny extents before a single big - * extent will force at least part of that big extent to be defragged. - */ - if (ret) { - *defrag_end = extent_map_end(em); - } else { - *last_len = 0; - *skip = extent_map_end(em); - *defrag_end = 0; + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + if (!PageUptodate(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EIO); + } } - - free_extent_map(em); - return ret; + return page; } +struct defrag_target_range { + struct list_head list; + u64 start; + u64 len; +}; + /* - * it doesn't do much good to defrag one or two pages - * at a time. This pulls in a nice chunk of pages - * to COW and defrag. + * Collect all valid target extents. * - * It also makes sure the delalloc code has enough - * dirty data to avoid making new small extents as part - * of the defrag - * - * It's a good idea to start RA on this range - * before calling this. + * @start: file offset to lookup + * @len: length to lookup + * @extent_thresh: file extent size threshold, any extent size >= this value + * will be ignored + * @newer_than: only defrag extents newer than this value + * @do_compress: whether the defrag is doing compression + * if true, @extent_thresh will be ignored and all regular + * file extents meeting @newer_than will be targets. + * @locked: if the range has already held extent lock + * @target_list: list of targets file extents */ -static int cluster_pages_for_defrag(struct inode *inode, - struct page **pages, - unsigned long start_index, - unsigned long num_pages) +static int defrag_collect_targets(struct btrfs_inode *inode, + u64 start, u64 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + bool locked, struct list_head *target_list) { - unsigned long file_end; - u64 isize = i_size_read(inode); - u64 page_start; - u64 page_end; - u64 page_cnt; - u64 start = (u64)start_index << PAGE_SHIFT; - u64 search_start; - int ret; - int i; - int i_done; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - struct extent_io_tree *tree; - struct extent_changeset *data_reserved = NULL; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + u64 cur = start; + int ret = 0; - file_end = (isize - 1) >> PAGE_SHIFT; - if (!isize || start_index > file_end) - return 0; + while (cur < start + len) { + struct extent_map *em; + struct defrag_target_range *new; + bool next_mergeable = true; + u64 range_len; - page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); + em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); + if (!em) + break; - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - start, page_cnt << PAGE_SHIFT); - if (ret) - return ret; - i_done = 0; - tree = &BTRFS_I(inode)->io_tree; + /* Skip hole/inline/preallocated extents */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + goto next; - /* step one, lock all the pages */ - for (i = 0; i < page_cnt; i++) { - struct page *page; -again: - page = find_or_create_page(inode->i_mapping, - start_index + i, mask); - if (!page) - break; + /* Skip older extent */ + if (em->generation < newer_than) + goto next; - ret = set_page_extent_mapped(page); - if (ret < 0) { - unlock_page(page); - put_page(page); - break; + /* + * For do_compress case, we want to compress all valid file + * extents, thus no @extent_thresh or mergeable check. + */ + if (do_compress) + goto add; + + /* Skip too large extent */ + if (em->len >= extent_thresh) + goto next; + + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, + locked); + if (!next_mergeable) { + struct defrag_target_range *last; + + /* Empty target list, no way to merge with last entry */ + if (list_empty(target_list)) + goto next; + last = list_entry(target_list->prev, + struct defrag_target_range, list); + /* Not mergeable with last entry */ + if (last->start + last->len != cur) + goto next; + + /* Mergeable, fall through to add it to @target_list. */ } - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; - while (1) { - lock_extent_bits(tree, page_start, page_end, - &cached_state); - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), - page_start); - unlock_extent_cached(tree, page_start, page_end, - &cached_state); - if (!ordered) - break; - - unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); - btrfs_put_ordered_extent(ordered); - lock_page(page); - /* - * we unlocked the page above, so we need check if - * it was released or not. - */ - if (page->mapping != inode->i_mapping) { - unlock_page(page); - put_page(page); - goto again; +add: + range_len = min(extent_map_end(em), start + len) - cur; + /* + * This one is a good target, check if it can be merged into + * last range of the target list. + */ + if (!list_empty(target_list)) { + struct defrag_target_range *last; + + last = list_entry(target_list->prev, + struct defrag_target_range, list); + ASSERT(last->start + last->len <= cur); + if (last->start + last->len == cur) { + /* Mergeable, enlarge the last entry */ + last->len += range_len; + goto next; } + /* Fall through to allocate a new entry */ } - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); - ret = -EIO; - break; - } + /* Allocate new defrag_target_range */ + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) { + free_extent_map(em); + ret = -ENOMEM; + break; } + new->start = cur; + new->len = range_len; + list_add_tail(&new->list, target_list); - if (page->mapping != inode->i_mapping) { - unlock_page(page); - put_page(page); - goto again; +next: + cur = extent_map_end(em); + free_extent_map(em); + } + if (ret < 0) { + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + + list_for_each_entry_safe(entry, tmp, target_list, list) { + list_del_init(&entry->list); + kfree(entry); } + } + return ret; +} + +#define CLUSTER_SIZE (SZ_256K) + +/* + * Defrag one contiguous target range. + * + * @inode: target inode + * @target: target range to defrag + * @pages: locked pages covering the defrag range + * @nr_pages: number of locked pages + * + * Caller should ensure: + * + * - Pages are prepared + * Pages should be locked, no ordered extent in the pages range, + * no writeback. + * + * - Extent bits are locked + */ +static int defrag_one_locked_target(struct btrfs_inode *inode, + struct defrag_target_range *target, + struct page **pages, int nr_pages, + struct extent_state **cached_state) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_changeset *data_reserved = NULL; + const u64 start = target->start; + const u64 len = target->len; + unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; + unsigned long start_index = start >> PAGE_SHIFT; + unsigned long first_index = page_index(pages[0]); + int ret = 0; + int i; - pages[i] = page; - i_done++; + ASSERT(last_index - first_index + 1 <= nr_pages); + + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); + if (ret < 0) + return ret; + clear_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 0, 0, cached_state); + set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); + + /* Update the page status */ + for (i = start_index - first_index; i <= last_index - first_index; i++) { + ClearPageChecked(pages[i]); + btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); } - if (!i_done || ret) - goto out; + btrfs_delalloc_release_extents(inode, len); + extent_changeset_free(data_reserved); - if (!(inode->i_sb->s_flags & SB_ACTIVE)) - goto out; + return ret; +} - /* - * so now we have a nice long stream of locked - * and up to date pages, lets wait on them - */ - for (i = 0; i < i_done; i++) - wait_on_page_writeback(pages[i]); +static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, + u32 extent_thresh, u64 newer_than, bool do_compress) +{ + struct extent_state *cached_state = NULL; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + struct page **pages; + const u32 sectorsize = inode->root->fs_info->sectorsize; + u64 last_index = (start + len - 1) >> PAGE_SHIFT; + u64 start_index = start >> PAGE_SHIFT; + unsigned int nr_pages = last_index - start_index + 1; + int ret = 0; + int i; - page_start = page_offset(pages[0]); - page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE; + ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); - lock_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + + /* Prepare all pages */ + for (i = 0; i < nr_pages; i++) { + pages[i] = defrag_prepare_one_page(inode, start_index + i); + if (IS_ERR(pages[i])) { + ret = PTR_ERR(pages[i]); + pages[i] = NULL; + goto free_pages; + } + } + for (i = 0; i < nr_pages; i++) + wait_on_page_writeback(pages[i]); + /* Lock the pages range */ + lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); /* - * When defragmenting we skip ranges that have holes or inline extents, - * (check should_defrag_range()), to avoid unnecessary IO and wasting - * space. At btrfs_defrag_file(), we check if a range should be defragged - * before locking the inode and then, if it should, we trigger a sync - * page cache readahead - we lock the inode only after that to avoid - * blocking for too long other tasks that possibly want to operate on - * other file ranges. But before we were able to get the inode lock, - * some other task may have punched a hole in the range, or we may have - * now an inline extent, in which case we should not defrag. So check - * for that here, where we have the inode and the range locked, and bail - * out if that happened. + * Now we have a consistent view about the extent map, re-check + * which range really needs to be defragged. + * + * And this time we have extent locked already, pass @locked = true + * so that we won't relock the extent range and cause deadlock. */ - search_start = page_start; - while (search_start < page_end) { - struct extent_map *em; + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, true, + &target_list); + if (ret < 0) + goto unlock_extent; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start, - page_end - search_start); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out_unlock_range; - } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - /* Ok, 0 means we did not defrag anything */ - ret = 0; - goto out_unlock_range; + list_for_each_entry(entry, &target_list, list) { + ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + &cached_state); + if (ret < 0) + break; + } + + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } +unlock_extent: + unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); +free_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + put_page(pages[i]); } - search_start = extent_map_end(em); - free_extent_map(em); } + kfree(pages); + return ret; +} - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, - page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 0, 0, &cached_state); +static int defrag_one_cluster(struct btrfs_inode *inode, + struct file_ra_state *ra, + u64 start, u32 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + unsigned long *sectors_defragged, + unsigned long max_sectors) +{ + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + int ret; - if (i_done != page_cnt) { - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); - spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - start, (page_cnt - i_done) << PAGE_SHIFT, true); - } + BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, false, + &target_list); + if (ret < 0) + goto out; + list_for_each_entry(entry, &target_list, list) { + u32 range_len = entry->len; - set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, - &cached_state); + /* Reached the limit */ + if (max_sectors && max_sectors == *sectors_defragged) + break; - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state); + if (max_sectors) + range_len = min_t(u32, range_len, + (max_sectors - *sectors_defragged) * sectorsize); - for (i = 0; i < i_done; i++) { - clear_page_dirty_for_io(pages[i]); - ClearPageChecked(pages[i]); - set_page_dirty(pages[i]); - unlock_page(pages[i]); - put_page(pages[i]); + if (ra) + page_cache_sync_readahead(inode->vfs_inode.i_mapping, + ra, NULL, entry->start >> PAGE_SHIFT, + ((entry->start + range_len - 1) >> PAGE_SHIFT) - + (entry->start >> PAGE_SHIFT) + 1); + /* + * Here we may not defrag any range if holes are punched before + * we locked the pages. + * But that's fine, it only affects the @sectors_defragged + * accounting. + */ + ret = defrag_one_range(inode, entry->start, range_len, + extent_thresh, newer_than, do_compress); + if (ret < 0) + break; + *sectors_defragged += range_len; } - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); - extent_changeset_free(data_reserved); - return i_done; - -out_unlock_range: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state); out: - for (i = 0; i < i_done; i++) { - unlock_page(pages[i]); - put_page(pages[i]); + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); } - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - start, page_cnt << PAGE_SHIFT, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); - extent_changeset_free(data_reserved); return ret; - } -int btrfs_defrag_file(struct inode *inode, struct file *file, +/* + * Entry point to file defragmentation. + * + * @inode: inode to be defragged + * @ra: readahead state (can be NUL) + * @range: defrag options including range and flags + * @newer_than: minimum transid to defrag + * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode + * will be defragged. + */ +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct file_ra_state *ra = NULL; - unsigned long last_index; + unsigned long sectors_defragged = 0; u64 isize = i_size_read(inode); - u64 last_len = 0; - u64 skip = 0; - u64 defrag_end = 0; - u64 newer_off = range->start; - unsigned long i; - unsigned long ra_index = 0; - int ret; - int defrag_count = 0; + u64 cur; + u64 last_byte; + bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; + bool ra_allocated = false; int compress_type = BTRFS_COMPRESS_ZLIB; + int ret = 0; u32 extent_thresh = range->extent_thresh; - unsigned long max_cluster = SZ_256K >> PAGE_SHIFT; - unsigned long cluster = max_cluster; - u64 new_align = ~((u64)SZ_128K - 1); - struct page **pages = NULL; - bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; if (isize == 0) return 0; @@ -1444,172 +1516,87 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (extent_thresh == 0) extent_thresh = SZ_256K; + if (range->start + range->len > range->start) { + /* Got a specific range */ + last_byte = min(isize, range->start + range->len) - 1; + } else { + /* Defrag until file end */ + last_byte = isize - 1; + } + /* - * If we were not given a file, allocate a readahead context. As + * If we were not given a ra, allocate a readahead context. As * readahead is just an optimization, defrag will work without it so * we don't error out. */ - if (!file) { + if (!ra) { + ra_allocated = true; ra = kzalloc(sizeof(*ra), GFP_KERNEL); if (ra) file_ra_state_init(ra, inode->i_mapping); - } else { - ra = &file->f_ra; - } - - pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto out_ra; } - /* find the last page to defrag */ - if (range->start + range->len > range->start) { - last_index = min_t(u64, isize - 1, - range->start + range->len - 1) >> PAGE_SHIFT; - } else { - last_index = (isize - 1) >> PAGE_SHIFT; - } + /* Align the range */ + cur = round_down(range->start, fs_info->sectorsize); + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; - if (newer_than) { - ret = find_new_extents(root, inode, newer_than, - &newer_off, SZ_64K); - if (!ret) { - range->start = newer_off; - /* - * we always align our defrag to help keep - * the extents in the file evenly spaced - */ - i = (newer_off & new_align) >> PAGE_SHIFT; - } else - goto out_ra; - } else { - i = range->start >> PAGE_SHIFT; - } - if (!max_to_defrag) - max_to_defrag = last_index - i + 1; + while (cur < last_byte) { + u64 cluster_end; - /* - * make writeback starts from i, so the defrag range can be - * written sequentially. - */ - if (i < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = i; + /* The cluster size 256K should always be page aligned */ + BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); - while (i <= last_index && defrag_count < max_to_defrag && - (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) { - /* - * make sure we stop running if someone unmounts - * the FS - */ - if (!(inode->i_sb->s_flags & SB_ACTIVE)) - break; - - if (btrfs_defrag_cancelled(fs_info)) { - btrfs_debug(fs_info, "defrag_file cancelled"); - ret = -EAGAIN; - goto error; - } - - if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, - extent_thresh, &last_len, &skip, - &defrag_end, do_compress)){ - unsigned long next; - /* - * the should_defrag function tells us how much to skip - * bump our counter by the suggested amount - */ - next = DIV_ROUND_UP(skip, PAGE_SIZE); - i = max(i + 1, next); - continue; - } - - if (!newer_than) { - cluster = (PAGE_ALIGN(defrag_end) >> - PAGE_SHIFT) - i; - cluster = min(cluster, max_cluster); - } else { - cluster = max_cluster; - } - - if (i + cluster > ra_index) { - ra_index = max(i, ra_index); - if (ra) - page_cache_sync_readahead(inode->i_mapping, ra, - file, ra_index, cluster); - ra_index += cluster; - } + /* We want the cluster end at page boundary when possible */ + cluster_end = (((cur >> PAGE_SHIFT) + + (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; + cluster_end = min(cluster_end, last_byte); btrfs_inode_lock(inode, 0); if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; - } else { - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = cluster_pages_for_defrag(inode, pages, i, cluster); + btrfs_inode_unlock(inode, 0); + break; } - if (ret < 0) { + if (!(inode->i_sb->s_flags & SB_ACTIVE)) { btrfs_inode_unlock(inode, 0); - goto out_ra; + break; } - - defrag_count += ret; - balance_dirty_pages_ratelimited(inode->i_mapping); + if (do_compress) + BTRFS_I(inode)->defrag_compress = compress_type; + ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + cluster_end + 1 - cur, extent_thresh, + newer_than, do_compress, + §ors_defragged, max_to_defrag); btrfs_inode_unlock(inode, 0); - - if (newer_than) { - if (newer_off == (u64)-1) - break; - - if (ret > 0) - i += ret; - - newer_off = max(newer_off + 1, - (u64)i << PAGE_SHIFT); - - ret = find_new_extents(root, inode, newer_than, - &newer_off, SZ_64K); - if (!ret) { - range->start = newer_off; - i = (newer_off & new_align) >> PAGE_SHIFT; - } else { - break; - } - } else { - if (ret > 0) { - i += ret; - last_len += ret << PAGE_SHIFT; - } else { - i++; - last_len = 0; - } - } + if (ret < 0) + break; + cur = cluster_end + 1; } - ret = defrag_count; -error: - if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { - filemap_flush(inode->i_mapping); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) + if (ra_allocated) + kfree(ra); + if (sectors_defragged) { + /* + * We have defragged some sectors, for compression case they + * need to be written back immediately. + */ + if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } + if (range->compress_type == BTRFS_COMPRESS_LZO) + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + else if (range->compress_type == BTRFS_COMPRESS_ZSTD) + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + ret = sectors_defragged; } - - if (range->compress_type == BTRFS_COMPRESS_LZO) { - btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); - } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { - btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); - } - -out_ra: if (do_compress) { btrfs_inode_lock(inode, 0); BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; btrfs_inode_unlock(inode, 0); } - if (!file) - kfree(ra); - kfree(pages); return ret; } @@ -1658,6 +1645,7 @@ static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info, static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { + BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 new_size; @@ -1713,7 +1701,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, btrfs_info(fs_info, "resizing devid %llu", devid); } - device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); + args.devid = devid; + device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) { btrfs_info(fs_info, "resizer unable to find device %llu", devid); @@ -1730,7 +1719,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, } if (!strcmp(sizestr, "max")) - new_size = device->bdev->bd_inode->i_size; + new_size = bdev_nr_bytes(device->bdev); else { if (sizestr[0] == '-') { mod = -1; @@ -1771,7 +1760,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_finish; } - if (new_size > device->bdev->bd_inode->i_size) { + if (new_size > bdev_nr_bytes(device->bdev)) { ret = -EFBIG; goto out_finish; } @@ -2121,7 +2110,7 @@ static noinline int copy_to_sk(struct btrfs_path *path, for (i = slot; i < nritems; i++) { item_off = btrfs_item_ptr_offset(leaf, i); - item_len = btrfs_item_size_nr(leaf, i); + item_len = btrfs_item_size(leaf, i); btrfs_item_key_to_cpu(leaf, key, i); if (!key_in_sk(key, sk)) @@ -2261,9 +2250,8 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { - ret = fault_in_pages_writeable(ubuf + sk_offset, - *buf_size - sk_offset); - if (ret) + ret = -EFAULT; + if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset)) break; ret = btrfs_search_forward(root, &key, path, sk->min_transid); @@ -2576,7 +2564,7 @@ static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, btrfs_item_key_to_cpu(leaf, &key, slot); item_off = btrfs_item_ptr_offset(leaf, slot); - item_len = btrfs_item_size_nr(leaf, slot); + item_len = btrfs_item_size(leaf, slot); /* Check if dirid in ROOT_REF corresponds to passed dirid */ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { @@ -2778,7 +2766,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) item_off = btrfs_item_ptr_offset(leaf, slot) + sizeof(struct btrfs_root_ref); - item_len = btrfs_item_size_nr(leaf, slot) + item_len = btrfs_item_size(leaf, slot) - sizeof(struct btrfs_root_ref); read_extent_buffer(leaf, subvol_info->name, item_off, item_len); @@ -3136,12 +3124,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } - /* Subpage defrag will be supported in later commits */ - if (root->fs_info->sectorsize < PAGE_SIZE) { - ret = -ENOTTY; - goto out; - } - switch (inode->i_mode & S_IFMT) { case S_IFDIR: if (!capable(CAP_SYS_ADMIN)) { @@ -3176,7 +3158,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range.len = (u64)-1; } - ret = btrfs_defrag_file(file_inode(file), file, + ret = btrfs_defrag_file(file_inode(file), &file->f_ra, &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; @@ -3192,13 +3174,25 @@ out: static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_vol_args *vol_args; + bool restore_op = false; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) - return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { + if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + + /* + * We can do the device add because we have a paused balanced, + * change the exclusive op type and remember we should bring + * back the paused balance + */ + fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD; + btrfs_exclop_start_unlock(fs_info); + restore_op = true; + } vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { @@ -3214,12 +3208,16 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - btrfs_exclop_finish(fs_info); + if (restore_op) + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); + else + btrfs_exclop_finish(fs_info); return ret; } static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { + BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; @@ -3231,35 +3229,37 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = mnt_want_write_file(file); - if (ret) - return ret; - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto err_drop; - } + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) { ret = -EOPNOTSUPP; goto out; } + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; - if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) && - strcmp("cancel", vol_args->name) == 0) + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { + args.devid = vol_args->devid; + } else if (!strcmp("cancel", vol_args->name)) { cancel = true; + } else { + ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); + if (ret) + goto out; + } + + ret = mnt_want_write_file(file); + if (ret) + goto out; ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret) - goto out; - /* Exclusive operation is now claimed */ + goto err_drop; - if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) - ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode); - else - ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); + /* Exclusive operation is now claimed */ + ret = btrfs_rm_device(fs_info, &args, &bdev, &mode); btrfs_exclop_finish(fs_info); @@ -3271,17 +3271,19 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) btrfs_info(fs_info, "device deleted: %s", vol_args->name); } -out: - kfree(vol_args); err_drop: mnt_drop_write_file(file); if (bdev) blkdev_put(bdev, mode); +out: + btrfs_put_dev_args_from_path(&args); + kfree(vol_args); return ret; } static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { + BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; @@ -3293,32 +3295,38 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = mnt_want_write_file(file); - if (ret) - return ret; - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto out_drop_write; - } + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - cancel = (strcmp("cancel", vol_args->name) == 0); + if (!strcmp("cancel", vol_args->name)) { + cancel = true; + } else { + ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); + if (ret) + goto out; + } + + ret = mnt_want_write_file(file); + if (ret) + goto out; ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); + ret = btrfs_rm_device(fs_info, &args, &bdev, &mode); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } - kfree(vol_args); -out_drop_write: mnt_drop_write_file(file); if (bdev) blkdev_put(bdev, mode); +out: + btrfs_put_dev_args_from_path(&args); + kfree(vol_args); return ret; } @@ -3379,22 +3387,21 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, void __user *arg) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_ioctl_dev_info_args *di_args; struct btrfs_device *dev; int ret = 0; - char *s_uuid = NULL; di_args = memdup_user(arg, sizeof(*di_args)); if (IS_ERR(di_args)) return PTR_ERR(di_args); + args.devid = di_args->devid; if (!btrfs_is_empty_uuid(di_args->uuid)) - s_uuid = di_args->uuid; + args.uuid = di_args->uuid; rcu_read_lock(); - dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid, - NULL); - + dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev) { ret = -ENODEV; goto out; @@ -3656,7 +3663,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, { struct btrfs_trans_handle *trans; u64 transid; - int ret; trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -3668,11 +3674,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, goto out; } transid = trans->transid; - ret = btrfs_commit_transaction_async(trans); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } + btrfs_commit_transaction_async(trans); out: if (argp) if (copy_to_user(argp, &transid, sizeof(transid))) @@ -4019,6 +4021,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bool need_unlock; /* for mut. excl. ops lock */ int ret; + if (!arg) + btrfs_warn(fs_info, + "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18"); + if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4091,6 +4097,7 @@ locked: spin_lock(&fs_info->balance_lock); bctl->flags |= BTRFS_BALANCE_RESUME; spin_unlock(&fs_info->balance_lock); + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); goto do_balance; } @@ -4430,7 +4437,6 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_quota_rescan_args qsa = {0}; - int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4441,9 +4447,9 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, } if (copy_to_user(arg, &qsa, sizeof(qsa))) - ret = -EFAULT; + return -EFAULT; - return ret; + return 0; } static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index a2e1f1f5c6e3..bbc45534ae9a 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -96,11 +96,12 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); #ifdef CONFIG_BTRFS_DEBUG -static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { - lockdep_assert_held(&eb->lock); +static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) +{ + lockdep_assert_held_write(&eb->lock); } #else -static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { } +static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { } #endif void btrfs_unlock_up_safe(struct btrfs_path *path, int level); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index c25dfd1a8a54..0fb90cbe7669 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -32,19 +32,19 @@ * payload. * One regular LZO compressed extent can have one or more segments. * For inlined LZO compressed extent, only one segment is allowed. - * One segment represents at most one page of uncompressed data. + * One segment represents at most one sector of uncompressed data. * * 2.1 Segment header * Fixed size. LZO_LEN (4) bytes long, LE32. * Records the total size of the segment (not including the header). - * Segment header never crosses page boundary, thus it's possible to - * have at most 3 padding zeros at the end of the page. + * Segment header never crosses sector boundary, thus it's possible to + * have at most 3 padding zeros at the end of the sector. * * 2.2 Data Payload - * Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE) - * which is 4419 for a 4KiB page. + * Variable size. Size up limit should be lzo1x_worst_compress(sectorsize) + * which is 4419 for a 4KiB sectorsize. * - * Example: + * Example with 4K sectorsize: * Page 1: * 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10 * 0x0000 | Header | SegHdr 01 | Data payload 01 ... | @@ -112,163 +112,187 @@ static inline size_t read_compress_length(const char *buf) return le32_to_cpu(dlen); } +/* + * Will do: + * + * - Write a segment header into the destination + * - Copy the compressed buffer into the destination + * - Make sure we have enough space in the last sector to fit a segment header + * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros. + * + * Will allocate new pages when needed. + */ +static int copy_compressed_data_to_page(char *compressed_data, + size_t compressed_size, + struct page **out_pages, + unsigned long max_nr_page, + u32 *cur_out, + const u32 sectorsize) +{ + u32 sector_bytes_left; + u32 orig_out; + struct page *cur_page; + char *kaddr; + + if ((*cur_out / PAGE_SIZE) >= max_nr_page) + return -E2BIG; + + /* + * We never allow a segment header crossing sector boundary, previous + * run should ensure we have enough space left inside the sector. + */ + ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); + + cur_page = out_pages[*cur_out / PAGE_SIZE]; + /* Allocate a new page */ + if (!cur_page) { + cur_page = alloc_page(GFP_NOFS); + if (!cur_page) + return -ENOMEM; + out_pages[*cur_out / PAGE_SIZE] = cur_page; + } + + kaddr = kmap(cur_page); + write_compress_length(kaddr + offset_in_page(*cur_out), + compressed_size); + *cur_out += LZO_LEN; + + orig_out = *cur_out; + + /* Copy compressed data */ + while (*cur_out - orig_out < compressed_size) { + u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize, + orig_out + compressed_size - *cur_out); + + kunmap(cur_page); + + if ((*cur_out / PAGE_SIZE) >= max_nr_page) + return -E2BIG; + + cur_page = out_pages[*cur_out / PAGE_SIZE]; + /* Allocate a new page */ + if (!cur_page) { + cur_page = alloc_page(GFP_NOFS); + if (!cur_page) + return -ENOMEM; + out_pages[*cur_out / PAGE_SIZE] = cur_page; + } + kaddr = kmap(cur_page); + + memcpy(kaddr + offset_in_page(*cur_out), + compressed_data + *cur_out - orig_out, copy_len); + + *cur_out += copy_len; + } + + /* + * Check if we can fit the next segment header into the remaining space + * of the sector. + */ + sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out; + if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0) + goto out; + + /* The remaining size is not enough, pad it with zeros */ + memset(kaddr + offset_in_page(*cur_out), 0, + sector_bytes_left); + *cur_out += sector_bytes_left; + +out: + kunmap(cur_page); + return 0; +} + int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); + const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize; + struct page *page_in = NULL; + char *sizes_ptr; + const unsigned long max_nr_page = *out_pages; int ret = 0; - char *data_in; - char *cpage_out, *sizes_ptr; - int nr_pages = 0; - struct page *in_page = NULL; - struct page *out_page = NULL; - unsigned long bytes_left; - unsigned long len = *total_out; - unsigned long nr_dest_pages = *out_pages; - const unsigned long max_out = nr_dest_pages * PAGE_SIZE; - size_t in_len; - size_t out_len; - char *buf; - unsigned long tot_in = 0; - unsigned long tot_out = 0; - unsigned long pg_bytes_left; - unsigned long out_offset; - unsigned long bytes; + /* Points to the file offset of input data */ + u64 cur_in = start; + /* Points to the current output byte */ + u32 cur_out = 0; + u32 len = *total_out; + ASSERT(max_nr_page > 0); *out_pages = 0; *total_out = 0; *total_in = 0; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); - /* - * store the size of all chunks of compressed data in - * the first 4 bytes + * Skip the header for now, we will later come back and write the total + * compressed size */ - out_page = alloc_page(GFP_NOFS); - if (out_page == NULL) { - ret = -ENOMEM; - goto out; - } - cpage_out = page_address(out_page); - out_offset = LZO_LEN; - tot_out = LZO_LEN; - pages[0] = out_page; - nr_pages = 1; - pg_bytes_left = PAGE_SIZE - LZO_LEN; - - /* compress at most one page of data each time */ - in_len = min(len, PAGE_SIZE); - while (tot_in < len) { - ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, - &out_len, workspace->mem); - if (ret != LZO_E_OK) { - pr_debug("BTRFS: lzo in loop returned %d\n", - ret); + cur_out += LZO_LEN; + while (cur_in < start + len) { + char *data_in; + const u32 sectorsize_mask = sectorsize - 1; + u32 sector_off = (cur_in - start) & sectorsize_mask; + u32 in_len; + size_t out_len; + + /* Get the input page first */ + if (!page_in) { + page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT); + ASSERT(page_in); + } + + /* Compress at most one sector of data each time */ + in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); + ASSERT(in_len); + data_in = kmap(page_in); + ret = lzo1x_1_compress(data_in + + offset_in_page(cur_in), in_len, + workspace->cbuf, &out_len, + workspace->mem); + kunmap(page_in); + if (ret < 0) { + pr_debug("BTRFS: lzo in loop returned %d\n", ret); ret = -EIO; goto out; } - /* store the size of this chunk of compressed data */ - write_compress_length(cpage_out + out_offset, out_len); - tot_out += LZO_LEN; - out_offset += LZO_LEN; - pg_bytes_left -= LZO_LEN; - - tot_in += in_len; - tot_out += out_len; - - /* copy bytes from the working buffer into the pages */ - buf = workspace->cbuf; - while (out_len) { - bytes = min_t(unsigned long, pg_bytes_left, out_len); - - memcpy(cpage_out + out_offset, buf, bytes); - - out_len -= bytes; - pg_bytes_left -= bytes; - buf += bytes; - out_offset += bytes; - - /* - * we need another page for writing out. - * - * Note if there's less than 4 bytes left, we just - * skip to a new page. - */ - if ((out_len == 0 && pg_bytes_left < LZO_LEN) || - pg_bytes_left == 0) { - if (pg_bytes_left) { - memset(cpage_out + out_offset, 0, - pg_bytes_left); - tot_out += pg_bytes_left; - } - - /* we're done, don't allocate new page */ - if (out_len == 0 && tot_in >= len) - break; - - if (nr_pages == nr_dest_pages) { - out_page = NULL; - ret = -E2BIG; - goto out; - } - - out_page = alloc_page(GFP_NOFS); - if (out_page == NULL) { - ret = -ENOMEM; - goto out; - } - cpage_out = page_address(out_page); - pages[nr_pages++] = out_page; - - pg_bytes_left = PAGE_SIZE; - out_offset = 0; - } - } + ret = copy_compressed_data_to_page(workspace->cbuf, out_len, + pages, max_nr_page, + &cur_out, sectorsize); + if (ret < 0) + goto out; - /* we're making it bigger, give up */ - if (tot_in > 8192 && tot_in < tot_out) { + cur_in += in_len; + + /* + * Check if we're making it bigger after two sectors. And if + * it is so, give up. + */ + if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) { ret = -E2BIG; goto out; } - /* we're all done */ - if (tot_in >= len) - break; - - if (tot_out > max_out) - break; - - bytes_left = len - tot_in; - put_page(in_page); - - start += PAGE_SIZE; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); - in_len = min(bytes_left, PAGE_SIZE); - } - - if (tot_out >= tot_in) { - ret = -E2BIG; - goto out; + /* Check if we have reached page boundary */ + if (IS_ALIGNED(cur_in, PAGE_SIZE)) { + put_page(page_in); + page_in = NULL; + } } - /* store the size of all chunks of compressed data */ - sizes_ptr = page_address(pages[0]); - write_compress_length(sizes_ptr, tot_out); + /* Store the size of all chunks of compressed data */ + sizes_ptr = kmap_local_page(pages[0]); + write_compress_length(sizes_ptr, cur_out); + kunmap_local(sizes_ptr); ret = 0; - *total_out = tot_out; - *total_in = tot_in; + *total_out = cur_out; + *total_in = cur_in - start; out: - *out_pages = nr_pages; - - if (in_page) - put_page(in_page); - + if (page_in) + put_page(page_in); + *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE); return ret; } @@ -283,6 +307,7 @@ static void copy_compressed_segment(struct compressed_bio *cb, u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { + char *kaddr; struct page *cur_page; u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), orig_in + len - *cur_in); @@ -290,9 +315,11 @@ static void copy_compressed_segment(struct compressed_bio *cb, ASSERT(copy_len); cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; + kaddr = kmap(cur_page); memcpy(dest + *cur_in - orig_in, - page_address(cur_page) + offset_in_page(*cur_in), + kaddr + offset_in_page(*cur_in), copy_len); + kunmap(cur_page); *cur_in += copy_len; } @@ -303,6 +330,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct workspace *workspace = list_entry(ws, struct workspace, list); const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); const u32 sectorsize = fs_info->sectorsize; + char *kaddr; int ret; /* Compressed data length, can be unaligned */ u32 len_in; @@ -311,7 +339,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Bytes decompressed so far */ u32 cur_out = 0; - len_in = read_compress_length(page_address(cb->compressed_pages[0])); + kaddr = kmap(cb->compressed_pages[0]); + len_in = read_compress_length(kaddr); + kunmap(cb->compressed_pages[0]); cur_in += LZO_LEN; /* @@ -345,8 +375,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) (cur_in + LZO_LEN - 1) / sectorsize); cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; ASSERT(cur_page); - seg_len = read_compress_length(page_address(cur_page) + - offset_in_page(cur_in)); + kaddr = kmap(cur_page); + seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); + kunmap(cur_page); cur_in += LZO_LEN; /* Copy the compressed segment payload into workspace */ @@ -431,7 +462,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, destlen = min_t(unsigned long, destlen, PAGE_SIZE); bytes = min_t(unsigned long, destlen, out_len - start_byte); - kaddr = page_address(dest_page); + kaddr = kmap_local_page(dest_page); memcpy(kaddr, workspace->buf + start_byte, bytes); /* @@ -441,6 +472,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, */ if (bytes < destlen) memset(kaddr+bytes, 0, destlen-bytes); + kunmap_local(kaddr); out: return ret; } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index aae1027bd76a..0775ae9f4419 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -85,7 +85,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) struct btrfs_disk_key key; unsigned long end; unsigned long ptr; - u32 item_size = btrfs_item_size_nr(eb, slot); + u32 item_size = btrfs_item_size(eb, slot); u64 flags; u64 offset; int ref_index = 0; @@ -200,7 +200,6 @@ void btrfs_print_leaf(struct extent_buffer *l) struct btrfs_fs_info *fs_info; int i; u32 type, nr; - struct btrfs_item *item; struct btrfs_root_item *ri; struct btrfs_dir_item *di; struct btrfs_inode_item *ii; @@ -224,12 +223,11 @@ void btrfs_print_leaf(struct extent_buffer *l) btrfs_leaf_free_space(l), btrfs_header_owner(l)); print_eb_refs_lock(l); for (i = 0 ; i < nr ; i++) { - item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); type = key.type; pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n", i, key.objectid, type, key.offset, - btrfs_item_offset(l, item), btrfs_item_size(l, item)); + btrfs_item_offset(l, i), btrfs_item_size(l, i)); switch (type) { case BTRFS_INODE_ITEM_KEY: ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); @@ -347,7 +345,7 @@ void btrfs_print_leaf(struct extent_buffer *l) case BTRFS_UUID_KEY_SUBVOL: case BTRFS_UUID_KEY_RECEIVED_SUBVOL: print_uuid_item(l, btrfs_item_ptr_offset(l, i), - btrfs_item_size_nr(l, i)); + btrfs_item_size(l, i)); break; } } diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index b1cb5a8c2999..1a6d2d5b4b33 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -158,7 +158,7 @@ static int iterate_object_props(struct btrfs_root *root, di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); cur = 0; - total_len = btrfs_item_size_nr(leaf, slot); + total_len = btrfs_item_size(leaf, slot); while (cur < total_len) { u32 name_len = btrfs_dir_name_len(leaf, di); @@ -377,8 +377,9 @@ static int inherit_props(struct btrfs_trans_handle *trans, */ if (need_reserve) { num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); - ret = btrfs_block_rsv_add(root, trans->block_rsv, - num_bytes, BTRFS_RESERVE_NO_FLUSH); + ret = btrfs_block_rsv_add(fs_info, trans->block_rsv, + num_bytes, + BTRFS_RESERVE_NO_FLUSH); if (ret) return ret; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index db680f5be745..8928275823a1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -940,6 +940,14 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) int ret = 0; int slot; + /* + * We need to have subvol_sem write locked, to prevent races between + * concurrent tasks trying to enable quotas, because we will unlock + * and relock qgroup_ioctl_lock before setting fs_info->quota_root + * and before setting BTRFS_FS_QUOTA_ENABLED. + */ + lockdep_assert_held_write(&fs_info->subvol_sem); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) goto out; @@ -1117,8 +1125,19 @@ out_add_root: goto out_free_path; } + mutex_unlock(&fs_info->qgroup_ioctl_lock); + /* + * Commit the transaction while not holding qgroup_ioctl_lock, to avoid + * a deadlock with tasks concurrently doing other qgroup operations, such + * adding/removing qgroups or adding/deleting qgroup relations for example, + * because all qgroup operations first start or join a transaction and then + * lock the qgroup_ioctl_lock mutex. + * We are safe from a concurrent task trying to enable quotas, by calling + * this function, since we are serialized by fs_info->subvol_sem. + */ ret = btrfs_commit_transaction(trans); trans = NULL; + mutex_lock(&fs_info->qgroup_ioctl_lock); if (ret) goto out_free_path; @@ -1219,7 +1238,8 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_tree_lock(quota_root->node); btrfs_clean_tree_block(quota_root->node); btrfs_tree_unlock(quota_root->node); - btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); + btrfs_free_tree_block(trans, btrfs_root_id(quota_root), + quota_root->node, 0, 1); btrfs_put_root(quota_root); @@ -3141,6 +3161,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; struct btrfs_key found; struct extent_buffer *scratch_leaf = NULL; struct ulist *roots = NULL; @@ -3150,7 +3171,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, int ret; mutex_lock(&fs_info->qgroup_rescan_lock); - ret = btrfs_search_slot_for_read(fs_info->extent_root, + extent_root = btrfs_extent_root(fs_info, + fs_info->qgroup_rescan_progress.objectid); + ret = btrfs_search_slot_for_read(extent_root, &fs_info->qgroup_rescan_progress, path, 1, 0); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d8d268ca8aa7..0e239a4c3b26 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -60,8 +60,7 @@ enum btrfs_rbio_ops { }; struct btrfs_raid_bio { - struct btrfs_fs_info *fs_info; - struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; /* while we're doing rmw on a stripe * we put it into a hash table so we can @@ -192,7 +191,7 @@ static void scrub_parity_work(struct btrfs_work *work); static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) { btrfs_init_work(&rbio->work, work_func, NULL, NULL); - btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); + btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); } /* @@ -271,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) */ static int rbio_bucket(struct btrfs_raid_bio *rbio) { - u64 num = rbio->bbio->raid_map[0]; + u64 num = rbio->bioc->raid_map[0]; /* * we shift down quite a bit. We're using byte @@ -345,7 +344,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; - table = rbio->fs_info->stripe_hash_table; + table = rbio->bioc->fs_info->stripe_hash_table; h = table->table + bucket; /* hold the lock for the bucket because we may be @@ -400,7 +399,7 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; - table = rbio->fs_info->stripe_hash_table; + table = rbio->bioc->fs_info->stripe_hash_table; spin_lock_irqsave(&table->cache_lock, flags); __remove_rbio_from_cache(rbio); @@ -460,7 +459,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) return; - table = rbio->fs_info->stripe_hash_table; + table = rbio->bioc->fs_info->stripe_hash_table; spin_lock_irqsave(&table->cache_lock, flags); spin_lock(&rbio->bio_list_lock); @@ -559,8 +558,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0; - if (last->bbio->raid_map[0] != - cur->bbio->raid_map[0]) + if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) return 0; /* we can't merge with different operations */ @@ -669,11 +667,11 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; - h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio); + h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); spin_lock_irqsave(&h->lock, flags); list_for_each_entry(cur, &h->hash_list, hash_list) { - if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0]) + if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) continue; spin_lock(&cur->bio_list_lock); @@ -751,7 +749,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) int keep_cache = 0; bucket = rbio_bucket(rbio); - h = rbio->fs_info->stripe_hash_table->table + bucket; + h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; if (list_empty(&rbio->plug_list)) cache_rbio(rbio); @@ -838,7 +836,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) } } - btrfs_put_bbio(rbio->bbio); + btrfs_put_bioc(rbio->bioc); kfree(rbio); } @@ -865,7 +863,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) struct bio *extra; if (rbio->generic_bio_cnt) - btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); + btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); /* * At this moment, rbio->bio_list is empty, however since rbio does not @@ -906,7 +904,7 @@ static void raid_write_end_io(struct bio *bio) /* OK, we have read all the stripes we need to. */ max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? - 0 : rbio->bbio->max_errors; + 0 : rbio->bioc->max_errors; if (atomic_read(&rbio->error) > max_errors) err = BLK_STS_IOERR; @@ -961,12 +959,12 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, - struct btrfs_bio *bbio, + struct btrfs_io_context *bioc, u64 stripe_len) { struct btrfs_raid_bio *rbio; int nr_data = 0; - int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; + int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; int num_pages = rbio_nr_pages(stripe_len, real_stripes); int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); void *p; @@ -987,8 +985,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, spin_lock_init(&rbio->bio_list_lock); INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); - rbio->bbio = bbio; - rbio->fs_info = fs_info; + rbio->bioc = bioc; rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; rbio->real_stripes = real_stripes; @@ -1015,9 +1012,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); #undef CONSUME_ALLOC - if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) nr_data = real_stripes - 1; - else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) + else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) nr_data = real_stripes - 2; else BUG(); @@ -1077,10 +1074,10 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, struct bio *last = bio_list->tail; int ret; struct bio *bio; - struct btrfs_bio_stripe *stripe; + struct btrfs_io_stripe *stripe; u64 disk_start; - stripe = &rbio->bbio->stripes[stripe_nr]; + stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + (page_index << PAGE_SHIFT); /* if the device is missing, just fail this stripe */ @@ -1105,8 +1102,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); - btrfs_io_bio(bio)->device = stripe->dev; + bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); + btrfs_bio(bio)->device = stripe->dev; bio->bi_iter.bi_size = 0; bio_set_dev(bio, stripe->dev->bdev); bio->bi_iter.bi_sector = disk_start >> 9; @@ -1155,11 +1152,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) int i = 0; start = bio->bi_iter.bi_sector << 9; - stripe_offset = start - rbio->bbio->raid_map[0]; + stripe_offset = start - rbio->bioc->raid_map[0]; page_index = stripe_offset >> PAGE_SHIFT; if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_io_bio(bio)->iter; + bio->bi_iter = btrfs_bio(bio)->iter; bio_for_each_segment(bvec, bio, iter) { rbio->bio_pages[page_index + i] = bvec.bv_page; @@ -1179,7 +1176,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) */ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { - struct btrfs_bio *bbio = rbio->bbio; + struct btrfs_io_context *bioc = rbio->bioc; void **pointers = rbio->finish_pointers; int nr_data = rbio->nr_data; int stripe; @@ -1284,11 +1281,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } } - if (likely(!bbio->num_tgtdevs)) + if (likely(!bioc->num_tgtdevs)) goto write_data; for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - if (!bbio->tgtdev_map[stripe]) + if (!bioc->tgtdev_map[stripe]) continue; for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { @@ -1302,7 +1299,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } ret = rbio_add_io_page(rbio, &bio_list, page, - rbio->bbio->tgtdev_map[stripe], + rbio->bioc->tgtdev_map[stripe], pagenr, rbio->stripe_len); if (ret) goto cleanup; @@ -1339,12 +1336,12 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, { u64 physical = bio->bi_iter.bi_sector; int i; - struct btrfs_bio_stripe *stripe; + struct btrfs_io_stripe *stripe; physical <<= 9; - for (i = 0; i < rbio->bbio->num_stripes; i++) { - stripe = &rbio->bbio->stripes[i]; + for (i = 0; i < rbio->bioc->num_stripes; i++) { + stripe = &rbio->bioc->stripes[i]; if (in_range(physical, stripe->physical, rbio->stripe_len) && stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { return i; @@ -1365,7 +1362,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, int i; for (i = 0; i < rbio->nr_data; i++) { - u64 stripe_start = rbio->bbio->raid_map[i]; + u64 stripe_start = rbio->bioc->raid_map[i]; if (in_range(logical, stripe_start, rbio->stripe_len)) return i; @@ -1456,7 +1453,7 @@ static void raid_rmw_end_io(struct bio *bio) if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) goto cleanup; /* @@ -1538,8 +1535,8 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) } /* - * the bbio may be freed once we submit the last bio. Make sure - * not to touch it after that + * The bioc may be freed once we submit the last bio. Make sure not to + * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { @@ -1547,7 +1544,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) bio->bi_end_io = raid_rmw_end_io; bio->bi_opf = REQ_OP_READ; - btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); submit_bio(bio); } @@ -1719,17 +1716,18 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len) +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, + u64 stripe_len) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; int ret; - rbio = alloc_rbio(fs_info, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bioc, stripe_len); if (IS_ERR(rbio)) { - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); return PTR_ERR(rbio); } bio_list_add(&rbio->bio_list, bio); @@ -1842,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } /* all raid6 handling here */ - if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { + if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { /* * single failure, rebuild from parity raid5 * style @@ -1874,8 +1872,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * here due to a crc mismatch and we can't give them the * data they want */ - if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->bbio->raid_map[faila] == + if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->bioc->raid_map[faila] == RAID5_P_STRIPE) { err = BLK_STS_IOERR; goto cleanup; @@ -1887,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) goto pstripe; } - if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { + if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { raid6_datap_recov(rbio->real_stripes, PAGE_SIZE, faila, pointers); } else { @@ -2006,7 +2004,7 @@ static void raid_recover_end_io(struct bio *bio) if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) rbio_orig_end_io(rbio, BLK_STS_IOERR); else __raid_recover_end_io(rbio); @@ -2074,7 +2072,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * were up to date, or we might have no bios to read because * the devices were gone. */ - if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { + if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { __raid_recover_end_io(rbio); return 0; } else { @@ -2083,8 +2081,8 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) } /* - * the bbio may be freed once we submit the last bio. Make sure - * not to touch it after that + * The bioc may be freed once we submit the last bio. Make sure not to + * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { @@ -2092,7 +2090,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) bio->bi_end_io = raid_recover_end_io; bio->bi_opf = REQ_OP_READ; - btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); submit_bio(bio); } @@ -2116,22 +2114,22 @@ cleanup: * so we assume the bio they send down corresponds to a failed part * of the drive. */ -int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - int mirror_num, int generic_io) +int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + u64 stripe_len, int mirror_num, int generic_io) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int ret; if (generic_io) { - ASSERT(bbio->mirror_num == mirror_num); - btrfs_io_bio(bio)->mirror_num = mirror_num; + ASSERT(bioc->mirror_num == mirror_num); + btrfs_bio(bio)->mirror_num = mirror_num; } - rbio = alloc_rbio(fs_info, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bioc, stripe_len); if (IS_ERR(rbio)) { if (generic_io) - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); return PTR_ERR(rbio); } @@ -2142,11 +2140,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { btrfs_warn(fs_info, - "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)", +"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", __func__, bio->bi_iter.bi_sector << 9, - (u64)bio->bi_iter.bi_size, bbio->map_type); + (u64)bio->bi_iter.bi_size, bioc->map_type); if (generic_io) - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); kfree(rbio); return -EIO; } @@ -2155,7 +2153,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; } else { - btrfs_get_bbio(bbio); + btrfs_get_bioc(bioc); } /* @@ -2214,23 +2212,23 @@ static void read_rebuild_work(struct btrfs_work *work) /* * The following code is used to scrub/replace the parity stripe * - * Caller must have already increased bio_counter for getting @bbio. + * Caller must have already increased bio_counter for getting @bioc. * * Note: We need make sure all the pages that add into the scrub/replace * raid bio are correct and not be changed during the scrub/replace. That * is those pages just hold metadata or file data with checksum. */ -struct btrfs_raid_bio * -raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - struct btrfs_device *scrub_dev, - unsigned long *dbitmap, int stripe_nsectors) +struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, + struct btrfs_io_context *bioc, + u64 stripe_len, struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(fs_info, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bioc, stripe_len); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2242,12 +2240,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, rbio->operation = BTRFS_RBIO_PARITY_SCRUB; /* - * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted + * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted * to the end position, so this search can start from the first parity * stripe. */ for (i = rbio->nr_data; i < rbio->real_stripes; i++) { - if (bbio->stripes[i].dev == scrub_dev) { + if (bioc->stripes[i].dev == scrub_dev) { rbio->scrubp = i; break; } @@ -2260,7 +2258,7 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); /* - * We have already increased bio_counter when getting bbio, record it + * We have already increased bio_counter when getting bioc, record it * so we can free it at rbio_orig_end_io(). */ rbio->generic_bio_cnt = 1; @@ -2275,10 +2273,10 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, int stripe_offset; int index; - ASSERT(logical >= rbio->bbio->raid_map[0]); - ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + + ASSERT(logical >= rbio->bioc->raid_map[0]); + ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] + rbio->stripe_len * rbio->nr_data); - stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); + stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); index = stripe_offset >> PAGE_SHIFT; rbio->bio_pages[index] = page; } @@ -2312,7 +2310,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { - struct btrfs_bio *bbio = rbio->bbio; + struct btrfs_io_context *bioc = rbio->bioc; void **pointers = rbio->finish_pointers; unsigned long *pbitmap = rbio->finish_pbitmap; int nr_data = rbio->nr_data; @@ -2335,7 +2333,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, else BUG(); - if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { + if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { is_replace = 1; bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); } @@ -2435,7 +2433,7 @@ writeback: page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); ret = rbio_add_io_page(rbio, &bio_list, page, - bbio->tgtdev_map[rbio->scrubp], + bioc->tgtdev_map[rbio->scrubp], pagenr, rbio->stripe_len); if (ret) goto cleanup; @@ -2483,7 +2481,7 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) */ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) { - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) goto cleanup; if (rbio->faila >= 0 || rbio->failb >= 0) { @@ -2504,7 +2502,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) * the data, so the capability of the repair is declined. * (In the case of RAID5, we can not repair anything) */ - if (dfail > rbio->bbio->max_errors - 1) + if (dfail > rbio->bioc->max_errors - 1) goto cleanup; /* @@ -2625,8 +2623,8 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) } /* - * the bbio may be freed once we submit the last bio. Make sure - * not to touch it after that + * The bioc may be freed once we submit the last bio. Make sure not to + * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { @@ -2634,7 +2632,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) bio->bi_end_io = raid56_parity_scrub_end_io; bio->bi_opf = REQ_OP_READ; - btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); submit_bio(bio); } @@ -2670,12 +2668,13 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) /* The following code is used for dev replace of a missing RAID 5/6 device. */ struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 length) +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, + u64 length) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - rbio = alloc_rbio(fs_info, bbio, length); + rbio = alloc_rbio(fs_info, bioc, length); if (IS_ERR(rbio)) return NULL; @@ -2695,7 +2694,7 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, } /* - * When we get bbio, we have already increased bio_counter, record it + * When we get bioc, we have already increased bio_counter, record it * so we can free it at rbio_orig_end_io() */ rbio->generic_bio_cnt = 1; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 2503485db859..72c00fc284b5 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -30,25 +30,23 @@ static inline int nr_data_stripes(const struct map_lookup *map) struct btrfs_raid_bio; struct btrfs_device; -int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - int mirror_num, int generic_io); -int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len); +int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + u64 stripe_len, int mirror_num, int generic_io); +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, + u64 stripe_len); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, u64 logical); -struct btrfs_raid_bio * -raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - struct btrfs_device *scrub_dev, - unsigned long *dbitmap, int stripe_nsectors); +struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, + struct btrfs_io_context *bioc, u64 stripe_len, + struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 length); +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, + u64 length); void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c deleted file mode 100644 index 06713a8fe26b..000000000000 --- a/fs/btrfs/reada.c +++ /dev/null @@ -1,1086 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2011 STRATO. All rights reserved. - */ - -#include <linux/sched.h> -#include <linux/pagemap.h> -#include <linux/writeback.h> -#include <linux/blkdev.h> -#include <linux/slab.h> -#include <linux/workqueue.h> -#include "ctree.h" -#include "volumes.h" -#include "disk-io.h" -#include "transaction.h" -#include "dev-replace.h" -#include "block-group.h" - -#undef DEBUG - -/* - * This is the implementation for the generic read ahead framework. - * - * To trigger a readahead, btrfs_reada_add must be called. It will start - * a read ahead for the given range [start, end) on tree root. The returned - * handle can either be used to wait on the readahead to finish - * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). - * - * The read ahead works as follows: - * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. - * reada_start_machine will then search for extents to prefetch and trigger - * some reads. When a read finishes for a node, all contained node/leaf - * pointers that lie in the given range will also be enqueued. The reads will - * be triggered in sequential order, thus giving a big win over a naive - * enumeration. It will also make use of multi-device layouts. Each disk - * will have its on read pointer and all disks will by utilized in parallel. - * Also will no two disks read both sides of a mirror simultaneously, as this - * would waste seeking capacity. Instead both disks will read different parts - * of the filesystem. - * Any number of readaheads can be started in parallel. The read order will be - * determined globally, i.e. 2 parallel readaheads will normally finish faster - * than the 2 started one after another. - */ - -#define MAX_IN_FLIGHT 6 - -struct reada_extctl { - struct list_head list; - struct reada_control *rc; - u64 generation; -}; - -struct reada_extent { - u64 logical; - u64 owner_root; - struct btrfs_key top; - struct list_head extctl; - int refcnt; - spinlock_t lock; - struct reada_zone *zones[BTRFS_MAX_MIRRORS]; - int nzones; - int scheduled; - int level; -}; - -struct reada_zone { - u64 start; - u64 end; - u64 elems; - struct list_head list; - spinlock_t lock; - int locked; - struct btrfs_device *device; - struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl - * self */ - int ndevs; - struct kref refcnt; -}; - -struct reada_machine_work { - struct btrfs_work work; - struct btrfs_fs_info *fs_info; -}; - -static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); -static void reada_control_release(struct kref *kref); -static void reada_zone_release(struct kref *kref); -static void reada_start_machine(struct btrfs_fs_info *fs_info); -static void __reada_start_machine(struct btrfs_fs_info *fs_info); - -static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, u64 owner_root, - u64 generation, int level); - -/* recurses */ -/* in case of err, eb might be NULL */ -static void __readahead_hook(struct btrfs_fs_info *fs_info, - struct reada_extent *re, struct extent_buffer *eb, - int err) -{ - int nritems; - int i; - u64 bytenr; - u64 generation; - struct list_head list; - - spin_lock(&re->lock); - /* - * just take the full list from the extent. afterwards we - * don't need the lock anymore - */ - list_replace_init(&re->extctl, &list); - re->scheduled = 0; - spin_unlock(&re->lock); - - /* - * this is the error case, the extent buffer has not been - * read correctly. We won't access anything from it and - * just cleanup our data structures. Effectively this will - * cut the branch below this node from read ahead. - */ - if (err) - goto cleanup; - - /* - * FIXME: currently we just set nritems to 0 if this is a leaf, - * effectively ignoring the content. In a next step we could - * trigger more readahead depending from the content, e.g. - * fetch the checksums for the extents in the leaf. - */ - if (!btrfs_header_level(eb)) - goto cleanup; - - nritems = btrfs_header_nritems(eb); - generation = btrfs_header_generation(eb); - for (i = 0; i < nritems; i++) { - struct reada_extctl *rec; - u64 n_gen; - struct btrfs_key key; - struct btrfs_key next_key; - - btrfs_node_key_to_cpu(eb, &key, i); - if (i + 1 < nritems) - btrfs_node_key_to_cpu(eb, &next_key, i + 1); - else - next_key = re->top; - bytenr = btrfs_node_blockptr(eb, i); - n_gen = btrfs_node_ptr_generation(eb, i); - - list_for_each_entry(rec, &list, list) { - struct reada_control *rc = rec->rc; - - /* - * if the generation doesn't match, just ignore this - * extctl. This will probably cut off a branch from - * prefetch. Alternatively one could start a new (sub-) - * prefetch for this branch, starting again from root. - * FIXME: move the generation check out of this loop - */ -#ifdef DEBUG - if (rec->generation != generation) { - btrfs_debug(fs_info, - "generation mismatch for (%llu,%d,%llu) %llu != %llu", - key.objectid, key.type, key.offset, - rec->generation, generation); - } -#endif - if (rec->generation == generation && - btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && - btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) - reada_add_block(rc, bytenr, &next_key, - btrfs_header_owner(eb), n_gen, - btrfs_header_level(eb) - 1); - } - } - -cleanup: - /* - * free extctl records - */ - while (!list_empty(&list)) { - struct reada_control *rc; - struct reada_extctl *rec; - - rec = list_first_entry(&list, struct reada_extctl, list); - list_del(&rec->list); - rc = rec->rc; - kfree(rec); - - kref_get(&rc->refcnt); - if (atomic_dec_and_test(&rc->elems)) { - kref_put(&rc->refcnt, reada_control_release); - wake_up(&rc->wait); - } - kref_put(&rc->refcnt, reada_control_release); - - reada_extent_put(fs_info, re); /* one ref for each entry */ - } - - return; -} - -int btree_readahead_hook(struct extent_buffer *eb, int err) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - int ret = 0; - struct reada_extent *re; - - /* find extent */ - spin_lock(&fs_info->reada_lock); - re = radix_tree_lookup(&fs_info->reada_tree, - eb->start >> fs_info->sectorsize_bits); - if (re) - re->refcnt++; - spin_unlock(&fs_info->reada_lock); - if (!re) { - ret = -1; - goto start_machine; - } - - __readahead_hook(fs_info, re, eb, err); - reada_extent_put(fs_info, re); /* our ref */ - -start_machine: - reada_start_machine(fs_info); - return ret; -} - -static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, - struct btrfs_bio *bbio) -{ - struct btrfs_fs_info *fs_info = dev->fs_info; - int ret; - struct reada_zone *zone; - struct btrfs_block_group *cache = NULL; - u64 start; - u64 end; - int i; - - zone = NULL; - spin_lock(&fs_info->reada_lock); - ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> fs_info->sectorsize_bits, 1); - if (ret == 1 && logical >= zone->start && logical <= zone->end) { - kref_get(&zone->refcnt); - spin_unlock(&fs_info->reada_lock); - return zone; - } - - spin_unlock(&fs_info->reada_lock); - - cache = btrfs_lookup_block_group(fs_info, logical); - if (!cache) - return NULL; - - start = cache->start; - end = start + cache->length - 1; - btrfs_put_block_group(cache); - - zone = kzalloc(sizeof(*zone), GFP_KERNEL); - if (!zone) - return NULL; - - ret = radix_tree_preload(GFP_KERNEL); - if (ret) { - kfree(zone); - return NULL; - } - - zone->start = start; - zone->end = end; - INIT_LIST_HEAD(&zone->list); - spin_lock_init(&zone->lock); - zone->locked = 0; - kref_init(&zone->refcnt); - zone->elems = 0; - zone->device = dev; /* our device always sits at index 0 */ - for (i = 0; i < bbio->num_stripes; ++i) { - /* bounds have already been checked */ - zone->devs[i] = bbio->stripes[i].dev; - } - zone->ndevs = bbio->num_stripes; - - spin_lock(&fs_info->reada_lock); - ret = radix_tree_insert(&dev->reada_zones, - (unsigned long)(zone->end >> fs_info->sectorsize_bits), - zone); - - if (ret == -EEXIST) { - kfree(zone); - ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> fs_info->sectorsize_bits, 1); - if (ret == 1 && logical >= zone->start && logical <= zone->end) - kref_get(&zone->refcnt); - else - zone = NULL; - } - spin_unlock(&fs_info->reada_lock); - radix_tree_preload_end(); - - return zone; -} - -static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, - u64 logical, - struct btrfs_key *top, - u64 owner_root, int level) -{ - int ret; - struct reada_extent *re = NULL; - struct reada_extent *re_exist = NULL; - struct btrfs_bio *bbio = NULL; - struct btrfs_device *dev; - struct btrfs_device *prev_dev; - u64 length; - int real_stripes; - int nzones = 0; - unsigned long index = logical >> fs_info->sectorsize_bits; - int dev_replace_is_ongoing; - int have_zone = 0; - - spin_lock(&fs_info->reada_lock); - re = radix_tree_lookup(&fs_info->reada_tree, index); - if (re) - re->refcnt++; - spin_unlock(&fs_info->reada_lock); - - if (re) - return re; - - re = kzalloc(sizeof(*re), GFP_KERNEL); - if (!re) - return NULL; - - re->logical = logical; - re->top = *top; - INIT_LIST_HEAD(&re->extctl); - spin_lock_init(&re->lock); - re->refcnt = 1; - re->owner_root = owner_root; - re->level = level; - - /* - * map block - */ - length = fs_info->nodesize; - ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bbio, 0); - if (ret || !bbio || length < fs_info->nodesize) - goto error; - - if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { - btrfs_err(fs_info, - "readahead: more than %d copies not supported", - BTRFS_MAX_MIRRORS); - goto error; - } - - real_stripes = bbio->num_stripes - bbio->num_tgtdevs; - for (nzones = 0; nzones < real_stripes; ++nzones) { - struct reada_zone *zone; - - dev = bbio->stripes[nzones].dev; - - /* cannot read ahead on missing device. */ - if (!dev->bdev) - continue; - - zone = reada_find_zone(dev, logical, bbio); - if (!zone) - continue; - - re->zones[re->nzones++] = zone; - spin_lock(&zone->lock); - if (!zone->elems) - kref_get(&zone->refcnt); - ++zone->elems; - spin_unlock(&zone->lock); - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); - spin_unlock(&fs_info->reada_lock); - } - if (re->nzones == 0) { - /* not a single zone found, error and out */ - goto error; - } - - /* Insert extent in reada tree + all per-device trees, all or nothing */ - down_read(&fs_info->dev_replace.rwsem); - ret = radix_tree_preload(GFP_KERNEL); - if (ret) { - up_read(&fs_info->dev_replace.rwsem); - goto error; - } - - spin_lock(&fs_info->reada_lock); - ret = radix_tree_insert(&fs_info->reada_tree, index, re); - if (ret == -EEXIST) { - re_exist = radix_tree_lookup(&fs_info->reada_tree, index); - re_exist->refcnt++; - spin_unlock(&fs_info->reada_lock); - radix_tree_preload_end(); - up_read(&fs_info->dev_replace.rwsem); - goto error; - } - if (ret) { - spin_unlock(&fs_info->reada_lock); - radix_tree_preload_end(); - up_read(&fs_info->dev_replace.rwsem); - goto error; - } - radix_tree_preload_end(); - prev_dev = NULL; - dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( - &fs_info->dev_replace); - for (nzones = 0; nzones < re->nzones; ++nzones) { - dev = re->zones[nzones]->device; - - if (dev == prev_dev) { - /* - * in case of DUP, just add the first zone. As both - * are on the same device, there's nothing to gain - * from adding both. - * Also, it wouldn't work, as the tree is per device - * and adding would fail with EEXIST - */ - continue; - } - if (!dev->bdev) - continue; - - if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state)) - continue; - - if (dev_replace_is_ongoing && - dev == fs_info->dev_replace.tgtdev) { - /* - * as this device is selected for reading only as - * a last resort, skip it for read ahead. - */ - continue; - } - prev_dev = dev; - ret = radix_tree_insert(&dev->reada_extents, index, re); - if (ret) { - while (--nzones >= 0) { - dev = re->zones[nzones]->device; - BUG_ON(dev == NULL); - /* ignore whether the entry was inserted */ - radix_tree_delete(&dev->reada_extents, index); - } - radix_tree_delete(&fs_info->reada_tree, index); - spin_unlock(&fs_info->reada_lock); - up_read(&fs_info->dev_replace.rwsem); - goto error; - } - have_zone = 1; - } - if (!have_zone) - radix_tree_delete(&fs_info->reada_tree, index); - spin_unlock(&fs_info->reada_lock); - up_read(&fs_info->dev_replace.rwsem); - - if (!have_zone) - goto error; - - btrfs_put_bbio(bbio); - return re; - -error: - for (nzones = 0; nzones < re->nzones; ++nzones) { - struct reada_zone *zone; - - zone = re->zones[nzones]; - kref_get(&zone->refcnt); - spin_lock(&zone->lock); - --zone->elems; - if (zone->elems == 0) { - /* - * no fs_info->reada_lock needed, as this can't be - * the last ref - */ - kref_put(&zone->refcnt, reada_zone_release); - } - spin_unlock(&zone->lock); - - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); - spin_unlock(&fs_info->reada_lock); - } - btrfs_put_bbio(bbio); - kfree(re); - return re_exist; -} - -static void reada_extent_put(struct btrfs_fs_info *fs_info, - struct reada_extent *re) -{ - int i; - unsigned long index = re->logical >> fs_info->sectorsize_bits; - - spin_lock(&fs_info->reada_lock); - if (--re->refcnt) { - spin_unlock(&fs_info->reada_lock); - return; - } - - radix_tree_delete(&fs_info->reada_tree, index); - for (i = 0; i < re->nzones; ++i) { - struct reada_zone *zone = re->zones[i]; - - radix_tree_delete(&zone->device->reada_extents, index); - } - - spin_unlock(&fs_info->reada_lock); - - for (i = 0; i < re->nzones; ++i) { - struct reada_zone *zone = re->zones[i]; - - kref_get(&zone->refcnt); - spin_lock(&zone->lock); - --zone->elems; - if (zone->elems == 0) { - /* no fs_info->reada_lock needed, as this can't be - * the last ref */ - kref_put(&zone->refcnt, reada_zone_release); - } - spin_unlock(&zone->lock); - - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); - spin_unlock(&fs_info->reada_lock); - } - - kfree(re); -} - -static void reada_zone_release(struct kref *kref) -{ - struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); - struct btrfs_fs_info *fs_info = zone->device->fs_info; - - lockdep_assert_held(&fs_info->reada_lock); - - radix_tree_delete(&zone->device->reada_zones, - zone->end >> fs_info->sectorsize_bits); - - kfree(zone); -} - -static void reada_control_release(struct kref *kref) -{ - struct reada_control *rc = container_of(kref, struct reada_control, - refcnt); - - kfree(rc); -} - -static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, u64 owner_root, - u64 generation, int level) -{ - struct btrfs_fs_info *fs_info = rc->fs_info; - struct reada_extent *re; - struct reada_extctl *rec; - - /* takes one ref */ - re = reada_find_extent(fs_info, logical, top, owner_root, level); - if (!re) - return -1; - - rec = kzalloc(sizeof(*rec), GFP_KERNEL); - if (!rec) { - reada_extent_put(fs_info, re); - return -ENOMEM; - } - - rec->rc = rc; - rec->generation = generation; - atomic_inc(&rc->elems); - - spin_lock(&re->lock); - list_add_tail(&rec->list, &re->extctl); - spin_unlock(&re->lock); - - /* leave the ref on the extent */ - - return 0; -} - -/* - * called with fs_info->reada_lock held - */ -static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) -{ - int i; - unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits; - - for (i = 0; i < zone->ndevs; ++i) { - struct reada_zone *peer; - peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); - if (peer && peer->device != zone->device) - peer->locked = lock; - } -} - -/* - * called with fs_info->reada_lock held - */ -static int reada_pick_zone(struct btrfs_device *dev) -{ - struct reada_zone *top_zone = NULL; - struct reada_zone *top_locked_zone = NULL; - u64 top_elems = 0; - u64 top_locked_elems = 0; - unsigned long index = 0; - int ret; - - if (dev->reada_curr_zone) { - reada_peer_zones_set_lock(dev->reada_curr_zone, 0); - kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); - dev->reada_curr_zone = NULL; - } - /* pick the zone with the most elements */ - while (1) { - struct reada_zone *zone; - - ret = radix_tree_gang_lookup(&dev->reada_zones, - (void **)&zone, index, 1); - if (ret == 0) - break; - index = (zone->end >> dev->fs_info->sectorsize_bits) + 1; - if (zone->locked) { - if (zone->elems > top_locked_elems) { - top_locked_elems = zone->elems; - top_locked_zone = zone; - } - } else { - if (zone->elems > top_elems) { - top_elems = zone->elems; - top_zone = zone; - } - } - } - if (top_zone) - dev->reada_curr_zone = top_zone; - else if (top_locked_zone) - dev->reada_curr_zone = top_locked_zone; - else - return 0; - - dev->reada_next = dev->reada_curr_zone->start; - kref_get(&dev->reada_curr_zone->refcnt); - reada_peer_zones_set_lock(dev->reada_curr_zone, 1); - - return 1; -} - -static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 owner_root, int level, int mirror_num, - struct extent_buffer **eb) -{ - struct extent_buffer *buf = NULL; - int ret; - - buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); - if (IS_ERR(buf)) - return 0; - - set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); - - ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num); - if (ret) { - free_extent_buffer_stale(buf); - return ret; - } - - if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { - free_extent_buffer_stale(buf); - return -EIO; - } else if (extent_buffer_uptodate(buf)) { - *eb = buf; - } else { - free_extent_buffer(buf); - } - return 0; -} - -static int reada_start_machine_dev(struct btrfs_device *dev) -{ - struct btrfs_fs_info *fs_info = dev->fs_info; - struct reada_extent *re = NULL; - int mirror_num = 0; - struct extent_buffer *eb = NULL; - u64 logical; - int ret; - int i; - - spin_lock(&fs_info->reada_lock); - if (dev->reada_curr_zone == NULL) { - ret = reada_pick_zone(dev); - if (!ret) { - spin_unlock(&fs_info->reada_lock); - return 0; - } - } - /* - * FIXME currently we issue the reads one extent at a time. If we have - * a contiguous block of extents, we could also coagulate them or use - * plugging to speed things up - */ - ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> fs_info->sectorsize_bits, 1); - if (ret == 0 || re->logical > dev->reada_curr_zone->end) { - ret = reada_pick_zone(dev); - if (!ret) { - spin_unlock(&fs_info->reada_lock); - return 0; - } - re = NULL; - ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> fs_info->sectorsize_bits, 1); - } - if (ret == 0) { - spin_unlock(&fs_info->reada_lock); - return 0; - } - dev->reada_next = re->logical + fs_info->nodesize; - re->refcnt++; - - spin_unlock(&fs_info->reada_lock); - - spin_lock(&re->lock); - if (re->scheduled || list_empty(&re->extctl)) { - spin_unlock(&re->lock); - reada_extent_put(fs_info, re); - return 0; - } - re->scheduled = 1; - spin_unlock(&re->lock); - - /* - * find mirror num - */ - for (i = 0; i < re->nzones; ++i) { - if (re->zones[i]->device == dev) { - mirror_num = i + 1; - break; - } - } - logical = re->logical; - - atomic_inc(&dev->reada_in_flight); - ret = reada_tree_block_flagged(fs_info, logical, re->owner_root, - re->level, mirror_num, &eb); - if (ret) - __readahead_hook(fs_info, re, NULL, ret); - else if (eb) - __readahead_hook(fs_info, re, eb, ret); - - if (eb) - free_extent_buffer(eb); - - atomic_dec(&dev->reada_in_flight); - reada_extent_put(fs_info, re); - - return 1; - -} - -static void reada_start_machine_worker(struct btrfs_work *work) -{ - struct reada_machine_work *rmw; - int old_ioprio; - - rmw = container_of(work, struct reada_machine_work, work); - - old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), - task_nice_ioprio(current)); - set_task_ioprio(current, BTRFS_IOPRIO_READA); - __reada_start_machine(rmw->fs_info); - set_task_ioprio(current, old_ioprio); - - atomic_dec(&rmw->fs_info->reada_works_cnt); - - kfree(rmw); -} - -/* Try to start up to 10k READA requests for a group of devices */ -static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) -{ - u64 enqueued; - u64 total = 0; - struct btrfs_device *device; - - do { - enqueued = 0; - list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (atomic_read(&device->reada_in_flight) < - MAX_IN_FLIGHT) - enqueued += reada_start_machine_dev(device); - } - total += enqueued; - } while (enqueued && total < 10000); - - return total; -} - -static void __reada_start_machine(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; - int i; - u64 enqueued = 0; - - mutex_lock(&fs_devices->device_list_mutex); - - enqueued += reada_start_for_fsdevs(fs_devices); - list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) - enqueued += reada_start_for_fsdevs(seed_devs); - - mutex_unlock(&fs_devices->device_list_mutex); - if (enqueued == 0) - return; - - /* - * If everything is already in the cache, this is effectively single - * threaded. To a) not hold the caller for too long and b) to utilize - * more cores, we broke the loop above after 10000 iterations and now - * enqueue to workers to finish it. This will distribute the load to - * the cores. - */ - for (i = 0; i < 2; ++i) { - reada_start_machine(fs_info); - if (atomic_read(&fs_info->reada_works_cnt) > - BTRFS_MAX_MIRRORS * 2) - break; - } -} - -static void reada_start_machine(struct btrfs_fs_info *fs_info) -{ - struct reada_machine_work *rmw; - - rmw = kzalloc(sizeof(*rmw), GFP_KERNEL); - if (!rmw) { - /* FIXME we cannot handle this properly right now */ - BUG(); - } - btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); - rmw->fs_info = fs_info; - - btrfs_queue_work(fs_info->readahead_workers, &rmw->work); - atomic_inc(&fs_info->reada_works_cnt); -} - -#ifdef DEBUG -static void dump_devs(struct btrfs_fs_info *fs_info, int all) -{ - struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - unsigned long index; - int ret; - int i; - int j; - int cnt; - - spin_lock(&fs_info->reada_lock); - list_for_each_entry(device, &fs_devices->devices, dev_list) { - btrfs_debug(fs_info, "dev %lld has %d in flight", device->devid, - atomic_read(&device->reada_in_flight)); - index = 0; - while (1) { - struct reada_zone *zone; - ret = radix_tree_gang_lookup(&device->reada_zones, - (void **)&zone, index, 1); - if (ret == 0) - break; - pr_debug(" zone %llu-%llu elems %llu locked %d devs", - zone->start, zone->end, zone->elems, - zone->locked); - for (j = 0; j < zone->ndevs; ++j) { - pr_cont(" %lld", - zone->devs[j]->devid); - } - if (device->reada_curr_zone == zone) - pr_cont(" curr off %llu", - device->reada_next - zone->start); - pr_cont("\n"); - index = (zone->end >> fs_info->sectorsize_bits) + 1; - } - cnt = 0; - index = 0; - while (all) { - struct reada_extent *re = NULL; - - ret = radix_tree_gang_lookup(&device->reada_extents, - (void **)&re, index, 1); - if (ret == 0) - break; - pr_debug(" re: logical %llu size %u empty %d scheduled %d", - re->logical, fs_info->nodesize, - list_empty(&re->extctl), re->scheduled); - - for (i = 0; i < re->nzones; ++i) { - pr_cont(" zone %llu-%llu devs", - re->zones[i]->start, - re->zones[i]->end); - for (j = 0; j < re->zones[i]->ndevs; ++j) { - pr_cont(" %lld", - re->zones[i]->devs[j]->devid); - } - } - pr_cont("\n"); - index = (re->logical >> fs_info->sectorsize_bits) + 1; - if (++cnt > 15) - break; - } - } - - index = 0; - cnt = 0; - while (all) { - struct reada_extent *re = NULL; - - ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, - index, 1); - if (ret == 0) - break; - if (!re->scheduled) { - index = (re->logical >> fs_info->sectorsize_bits) + 1; - continue; - } - pr_debug("re: logical %llu size %u list empty %d scheduled %d", - re->logical, fs_info->nodesize, - list_empty(&re->extctl), re->scheduled); - for (i = 0; i < re->nzones; ++i) { - pr_cont(" zone %llu-%llu devs", - re->zones[i]->start, - re->zones[i]->end); - for (j = 0; j < re->zones[i]->ndevs; ++j) { - pr_cont(" %lld", - re->zones[i]->devs[j]->devid); - } - } - pr_cont("\n"); - index = (re->logical >> fs_info->sectorsize_bits) + 1; - } - spin_unlock(&fs_info->reada_lock); -} -#endif - -/* - * interface - */ -struct reada_control *btrfs_reada_add(struct btrfs_root *root, - struct btrfs_key *key_start, struct btrfs_key *key_end) -{ - struct reada_control *rc; - u64 start; - u64 generation; - int ret; - int level; - struct extent_buffer *node; - static struct btrfs_key max_key = { - .objectid = (u64)-1, - .type = (u8)-1, - .offset = (u64)-1 - }; - - rc = kzalloc(sizeof(*rc), GFP_KERNEL); - if (!rc) - return ERR_PTR(-ENOMEM); - - rc->fs_info = root->fs_info; - rc->key_start = *key_start; - rc->key_end = *key_end; - atomic_set(&rc->elems, 0); - init_waitqueue_head(&rc->wait); - kref_init(&rc->refcnt); - kref_get(&rc->refcnt); /* one ref for having elements */ - - node = btrfs_root_node(root); - start = node->start; - generation = btrfs_header_generation(node); - level = btrfs_header_level(node); - free_extent_buffer(node); - - ret = reada_add_block(rc, start, &max_key, root->root_key.objectid, - generation, level); - if (ret) { - kfree(rc); - return ERR_PTR(ret); - } - - reada_start_machine(root->fs_info); - - return rc; -} - -#ifdef DEBUG -int btrfs_reada_wait(void *handle) -{ - struct reada_control *rc = handle; - struct btrfs_fs_info *fs_info = rc->fs_info; - - while (atomic_read(&rc->elems)) { - if (!atomic_read(&fs_info->reada_works_cnt)) - reada_start_machine(fs_info); - wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, - 5 * HZ); - dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); - } - - dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); - - kref_put(&rc->refcnt, reada_control_release); - - return 0; -} -#else -int btrfs_reada_wait(void *handle) -{ - struct reada_control *rc = handle; - struct btrfs_fs_info *fs_info = rc->fs_info; - - while (atomic_read(&rc->elems)) { - if (!atomic_read(&fs_info->reada_works_cnt)) - reada_start_machine(fs_info); - wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, - (HZ + 9) / 10); - } - - kref_put(&rc->refcnt, reada_control_release); - - return 0; -} -#endif - -void btrfs_reada_detach(void *handle) -{ - struct reada_control *rc = handle; - - kref_put(&rc->refcnt, reada_control_release); -} - -/* - * Before removing a device (device replace or device remove ioctls), call this - * function to wait for all existing readahead requests on the device and to - * make sure no one queues more readahead requests for the device. - * - * Must be called without holding neither the device list mutex nor the device - * replace semaphore, otherwise it will deadlock. - */ -void btrfs_reada_remove_dev(struct btrfs_device *dev) -{ - struct btrfs_fs_info *fs_info = dev->fs_info; - - /* Serialize with readahead extent creation at reada_find_extent(). */ - spin_lock(&fs_info->reada_lock); - set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); - spin_unlock(&fs_info->reada_lock); - - /* - * There might be readahead requests added to the radix trees which - * were not yet added to the readahead work queue. We need to start - * them and wait for their completion, otherwise we can end up with - * use-after-free problems when dropping the last reference on the - * readahead extents and their zones, as they need to access the - * device structure. - */ - reada_start_machine(fs_info); - btrfs_flush_workqueue(fs_info->readahead_workers); -} - -/* - * If when removing a device (device replace or device remove ioctls) an error - * happens after calling btrfs_reada_remove_dev(), call this to undo what that - * function did. This is safe to call even if btrfs_reada_remove_dev() was not - * called before. - */ -void btrfs_reada_undo_remove_dev(struct btrfs_device *dev) -{ - spin_lock(&dev->fs_info->reada_lock); - clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); - spin_unlock(&dev->fs_info->reada_lock); -} diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index d2062d5f71dd..a248f46cfe72 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -435,7 +435,7 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; struct extent_buffer *leaf = path->nodes[0]; - u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 item_size = btrfs_item_size(leaf, slot); unsigned long end, ptr; u64 offset, flags, count; int type, ret; @@ -678,10 +678,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, if (generic_ref->type == BTRFS_REF_METADATA) { if (!parent) - ref_root = generic_ref->tree_ref.root; + ref_root = generic_ref->tree_ref.owning_root; owner = generic_ref->tree_ref.level; } else if (!parent) { - ref_root = generic_ref->data_ref.ref_root; + ref_root = generic_ref->data_ref.owning_root; owner = generic_ref->data_ref.ino; offset = generic_ref->data_ref.offset; } @@ -972,6 +972,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, /* Walk down all roots and build the ref tree, meant to be called at mount */ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { + struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *eb; int tree_block_level = 0; @@ -985,7 +986,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) if (!path) return -ENOMEM; - eb = btrfs_read_lock_root_node(fs_info->extent_root); + extent_root = btrfs_extent_root(fs_info, 0); + eb = btrfs_read_lock_root_node(extent_root); level = btrfs_header_level(eb); path->nodes[level] = eb; path->slots[level] = 0; @@ -998,7 +1000,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) * would have had to added a ref key item which may appear on a * different leaf from the original extent item. */ - ret = walk_down_tree(fs_info->extent_root, path, level, + ret = walk_down_tree(extent_root, path, level, &bytenr, &num_bytes, &tree_block_level); if (ret) break; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 9b0814318e72..a3930da4eb3f 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -138,7 +138,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, } btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); - ClearPageChecked(page); + btrfs_page_clear_checked(fs_info, page, file_offset, block_size); btrfs_page_set_dirty(fs_info, page, file_offset, block_size); out_unlock: if (page) { @@ -439,7 +439,7 @@ process_slot: break; } next_key_min_offset = key.offset + datal; - size = btrfs_item_size_nr(leaf, slot); + size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); @@ -649,7 +649,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { - int ret; + int ret = 0; u64 i, tail_len, chunk_count; struct btrfs_root *root_dst = BTRFS_I(dst)->root; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 914d403b4415..f5465197996d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -25,6 +25,8 @@ #include "backref.h" #include "misc.h" #include "subpage.h" +#include "zoned.h" +#include "inode-item.h" /* * Relocation overview @@ -1145,9 +1147,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(leaf, fi); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, num_bytes, parent); - ref.real_root = root->root_key.objectid; btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - key.objectid, key.offset); + key.objectid, key.offset, + root->root_key.objectid, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1156,9 +1158,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, num_bytes, parent); - ref.real_root = root->root_key.objectid; btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - key.objectid, key.offset); + key.objectid, key.offset, + root->root_key.objectid, false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1367,8 +1369,8 @@ again: btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, blocksize, path->nodes[level]->start); - ref.skip_qgroup = true; - btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); + btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, + 0, true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1376,8 +1378,8 @@ again: } btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, blocksize, 0); - ref.skip_qgroup = true; - btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); + btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, + true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1386,8 +1388,8 @@ again: btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, blocksize, path->nodes[level]->start); - btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); - ref.skip_qgroup = true; + btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, + 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1396,8 +1398,8 @@ again: btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, blocksize, 0); - btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); - ref.skip_qgroup = true; + btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, + 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1735,7 +1737,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, memset(&next_key, 0, sizeof(next_key)); while (1) { - ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, + ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, + min_reserved, BTRFS_RESERVE_FLUSH_LIMIT); if (ret) goto out; @@ -1854,7 +1857,7 @@ int prepare_to_merge(struct reloc_control *rc, int err) again: if (!err) { num_bytes = rc->merging_rsv_size; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, + ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, BTRFS_RESERVE_FLUSH_ALL); if (ret) err = ret; @@ -2322,8 +2325,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, * If we get an enospc just kick back -EAGAIN so we know to drop the * transaction and try to refill when we can flush all the things. */ - ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, - BTRFS_RESERVE_FLUSH_LIMIT); + ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; while (tmp <= rc->reserved_bytes) @@ -2473,9 +2476,9 @@ static int do_relocation(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, node->eb->start, blocksize, upper->eb->start); - ref.real_root = root->root_key.objectid; btrfs_init_tree_ref(&ref, node->level, - btrfs_header_owner(upper->eb)); + btrfs_header_owner(upper->eb), + root->root_key.objectid, false); ret = btrfs_inc_extent_ref(trans, &ref); if (!ret) ret = btrfs_drop_subtree(trans, root, eb, @@ -2691,8 +2694,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, list_add_tail(&node->list, &rc->backref_cache.changed); } else { path->lowest_level = node->level; + if (root == root->fs_info->chunk_root) + btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, key, path, 0, 1); btrfs_release_path(path); + if (root == root->fs_info->chunk_root) + btrfs_trans_release_chunk_metadata(trans); if (ret > 0) ret = 0; } @@ -2852,31 +2859,6 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) return ret; - /* - * On a zoned filesystem, we cannot preallocate the file region. - * Instead, we dirty and fiemap_write the region. - */ - if (btrfs_is_zoned(inode->root->fs_info)) { - struct btrfs_root *root = inode->root; - struct btrfs_trans_handle *trans; - - end = cluster->end - offset + 1; - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); - i_size_write(&inode->vfs_inode, end); - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; - } - - return btrfs_end_transaction(trans); - } - btrfs_inode_lock(&inode->vfs_inode, 0); for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; @@ -2903,9 +2885,8 @@ static noinline_for_stack int prealloc_file_extent_cluster( return ret; } -static noinline_for_stack -int setup_extent_mapping(struct inode *inode, u64 start, u64 end, - u64 block_start) +static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, + u64 start, u64 end, u64 block_start) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; @@ -3084,7 +3065,6 @@ release_page: static int relocate_file_extent_cluster(struct inode *inode, struct file_extent_cluster *cluster) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; unsigned long last_index; @@ -3105,7 +3085,7 @@ static int relocate_file_extent_cluster(struct inode *inode, file_ra_state_init(ra, inode->i_mapping); - ret = setup_extent_mapping(inode, cluster->start - offset, + ret = setup_relocation_extent_mapping(inode, cluster->start - offset, cluster->end - offset, cluster->start); if (ret) goto out; @@ -3114,8 +3094,6 @@ static int relocate_file_extent_cluster(struct inode *inode, for (index = (cluster->start - offset) >> PAGE_SHIFT; index <= last_index && !ret; index++) ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); - if (btrfs_is_zoned(fs_info) && !ret) - ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret == 0) WARN_ON(cluster_nr != cluster->nr); out: @@ -3173,7 +3151,7 @@ static int add_tree_block(struct reloc_control *rc, u64 owner = 0; eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, path->slots[0]); + item_size = btrfs_item_size(eb, path->slots[0]); if (extent_key->type == BTRFS_METADATA_ITEM_KEY || item_size >= sizeof(*ei) + sizeof(*bi)) { @@ -3574,7 +3552,7 @@ int prepare_to_relocate(struct reloc_control *rc) rc->reserved_bytes = 0; rc->block_rsv->size = rc->extent_root->fs_info->nodesize * RELOCATION_RESERVED_NODES; - ret = btrfs_block_rsv_refill(rc->extent_root, + ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, rc->block_rsv, rc->block_rsv->size, BTRFS_RESERVE_FLUSH_ALL); if (ret) @@ -3622,9 +3600,9 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) while (1) { rc->reserved_bytes = 0; - ret = btrfs_block_rsv_refill(rc->extent_root, - rc->block_rsv, rc->block_rsv->size, - BTRFS_RESERVE_FLUSH_ALL); + ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, + rc->block_rsv->size, + BTRFS_RESERVE_FLUSH_ALL); if (ret) { err = ret; break; @@ -3770,12 +3748,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_inode_item *item; struct extent_buffer *leaf; - u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; int ret; - if (btrfs_is_zoned(trans->fs_info)) - flags &= ~BTRFS_INODE_PREALLOC; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3790,7 +3764,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, flags); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC); btrfs_mark_buffer_dirty(leaf); out: btrfs_free_path(path); @@ -3885,25 +3860,14 @@ out: * 0 success * -EINPROGRESS operation is already in progress, that's probably a bug * -ECANCELED cancellation request was set before the operation started - * -EAGAIN can not start because there are ongoing send operations */ static int reloc_chunk_start(struct btrfs_fs_info *fs_info) { - spin_lock(&fs_info->send_reloc_lock); - if (fs_info->send_in_progress) { - btrfs_warn_rl(fs_info, -"cannot run relocation while send operations are in progress (%d in progress)", - fs_info->send_in_progress); - spin_unlock(&fs_info->send_reloc_lock); - return -EAGAIN; - } if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { /* This should not happen */ - spin_unlock(&fs_info->send_reloc_lock); btrfs_err(fs_info, "reloc already running, cannot start"); return -EINPROGRESS; } - spin_unlock(&fs_info->send_reloc_lock); if (atomic_read(&fs_info->reloc_cancel_req) > 0) { btrfs_info(fs_info, "chunk relocation canceled on start"); @@ -3925,9 +3889,7 @@ static void reloc_chunk_end(struct btrfs_fs_info *fs_info) /* Requested after start, clear bit first so any waiters can continue */ if (atomic_read(&fs_info->reloc_cancel_req) > 0) btrfs_info(fs_info, "chunk relocation canceled during operation"); - spin_lock(&fs_info->send_reloc_lock); clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); - spin_unlock(&fs_info->send_reloc_lock); atomic_set(&fs_info->reloc_cancel_req, 0); } @@ -3990,7 +3952,7 @@ static const char *stage_to_string(int stage) int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) { struct btrfs_block_group *bg; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); struct reloc_control *rc; struct inode *inode; struct btrfs_path *path; @@ -4063,6 +4025,9 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) rc->block_group->start, rc->block_group->length); + ret = btrfs_zone_finish(rc->block_group); + WARN_ON(ret && ret != -EAGAIN); + while (1) { int finishes_stage; @@ -4238,7 +4203,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) goto out_end; } - rc->extent_root = fs_info->extent_root; + rc->extent_root = btrfs_extent_root(fs_info, 0); set_reloc_control(rc); @@ -4329,6 +4294,7 @@ out: int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *csum_root; struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered; int ret; @@ -4340,7 +4306,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); disk_bytenr = file_pos + inode->index_cnt; - ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, + csum_root = btrfs_csum_root(fs_info, disk_bytenr); + ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, disk_bytenr + len - 1, &list, 0); if (ret) goto out; @@ -4386,8 +4353,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, if (!rc) return 0; - BUG_ON(rc->stage == UPDATE_DATA_PTRS && - root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 702dc5441f03..3d68d2dcd83e 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -25,7 +25,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, u32 len; int need_reset = 0; - len = btrfs_item_size_nr(eb, slot); + len = btrfs_item_size(eb, slot); read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot), min_t(u32, len, sizeof(*item))); if (len < sizeof(*item)) @@ -39,10 +39,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, need_reset = 1; } if (need_reset) { - memset(&item->generation_v2, 0, - sizeof(*item) - offsetof(struct btrfs_root_item, - generation_v2)); - + /* Clear all members from generation_v2 onwards. */ + memset_startat(item, 0, generation_v2); generate_random_guid(item->uuid); } } @@ -148,7 +146,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root l = path->nodes[0]; slot = path->slots[0]; ptr = btrfs_item_ptr_offset(l, slot); - old_len = btrfs_item_size_nr(l, slot); + old_len = btrfs_item_size(l, slot); /* * If this is the first time we update the root item which originated @@ -336,7 +334,8 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, key.offset = ref_id; again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - BUG_ON(ret < 0); + if (ret < 0) + goto out; if (ret == 0) { leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], @@ -503,7 +502,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, num_bytes = btrfs_calc_insert_metadata_size(fs_info, items); rsv->space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - ret = btrfs_block_rsv_add(root, rsv, num_bytes, + ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, BTRFS_RESERVE_FLUSH_ALL); if (ret == -ENOSPC && use_global_rsv) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 088641ba7a8e..2e9a322773f2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -39,25 +39,24 @@ struct scrub_block; struct scrub_ctx; /* - * the following three values only influence the performance. + * The following three values only influence the performance. + * * The last one configures the number of parallel and outstanding I/O - * operations. The first two values configure an upper limit for the number + * operations. The first one configures an upper limit for the number * of (dynamically allocated) pages that are added to a bio. */ -#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ -#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ -#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ +#define SCRUB_PAGES_PER_BIO 32 /* 128KiB per bio for x86 */ +#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for x86 */ /* - * the following value times PAGE_SIZE needs to be large enough to match the + * The following value times PAGE_SIZE needs to be large enough to match the * largest node/leaf/sector size that shall be supported. - * Values larger than BTRFS_STRIPE_LEN are not supported. */ -#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ +#define SCRUB_MAX_PAGES_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) struct scrub_recover { refcount_t refs; - struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; u64 map_length; }; @@ -73,8 +72,8 @@ struct scrub_page { u64 physical_for_dev_replace; atomic_t refs; u8 mirror_num; - int have_csum:1; - int io_error:1; + unsigned int have_csum:1; + unsigned int io_error:1; u8 csum[BTRFS_CSUM_SIZE]; struct scrub_recover *recover; @@ -88,11 +87,7 @@ struct scrub_bio { blk_status_t status; u64 logical; u64 physical; -#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO - struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; -#else - struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; -#endif + struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; int page_count; int next_free; struct btrfs_work work; @@ -163,7 +158,7 @@ struct scrub_ctx { struct list_head csum_list; atomic_t cancel_req; int readonly; - int pages_per_rd_bio; + int pages_per_bio; /* State of IO submission throttling affecting the associated device */ ktime_t throttle_deadline; @@ -174,7 +169,6 @@ struct scrub_ctx { struct scrub_bio *wr_curr_bio; struct mutex wr_lock; - int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ struct btrfs_device *wr_tgtdev; bool flush_all_writes; @@ -254,7 +248,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx); static inline int scrub_is_page_on_raid56(struct scrub_page *spage) { return spage->recover && - (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); + (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); } static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -578,7 +572,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( goto nomem; refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; - sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; + sctx->pages_per_bio = SCRUB_PAGES_PER_BIO; sctx->curr = -1; sctx->fs_info = fs_info; INIT_LIST_HEAD(&sctx->csum_list); @@ -616,7 +610,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sctx->wr_curr_bio = NULL; if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); - sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } @@ -758,7 +751,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(eb, path->slots[0]); + item_size = btrfs_item_size(eb, path->slots[0]); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -798,7 +791,7 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, { if (refcount_dec_and_test(&recover->refs)) { btrfs_bio_counter_dec(fs_info); - btrfs_put_bbio(recover->bbio); + btrfs_put_bioc(recover->bioc); kfree(recover); } } @@ -852,8 +845,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) have_csum = sblock_to_check->pagev[0]->have_csum; dev = sblock_to_check->pagev[0]->dev; - if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace) - return btrfs_repair_one_zone(fs_info, logical); + if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) + return 0; /* * We must use GFP_NOFS because the scrub task might be waiting for a @@ -1027,8 +1020,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sblock_other = sblocks_for_recheck + mirror_index; } else { struct scrub_recover *r = sblock_bad->pagev[0]->recover; - int max_allowed = r->bbio->num_stripes - - r->bbio->num_tgtdevs; + int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; if (mirror_index >= max_allowed) break; @@ -1218,14 +1210,14 @@ out: return 0; } -static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio) +static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) { - if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) return 2; - else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) + else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) return 3; else - return (int)bbio->num_stripes; + return (int)bioc->num_stripes; } static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, @@ -1269,7 +1261,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, u64 flags = original_sblock->pagev[0]->flags; u64 have_csum = original_sblock->pagev[0]->have_csum; struct scrub_recover *recover; - struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; u64 sublen; u64 mapped_length; u64 stripe_offset; @@ -1288,7 +1280,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, while (length > 0) { sublen = min_t(u64, length, fs_info->sectorsize); mapped_length = sublen; - bbio = NULL; + bioc = NULL; /* * With a length of sectorsize, each returned stripe represents @@ -1296,27 +1288,27 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, */ btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &mapped_length, &bbio); - if (ret || !bbio || mapped_length < sublen) { - btrfs_put_bbio(bbio); + logical, &mapped_length, &bioc); + if (ret || !bioc || mapped_length < sublen) { + btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); return -EIO; } recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); if (!recover) { - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); return -ENOMEM; } refcount_set(&recover->refs, 1); - recover->bbio = bbio; + recover->bioc = bioc; recover->map_length = mapped_length; - BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK); - nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); + nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { @@ -1348,17 +1340,17 @@ leave_nomem: sctx->fs_info->csum_size); scrub_stripe_index_and_offset(logical, - bbio->map_type, - bbio->raid_map, + bioc->map_type, + bioc->raid_map, mapped_length, - bbio->num_stripes - - bbio->num_tgtdevs, + bioc->num_stripes - + bioc->num_tgtdevs, mirror_index, &stripe_index, &stripe_offset); - spage->physical = bbio->stripes[stripe_index].physical + + spage->physical = bioc->stripes[stripe_index].physical + stripe_offset; - spage->dev = bbio->stripes[stripe_index].dev; + spage->dev = bioc->stripes[stripe_index].dev; BUG_ON(page_index >= original_sblock->page_count); spage->physical_for_dev_replace = @@ -1401,7 +1393,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, bio->bi_end_io = scrub_bio_wait_endio; mirror_num = spage->sblock->pagev[0]->mirror_num; - ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio, + ret = raid56_parity_recover(bio, spage->recover->bioc, spage->recover->map_length, mirror_num, 0); if (ret) @@ -1423,7 +1415,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, if (!first_page->dev->bdev) goto out; - bio = btrfs_io_bio_alloc(BIO_MAX_VECS); + bio = btrfs_bio_alloc(BIO_MAX_VECS); bio_set_dev(bio, first_page->dev->bdev); for (page_num = 0; page_num < sblock->page_count; page_num++) { @@ -1480,7 +1472,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } WARN_ON(!spage->page); - bio = btrfs_io_bio_alloc(1); + bio = btrfs_bio_alloc(1); bio_set_dev(bio, spage->dev->bdev); bio_add_page(bio, spage->page, fs_info->sectorsize, 0); @@ -1562,7 +1554,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - bio = btrfs_io_bio_alloc(1); + bio = btrfs_bio_alloc(1); bio_set_dev(bio, spage_bad->dev->bdev); bio->bi_iter.bi_sector = spage_bad->physical >> 9; bio->bi_opf = REQ_OP_WRITE; @@ -1676,7 +1668,7 @@ again: sbio->dev = sctx->wr_tgtdev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio); + bio = btrfs_bio_alloc(sctx->pages_per_bio); sbio->bio = bio; } @@ -1709,7 +1701,7 @@ again: sbio->pagev[sbio->page_count] = spage; scrub_page_get(spage); sbio->page_count++; - if (sbio->page_count == sctx->pages_per_wr_bio) + if (sbio->page_count == sctx->pages_per_bio) scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); @@ -1756,7 +1748,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) struct scrub_ctx *sctx = sbio->sctx; int i; - WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); + ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; @@ -2102,7 +2094,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio); + bio = btrfs_bio_alloc(sctx->pages_per_bio); sbio->bio = bio; } @@ -2136,7 +2128,7 @@ again: scrub_block_get(sblock); /* one for the page added to the bio */ atomic_inc(&sblock->outstanding_pages); sbio->page_count++; - if (sbio->page_count == sctx->pages_per_rd_bio) + if (sbio->page_count == sctx->pages_per_bio) scrub_submit(sctx); return 0; @@ -2203,7 +2195,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; u64 length = sblock->page_count * PAGE_SIZE; u64 logical = sblock->pagev[0]->logical; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; int ret; @@ -2211,27 +2203,27 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bbio); - if (ret || !bbio || !bbio->raid_map) - goto bbio_out; + &length, &bioc); + if (ret || !bioc || !bioc->raid_map) + goto bioc_out; if (WARN_ON(!sctx->is_dev_replace || - !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { + !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { /* * We shouldn't be scrubbing a missing device. Even for dev * replace, we should only get here for RAID 5/6. We either * managed to mount something with no mirrors remaining or * there's a bug in scrub_remap_extent()/btrfs_map_block(). */ - goto bbio_out; + goto bioc_out; } - bio = btrfs_io_bio_alloc(0); + bio = btrfs_bio_alloc(BIO_MAX_VECS); bio->bi_iter.bi_sector = logical >> 9; bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; - rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length); + rbio = raid56_alloc_missing_rbio(bio, bioc, length); if (!rbio) goto rbio_out; @@ -2249,9 +2241,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) rbio_out: bio_put(bio); -bbio_out: +bioc_out: btrfs_bio_counter_dec(fs_info); - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); @@ -2298,7 +2290,7 @@ leave_nomem: scrub_block_put(sblock); return -ENOMEM; } - BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); scrub_page_get(spage); sblock->pagev[index] = spage; spage->sblock = sblock; @@ -2370,7 +2362,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) struct scrub_ctx *sctx = sbio->sctx; int i; - BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); + ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); if (sbio->status) { for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -2632,7 +2624,7 @@ leave_nomem: scrub_block_put(sblock); return -ENOMEM; } - BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); /* For scrub block */ scrub_page_get(spage); sblock->pagev[index] = spage; @@ -2826,7 +2818,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) struct btrfs_fs_info *fs_info = sctx->fs_info; struct bio *bio; struct btrfs_raid_bio *rbio; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; u64 length; int ret; @@ -2838,17 +2830,17 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, - &length, &bbio); - if (ret || !bbio || !bbio->raid_map) - goto bbio_out; + &length, &bioc); + if (ret || !bioc || !bioc->raid_map) + goto bioc_out; - bio = btrfs_io_bio_alloc(0); + bio = btrfs_bio_alloc(BIO_MAX_VECS); bio->bi_iter.bi_sector = sparity->logic_start >> 9; bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; - rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio, - length, sparity->scrub_dev, + rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length, + sparity->scrub_dev, sparity->dbitmap, sparity->nsectors); if (!rbio) @@ -2860,9 +2852,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) rbio_out: bio_put(bio); -bbio_out: +bioc_out: btrfs_bio_counter_dec(fs_info); - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); @@ -2893,15 +2885,15 @@ static void scrub_parity_put(struct scrub_parity *sparity) static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *sdev, - struct btrfs_path *path, u64 logic_start, u64 logic_end) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_root *csum_root = fs_info->csum_root; + struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start); + struct btrfs_root *csum_root; struct btrfs_extent_item *extent; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; + struct btrfs_path *path; u64 flags; int ret; int slot; @@ -2920,6 +2912,16 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + path->search_commit_root = 1; + path->skip_locking = 1; + ASSERT(map->stripe_len <= U32_MAX); nsectors = map->stripe_len >> fs_info->sectorsize_bits; bitmap_len = scrub_calc_parity_bitmap_len(nsectors); @@ -2929,6 +2931,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); + btrfs_free_path(path); return -ENOMEM; } @@ -3044,23 +3047,24 @@ again: extent_len); mapped_length = extent_len; - bbio = NULL; + bioc = NULL; ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, - extent_logical, &mapped_length, &bbio, + extent_logical, &mapped_length, &bioc, 0); if (!ret) { - if (!bbio || mapped_length < extent_len) + if (!bioc || mapped_length < extent_len) ret = -EIO; } if (ret) { - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); goto out; } - extent_physical = bbio->stripes[0].physical; - extent_mirror_num = bbio->mirror_num; - extent_dev = bbio->stripes[0].dev; - btrfs_put_bbio(bbio); + extent_physical = bioc->stripes[0].physical; + extent_mirror_num = bioc->mirror_num; + extent_dev = bioc->stripes[0].dev; + btrfs_put_bioc(bioc); + csum_root = btrfs_csum_root(fs_info, extent_logical); ret = btrfs_lookup_csums_range(csum_root, extent_logical, extent_logical + extent_len - 1, @@ -3117,7 +3121,7 @@ out: scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); - btrfs_release_path(path); + btrfs_free_path(path); return ret < 0 ? ret : 0; } @@ -3162,17 +3166,18 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, } static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, + struct btrfs_block_group *bg, struct map_lookup *map, struct btrfs_device *scrub_dev, - int num, u64 base, u64 length, - struct btrfs_block_group *cache) + int stripe_index, u64 dev_extent_len) { - struct btrfs_path *path, *ppath; + struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_root *csum_root = fs_info->csum_root; + struct btrfs_root *root; + struct btrfs_root *csum_root; struct btrfs_extent_item *extent; struct blk_plug plug; + const u64 chunk_logical = bg->start; u64 flags; int ret; int slot; @@ -3184,10 +3189,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 physical_end; u64 generation; int mirror_num; - struct reada_control *reada1; - struct reada_control *reada2; struct btrfs_key key; - struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; u64 extent_logical; @@ -3203,25 +3205,26 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - physical = map->stripes[num].physical; + physical = map->stripes[stripe_index].physical; offset = 0; - nstripes = div64_u64(length, map->stripe_len); + nstripes = div64_u64(dev_extent_len, map->stripe_len); mirror_num = 1; increment = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - offset = map->stripe_len * num; + offset = map->stripe_len * stripe_index; increment = map->stripe_len * map->num_stripes; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; - offset = map->stripe_len * (num / map->sub_stripes); + offset = map->stripe_len * (stripe_index / map->sub_stripes); increment = map->stripe_len * factor; - mirror_num = num % map->sub_stripes + 1; + mirror_num = stripe_index % map->sub_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - mirror_num = num % map->num_stripes + 1; + mirror_num = stripe_index % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - mirror_num = num % map->num_stripes + 1; + mirror_num = stripe_index % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical, num, map, &offset, NULL); + get_raid56_logic_offset(physical, stripe_index, map, &offset, + NULL); increment = map->stripe_len * nr_data_stripes(map); } @@ -3229,12 +3232,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (!path) return -ENOMEM; - ppath = btrfs_alloc_path(); - if (!ppath) { - btrfs_free_path(path); - return -ENOMEM; - } - /* * work on commit root. The related disk blocks are static as * long as COW is applied. This means, it is save to rewrite @@ -3242,20 +3239,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ path->search_commit_root = 1; path->skip_locking = 1; + path->reada = READA_FORWARD; - ppath->search_commit_root = 1; - ppath->skip_locking = 1; - /* - * trigger the readahead for extent tree csum tree and wait for - * completion. During readahead, the scrub is officially paused - * to not hold off transaction commits - */ - logical = base + offset; + logical = chunk_logical + offset; physical_end = physical + nstripes * map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical_end, num, + get_raid56_logic_offset(physical_end, stripe_index, map, &logic_end, NULL); - logic_end += base; + logic_end += chunk_logical; } else { logic_end = logical + increment * nstripes; } @@ -3263,32 +3254,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, atomic_read(&sctx->bios_in_flight) == 0); scrub_blocked_if_needed(fs_info); - /* FIXME it might be better to start readahead at commit root */ - key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)0; - key_end.objectid = logic_end; - key_end.type = BTRFS_METADATA_ITEM_KEY; - key_end.offset = (u64)-1; - reada1 = btrfs_reada_add(root, &key, &key_end); - - if (cache->flags & BTRFS_BLOCK_GROUP_DATA) { - key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.type = BTRFS_EXTENT_CSUM_KEY; - key.offset = logical; - key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_end.type = BTRFS_EXTENT_CSUM_KEY; - key_end.offset = logic_end; - reada2 = btrfs_reada_add(csum_root, &key, &key_end); - } else { - reada2 = NULL; - } - - if (!IS_ERR(reada1)) - btrfs_reada_wait(reada1); - if (!IS_ERR_OR_NULL(reada2)) - btrfs_reada_wait(reada2); - + root = btrfs_extent_root(fs_info, logical); + csum_root = btrfs_csum_root(fs_info, logical); /* * collect all data csums for the stripe to avoid seeking during @@ -3334,16 +3301,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = get_raid56_logic_offset(physical, num, map, - &logical, + ret = get_raid56_logic_offset(physical, stripe_index, + map, &logical, &stripe_logical); - logical += base; + logical += chunk_logical; if (ret) { /* it is parity strip */ - stripe_logical += base; + stripe_logical += chunk_logical; stripe_end = stripe_logical + increment; ret = scrub_raid56_parity(sctx, map, scrub_dev, - ppath, stripe_logical, + stripe_logical, stripe_end); if (ret) goto out; @@ -3420,13 +3387,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * Continuing would prevent reusing its device extents * for new block groups for a long time. */ - spin_lock(&cache->lock); - if (cache->removed) { - spin_unlock(&cache->lock); + spin_lock(&bg->lock); + if (bg->removed) { + spin_unlock(&bg->lock); ret = 0; goto out; } - spin_unlock(&cache->lock); + spin_unlock(&bg->lock); extent = btrfs_item_ptr(l, slot, struct btrfs_extent_item); @@ -3505,16 +3472,16 @@ again: loop: physical += map->stripe_len; ret = get_raid56_logic_offset(physical, - num, map, &logical, - &stripe_logical); - logical += base; + stripe_index, map, + &logical, &stripe_logical); + logical += chunk_logical; if (ret && physical < physical_end) { - stripe_logical += base; + stripe_logical += chunk_logical; stripe_end = stripe_logical + increment; ret = scrub_raid56_parity(sctx, - map, scrub_dev, ppath, + map, scrub_dev, stripe_logical, stripe_end); if (ret) @@ -3544,8 +3511,8 @@ skip: physical += map->stripe_len; spin_lock(&sctx->stat_lock); if (stop_loop) - sctx->stat.last_physical = map->stripes[num].physical + - length; + sctx->stat.last_physical = map->stripes[stripe_index].physical + + dev_extent_len; else sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); @@ -3561,14 +3528,14 @@ out: blk_finish_plug(&plug); btrfs_free_path(path); - btrfs_free_path(ppath); if (sctx->is_dev_replace && ret >= 0) { int ret2; - ret2 = sync_write_pointer_for_zoned(sctx, base + offset, - map->stripes[num].physical, - physical_end); + ret2 = sync_write_pointer_for_zoned(sctx, + chunk_logical + offset, + map->stripes[stripe_index].physical, + physical_end); if (ret2) ret = ret2; } @@ -3577,10 +3544,10 @@ out: } static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, + struct btrfs_block_group *bg, struct btrfs_device *scrub_dev, - u64 chunk_offset, u64 length, u64 dev_offset, - struct btrfs_block_group *cache) + u64 dev_extent_len) { struct btrfs_fs_info *fs_info = sctx->fs_info; struct extent_map_tree *map_tree = &fs_info->mapping_tree; @@ -3590,7 +3557,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, int ret = 0; read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, chunk_offset, 1); + em = lookup_extent_mapping(map_tree, bg->start, bg->length); read_unlock(&map_tree->lock); if (!em) { @@ -3598,26 +3565,24 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, * Might have been an unused block group deleted by the cleaner * kthread or relocation. */ - spin_lock(&cache->lock); - if (!cache->removed) + spin_lock(&bg->lock); + if (!bg->removed) ret = -EINVAL; - spin_unlock(&cache->lock); + spin_unlock(&bg->lock); return ret; } - - map = em->map_lookup; - if (em->start != chunk_offset) + if (em->start != bg->start) goto out; - - if (em->len < length) + if (em->len < dev_extent_len) goto out; + map = em->map_lookup; for (i = 0; i < map->num_stripes; ++i) { if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, map, scrub_dev, i, - chunk_offset, length, cache); + ret = scrub_stripe(sctx, bg, map, scrub_dev, i, + dev_extent_len); if (ret) goto out; } @@ -3655,7 +3620,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root = fs_info->dev_root; - u64 length; u64 chunk_offset; int ret = 0; int ro_set; @@ -3679,6 +3643,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, key.type = BTRFS_DEV_EXTENT_KEY; while (1) { + u64 dev_extent_len; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) break; @@ -3715,9 +3681,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - length = btrfs_dev_extent_length(l, dev_extent); + dev_extent_len = btrfs_dev_extent_length(l, dev_extent); - if (found_key.offset + length <= start) + if (found_key.offset + dev_extent_len <= start) goto skip; chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); @@ -3851,13 +3817,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); down_write(&dev_replace->rwsem); - dev_replace->cursor_right = found_key.offset + length; + dev_replace->cursor_right = found_key.offset + dev_extent_len; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; up_write(&dev_replace->rwsem); - ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, - found_key.offset, cache); + ASSERT(cache->start == chunk_offset); + ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, + dev_extent_len); /* * flush, submit all pending read and write bios, afterwards @@ -3938,7 +3905,7 @@ skip_unfreeze: break; } skip: - key.offset = found_key.offset + length; + key.offset = found_key.offset + dev_extent_len; btrfs_release_path(path); } @@ -3956,7 +3923,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, int ret; struct btrfs_fs_info *fs_info = sctx->fs_info; - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) return -EROFS; /* Seed devices of a new filesystem has their own generation. */ @@ -4068,6 +4035,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace) { + struct btrfs_dev_lookup_args args = { .devid = devid }; struct scrub_ctx *sctx; int ret; struct btrfs_device *dev; @@ -4115,7 +4083,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, goto out_free_ctx; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -4288,11 +4256,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_device *dev) int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress) { + struct btrfs_dev_lookup_args args = { .devid = devid }; struct btrfs_device *dev; struct scrub_ctx *sctx = NULL; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, &args); if (dev) sctx = dev->scrub_ctx; if (sctx) @@ -4309,20 +4278,20 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, int *extent_mirror_num) { u64 mapped_length; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; int ret; mapped_length = extent_len; ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical, - &mapped_length, &bbio, 0); - if (ret || !bbio || mapped_length < extent_len || - !bbio->stripes[0].dev->bdev) { - btrfs_put_bbio(bbio); + &mapped_length, &bioc, 0); + if (ret || !bioc || mapped_length < extent_len || + !bioc->stripes[0].dev->bdev) { + btrfs_put_bioc(bioc); return; } - *extent_physical = bbio->stripes[0].physical; - *extent_mirror_num = bbio->mirror_num; - *extent_dev = bbio->stripes[0].dev; - btrfs_put_bbio(bbio); + *extent_physical = bioc->stripes[0].physical; + *extent_mirror_num = bioc->mirror_num; + *extent_dev = bioc->stripes[0].dev; + btrfs_put_bioc(bioc); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 72f9b865e847..d8ccb62aa7d2 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -24,6 +24,7 @@ #include "transaction.h" #include "compression.h" #include "xattr.h" +#include "print-tree.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -84,6 +85,8 @@ struct send_ctx { u64 total_send_size; u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ + /* Protocol version compatibility requested */ + u32 proto; struct btrfs_root *send_root; struct btrfs_root *parent_root; @@ -96,6 +99,15 @@ struct send_ctx { struct btrfs_key *cmp_key; /* + * Keep track of the generation of the last transaction that was used + * for relocating a block group. This is periodically checked in order + * to detect if a relocation happened since the last check, so that we + * don't operate on stale extent buffers for nodes (level >= 1) or on + * stale disk_bytenr values of file extent items. + */ + u64 last_reloc_trans; + + /* * infos of the currently processed inode. In case of deleted inodes, * these are the values from the deleted inode. */ @@ -312,6 +324,16 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx, sctx->parent_root->root_key.objectid : 0)); } +__maybe_unused +static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) +{ + switch (sctx->proto) { + case 1: return cmd < __BTRFS_SEND_C_MAX_V1; + case 2: return cmd < __BTRFS_SEND_C_MAX_V2; + default: return false; + } +} + static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); static struct waiting_dir_move * @@ -886,7 +908,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, iterate_inode_ref_t iterate, void *ctx) { struct extent_buffer *eb = path->nodes[0]; - struct btrfs_item *item; struct btrfs_inode_ref *iref; struct btrfs_inode_extref *extref; struct btrfs_path *tmp_path; @@ -918,12 +939,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, if (found_key->type == BTRFS_INODE_REF_KEY) { ptr = (unsigned long)btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - item = btrfs_item_nr(slot); - total = btrfs_item_size(eb, item); + total = btrfs_item_size(eb, slot); elem_size = sizeof(*iref); } else { ptr = btrfs_item_ptr_offset(eb, slot); - total = btrfs_item_size_nr(eb, slot); + total = btrfs_item_size(eb, slot); elem_size = sizeof(*extref); } @@ -992,7 +1012,7 @@ out: typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, - u8 type, void *ctx); + void *ctx); /* * Helper function to iterate the entries in ONE btrfs_dir_item. @@ -1006,7 +1026,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, { int ret = 0; struct extent_buffer *eb; - struct btrfs_item *item; struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; @@ -1018,7 +1037,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, u32 total; int slot; int num; - u8 type; /* * Start with a small buffer (1 page). If later we end up needing more @@ -1035,20 +1053,18 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, eb = path->nodes[0]; slot = path->slots[0]; - item = btrfs_item_nr(slot); di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); cur = 0; len = 0; - total = btrfs_item_size(eb, item); + total = btrfs_item_size(eb, slot); num = 0; while (cur < total) { name_len = btrfs_dir_name_len(eb, di); data_len = btrfs_dir_data_len(eb, di); - type = btrfs_dir_type(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); - if (type == BTRFS_FT_XATTR) { + if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) { if (name_len > XATTR_NAME_MAX) { ret = -ENAMETOOLONG; goto out; @@ -1098,7 +1114,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, cur += len; ret = iterate(num, &di_key, buf, name_len, buf + name_len, - data_len, type, ctx); + data_len, ctx); if (ret < 0) goto out; if (ret) { @@ -1415,6 +1431,26 @@ static int find_extent_clone(struct send_ctx *sctx, if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + /* + * A transaction commit for a transaction in which block group + * relocation was done just happened. + * The disk_bytenr of the file extent item we processed is + * possibly stale, referring to the extent's location before + * relocation. So act as if we haven't found any clone sources + * and fallback to write commands, which will read the correct + * data from the new extent location. Otherwise we will fail + * below because we haven't found our own back reference or we + * could be getting incorrect sources in case the old extent + * was already reallocated after the relocation. + */ + up_read(&fs_info->commit_root_sem); + ret = -ENOENT; + goto out; + } + up_read(&fs_info->commit_root_sem); + if (!backref_ctx.found_itself) { /* found a bug in backref code? */ ret = -EIO; @@ -1680,8 +1716,7 @@ out: */ static int lookup_dir_item_inode(struct btrfs_root *root, u64 dir, const char *name, int name_len, - u64 *found_inode, - u8 *found_type) + u64 *found_inode) { int ret = 0; struct btrfs_dir_item *di; @@ -1704,7 +1739,6 @@ static int lookup_dir_item_inode(struct btrfs_root *root, goto out; } *found_inode = key.objectid; - *found_type = btrfs_dir_type(path->nodes[0], di); out: btrfs_free_path(path); @@ -1827,7 +1861,6 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, int ret = 0; u64 gen; u64 other_inode = 0; - u8 other_type = 0; if (!sctx->parent_root) goto out; @@ -1855,7 +1888,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, } ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, - &other_inode, &other_type); + &other_inode); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1900,7 +1933,6 @@ static int did_overwrite_ref(struct send_ctx *sctx, int ret = 0; u64 gen; u64 ow_inode; - u8 other_type; if (!sctx->parent_root) goto out; @@ -1924,7 +1956,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, /* check if the ref was overwritten by another ref */ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, - &ow_inode, &other_type); + &ow_inode); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -2720,19 +2752,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx) if (S_ISDIR(sctx->cur_inode_mode)) { ret = did_create_dir(sctx, sctx->cur_ino); if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + return ret; + else if (ret > 0) + return 0; } - ret = send_create_inode(sctx, sctx->cur_ino); - if (ret < 0) - goto out; - -out: - return ret; + return send_create_inode(sctx, sctx->cur_ino); } struct recorded_ref { @@ -3617,7 +3642,7 @@ static int is_ancestor(struct btrfs_root *root, key.type != BTRFS_INODE_EXTREF_KEY) break; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); while (cur_offset < item_size) { u64 parent; u64 parent_gen; @@ -4646,9 +4671,8 @@ out: } static int __process_new_xattr(int num, struct btrfs_key *di_key, - const char *name, int name_len, - const char *data, int data_len, - u8 type, void *ctx) + const char *name, int name_len, const char *data, + int data_len, void *ctx) { int ret; struct send_ctx *sctx = ctx; @@ -4692,8 +4716,7 @@ out: static int __process_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, - const char *data, int data_len, - u8 type, void *ctx) + const char *data, int data_len, void *ctx) { int ret; struct send_ctx *sctx = ctx; @@ -4738,10 +4761,8 @@ struct find_xattr_ctx { int found_data_len; }; -static int __find_xattr(int num, struct btrfs_key *di_key, - const char *name, int name_len, - const char *data, int data_len, - u8 type, void *vctx) +static int __find_xattr(int num, struct btrfs_key *di_key, const char *name, + int name_len, const char *data, int data_len, void *vctx) { struct find_xattr_ctx *ctx = vctx; @@ -4791,7 +4812,7 @@ static int find_xattr(struct btrfs_root *root, static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, - u8 type, void *ctx) + void *ctx) { int ret; struct send_ctx *sctx = ctx; @@ -4803,12 +4824,12 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, &found_data_len); if (ret == -ENOENT) { ret = __process_new_xattr(num, di_key, name, name_len, data, - data_len, type, ctx); + data_len, ctx); } else if (ret >= 0) { if (data_len != found_data_len || memcmp(data, found_data, data_len)) { ret = __process_new_xattr(num, di_key, name, name_len, - data, data_len, type, ctx); + data, data_len, ctx); } else { ret = 0; } @@ -4821,7 +4842,7 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, - u8 type, void *ctx) + void *ctx) { int ret; struct send_ctx *sctx = ctx; @@ -4830,7 +4851,7 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, name, name_len, NULL, NULL); if (ret == -ENOENT) ret = __process_deleted_xattr(num, di_key, name, name_len, data, - data_len, type, ctx); + data_len, ctx); else if (ret >= 0) ret = 0; @@ -6561,7 +6582,7 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { extref = (struct btrfs_inode_extref *)(ptr + @@ -6592,6 +6613,50 @@ static int changed_cb(struct btrfs_path *left_path, { int ret = 0; + /* + * We can not hold the commit root semaphore here. This is because in + * the case of sending and receiving to the same filesystem, using a + * pipe, could result in a deadlock: + * + * 1) The task running send blocks on the pipe because it's full; + * + * 2) The task running receive, which is the only consumer of the pipe, + * is waiting for a transaction commit (for example due to a space + * reservation when doing a write or triggering a transaction commit + * when creating a subvolume); + * + * 3) The transaction is waiting to write lock the commit root semaphore, + * but can not acquire it since it's being held at 1). + * + * Down this call chain we write to the pipe through kernel_write(). + * The same type of problem can also happen when sending to a file that + * is stored in the same filesystem - when reserving space for a write + * into the file, we can trigger a transaction commit. + * + * Our caller has supplied us with clones of leaves from the send and + * parent roots, so we're safe here from a concurrent relocation and + * further reallocation of metadata extents while we are here. Below we + * also assert that the leaves are clones. + */ + lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem); + + /* + * We always have a send root, so left_path is never NULL. We will not + * have a leaf when we have reached the end of the send root but have + * not yet reached the end of the parent root. + */ + if (left_path->nodes[0]) + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, + &left_path->nodes[0]->bflags)); + /* + * When doing a full send we don't have a parent root, so right_path is + * NULL. When doing an incremental send, we may have reached the end of + * the parent root already, so we don't have a leaf at right_path. + */ + if (right_path && right_path->nodes[0]) + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, + &right_path->nodes[0]->bflags)); + if (result == BTRFS_COMPARE_TREE_SAME) { if (key->type == BTRFS_INODE_REF_KEY || key->type == BTRFS_INODE_EXTREF_KEY) { @@ -6638,14 +6703,46 @@ out: return ret; } +static int search_key_again(const struct send_ctx *sctx, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key) +{ + int ret; + + if (!path->need_commit_sem) + lockdep_assert_held_read(&root->fs_info->commit_root_sem); + + /* + * Roots used for send operations are readonly and no one can add, + * update or remove keys from them, so we should be able to find our + * key again. The only exception is deduplication, which can operate on + * readonly roots and add, update or remove keys to/from them - but at + * the moment we don't allow it to run in parallel with send. + */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + ASSERT(ret <= 0); + if (ret > 0) { + btrfs_print_tree(path->nodes[path->lowest_level], false); + btrfs_err(root->fs_info, +"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", + key->objectid, key->type, key->offset, + (root == sctx->parent_root ? "parent" : "send"), + root->root_key.objectid, path->lowest_level, + path->slots[path->lowest_level]); + return -EUCLEAN; + } + + return ret; +} + static int full_send_tree(struct send_ctx *sctx) { int ret; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; + struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_path *path; - struct extent_buffer *eb; - int slot; path = alloc_path_for_send(); if (!path) @@ -6656,6 +6753,10 @@ static int full_send_tree(struct send_ctx *sctx) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; + down_read(&fs_info->commit_root_sem); + sctx->last_reloc_trans = fs_info->last_reloc_trans; + up_read(&fs_info->commit_root_sem); + ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) goto out; @@ -6663,15 +6764,35 @@ static int full_send_tree(struct send_ctx *sctx) goto out_finish; while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &key, slot); + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ret = changed_cb(path, NULL, &key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + sctx->last_reloc_trans = fs_info->last_reloc_trans; + up_read(&fs_info->commit_root_sem); + /* + * A transaction used for relocating a block group was + * committed or is about to finish its commit. Release + * our path (leaf) and restart the search, so that we + * avoid operating on any file extent items that are + * stale, with a disk_bytenr that reflects a pre + * relocation value. This way we avoid as much as + * possible to fallback to regular writes when checking + * if we can clone file ranges. + */ + btrfs_release_path(path); + ret = search_key_again(sctx, send_root, path, &key); + if (ret < 0) + goto out; + } else { + up_read(&fs_info->commit_root_sem); + } + ret = btrfs_next_item(send_root, path); if (ret < 0) goto out; @@ -6689,6 +6810,20 @@ out: return ret; } +static int replace_node_with_clone(struct btrfs_path *path, int level) +{ + struct extent_buffer *clone; + + clone = btrfs_clone_extent_buffer(path->nodes[level]); + if (!clone) + return -ENOMEM; + + free_extent_buffer(path->nodes[level]); + path->nodes[level] = clone; + + return 0; +} + static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) { struct extent_buffer *eb; @@ -6698,6 +6833,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen u64 reada_max; u64 reada_done = 0; + lockdep_assert_held_read(&parent->fs_info->commit_root_sem); + BUG_ON(*level == 0); eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) @@ -6721,6 +6858,10 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; + + if (*level == 0) + return replace_node_with_clone(path, 0); + return 0; } @@ -6734,8 +6875,10 @@ static int tree_move_next_or_upnext(struct btrfs_path *path, path->slots[*level]++; while (path->slots[*level] >= nritems) { - if (*level == root_level) + if (*level == root_level) { + path->slots[*level] = nritems - 1; return -1; + } /* move upnext */ path->slots[*level] = 0; @@ -6767,14 +6910,20 @@ static int tree_advance(struct btrfs_path *path, } else { ret = tree_move_down(path, level, reada_min_gen); } - if (ret >= 0) { - if (*level == 0) - btrfs_item_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - else - btrfs_node_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - } + + /* + * Even if we have reached the end of a tree, ret is -1, update the key + * anyway, so that in case we need to restart due to a block group + * relocation, we can assert that the last key of the root node still + * exists in the tree. + */ + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + return ret; } @@ -6786,8 +6935,8 @@ static int tree_compare_item(struct btrfs_path *left_path, int len1, len2; unsigned long off1, off2; - len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); - len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); + len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]); + len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]); if (len1 != len2) return 1; @@ -6804,6 +6953,97 @@ static int tree_compare_item(struct btrfs_path *left_path, } /* + * A transaction used for relocating a block group was committed or is about to + * finish its commit. Release our paths and restart the search, so that we are + * not using stale extent buffers: + * + * 1) For levels > 0, we are only holding references of extent buffers, without + * any locks on them, which does not prevent them from having been relocated + * and reallocated after the last time we released the commit root semaphore. + * The exception are the root nodes, for which we always have a clone, see + * the comment at btrfs_compare_trees(); + * + * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so + * we are safe from the concurrent relocation and reallocation. However they + * can have file extent items with a pre relocation disk_bytenr value, so we + * restart the start from the current commit roots and clone the new leaves so + * that we get the post relocation disk_bytenr values. Not doing so, could + * make us clone the wrong data in case there are new extents using the old + * disk_bytenr that happen to be shared. + */ +static int restart_after_relocation(struct btrfs_path *left_path, + struct btrfs_path *right_path, + const struct btrfs_key *left_key, + const struct btrfs_key *right_key, + int left_level, + int right_level, + const struct send_ctx *sctx) +{ + int root_level; + int ret; + + lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem); + + btrfs_release_path(left_path); + btrfs_release_path(right_path); + + /* + * Since keys can not be added or removed to/from our roots because they + * are readonly and we do not allow deduplication to run in parallel + * (which can add, remove or change keys), the layout of the trees should + * not change. + */ + left_path->lowest_level = left_level; + ret = search_key_again(sctx, sctx->send_root, left_path, left_key); + if (ret < 0) + return ret; + + right_path->lowest_level = right_level; + ret = search_key_again(sctx, sctx->parent_root, right_path, right_key); + if (ret < 0) + return ret; + + /* + * If the lowest level nodes are leaves, clone them so that they can be + * safely used by changed_cb() while not under the protection of the + * commit root semaphore, even if relocation and reallocation happens in + * parallel. + */ + if (left_level == 0) { + ret = replace_node_with_clone(left_path, 0); + if (ret < 0) + return ret; + } + + if (right_level == 0) { + ret = replace_node_with_clone(right_path, 0); + if (ret < 0) + return ret; + } + + /* + * Now clone the root nodes (unless they happen to be the leaves we have + * already cloned). This is to protect against concurrent snapshotting of + * the send and parent roots (see the comment at btrfs_compare_trees()). + */ + root_level = btrfs_header_level(sctx->send_root->commit_root); + if (root_level > 0) { + ret = replace_node_with_clone(left_path, root_level); + if (ret < 0) + return ret; + } + + root_level = btrfs_header_level(sctx->parent_root->commit_root); + if (root_level > 0) { + ret = replace_node_with_clone(right_path, root_level); + if (ret < 0) + return ret; + } + + return 0; +} + +/* * This function compares two trees and calls the provided callback for * every changed/new/deleted item it finds. * If shared tree blocks are encountered, whole subtrees are skipped, making @@ -6831,10 +7071,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, int right_root_level; int left_level; int right_level; - int left_end_reached; - int right_end_reached; - int advance_left; - int advance_right; + int left_end_reached = 0; + int right_end_reached = 0; + int advance_left = 0; + int advance_right = 0; u64 left_blockptr; u64 right_blockptr; u64 left_gen; @@ -6902,12 +7142,18 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, down_read(&fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; + /* + * We clone the root node of the send and parent roots to prevent races + * with snapshot creation of these roots. Snapshot creation COWs the + * root node of a tree, so after the transaction is committed the old + * extent can be reallocated while this send operation is still ongoing. + * So we clone them, under the commit root semaphore, to be race free. + */ left_path->nodes[left_level] = btrfs_clone_extent_buffer(left_root->commit_root); if (!left_path->nodes[left_level]) { - up_read(&fs_info->commit_root_sem); ret = -ENOMEM; - goto out; + goto out_unlock; } right_level = btrfs_header_level(right_root->commit_root); @@ -6915,9 +7161,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, right_path->nodes[right_level] = btrfs_clone_extent_buffer(right_root->commit_root); if (!right_path->nodes[right_level]) { - up_read(&fs_info->commit_root_sem); ret = -ENOMEM; - goto out; + goto out_unlock; } /* * Our right root is the parent root, while the left root is the "send" @@ -6927,7 +7172,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, * will need to read them at some point. */ reada_min_gen = btrfs_header_generation(right_root->commit_root); - up_read(&fs_info->commit_root_sem); if (left_level == 0) btrfs_item_key_to_cpu(left_path->nodes[left_level], @@ -6942,11 +7186,26 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, btrfs_node_key_to_cpu(right_path->nodes[right_level], &right_key, right_path->slots[right_level]); - left_end_reached = right_end_reached = 0; - advance_left = advance_right = 0; + sctx->last_reloc_trans = fs_info->last_reloc_trans; while (1) { - cond_resched(); + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { + up_read(&fs_info->commit_root_sem); + cond_resched(); + down_read(&fs_info->commit_root_sem); + } + + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + ret = restart_after_relocation(left_path, right_path, + &left_key, &right_key, + left_level, right_level, + sctx); + if (ret < 0) + goto out_unlock; + sctx->last_reloc_trans = fs_info->last_reloc_trans; + } + if (advance_left && !left_end_reached) { ret = tree_advance(left_path, &left_level, left_root_level, @@ -6955,7 +7214,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, if (ret == -1) left_end_reached = ADVANCE; else if (ret < 0) - goto out; + goto out_unlock; advance_left = 0; } if (advance_right && !right_end_reached) { @@ -6966,54 +7225,55 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, if (ret == -1) right_end_reached = ADVANCE; else if (ret < 0) - goto out; + goto out_unlock; advance_right = 0; } if (left_end_reached && right_end_reached) { ret = 0; - goto out; + goto out_unlock; } else if (left_end_reached) { if (right_level == 0) { + up_read(&fs_info->commit_root_sem); ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); } advance_right = ADVANCE; continue; } else if (right_end_reached) { if (left_level == 0) { + up_read(&fs_info->commit_root_sem); ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); } advance_left = ADVANCE; continue; } if (left_level == 0 && right_level == 0) { + up_read(&fs_info->commit_root_sem); cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, sctx); - if (ret < 0) - goto out; advance_left = ADVANCE; } else if (cmp > 0) { ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, sctx); - if (ret < 0) - goto out; advance_right = ADVANCE; } else { enum btrfs_compare_tree_result result; @@ -7027,11 +7287,13 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, result = BTRFS_COMPARE_TREE_SAME; ret = changed_cb(left_path, right_path, &left_key, result, sctx); - if (ret < 0) - goto out; advance_left = ADVANCE; advance_right = ADVANCE; } + + if (ret < 0) + goto out; + down_read(&fs_info->commit_root_sem); } else if (left_level == right_level) { cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { @@ -7071,6 +7333,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, } } +out_unlock: + up_read(&fs_info->commit_root_sem); out: btrfs_free_path(left_path); btrfs_free_path(right_path); @@ -7276,6 +7540,17 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) sctx->flags = arg->flags; + if (arg->flags & BTRFS_SEND_FLAG_VERSION) { + if (arg->version > BTRFS_SEND_STREAM_VERSION) { + ret = -EPROTO; + goto out; + } + /* Zero means "use the highest version" */ + sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION; + } else { + sctx->proto = 1; + } + sctx->send_filp = fget(arg->send_fd); if (!sctx->send_filp) { ret = -EBADF; @@ -7409,21 +7684,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (ret) goto out; - spin_lock(&fs_info->send_reloc_lock); - if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { - spin_unlock(&fs_info->send_reloc_lock); - btrfs_warn_rl(fs_info, - "cannot run send because a relocation operation is in progress"); - ret = -EAGAIN; - goto out; - } - fs_info->send_in_progress++; - spin_unlock(&fs_info->send_reloc_lock); - ret = send_subvol(sctx); - spin_lock(&fs_info->send_reloc_lock); - fs_info->send_in_progress--; - spin_unlock(&fs_info->send_reloc_lock); if (ret < 0) goto out; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index de91488b7cd0..23bcefc84e49 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -48,6 +48,7 @@ struct btrfs_tlv_header { enum btrfs_send_cmd { BTRFS_SEND_C_UNSPEC, + /* Version 1 */ BTRFS_SEND_C_SUBVOL, BTRFS_SEND_C_SNAPSHOT, @@ -76,6 +77,12 @@ enum btrfs_send_cmd { BTRFS_SEND_C_END, BTRFS_SEND_C_UPDATE_EXTENT, + __BTRFS_SEND_C_MAX_V1, + + /* Version 2 */ + __BTRFS_SEND_C_MAX_V2, + + /* End */ __BTRFS_SEND_C_MAX, }; #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index aa5be0b24987..294242c194d8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -617,7 +617,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, enum btrfs_flush_state state, bool for_preempt) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; int nr; int ret = 0; @@ -844,6 +844,9 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 min_bytes; + if (!ticket->steal) + return false; + if (global_rsv->space_info != space_info) return false; @@ -885,6 +888,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, { struct reserve_ticket *ticket; u64 tickets_id = space_info->tickets_id; + const bool aborted = BTRFS_FS_ERROR(fs_info); trace_btrfs_fail_all_tickets(fs_info, space_info); @@ -898,16 +902,18 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (ticket->steal && - steal_from_global_rsv(fs_info, space_info, ticket)) + if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket)) return true; - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) btrfs_info(fs_info, "failing ticket with %llu bytes", ticket->bytes); remove_ticket(space_info, ticket); - ticket->error = -ENOSPC; + if (aborted) + ticket->error = -EIO; + else + ticket->error = -ENOSPC; wake_up(&ticket->wait); /* @@ -916,7 +922,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, * here to see if we can make progress with the next ticket in * the list. */ - btrfs_try_granting_tickets(fs_info, space_info); + if (!aborted) + btrfs_try_granting_tickets(fs_info, space_info); } return (tickets_id != space_info->tickets_id); } @@ -1172,6 +1179,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) spin_unlock(&space_info->lock); return; } + + /* Something happened, fail everything and bail. */ + if (BTRFS_FS_ERROR(fs_info)) + goto aborted_fs; last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); } @@ -1202,9 +1213,20 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) } else { flush_state = 0; } + + /* Something happened, fail everything and bail. */ + if (BTRFS_FS_ERROR(fs_info)) + goto aborted_fs; + } spin_unlock(&space_info->lock); } + return; + +aborted_fs: + maybe_fail_all_tickets(fs_info, space_info); + space_info->flush = 0; + spin_unlock(&space_info->lock); } void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) @@ -1240,18 +1262,23 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, int states_nr) { u64 to_reclaim; - int flush_state; + int flush_state = 0; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); - if (!to_reclaim) { + /* + * This is the priority reclaim path, so to_reclaim could be >0 still + * because we may have only satisified the priority tickets and still + * left non priority tickets on the list. We would then have + * to_reclaim but ->bytes == 0. + */ + if (ticket->bytes == 0) { spin_unlock(&space_info->lock); return; } - spin_unlock(&space_info->lock); - flush_state = 0; - do { + while (flush_state < states_nr) { + spin_unlock(&space_info->lock); flush_space(fs_info, space_info, to_reclaim, states[flush_state], false); flush_state++; @@ -1260,23 +1287,49 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); return; } - spin_unlock(&space_info->lock); - } while (flush_state < states_nr); + } + + /* Attempt to steal from the global rsv if we can. */ + if (!steal_from_global_rsv(fs_info, space_info, ticket)) { + ticket->error = -ENOSPC; + remove_ticket(space_info, ticket); + } + + /* + * We must run try_granting_tickets here because we could be a large + * ticket in front of a smaller ticket that can now be satisfied with + * the available space. + */ + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); } static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { + spin_lock(&space_info->lock); + + /* We could have been granted before we got here. */ + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + while (!space_info->full) { + spin_unlock(&space_info->lock); flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (ticket->bytes == 0) { spin_unlock(&space_info->lock); return; } - spin_unlock(&space_info->lock); } + + ticket->error = -ENOSPC; + remove_ticket(space_info, ticket); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); } static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, @@ -1358,25 +1411,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, break; } - spin_lock(&space_info->lock); ret = ticket->error; - if (ticket->bytes || ticket->error) { - /* - * We were a priority ticket, so we need to delete ourselves - * from the list. Because we could have other priority tickets - * behind us that require less space, run - * btrfs_try_granting_tickets() to see if their reservations can - * now be made. - */ - if (!list_empty(&ticket->list)) { - remove_ticket(space_info, ticket); - btrfs_try_granting_tickets(fs_info, space_info); - } - - if (!ret) - ret = -ENOSPC; - } - spin_unlock(&space_info->lock); ASSERT(list_empty(&ticket->list)); /* * Check that we can't have an error set if the reservation succeeded, @@ -1418,6 +1453,12 @@ static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, space_info->clamp = min(space_info->clamp + 1, 8); } +static inline bool can_steal(enum btrfs_reserve_flush_enum flush) +{ + return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || + flush == BTRFS_RESERVE_FLUSH_EVICT); +} + /** * Try to reserve bytes from the block_rsv's space * @@ -1491,7 +1532,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, ticket.error = 0; space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); - ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); + ticket.steal = can_steal(flush); if (trace_btrfs_reserve_ticket_enabled()) start_ns = ktime_get_ns(); @@ -1547,7 +1588,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, /** * Trye to reserve metadata bytes from the block_rsv's space * - * @root: the root we're allocating for + * @fs_info: the filesystem * @block_rsv: block_rsv we're allocating for * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation @@ -1559,22 +1600,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * regain reservations will be made and this will fail if there is not enough * space already. */ -int btrfs_reserve_metadata_bytes(struct btrfs_root *root, +int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); - if (ret == -ENOSPC && - unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { - if (block_rsv != global_rsv && - !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) - ret = 0; - } if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", block_rsv->space_info->flags, diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index cb5056472e79..d841fed73492 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -123,7 +123,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -int btrfs_reserve_metadata_bytes(struct btrfs_root *root, +int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index cb10e56ee31e..29bd8c7a7706 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -63,11 +63,41 @@ * This means a slightly higher tree locking latency. */ +void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) +{ + unsigned int cur = 0; + unsigned int nr_bits; + + ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize)); + + nr_bits = PAGE_SIZE / sectorsize; + subpage_info->bitmap_nr_bits = nr_bits; + + subpage_info->uptodate_offset = cur; + cur += nr_bits; + + subpage_info->error_offset = cur; + cur += nr_bits; + + subpage_info->dirty_offset = cur; + cur += nr_bits; + + subpage_info->writeback_offset = cur; + cur += nr_bits; + + subpage_info->ordered_offset = cur; + cur += nr_bits; + + subpage_info->checked_offset = cur; + cur += nr_bits; + + subpage_info->total_nr_bits = cur; +} + int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type) { - struct btrfs_subpage *subpage = NULL; - int ret; + struct btrfs_subpage *subpage; /* * We have cases like a dummy extent buffer page, which is not mappped @@ -75,13 +105,15 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, */ if (page->mapping) ASSERT(PageLocked(page)); + /* Either not subpage, or the page already has private attached */ if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) return 0; - ret = btrfs_alloc_subpage(fs_info, &subpage, type); - if (ret < 0) - return ret; + subpage = btrfs_alloc_subpage(fs_info, type); + if (IS_ERR(subpage)) + return PTR_ERR(subpage); + attach_page_private(page, subpage); return 0; } @@ -100,24 +132,28 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, btrfs_free_subpage(subpage); } -int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - struct btrfs_subpage **ret, - enum btrfs_subpage_type type) +struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + enum btrfs_subpage_type type) { - if (fs_info->sectorsize == PAGE_SIZE) - return 0; + struct btrfs_subpage *ret; + unsigned int real_size; + + ASSERT(fs_info->sectorsize < PAGE_SIZE); + + real_size = struct_size(ret, bitmaps, + BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits)); + ret = kzalloc(real_size, GFP_NOFS); + if (!ret) + return ERR_PTR(-ENOMEM); - *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); - if (!*ret) - return -ENOMEM; - spin_lock_init(&(*ret)->lock); + spin_lock_init(&ret->lock); if (type == BTRFS_SUBPAGE_METADATA) { - atomic_set(&(*ret)->eb_refs, 0); + atomic_set(&ret->eb_refs, 0); } else { - atomic_set(&(*ret)->readers, 0); - atomic_set(&(*ret)->writers, 0); + atomic_set(&ret->readers, 0); + atomic_set(&ret->writers, 0); } - return 0; + return ret; } void btrfs_free_subpage(struct btrfs_subpage *subpage) @@ -222,8 +258,16 @@ static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) u32 orig_len = *len; *start = max_t(u64, page_offset(page), orig_start); - *len = min_t(u64, page_offset(page) + PAGE_SIZE, - orig_start + orig_len) - *start; + /* + * For certain call sites like btrfs_drop_pages(), we may have pages + * beyond the target range. In that case, just set @len to 0, subpage + * helpers can handle @len == 0 without any problem. + */ + if (page_offset(page) >= orig_start + orig_len) + *len = 0; + else + *len = min_t(u64, page_offset(page) + PAGE_SIZE, + orig_start + orig_len) - *start; } void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, @@ -248,6 +292,16 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, btrfs_subpage_assert(fs_info, page, start, len); + /* + * We have call sites passing @lock_page into + * extent_clear_unlock_delalloc() for compression path. + * + * This @locked_page is locked by plain lock_page(), thus its + * subpage::writers is 0. Handle them in a special way. + */ + if (atomic_read(&subpage->writers) == 0) + return true; + ASSERT(atomic_read(&subpage->writers) >= nbits); return atomic_sub_and_test(nbits, &subpage->writers); } @@ -289,37 +343,59 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, unlock_page(page); } -/* - * Convert the [start, start + len) range into a u16 bitmap - * - * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. - */ -static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) +static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start, + unsigned int nbits) { - const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; - const int nbits = len >> fs_info->sectorsize_bits; + unsigned int found_zero; - btrfs_subpage_assert(fs_info, page, start, len); + found_zero = find_next_zero_bit(addr, start + nbits, start); + if (found_zero == start + nbits) + return true; + return false; +} - /* - * Here nbits can be 16, thus can go beyond u16 range. We make the - * first left shift to be calculate in unsigned long (at least u32), - * then truncate the result to u16. - */ - return (u16)(((1UL << nbits) - 1) << bit_start); +static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start, + unsigned int nbits) +{ + unsigned int found_set; + + found_set = find_next_bit(addr, start + nbits, start); + if (found_set == start + nbits) + return true; + return false; } +#define subpage_calc_start_bit(fs_info, page, name, start, len) \ +({ \ + unsigned int start_bit; \ + \ + btrfs_subpage_assert(fs_info, page, start, len); \ + start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ + start_bit += fs_info->subpage_info->name##_offset; \ + start_bit; \ +}) + +#define subpage_test_bitmap_all_set(fs_info, subpage, name) \ + bitmap_test_range_all_set(subpage->bitmaps, \ + fs_info->subpage_info->name##_offset, \ + fs_info->subpage_info->bitmap_nr_bits) + +#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ + bitmap_test_range_all_zero(subpage->bitmaps, \ + fs_info->subpage_info->name##_offset, \ + fs_info->subpage_info->bitmap_nr_bits) + void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->uptodate_bitmap |= tmp; - if (subpage->uptodate_bitmap == U16_MAX) + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) SetPageUptodate(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -328,11 +404,12 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->uptodate_bitmap &= ~tmp; + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); ClearPageUptodate(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -341,11 +418,12 @@ void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + error, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->error_bitmap |= tmp; + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); SetPageError(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -354,12 +432,13 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + error, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->error_bitmap &= ~tmp; - if (subpage->error_bitmap == 0) + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, error)) ClearPageError(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -368,11 +447,12 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + dirty, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->dirty_bitmap |= tmp; + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&subpage->lock, flags); set_page_dirty(page); } @@ -391,13 +471,14 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + dirty, start, len); unsigned long flags; bool last = false; spin_lock_irqsave(&subpage->lock, flags); - subpage->dirty_bitmap &= ~tmp; - if (subpage->dirty_bitmap == 0) + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty)) last = true; spin_unlock_irqrestore(&subpage->lock, flags); return last; @@ -417,11 +498,12 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->writeback_bitmap |= tmp; + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); set_page_writeback(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -430,12 +512,13 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->writeback_bitmap &= ~tmp; - if (subpage->writeback_bitmap == 0) { + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { ASSERT(PageWriteback(page)); end_page_writeback(page); } @@ -446,11 +529,12 @@ void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->ordered_bitmap |= tmp; + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); SetPageOrdered(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -459,15 +543,46 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->ordered_bitmap &= ~tmp; - if (subpage->ordered_bitmap == 0) + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) ClearPageOrdered(page); spin_unlock_irqrestore(&subpage->lock, flags); } + +void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + checked, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) + SetPageChecked(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + checked, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + ClearPageChecked(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + /* * Unlike set/clear which is dependent on each page status, for test all bits * are tested in the same way. @@ -477,12 +592,14 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \ + name, start, len); \ unsigned long flags; \ bool ret; \ \ spin_lock_irqsave(&subpage->lock, flags); \ - ret = ((subpage->name##_bitmap & tmp) == tmp); \ + ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \ + len >> fs_info->sectorsize_bits); \ spin_unlock_irqrestore(&subpage->lock, flags); \ return ret; \ } @@ -491,6 +608,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); /* * Note that, in selftests (extent-io-tests), we can have empty fs_info passed @@ -561,6 +679,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, PageWriteback); IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, PageOrdered); +IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked); /* * Make sure not only the page dirty bit is cleared, but also subpage dirty bit @@ -579,5 +698,48 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, return; ASSERT(PagePrivate(page) && page->private); - ASSERT(subpage->dirty_bitmap == 0); + ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty)); +} + +/* + * Handle different locked pages with different page sizes: + * + * - Page locked by plain lock_page() + * It should not have any subpage::writers count. + * Can be unlocked by unlock_page(). + * This is the most common locked page for __extent_writepage() called + * inside extent_write_cache_pages() or extent_write_full_page(). + * Rarer cases include the @locked_page from extent_write_locked_range(). + * + * - Page locked by lock_delalloc_pages() + * There is only one caller, all pages except @locked_page for + * extent_write_locked_range(). + * In this case, we have to call subpage helper to handle the case. + */ +void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, + u64 start, u32 len) +{ + struct btrfs_subpage *subpage; + + ASSERT(PageLocked(page)); + /* For regular page size case, we just unlock the page */ + if (fs_info->sectorsize == PAGE_SIZE) + return unlock_page(page); + + ASSERT(PagePrivate(page) && page->private); + subpage = (struct btrfs_subpage *)page->private; + + /* + * For subpage case, there are two types of locked page. With or + * without writers number. + * + * Since we own the page lock, no one else could touch subpage::writers + * and we are safe to do several atomic operations without spinlock. + */ + if (atomic_read(&subpage->writers)) + /* No writers, locked by plain lock_page() */ + return unlock_page(page); + + /* Have writers, use proper subpage helper to end it */ + btrfs_page_end_writer_lock(fs_info, page, start, len); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 0120948f37a1..7accb5c40d33 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -6,10 +6,38 @@ #include <linux/spinlock.h> /* - * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap - * is sufficient. Regular bitmap_* is not used due to size reasons. + * Extra info for subpapge bitmap. + * + * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into + * one larger bitmap. + * + * This structure records how they are organized in the bitmap: + * + * /- uptodate_offset /- error_offset /- dirty_offset + * | | | + * v v v + * |u|u|u|u|........|u|u|e|e|.......|e|e| ... |o|o| + * |<- bitmap_nr_bits ->| + * |<--------------- total_nr_bits ---------------->| */ -#define BTRFS_SUBPAGE_BITMAP_SIZE 16 +struct btrfs_subpage_info { + /* Number of bits for each bitmap */ + unsigned int bitmap_nr_bits; + + /* Total number of bits for the whole bitmap */ + unsigned int total_nr_bits; + + /* + * *_start indicates where the bitmap starts, the length is always + * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. + */ + unsigned int uptodate_offset; + unsigned int error_offset; + unsigned int dirty_offset; + unsigned int writeback_offset; + unsigned int ordered_offset; + unsigned int checked_offset; +}; /* * Structure to trace status of each sector inside a page, attached to @@ -18,10 +46,6 @@ struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; - u16 uptodate_bitmap; - u16 error_bitmap; - u16 dirty_bitmap; - u16 writeback_bitmap; /* * Both data and metadata needs to track how many readers are for the * page. @@ -38,14 +62,11 @@ struct btrfs_subpage { * manages whether the subpage can be detached. */ atomic_t eb_refs; - /* Structures only used by data */ - struct { - atomic_t writers; - /* Tracke pending ordered extent in this sector */ - u16 ordered_bitmap; - }; + /* Structures only used by data */ + atomic_t writers; }; + unsigned long bitmaps[]; }; enum btrfs_subpage_type { @@ -53,15 +74,15 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; +void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type); void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct page *page); /* Allocate additional data where page represents more than one sector */ -int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - struct btrfs_subpage **ret, - enum btrfs_subpage_type type); +struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, @@ -122,11 +143,14 @@ DECLARE_BTRFS_SUBPAGE_OPS(error); DECLARE_BTRFS_SUBPAGE_OPS(dirty); DECLARE_BTRFS_SUBPAGE_OPS(writeback); DECLARE_BTRFS_SUBPAGE_OPS(ordered); +DECLARE_BTRFS_SUBPAGE_OPS(checked); bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct page *page); +void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, + u64 start, u32 len); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 537d90bf5d84..0ec09fe01be6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1705,7 +1705,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, goto error_close_devices; } - bdev = fs_devices->latest_bdev; + bdev = fs_devices->latest_dev->bdev; s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC, fs_info); if (IS_ERR(s)) { @@ -1842,7 +1842,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, new_pool_size); } @@ -2006,7 +2005,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; } else { - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { btrfs_err(fs_info, "Remounting read-write after error is not allowed"); ret = -EINVAL; @@ -2463,30 +2462,16 @@ static int btrfs_unfreeze(struct super_block *sb) static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); - struct btrfs_device *dev, *first_dev = NULL; /* - * Lightweight locking of the devices. We should not need - * device_list_mutex here as we only read the device data and the list - * is protected by RCU. Even if a device is deleted during the list - * traversals, we'll get valid data, the freeing callback will wait at - * least until the rcu_read_unlock. + * There should be always a valid pointer in latest_dev, it may be stale + * for a short moment in case it's being deleted but still valid until + * the end of RCU grace period. */ rcu_read_lock(); - list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) - continue; - if (!dev->name) - continue; - if (!first_dev || dev->devid < first_dev->devid) - first_dev = dev; - } - - if (first_dev) - seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\"); - else - WARN_ON(1); + seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\"); rcu_read_unlock(); + return 0; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 25a6f587852b..beb7f72d50b8 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -177,7 +177,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj, } else val = can_modify_feature(fa); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } static ssize_t btrfs_feature_attr_store(struct kobject *kobj, @@ -330,7 +330,7 @@ static const struct attribute_group btrfs_feature_attr_group = { static ssize_t rmdir_subvol_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return scnprintf(buf, PAGE_SIZE, "0\n"); + return sysfs_emit(buf, "0\n"); } BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); @@ -345,12 +345,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj, * This "trick" only works as long as 'enum btrfs_csum_type' has * no holes in it */ - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", - (i == 0 ? "" : " "), btrfs_super_csum_name(i)); + ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "), + btrfs_super_csum_name(i)); } - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); @@ -358,7 +358,7 @@ BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); static ssize_t send_stream_version_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION); + return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION); } BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); @@ -378,9 +378,8 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj, int i; for (i = 0; i < ARRAY_SIZE(rescue_opts); i++) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", - (i ? " " : ""), rescue_opts[i]); - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]); + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_rescue_options, @@ -394,10 +393,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, /* 4K sector size is also supported with 64K page size */ if (PAGE_SIZE == SZ_64K) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K); + ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); /* Only sectorsize == PAGE_SIZE is now supported */ - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE); + ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); return ret; } @@ -437,7 +436,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%lld\n", + return sysfs_emit(buf, "%lld\n", atomic64_read(&fs_info->discard_ctl.discardable_bytes)); } BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show); @@ -448,7 +447,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%d\n", + return sysfs_emit(buf, "%d\n", atomic_read(&fs_info->discard_ctl.discardable_extents)); } BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show); @@ -459,8 +458,8 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - fs_info->discard_ctl.discard_bitmap_bytes); + return sysfs_emit(buf, "%llu\n", + fs_info->discard_ctl.discard_bitmap_bytes); } BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); @@ -470,7 +469,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%lld\n", + return sysfs_emit(buf, "%lld\n", atomic64_read(&fs_info->discard_ctl.discard_bytes_saved)); } BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show); @@ -481,8 +480,8 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - fs_info->discard_ctl.discard_extent_bytes); + return sysfs_emit(buf, "%llu\n", + fs_info->discard_ctl.discard_extent_bytes); } BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); @@ -492,8 +491,8 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", - READ_ONCE(fs_info->discard_ctl.iops_limit)); + return sysfs_emit(buf, "%u\n", + READ_ONCE(fs_info->discard_ctl.iops_limit)); } static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj, @@ -523,8 +522,8 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", - READ_ONCE(fs_info->discard_ctl.kbps_limit)); + return sysfs_emit(buf, "%u\n", + READ_ONCE(fs_info->discard_ctl.kbps_limit)); } static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj, @@ -553,8 +552,8 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - READ_ONCE(fs_info->discard_ctl.max_discard_size)); + return sysfs_emit(buf, "%llu\n", + READ_ONCE(fs_info->discard_ctl.max_discard_size)); } static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj, @@ -627,7 +626,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) val = *value_ptr; if (lock) spin_unlock(lock); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return sysfs_emit(buf, "%llu\n", val); } static ssize_t global_rsv_size_show(struct kobject *kobj, @@ -673,7 +672,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj, val += block_group->used; } up_read(&sinfo->groups_sem); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return sysfs_emit(buf, "%llu\n", val); } /* @@ -771,7 +770,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj, ssize_t ret; spin_lock(&fs_info->super_lock); - ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label); spin_unlock(&fs_info->super_lock); return ret; @@ -819,7 +818,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); + return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -829,8 +828,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", - fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -840,7 +838,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); @@ -852,7 +850,7 @@ static ssize_t quota_override_show(struct kobject *kobj, int quota_override; quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); - return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override); + return sysfs_emit(buf, "%d\n", quota_override); } static ssize_t quota_override_store(struct kobject *kobj, @@ -890,8 +888,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%pU\n", - fs_info->fs_devices->metadata_uuid); + return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid); } BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); @@ -902,9 +899,9 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); - return scnprintf(buf, PAGE_SIZE, "%s (%s)\n", - btrfs_super_csum_name(csum_type), - crypto_shash_driver_name(fs_info->csum_shash)); + return sysfs_emit(buf, "%s (%s)\n", + btrfs_super_csum_name(csum_type), + crypto_shash_driver_name(fs_info->csum_shash)); } BTRFS_ATTR(, checksum, btrfs_checksum_show); @@ -941,7 +938,7 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, str = "UNKNOWN\n"; break; } - return scnprintf(buf, PAGE_SIZE, "%s", str); + return sysfs_emit(buf, "%s", str); } BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); @@ -950,7 +947,7 @@ static ssize_t btrfs_generation_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation); + return sysfs_emit(buf, "%llu\n", fs_info->generation); } BTRFS_ATTR(, generation, btrfs_generation_show); @@ -1028,8 +1025,7 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); ssize_t ret; - ret = scnprintf(buf, PAGE_SIZE, "%d\n", - READ_ONCE(fs_info->bg_reclaim_threshold)); + ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold)); return ret; } @@ -1471,7 +1467,7 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); @@ -1484,7 +1480,7 @@ static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show); @@ -1498,7 +1494,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); @@ -1509,8 +1505,7 @@ static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj, struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - READ_ONCE(device->scrub_speed_max)); + return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max)); } static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, @@ -1538,10 +1533,20 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); +static ssize_t btrfs_devinfo_fsid_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + return sysfs_emit(buf, "%pU\n", device->fs_devices->fsid); +} +BTRFS_ATTR(devid, fsid, btrfs_devinfo_fsid_show); + static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1549,14 +1554,14 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, devid_kobj); if (!device->dev_stats_valid) - return scnprintf(buf, PAGE_SIZE, "invalid\n"); + return sysfs_emit(buf, "invalid\n"); /* * Print all at once so we get a snapshot of all values from the same * time. Keep them in sync and in order of definition of * btrfs_dev_stat_values. */ - return scnprintf(buf, PAGE_SIZE, + return sysfs_emit(buf, "write_errs %d\n" "read_errs %d\n" "flush_errs %d\n" @@ -1577,6 +1582,7 @@ BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); */ static struct attribute *devid_attrs[] = { BTRFS_ATTR_PTR(devid, error_stats), + BTRFS_ATTR_PTR(devid, fsid), BTRFS_ATTR_PTR(devid, in_fs_metadata), BTRFS_ATTR_PTR(devid, missing), BTRFS_ATTR_PTR(devid, replace_target), diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 3a4099a2bf05..d8e56edd6991 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -204,6 +204,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; + btrfs_global_root_delete(root); btrfs_put_root(root); } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index df54cdfdc250..51a8b075c259 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -15,7 +15,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) struct btrfs_path *path = NULL; struct btrfs_root *root = NULL; struct extent_buffer *eb; - struct btrfs_item *item; char *value = "mary had a little lamb"; char *split1 = "mary had a little"; char *split2 = " lamb"; @@ -60,8 +59,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - setup_items_for_insert(root, path, &key, &value_len, 1); - item = btrfs_item_nr(0); + btrfs_setup_item_for_insert(root, path, &key, value_len); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); @@ -90,8 +88,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(0); - if (btrfs_item_size(eb, item) != strlen(split1)) { + if (btrfs_item_size(eb, 0) != strlen(split1)) { test_err("invalid len in the first split"); ret = -EINVAL; goto out; @@ -115,8 +112,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(1); - if (btrfs_item_size(eb, item) != strlen(split2)) { + if (btrfs_item_size(eb, 1) != strlen(split2)) { test_err("invalid len in the second split"); ret = -EINVAL; goto out; @@ -147,8 +143,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(0); - if (btrfs_item_size(eb, item) != strlen(split3)) { + if (btrfs_item_size(eb, 0) != strlen(split3)) { test_err("invalid len in the first split"); ret = -EINVAL; goto out; @@ -171,8 +166,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(1); - if (btrfs_item_size(eb, item) != strlen(split4)) { + if (btrfs_item_size(eb, 1) != strlen(split4)) { test_err("invalid len in the second split"); ret = -EINVAL; goto out; @@ -195,8 +189,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - item = btrfs_item_nr(2); - if (btrfs_item_size(eb, item) != strlen(split2)) { + if (btrfs_item_size(eb, 2) != strlen(split2)) { test_err("invalid len in the second split"); ret = -EINVAL; goto out; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 73e96d505f4f..a232b15b8021 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -56,6 +56,54 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, return count; } +#define STATE_FLAG_STR_LEN 256 + +#define PRINT_ONE_FLAG(state, dest, cur, name) \ +({ \ + if (state->state & EXTENT_##name) \ + cur += scnprintf(dest + cur, STATE_FLAG_STR_LEN - cur, \ + "%s" #name, cur == 0 ? "" : "|"); \ +}) + +static void extent_flag_to_str(const struct extent_state *state, char *dest) +{ + int cur = 0; + + dest[0] = 0; + PRINT_ONE_FLAG(state, dest, cur, DIRTY); + PRINT_ONE_FLAG(state, dest, cur, UPTODATE); + PRINT_ONE_FLAG(state, dest, cur, LOCKED); + PRINT_ONE_FLAG(state, dest, cur, NEW); + PRINT_ONE_FLAG(state, dest, cur, DELALLOC); + PRINT_ONE_FLAG(state, dest, cur, DEFRAG); + PRINT_ONE_FLAG(state, dest, cur, BOUNDARY); + PRINT_ONE_FLAG(state, dest, cur, NODATASUM); + PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV); + PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT); + PRINT_ONE_FLAG(state, dest, cur, DAMAGED); + PRINT_ONE_FLAG(state, dest, cur, NORESERVE); + PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED); + PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV); +} + +static void dump_extent_io_tree(const struct extent_io_tree *tree) +{ + struct rb_node *node; + char flags_str[STATE_FLAG_STR_LEN]; + + node = rb_first(&tree->state); + test_msg("io tree content:"); + while (node) { + struct extent_state *state; + + state = rb_entry(node, struct extent_state, rb_node); + extent_flag_to_str(state, flags_str); + test_msg(" start=%llu len=%llu flags=%s", state->start, + state->end + 1 - state->start, flags_str); + node = rb_next(node); + } +} + static int test_find_delalloc(u32 sectorsize) { struct inode *inode; @@ -112,7 +160,7 @@ static int test_find_delalloc(u32 sectorsize) */ set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL); start = 0; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { @@ -143,7 +191,7 @@ static int test_find_delalloc(u32 sectorsize) } set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { @@ -177,14 +225,14 @@ static int test_find_delalloc(u32 sectorsize) goto out_bits; } start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (found) { test_err("found range when we shouldn't have"); goto out_bits; } - if (end != (u64)-1) { + if (end != test_start + PAGE_SIZE - 1) { test_err("did not return the proper end offset"); goto out_bits; } @@ -198,7 +246,7 @@ static int test_find_delalloc(u32 sectorsize) */ set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { @@ -233,7 +281,7 @@ static int test_find_delalloc(u32 sectorsize) /* We unlocked it in the previous test */ lock_page(locked_page); start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; /* * Currently if we fail to find dirty pages in the delalloc range we * will adjust max_bytes down to PAGE_SIZE and then re-search. If @@ -258,6 +306,8 @@ static int test_find_delalloc(u32 sectorsize) } ret = 0; out_bits: + if (ret) + dump_extent_io_tree(tmp); clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) @@ -534,6 +584,8 @@ static int test_find_first_clear_extent_bit(void) ret = 0; out: + if (ret) + dump_extent_io_tree(&tree); clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); return ret; diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 8f05c1eb833f..5930cdcae5cb 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -824,6 +824,184 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, return 0; } +static bool bytes_index_use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + return true; +} + +static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize) +{ + const struct btrfs_free_space_op test_free_space_ops = { + .use_bitmap = bytes_index_use_bitmap, + }; + const struct btrfs_free_space_op *orig_free_space_ops; + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + u64 offset, max_extent_size, bytes; + int ret, i; + + test_msg("running bytes index tests"); + + /* First just validate that it does everything in order. */ + offset = 0; + for (i = 0; i < 10; i++) { + bytes = (i + 1) * SZ_1M; + ret = test_add_free_space_entry(cache, offset, bytes, 0); + if (ret) { + test_err("couldn't add extent entry %d\n", ret); + return ret; + } + offset += bytes + sectorsize; + } + + for (node = rb_first_cached(&ctl->free_space_bytes), i = 9; node; + node = rb_next(node), i--) { + entry = rb_entry(node, struct btrfs_free_space, bytes_index); + bytes = (i + 1) * SZ_1M; + if (entry->bytes != bytes) { + test_err("invalid bytes index order, found %llu expected %llu", + entry->bytes, bytes); + return -EINVAL; + } + } + + /* Now validate bitmaps do the correct thing. */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + for (i = 0; i < 2; i++) { + offset = i * BITS_PER_BITMAP * sectorsize; + bytes = (i + 1) * SZ_1M; + ret = test_add_free_space_entry(cache, offset, bytes, 1); + if (ret) { + test_err("couldn't add bitmap entry"); + return ret; + } + } + + for (node = rb_first_cached(&ctl->free_space_bytes), i = 1; node; + node = rb_next(node), i--) { + entry = rb_entry(node, struct btrfs_free_space, bytes_index); + bytes = (i + 1) * SZ_1M; + if (entry->bytes != bytes) { + test_err("invalid bytes index order, found %llu expected %llu", + entry->bytes, bytes); + return -EINVAL; + } + } + + /* Now validate bitmaps with different ->max_extent_size. */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + orig_free_space_ops = cache->free_space_ctl->op; + cache->free_space_ctl->op = &test_free_space_ops; + + ret = test_add_free_space_entry(cache, 0, sectorsize, 1); + if (ret) { + test_err("couldn't add bitmap entry"); + return ret; + } + + offset = BITS_PER_BITMAP * sectorsize; + ret = test_add_free_space_entry(cache, offset, sectorsize, 1); + if (ret) { + test_err("couldn't add bitmap_entry"); + return ret; + } + + /* + * Now set a bunch of sectorsize extents in the first entry so it's + * ->bytes is large. + */ + for (i = 2; i < 20; i += 2) { + offset = sectorsize * i; + ret = btrfs_add_free_space(cache, offset, sectorsize); + if (ret) { + test_err("error populating sparse bitmap %d", ret); + return ret; + } + } + + /* + * Now set a contiguous extent in the second bitmap so its + * ->max_extent_size is larger than the first bitmaps. + */ + offset = (BITS_PER_BITMAP * sectorsize) + sectorsize; + ret = btrfs_add_free_space(cache, offset, sectorsize); + if (ret) { + test_err("error adding contiguous extent %d", ret); + return ret; + } + + /* + * Since we don't set ->max_extent_size unless we search everything + * should be indexed on bytes. + */ + entry = rb_entry(rb_first_cached(&ctl->free_space_bytes), + struct btrfs_free_space, bytes_index); + if (entry->bytes != (10 * sectorsize)) { + test_err("error, wrong entry in the first slot in bytes_index"); + return -EINVAL; + } + + max_extent_size = 0; + offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 3, + 0, &max_extent_size); + if (offset != 0) { + test_err("found space to alloc even though we don't have enough space"); + return -EINVAL; + } + + if (max_extent_size != (2 * sectorsize)) { + test_err("got the wrong max_extent size %llu expected %llu", + max_extent_size, (unsigned long long)(2 * sectorsize)); + return -EINVAL; + } + + /* + * The search should have re-arranged the bytes index to use the + * ->max_extent_size, validate it's now what we expect it to be. + */ + entry = rb_entry(rb_first_cached(&ctl->free_space_bytes), + struct btrfs_free_space, bytes_index); + if (entry->bytes != (2 * sectorsize)) { + test_err("error, the bytes index wasn't recalculated properly"); + return -EINVAL; + } + + /* Add another sectorsize to re-arrange the tree back to ->bytes. */ + offset = (BITS_PER_BITMAP * sectorsize) - sectorsize; + ret = btrfs_add_free_space(cache, offset, sectorsize); + if (ret) { + test_err("error adding extent to the sparse entry %d", ret); + return ret; + } + + entry = rb_entry(rb_first_cached(&ctl->free_space_bytes), + struct btrfs_free_space, bytes_index); + if (entry->bytes != (11 * sectorsize)) { + test_err("error, wrong entry in the first slot in bytes_index"); + return -EINVAL; + } + + /* + * Now make sure we find our correct entry after searching that will + * result in a re-arranging of the tree. + */ + max_extent_size = 0; + offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 2, + 0, &max_extent_size); + if (offset != (BITS_PER_BITMAP * sectorsize)) { + test_err("error, found %llu instead of %llu for our alloc", + offset, + (unsigned long long)(BITS_PER_BITMAP * sectorsize)); + return -EINVAL; + } + + cache->free_space_ctl->op = orig_free_space_ops; + __btrfs_remove_free_space_cache(cache->free_space_ctl); + return 0; +} + int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { struct btrfs_fs_info *fs_info; @@ -858,7 +1036,10 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) goto out; } - root->fs_info->extent_root = root; + root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + btrfs_global_root_insert(root); ret = test_extents(cache); if (ret) @@ -871,6 +1052,9 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) goto out; ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize); + if (ret) + goto out; + ret = test_bytes_index(cache, sectorsize); out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 2c783d2f5228..13734ed43bfc 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -446,7 +446,10 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE); - root->fs_info->free_space_root = root; + root->root_key.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + btrfs_global_root_insert(root); root->fs_info->tree_root = root; root->node = alloc_test_extent_buffer(root->fs_info, nodesize); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index c9874b12d337..cac89c388131 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -33,7 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - setup_items_for_insert(root, &path, &key, &value_len, 1); + btrfs_setup_item_for_insert(root, &path, &key, value_len); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -63,7 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - setup_items_for_insert(root, &path, &key, &value_len, 1); + btrfs_setup_item_for_insert(root, &path, &key, value_len); } /* diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 19ba7d5b7d8f..eee1e4459541 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -455,7 +455,10 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) } /* We are using this root as our extent root */ - root->fs_info->extent_root = root; + root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + btrfs_global_root_insert(root); /* * Some of the paths we test assume we have a filled out fs_info, so we diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 14b9fdc8aaa9..03de89b45f27 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -162,7 +162,17 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) struct btrfs_root *root, *tmp; struct btrfs_caching_control *caching_ctl, *next; + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING); + down_write(&fs_info->commit_root_sem); + + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + fs_info->last_reloc_trans = trans->transid; + list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, dirty_list) { list_del_init(&root->dirty_list); @@ -283,7 +293,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); return -EROFS; } @@ -331,7 +341,7 @@ loop: */ kfree(cur_trans); goto loop; - } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + } else if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); kfree(cur_trans); return -EROFS; @@ -413,7 +423,6 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && root->last_trans < trans->transid) || force) { - WARN_ON(root == fs_info->extent_root); WARN_ON(!force && root->commit_root != root->node); /* @@ -579,7 +588,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, bool do_chunk_alloc = false; int ret; - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) return ERR_PTR(-EROFS); if (current->journal_info) { @@ -628,7 +637,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush); + ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush); if (ret) goto reserve_fail; if (delayed_refs_bytes) { @@ -692,7 +701,6 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; - h->root = root; refcount_set(&h->use_count, 1); h->fs_info = root->fs_info; @@ -991,8 +999,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (throttle) btrfs_run_delayed_iputs(info); - if (TRANS_ABORTED(trans) || - test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) { + if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) { wake_up_process(info->transaction_kthread); if (TRANS_ABORTED(trans)) err = trans->aborted; @@ -1237,6 +1244,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) struct extent_buffer *eb; int ret; + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, BTRFS_NESTING_COW); @@ -1268,9 +1281,8 @@ again: root = list_entry(next, struct btrfs_root, dirty_list); clear_bit(BTRFS_ROOT_DIRTY, &root->state); - if (root != fs_info->extent_root) - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; @@ -1300,9 +1312,6 @@ again: if (!list_empty(&fs_info->dirty_cowonly_roots)) goto again; - list_add_tail(&fs_info->extent_root->dirty_list, - &trans->transaction->switch_commits); - /* Update dev-replace pointer once everything is committed */ fs_info->dev_replace.committed_cursor_left = fs_info->dev_replace.cursor_left_last_write_of_item; @@ -1328,7 +1337,8 @@ void btrfs_add_dead_root(struct btrfs_root *root) } /* - * update all the cowonly tree roots on disk + * Update each subvolume root and its relocation root, if it exists, in the tree + * of tree roots. Also free log roots if they exist. */ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { @@ -1337,6 +1347,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) int i; int ret; + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + spin_lock(&fs_info->fs_roots_radix_lock); while (1) { ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, @@ -1349,6 +1365,14 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) struct btrfs_root *root = gang[i]; int ret2; + /* + * At this point we can neither have tasks logging inodes + * from a root nor trying to commit a log tree. + */ + ASSERT(atomic_read(&root->log_writers) == 0); + ASSERT(atomic_read(&root->log_commit[0]) == 0); + ASSERT(atomic_read(&root->log_commit[1]) == 0); + radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); @@ -1473,12 +1497,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, return ret; } - /* - * We are going to commit transaction, see btrfs_commit_transaction() - * comment for reason locking tree_log_mutex - */ - mutex_lock(&fs_info->tree_log_mutex); - ret = commit_fs_roots(trans); if (ret) goto out; @@ -1514,8 +1532,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, "Error while writing out transaction for qgroup"); out: - mutex_unlock(&fs_info->tree_log_mutex); - /* * Force parent root to be updated, as we recorded it before so its * last_trans == cur_transid. @@ -1579,7 +1595,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_reloc_pre_snapshot(pending, &to_reserve); if (to_reserve > 0) { - pending->error = btrfs_block_rsv_add(root, + pending->error = btrfs_block_rsv_add(fs_info, &pending->block_rsv, to_reserve, BTRFS_RESERVE_NO_FLUSH); @@ -1862,50 +1878,14 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) return ret; } -/* - * commit transactions asynchronously. once btrfs_commit_transaction_async - * returns, any subsequent transaction will not be allowed to join. - */ -struct btrfs_async_commit { - struct btrfs_trans_handle *newtrans; - struct work_struct work; -}; - -static void do_async_commit(struct work_struct *work) -{ - struct btrfs_async_commit *ac = - container_of(work, struct btrfs_async_commit, work); - - /* - * We've got freeze protection passed with the transaction. - * Tell lockdep about it. - */ - if (ac->newtrans->type & __TRANS_FREEZABLE) - __sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS); - - current->journal_info = ac->newtrans; - - btrfs_commit_transaction(ac->newtrans); - kfree(ac); -} - -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) +void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_async_commit *ac; struct btrfs_transaction *cur_trans; - ac = kmalloc(sizeof(*ac), GFP_NOFS); - if (!ac) - return -ENOMEM; - - INIT_WORK(&ac->work, do_async_commit); - ac->newtrans = btrfs_join_transaction(trans->root); - if (IS_ERR(ac->newtrans)) { - int err = PTR_ERR(ac->newtrans); - kfree(ac); - return err; - } + /* Kick the transaction kthread. */ + set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); + wake_up_process(fs_info->transaction_kthread); /* take transaction reference */ cur_trans = trans->transaction; @@ -1914,28 +1894,15 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) btrfs_end_transaction(trans); /* - * Tell lockdep we've released the freeze rwsem, since the - * async commit thread will be the one to unlock it. - */ - if (ac->newtrans->type & __TRANS_FREEZABLE) - __sb_writers_release(fs_info->sb, SB_FREEZE_FS); - - schedule_work(&ac->work); - /* * Wait for the current transaction commit to start and block * subsequent transaction joins */ wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); - if (current->journal_info == trans) - current->journal_info = NULL; - btrfs_put_transaction(cur_trans); - return 0; } - static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1987,7 +1954,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) btrfs_put_transaction(cur_trans); btrfs_put_transaction(cur_trans); - trace_btrfs_transaction_commit(trans->root); + trace_btrfs_transaction_commit(fs_info); if (current->journal_info == trans) current->journal_info = NULL; @@ -2155,7 +2122,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * abort to prevent writing a new superblock that reflects a * corrupt state (pointing to trees with unwritten nodes/leafs). */ - if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { ret = -EROFS; goto cleanup_transaction; } @@ -2201,6 +2168,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); + /* + * We've started the commit, clear the flag in case we were triggered to + * do an async commit but somebody else started before the transaction + * kthread could do the work. + */ + clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); + if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; goto scrub_continue; @@ -2247,24 +2221,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) WARN_ON(cur_trans != trans->transaction); - /* btrfs_commit_tree_roots is responsible for getting the - * various roots consistent with each other. Every pointer - * in the tree of tree roots has to point to the most up to date - * root for every subvolume and other tree. So, we have to keep - * the tree logging code from jumping in and changing any - * of the trees. - * - * At this point in the commit, there can't be any tree-log - * writers, but a little lower down we drop the trans mutex - * and let new people in. By holding the tree_log_mutex - * from now until after the super is written, we avoid races - * with the tree-log code. - */ - mutex_lock(&fs_info->tree_log_mutex); - ret = commit_fs_roots(trans); if (ret) - goto unlock_tree_log; + goto unlock_reloc; /* * Since the transaction is done, we can apply the pending changes @@ -2283,11 +2242,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ ret = btrfs_qgroup_account_extents(trans); if (ret < 0) - goto unlock_tree_log; + goto unlock_reloc; ret = commit_cowonly_roots(trans); if (ret) - goto unlock_tree_log; + goto unlock_reloc; /* * The tasks which save the space cache and inode cache may also @@ -2295,7 +2254,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; - goto unlock_tree_log; + goto unlock_reloc; } cur_trans = fs_info->running_transaction; @@ -2328,6 +2287,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_trans_release_chunk_metadata(trans); + /* + * Before changing the transaction state to TRANS_STATE_UNBLOCKED and + * setting fs_info->running_transaction to NULL, lock tree_log_mutex to + * make sure that before we commit our superblock, no other task can + * start a new transaction and commit a log tree before we commit our + * superblock. Anyone trying to commit a log tree locks this mutex before + * writing its superblock. + */ + mutex_lock(&fs_info->tree_log_mutex); + spin_lock(&fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; fs_info->running_transaction = NULL; @@ -2340,10 +2309,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) { btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction"); - /* - * reloc_mutex has been unlocked, tree_log_mutex is still held - * but we can't jump to unlock_tree_log causing double unlock - */ mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; } @@ -2394,7 +2359,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(fs_info->sb); - trace_btrfs_transaction_commit(trans->root); + trace_btrfs_transaction_commit(fs_info); btrfs_scrub_continue(fs_info); @@ -2405,8 +2370,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; -unlock_tree_log: - mutex_unlock(&fs_info->tree_log_mutex); unlock_reloc: mutex_unlock(&fs_info->reloc_mutex); scrub_continue: diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index ba45065f9451..1852ed9de7fd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -135,7 +135,6 @@ struct btrfs_trans_handle { bool removing_chunk; bool reloc_reserved; bool in_fsync; - struct btrfs_root *root; struct btrfs_fs_info *fs_info; struct list_head new_bgs; }; @@ -217,7 +216,7 @@ void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); +void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7733e8ac0a69..72e1c942197d 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -202,7 +202,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_file_extent_item *fi; u32 sectorsize = fs_info->sectorsize; - u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 item_size = btrfs_item_size(leaf, slot); u64 extent_end; if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { @@ -354,17 +354,17 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, key->offset, sectorsize); return -EUCLEAN; } - if (unlikely(!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize))) { + if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) { generic_err(leaf, slot, "unaligned item size for csum item, have %u should be aligned to %u", - btrfs_item_size_nr(leaf, slot), csumsize); + btrfs_item_size(leaf, slot), csumsize); return -EUCLEAN; } if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) { u64 prev_csum_end; u32 prev_item_size; - prev_item_size = btrfs_item_size_nr(leaf, slot - 1); + prev_item_size = btrfs_item_size(leaf, slot - 1); prev_csum_end = (prev_item_size / csumsize) * sectorsize; prev_csum_end += prev_key->offset; if (unlikely(prev_csum_end > key->offset)) { @@ -483,7 +483,7 @@ static int check_dir_item(struct extent_buffer *leaf, { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_dir_item *di; - u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 item_size = btrfs_item_size(leaf, slot); u32 cur = 0; if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) @@ -640,7 +640,7 @@ static int check_block_group_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_block_group_item bgi; - u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 item_size = btrfs_item_size(leaf, slot); u64 flags; u64 type; @@ -912,10 +912,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, { int num_stripes; - if (unlikely(btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk))) { + if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { chunk_err(leaf, chunk, key->offset, "invalid chunk item size: have %u expect [%zu, %u)", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), sizeof(struct btrfs_chunk), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); return -EUCLEAN; @@ -927,10 +927,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, goto out; if (unlikely(btrfs_chunk_item_size(num_stripes) != - btrfs_item_size_nr(leaf, slot))) { + btrfs_item_size(leaf, slot))) { chunk_err(leaf, chunk, key->offset, "invalid chunk item size: have %u expect %lu", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), btrfs_chunk_item_size(num_stripes)); return -EUCLEAN; } @@ -1095,12 +1095,12 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, if (unlikely(ret < 0)) return ret; - if (unlikely(btrfs_item_size_nr(leaf, slot) != sizeof(ri) && - btrfs_item_size_nr(leaf, slot) != + if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) && + btrfs_item_size(leaf, slot) != btrfs_legacy_root_item_size())) { generic_err(leaf, slot, "invalid root item size, have %u expect %zu or %u", - btrfs_item_size_nr(leaf, slot), sizeof(ri), + btrfs_item_size(leaf, slot), sizeof(ri), btrfs_legacy_root_item_size()); return -EUCLEAN; } @@ -1111,7 +1111,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, * And since we allow geneartion_v2 as 0, it will still pass the check. */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), - btrfs_item_size_nr(leaf, slot)); + btrfs_item_size(leaf, slot)); /* Generation related */ if (unlikely(btrfs_root_generation(&ri) > @@ -1208,7 +1208,7 @@ static int check_extent_item(struct extent_buffer *leaf, bool is_tree_block = false; unsigned long ptr; /* Current pointer inside inline refs */ unsigned long end; /* Extent item end */ - const u32 item_size = btrfs_item_size_nr(leaf, slot); + const u32 item_size = btrfs_item_size(leaf, slot); u64 flags; u64 generation; u64 total_refs; /* Total refs in btrfs_extent_item */ @@ -1432,10 +1432,10 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, if (key->type == BTRFS_SHARED_DATA_REF_KEY) expect_item_size = sizeof(struct btrfs_shared_data_ref); - if (unlikely(btrfs_item_size_nr(leaf, slot) != expect_item_size)) { + if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, "invalid item size, have %u expect %u for key type %u", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), expect_item_size, key->type); return -EUCLEAN; } @@ -1460,12 +1460,12 @@ static int check_extent_data_ref(struct extent_buffer *leaf, { struct btrfs_extent_data_ref *dref; unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); - const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot); + const unsigned long end = ptr + btrfs_item_size(leaf, slot); - if (unlikely(btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0)) { + if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) { generic_err(leaf, slot, "invalid item size, have %u expect aligned to %zu for key type %u", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), sizeof(*dref), key->type); return -EUCLEAN; } @@ -1507,16 +1507,16 @@ static int check_inode_ref(struct extent_buffer *leaf, if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) return -EUCLEAN; /* namelen can't be 0, so item_size == sizeof() is also invalid */ - if (unlikely(btrfs_item_size_nr(leaf, slot) <= sizeof(*iref))) { + if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) { inode_ref_err(leaf, slot, "invalid item size, have %u expect (%zu, %u)", - btrfs_item_size_nr(leaf, slot), + btrfs_item_size(leaf, slot), sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); return -EUCLEAN; } ptr = btrfs_item_ptr_offset(leaf, slot); - end = ptr + btrfs_item_size_nr(leaf, slot); + end = ptr + btrfs_item_size(leaf, slot); while (ptr < end) { u16 namelen; @@ -1689,12 +1689,12 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) if (slot == 0) item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info); else - item_end_expected = btrfs_item_offset_nr(leaf, + item_end_expected = btrfs_item_offset(leaf, slot - 1); - if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) { + if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) { generic_err(leaf, slot, "unexpected item end, have %u expect %u", - btrfs_item_end_nr(leaf, slot), + btrfs_item_data_end(leaf, slot), item_end_expected); return -EUCLEAN; } @@ -1704,11 +1704,11 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) * just in case all the items are consistent to each other, but * all point outside of the leaf. */ - if (unlikely(btrfs_item_end_nr(leaf, slot) > + if (unlikely(btrfs_item_data_end(leaf, slot) > BTRFS_LEAF_DATA_SIZE(fs_info))) { generic_err(leaf, slot, "slot end outside of leaf, have %u expect range [0, %u]", - btrfs_item_end_nr(leaf, slot), + btrfs_item_data_end(leaf, slot), BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 7c45d960b53c..b6cf39f4e7e4 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -27,14 +27,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, int next_key_ret = 0; u64 last_ret = 0; - if (root->fs_info->extent_root == root) { - /* - * there's recursion here right now in the tree locking, - * we can't defrag the extent root without deadlock - */ - goto out; - } - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) goto out; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f7efc26aa82a..c1ddbe800897 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -20,6 +20,7 @@ #include "block-group.h" #include "space-info.h" #include "zoned.h" +#include "inode-item.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -94,7 +95,7 @@ enum { }; static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_inode *inode, int inode_only, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, @@ -207,7 +208,7 @@ again: } atomic_inc(&root->log_writers); - if (ctx && !ctx->logging_new_name) { + if (!ctx->logging_new_name) { int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; @@ -368,25 +369,11 @@ static int process_one_buffer(struct btrfs_root *log, return ret; } -/* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. - * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). - * - * If the key is already in the destination tree the existing item is - * overwritten. If the existing item isn't big enough, it is extended. - * If it is too large, it is truncated. - * - * If the key isn't in the destination yet, a new item is inserted. - */ -static noinline int overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static int do_overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) { int ret; u32 item_size; @@ -400,18 +387,30 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) overwrite_root = 1; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); - /* look for the key in the destination tree */ - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - return ret; + /* Our caller must have done a search for the key for us. */ + ASSERT(path->nodes[0] != NULL); + + /* + * And the slot must point to the exact key or the slot where the key + * should be at (the first item with a key greater than 'key') + */ + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + struct btrfs_key found_key; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + ret = btrfs_comp_cpu_keys(&found_key, key); + ASSERT(ret >= 0); + } else { + ret = 1; + } if (ret == 0) { char *src_copy; char *dst_copy; - u32 dst_size = btrfs_item_size_nr(path->nodes[0], + u32 dst_size = btrfs_item_size(path->nodes[0], path->slots[0]); if (dst_size != item_size) goto insert; @@ -505,7 +504,7 @@ insert: /* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { u32 found_size; - found_size = btrfs_item_size_nr(path->nodes[0], + found_size = btrfs_item_size(path->nodes[0], path->slots[0]); if (found_size > item_size) btrfs_truncate_item(path, item_size, 1); @@ -585,6 +584,36 @@ no_copy: } /* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + + /* Look for the key in the destination tree. */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; + + return do_overwrite_item(trans, root, path, eb, slot, key); +} + +/* * simple helper to read an inode off the disk from a given root * This can only be called for subvolume roots and not for the log */ @@ -761,7 +790,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.objectid, ins.offset, 0); btrfs_init_data_ref(&ref, root->root_key.objectid, - key->objectid, offset); + key->objectid, offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) goto out; @@ -844,17 +873,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, */ while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums; + struct btrfs_root *csum_root; + sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); + csum_root = btrfs_csum_root(fs_info, + sums->bytenr); if (!ret) - ret = btrfs_del_csums(trans, - fs_info->csum_root, + ret = btrfs_del_csums(trans, csum_root, sums->bytenr, sums->len); if (!ret) ret = btrfs_csum_file_blocks(trans, - fs_info->csum_root, sums); + csum_root, + sums); list_del(&sums->list); kfree(sums); } @@ -893,11 +926,11 @@ out: * item */ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, struct btrfs_inode *dir, struct btrfs_dir_item *di) { + struct btrfs_root *root = dir->root; struct inode *inode; char *name; int name_len; @@ -926,7 +959,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, + ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name, name_len); if (ret) goto out; @@ -939,9 +972,11 @@ out: } /* - * helper function to see if a given name and sequence number found - * in an inode back reference are already in a directory and correctly - * point to this inode + * See if a given name and sequence number found in an inode back reference are + * already in a directory and correctly point to this inode. + * + * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it + * exists. */ static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, @@ -950,29 +985,34 @@ static noinline int inode_in_dir(struct btrfs_root *root, { struct btrfs_dir_item *di; struct btrfs_key location; - int match = 0; + int ret = 0; di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, index, name, name_len, 0); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (di) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); if (location.objectid != objectid) goto out; - } else + } else { goto out; - btrfs_release_path(path); + } + btrfs_release_path(path); di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); - if (di && !IS_ERR(di)) { - btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - if (location.objectid != objectid) - goto out; - } else + if (IS_ERR(di)) { + ret = PTR_ERR(di); goto out; - match = 1; + } else if (di) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid == objectid) + ret = 1; + } out: btrfs_release_path(path); - return match; + return ret; } /* @@ -1061,7 +1101,7 @@ again: * otherwise they must be unlinked as a conflict */ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); while (ptr < ptr_end) { victim_ref = (struct btrfs_inode_ref *)ptr; victim_name_len = btrfs_inode_ref_name_len(leaf, @@ -1084,7 +1124,7 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, root, dir, inode, + ret = btrfs_unlink_inode(trans, dir, inode, victim_name, victim_name_len); kfree(victim_name); if (ret) @@ -1120,7 +1160,7 @@ again: leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { @@ -1146,6 +1186,7 @@ again: parent_objectid, victim_name, victim_name_len); if (ret < 0) { + kfree(victim_name); return ret; } else if (!ret) { ret = -ENOENT; @@ -1155,7 +1196,7 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, root, + ret = btrfs_unlink_inode(trans, BTRFS_I(victim_parent), inode, victim_name, @@ -1182,8 +1223,10 @@ next: /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), ref_index, name, namelen, 0); - if (di && !IS_ERR(di)) { - ret = drop_one_dir_item(trans, root, path, dir, di); + if (IS_ERR(di)) { + return PTR_ERR(di); + } else if (di) { + ret = drop_one_dir_item(trans, path, dir, di); if (ret) return ret; } @@ -1192,8 +1235,10 @@ next: /* look for a conflicting name */ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); - if (di && !IS_ERR(di)) { - ret = drop_one_dir_item(trans, root, path, dir, di); + if (IS_ERR(di)) { + return PTR_ERR(di); + } else if (di) { + ret = drop_one_dir_item(trans, path, dir, di); if (ret) return ret; } @@ -1278,7 +1323,7 @@ again: eb = path->nodes[0]; ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); - ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); + ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); while (ref_ptr < ref_end) { char *name = NULL; int namelen; @@ -1313,7 +1358,7 @@ again: kfree(name); goto out; } - ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), inode, name, namelen); kfree(name); iput(dir); @@ -1374,10 +1419,11 @@ out: return ret; } -static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, +static int add_link(struct btrfs_trans_handle *trans, struct inode *dir, struct inode *inode, const char *name, int namelen, u64 ref_index) { + struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_dir_item *dir_item; struct btrfs_key key; struct btrfs_path *path; @@ -1411,7 +1457,7 @@ static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = -ENOENT; goto out; } - ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode), name, namelen); if (ret) goto out; @@ -1463,7 +1509,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, int ref_struct_size; ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + ref_end = ref_ptr + btrfs_item_size(eb, slot); if (key->type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *r; @@ -1517,10 +1563,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - /* if we already have a perfect match, we're done */ - if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, - name, namelen)) { + ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), + btrfs_ino(BTRFS_I(inode)), ref_index, + name, namelen); + if (ret < 0) { + goto out; + } else if (ret == 0) { /* * look for a conflicting back reference in the * metadata. if we find one we have to unlink that name @@ -1555,7 +1603,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = btrfs_inode_ref_exists(inode, dir, key->type, name, namelen); if (ret > 0) { - ret = btrfs_unlink_inode(trans, root, + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, namelen); @@ -1571,7 +1619,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; /* insert our name */ - ret = add_link(trans, root, dir, inode, name, namelen, + ret = add_link(trans, dir, inode, name, namelen, ref_index); if (ret) goto out; @@ -1580,6 +1628,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; } + /* Else, ret == 1, we already have a perfect match, we're done. */ ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; kfree(name); @@ -1634,7 +1683,7 @@ static int count_inode_extrefs(struct btrfs_root *root, break; leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); cur_offset = 0; @@ -1688,7 +1737,7 @@ process_slot: key.type != BTRFS_INODE_REF_KEY) break; ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], + ptr_end = ptr + btrfs_item_size(path->nodes[0], path->slots[0]); while (ptr < ptr_end) { struct btrfs_inode_ref *ref; @@ -1906,6 +1955,34 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, return ret; } +static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, + struct btrfs_path *path, + struct btrfs_dir_item *dst_di, + const struct btrfs_key *log_key, + u8 log_type, + bool exists) +{ + struct btrfs_key found_key; + + btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + /* The existing dentry points to the same inode, don't delete it. */ + if (found_key.objectid == log_key->objectid && + found_key.type == log_key->type && + found_key.offset == log_key->offset && + btrfs_dir_type(path->nodes[0], dst_di) == log_type) + return 1; + + /* + * Don't drop the conflicting directory entry if the inode for the new + * entry doesn't exist. + */ + if (!exists) + return 0; + + return drop_one_dir_item(trans, path, dir, dst_di); +} + /* * take a single entry in a log directory item and replay it into * the subvolume. @@ -1931,14 +2008,17 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, { char *name; int name_len; - struct btrfs_dir_item *dst_di; - struct btrfs_key found_key; + struct btrfs_dir_item *dir_dst_di; + struct btrfs_dir_item *index_dst_di; + bool dir_dst_matches = false; + bool index_dst_matches = false; struct btrfs_key log_key; + struct btrfs_key search_key; struct inode *dir; u8 log_type; - int exists; - int ret = 0; - bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); + bool exists; + int ret; + bool update_size = true; bool name_added = false; dir = read_one_inode(root, key->objectid); @@ -1957,79 +2037,60 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, name_len); btrfs_dir_item_key_to_cpu(eb, di, &log_key); - exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); - if (exists == 0) - exists = 1; - else - exists = 0; + ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); + if (ret < 0) + goto out; + exists = (ret == 0); + ret = 0; - if (key->type == BTRFS_DIR_ITEM_KEY) { - dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, - name, name_len, 1); - } else if (key->type == BTRFS_DIR_INDEX_KEY) { - dst_di = btrfs_lookup_dir_index_item(trans, root, path, - key->objectid, - key->offset, name, - name_len, 1); - } else { - /* Corruption */ - ret = -EINVAL; + dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, + name, name_len, 1); + if (IS_ERR(dir_dst_di)) { + ret = PTR_ERR(dir_dst_di); goto out; - } - if (IS_ERR_OR_NULL(dst_di)) { - /* we need a sequence number to insert, so we only - * do inserts for the BTRFS_DIR_INDEX_KEY types - */ - if (key->type != BTRFS_DIR_INDEX_KEY) + } else if (dir_dst_di) { + ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, + dir_dst_di, &log_key, log_type, + exists); + if (ret < 0) goto out; - goto insert; + dir_dst_matches = (ret == 1); } - btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); - /* the existing item matches the logged item */ - if (found_key.objectid == log_key.objectid && - found_key.type == log_key.type && - found_key.offset == log_key.offset && - btrfs_dir_type(path->nodes[0], dst_di) == log_type) { - update_size = false; + btrfs_release_path(path); + + index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, + key->objectid, key->offset, + name, name_len, 1); + if (IS_ERR(index_dst_di)) { + ret = PTR_ERR(index_dst_di); goto out; + } else if (index_dst_di) { + ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, + index_dst_di, &log_key, + log_type, exists); + if (ret < 0) + goto out; + index_dst_matches = (ret == 1); } - /* - * don't drop the conflicting directory entry if the inode - * for the new entry doesn't exist - */ - if (!exists) - goto out; + btrfs_release_path(path); - ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); - if (ret) + if (dir_dst_matches && index_dst_matches) { + ret = 0; + update_size = false; goto out; - - if (key->type == BTRFS_DIR_INDEX_KEY) - goto insert; -out: - btrfs_release_path(path); - if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); - ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); } - kfree(name); - iput(dir); - if (!ret && name_added) - ret = 1; - return ret; -insert: /* * Check if the inode reference exists in the log for the given name, * inode and parent inode */ - found_key.objectid = log_key.objectid; - found_key.type = BTRFS_INODE_REF_KEY; - found_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); + search_key.objectid = log_key.objectid; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); if (ret < 0) { goto out; } else if (ret) { @@ -2039,10 +2100,10 @@ insert: goto out; } - found_key.objectid = log_key.objectid; - found_key.type = BTRFS_INODE_EXTREF_KEY; - found_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &found_key, key->objectid, name, + search_key.objectid = log_key.objectid; + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &search_key, key->objectid, name, name_len); if (ret < 0) { goto out; @@ -2061,87 +2122,76 @@ insert: name_added = true; update_size = false; ret = 0; - goto out; + +out: + if (!ret && update_size) { + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); + ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); + } + kfree(name); + iput(dir); + if (!ret && name_added) + ret = 1; + return ret; } -/* - * find all the names in a directory item and reconcile them into - * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than - * one name in a directory item, but the same code gets used for - * both directory index types - */ +/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - int ret = 0; - u32 item_size = btrfs_item_size_nr(eb, slot); + int ret; struct btrfs_dir_item *di; - int name_len; - unsigned long ptr; - unsigned long ptr_end; - struct btrfs_path *fixup_path = NULL; - - ptr = btrfs_item_ptr_offset(eb, slot); - ptr_end = ptr + item_size; - while (ptr < ptr_end) { - di = (struct btrfs_dir_item *)ptr; - name_len = btrfs_dir_name_len(eb, di); - ret = replay_one_name(trans, root, path, eb, di, key); - if (ret < 0) - break; - ptr = (unsigned long)(di + 1); - ptr += name_len; - /* - * If this entry refers to a non-directory (directories can not - * have a link count > 1) and it was added in the transaction - * that was not committed, make sure we fixup the link count of - * the inode it the entry points to. Otherwise something like - * the following would result in a directory pointing to an - * inode with a wrong link that does not account for this dir - * entry: - * - * mkdir testdir - * touch testdir/foo - * touch testdir/bar - * sync - * - * ln testdir/bar testdir/bar_link - * ln testdir/foo testdir/foo_link - * xfs_io -c "fsync" testdir/bar - * - * <power failure> - * - * mount fs, log replay happens - * - * File foo would remain with a link count of 1 when it has two - * entries pointing to it in the directory testdir. This would - * make it impossible to ever delete the parent directory has - * it would result in stale dentries that can never be deleted. - */ - if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { - struct btrfs_key di_key; + /* We only log dir index keys, which only contain a single dir item. */ + ASSERT(key->type == BTRFS_DIR_INDEX_KEY); - if (!fixup_path) { - fixup_path = btrfs_alloc_path(); - if (!fixup_path) { - ret = -ENOMEM; - break; - } - } + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + ret = replay_one_name(trans, root, path, eb, di, key); + if (ret < 0) + return ret; - btrfs_dir_item_key_to_cpu(eb, di, &di_key); - ret = link_to_fixup_dir(trans, root, fixup_path, - di_key.objectid); - if (ret) - break; - } - ret = 0; + /* + * If this entry refers to a non-directory (directories can not have a + * link count > 1) and it was added in the transaction that was not + * committed, make sure we fixup the link count of the inode the entry + * points to. Otherwise something like the following would result in a + * directory pointing to an inode with a wrong link that does not account + * for this dir entry: + * + * mkdir testdir + * touch testdir/foo + * touch testdir/bar + * sync + * + * ln testdir/bar testdir/bar_link + * ln testdir/foo testdir/foo_link + * xfs_io -c "fsync" testdir/bar + * + * <power failure> + * + * mount fs, log replay happens + * + * File foo would remain with a link count of 1 when it has two entries + * pointing to it in the directory testdir. This would make it impossible + * to ever delete the parent directory has it would result in stale + * dentries that can never be deleted. + */ + if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { + struct btrfs_path *fixup_path; + struct btrfs_key di_key; + + fixup_path = btrfs_alloc_path(); + if (!fixup_path) + return -ENOMEM; + + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); + btrfs_free_path(fixup_path); } - btrfs_free_path(fixup_path); + return ret; } @@ -2158,7 +2208,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, */ static noinline int find_dir_range(struct btrfs_root *root, struct btrfs_path *path, - u64 dirid, int key_type, + u64 dirid, u64 *start_ret, u64 *end_ret) { struct btrfs_key key; @@ -2171,7 +2221,7 @@ static noinline int find_dir_range(struct btrfs_root *root, return 1; key.objectid = dirid; - key.type = key_type; + key.type = BTRFS_DIR_LOG_INDEX_KEY; key.offset = *start_ret; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -2185,7 +2235,7 @@ static noinline int find_dir_range(struct btrfs_root *root, if (ret != 0) btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.type != key_type || key.objectid != dirid) { + if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { ret = 1; goto next; } @@ -2212,7 +2262,7 @@ next: btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.type != key_type || key.objectid != dirid) { + if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { ret = 1; goto out; } @@ -2233,105 +2283,92 @@ out: * to is unlinked */ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, struct inode *dir, struct btrfs_key *dir_key) { + struct btrfs_root *root = BTRFS_I(dir)->root; int ret; struct extent_buffer *eb; int slot; - u32 item_size; struct btrfs_dir_item *di; - struct btrfs_dir_item *log_di; int name_len; - unsigned long ptr; - unsigned long ptr_end; char *name; - struct inode *inode; + struct inode *inode = NULL; struct btrfs_key location; -again: + /* + * Currenly we only log dir index keys. Even if we replay a log created + * by an older kernel that logged both dir index and dir item keys, all + * we need to do is process the dir index keys, we (and our caller) can + * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). + */ + ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); + eb = path->nodes[0]; slot = path->slots[0]; - item_size = btrfs_item_size_nr(eb, slot); - ptr = btrfs_item_ptr_offset(eb, slot); - ptr_end = ptr + item_size; - while (ptr < ptr_end) { - di = (struct btrfs_dir_item *)ptr; - name_len = btrfs_dir_name_len(eb, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) { - ret = -ENOMEM; - goto out; - } - read_extent_buffer(eb, name, (unsigned long)(di + 1), - name_len); - log_di = NULL; - if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { - log_di = btrfs_lookup_dir_item(trans, log, log_path, - dir_key->objectid, - name, name_len, 0); - } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { - log_di = btrfs_lookup_dir_index_item(trans, log, - log_path, - dir_key->objectid, - dir_key->offset, - name, name_len, 0); - } - if (!log_di || log_di == ERR_PTR(-ENOENT)) { - btrfs_dir_item_key_to_cpu(eb, di, &location); - btrfs_release_path(path); - btrfs_release_path(log_path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - kfree(name); - return -EIO; - } + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } - ret = link_to_fixup_dir(trans, root, - path, location.objectid); - if (ret) { - kfree(name); - iput(inode); - goto out; - } + read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); - inc_nlink(inode); - ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), - BTRFS_I(inode), name, name_len); - if (!ret) - ret = btrfs_run_delayed_items(trans); - kfree(name); - iput(inode); - if (ret) - goto out; + if (log) { + struct btrfs_dir_item *log_di; - /* there might still be more names under this key - * check and repeat if required - */ - ret = btrfs_search_slot(NULL, root, dir_key, path, - 0, 0); - if (ret == 0) - goto again; + log_di = btrfs_lookup_dir_index_item(trans, log, log_path, + dir_key->objectid, + dir_key->offset, + name, name_len, 0); + if (IS_ERR(log_di)) { + ret = PTR_ERR(log_di); + goto out; + } else if (log_di) { + /* The dentry exists in the log, we have nothing to do. */ ret = 0; goto out; - } else if (IS_ERR(log_di)) { - kfree(name); - return PTR_ERR(log_di); } - btrfs_release_path(log_path); - kfree(name); + } - ptr = (unsigned long)(di + 1); - ptr += name_len; + btrfs_dir_item_key_to_cpu(eb, di, &location); + btrfs_release_path(path); + btrfs_release_path(log_path); + inode = read_one_inode(root, location.objectid); + if (!inode) { + ret = -EIO; + goto out; } - ret = 0; + + ret = link_to_fixup_dir(trans, root, path, location.objectid); + if (ret) + goto out; + + inc_nlink(inode); + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, + name_len); + if (ret) + goto out; + + ret = btrfs_run_delayed_items(trans); + if (ret) + goto out; + + /* + * Unlike dir item keys, dir index keys can only have one name (entry) in + * them, as there are no key collisions since each key has a unique offset + * (an index number), so we're done. + */ out: btrfs_release_path(path); btrfs_release_path(log_path); + kfree(name); + iput(inode); return ret; } @@ -2374,7 +2411,7 @@ process_leaf: } di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); - total_size = btrfs_item_size_nr(path->nodes[0], i); + total_size = btrfs_item_size(path->nodes[0], i); cur = 0; while (cur < total_size) { u16 name_len = btrfs_dir_name_len(path->nodes[0], di); @@ -2451,7 +2488,6 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, { u64 range_start; u64 range_end; - int key_type = BTRFS_DIR_LOG_ITEM_KEY; int ret = 0; struct btrfs_key dir_key; struct btrfs_key found_key; @@ -2459,7 +2495,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct inode *dir; dir_key.objectid = dirid; - dir_key.type = BTRFS_DIR_ITEM_KEY; + dir_key.type = BTRFS_DIR_INDEX_KEY; log_path = btrfs_alloc_path(); if (!log_path) return -ENOMEM; @@ -2473,16 +2509,18 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, btrfs_free_path(log_path); return 0; } -again: + range_start = 0; range_end = 0; while (1) { if (del_all) range_end = (u64)-1; else { - ret = find_dir_range(log, path, dirid, key_type, + ret = find_dir_range(log, path, dirid, &range_start, &range_end); - if (ret != 0) + if (ret < 0) + goto out; + else if (ret > 0) break; } @@ -2505,13 +2543,15 @@ again: btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); if (found_key.objectid != dirid || - found_key.type != dir_key.type) - goto next_type; + found_key.type != dir_key.type) { + ret = 0; + goto out; + } if (found_key.offset > range_end) break; - ret = check_item_in_log(trans, root, log, path, + ret = check_item_in_log(trans, log, path, log_path, dir, &found_key); if (ret) @@ -2525,15 +2565,7 @@ again: break; range_start = range_end + 1; } - -next_type: ret = 0; - if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { - key_type = BTRFS_DIR_LOG_INDEX_KEY; - dir_key.type = BTRFS_DIR_INDEX_KEY; - btrfs_release_path(path); - goto again; - } out: btrfs_release_path(path); btrfs_free_path(log_path); @@ -2693,12 +2725,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, eb, i, &key); if (ret) break; - } else if (key.type == BTRFS_DIR_ITEM_KEY) { - ret = replay_one_dir_item(wc->trans, root, path, - eb, i, &key); - if (ret) - break; } + /* + * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the + * BTRFS_DIR_INDEX_KEY items which we use to derive the + * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an + * older kernel with such keys, ignore them. + */ } btrfs_free_path(path); return ret; @@ -2859,6 +2892,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, path->nodes[*level]->len); if (ret) return ret; + btrfs_redirty_list_add(trans->transaction, + next); } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); @@ -2939,6 +2974,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, next->start, next->len); if (ret) goto out; + btrfs_redirty_list_add(trans->transaction, next); } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); @@ -3019,9 +3055,6 @@ static void wait_for_writer(struct btrfs_root *root) static inline void btrfs_remove_log_ctx(struct btrfs_root *root, struct btrfs_log_ctx *ctx) { - if (!ctx) - return; - mutex_lock(&root->log_mutex); list_del_init(&ctx->list); mutex_unlock(&root->log_mutex); @@ -3310,7 +3343,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * writing the super here would result in transid mismatches. If there * is an error here just bail. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { ret = -EIO; btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); @@ -3392,8 +3425,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); extent_io_tree_release(&log->log_csum_range); - if (trans && log->node) - btrfs_redirty_list_add(trans->transaction, log->node); btrfs_put_root(log); } @@ -3434,6 +3465,9 @@ static bool inode_logged(struct btrfs_trans_handle *trans, if (inode->logged_trans == trans->transid) return true; + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) + return false; + /* * The inode's logged_trans is always 0 when we load it (because it is * not persisted in the inode item or elsewhere). So if it is 0, the @@ -3472,10 +3506,10 @@ static bool inode_logged(struct btrfs_trans_handle *trans, * This optimizations allows us to avoid relogging the entire inode * or the entire directory. */ -int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct btrfs_inode *dir, u64 index) +void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct btrfs_inode *dir, u64 index) { struct btrfs_root *log; struct btrfs_dir_item *di; @@ -3485,11 +3519,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, u64 dir_ino = btrfs_ino(dir); if (!inode_logged(trans, dir)) - return 0; + return; ret = join_running_log_trans(root); if (ret) - return 0; + return; mutex_lock(&dir->log_mutex); @@ -3500,20 +3534,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, goto out_unlock; } - di = btrfs_lookup_dir_item(trans, log, path, dir_ino, - name, name_len, -1); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto fail; - } - if (di) { - ret = btrfs_delete_one_dir_name(trans, log, path, di); - if (ret) { - err = ret; - goto fail; - } - } - btrfs_release_path(path); + /* + * We only log dir index items of a directory, so we don't need to look + * for dir item keys. + */ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, index, name, name_len, -1); if (IS_ERR(di)) { @@ -3537,49 +3561,36 @@ fail: btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); - if (err == -ENOSPC) { + if (err < 0) btrfs_set_log_full_commit(trans); - err = 0; - } else if (err < 0 && err != -ENOENT) { - /* ENOENT can be returned if the entry hasn't been fsynced yet */ - btrfs_abort_transaction(trans, err); - } - btrfs_end_log_trans(root); - - return err; } /* see comments for btrfs_del_dir_entries_in_log */ -int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct btrfs_inode *inode, u64 dirid) +void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct btrfs_inode *inode, u64 dirid) { struct btrfs_root *log; u64 index; int ret; if (!inode_logged(trans, inode)) - return 0; + return; ret = join_running_log_trans(root); if (ret) - return 0; + return; log = root->log_root; mutex_lock(&inode->log_mutex); ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), dirid, &index); mutex_unlock(&inode->log_mutex); - if (ret == -ENOSPC) { + if (ret < 0 && ret != -ENOENT) btrfs_set_log_full_commit(trans); - ret = 0; - } else if (ret < 0 && ret != -ENOENT) - btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); - - return ret; } /* @@ -3590,7 +3601,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, - int key_type, u64 dirid, + u64 dirid, u64 first_offset, u64 last_offset) { int ret; @@ -3599,10 +3610,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, key.objectid = dirid; key.offset = first_offset; - if (key_type == BTRFS_DIR_ITEM_KEY) - key.type = BTRFS_DIR_LOG_ITEM_KEY; - else - key.type = BTRFS_DIR_LOG_INDEX_KEY; + key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); if (ret) return ret; @@ -3615,33 +3623,226 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, return 0; } +static int flush_dir_items_batch(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct extent_buffer *src, + struct btrfs_path *dst_path, + int start_slot, + int count) +{ + char *ins_data = NULL; + struct btrfs_item_batch batch; + struct extent_buffer *dst; + unsigned long src_offset; + unsigned long dst_offset; + struct btrfs_key key; + u32 item_size; + int ret; + int i; + + ASSERT(count > 0); + batch.nr = count; + + if (count == 1) { + btrfs_item_key_to_cpu(src, &key, start_slot); + item_size = btrfs_item_size(src, start_slot); + batch.keys = &key; + batch.data_sizes = &item_size; + batch.total_data_size = item_size; + } else { + struct btrfs_key *ins_keys; + u32 *ins_sizes; + + ins_data = kmalloc(count * sizeof(u32) + + count * sizeof(struct btrfs_key), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); + batch.keys = ins_keys; + batch.data_sizes = ins_sizes; + batch.total_data_size = 0; + + for (i = 0; i < count; i++) { + const int slot = start_slot + i; + + btrfs_item_key_to_cpu(src, &ins_keys[i], slot); + ins_sizes[i] = btrfs_item_size(src, slot); + batch.total_data_size += ins_sizes[i]; + } + } + + ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); + if (ret) + goto out; + + dst = dst_path->nodes[0]; + /* + * Copy all the items in bulk, in a single copy operation. Item data is + * organized such that it's placed at the end of a leaf and from right + * to left. For example, the data for the second item ends at an offset + * that matches the offset where the data for the first item starts, the + * data for the third item ends at an offset that matches the offset + * where the data of the second items starts, and so on. + * Therefore our source and destination start offsets for copy match the + * offsets of the last items (highest slots). + */ + dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); + src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); + copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); + btrfs_release_path(dst_path); +out: + kfree(ins_data); + + return ret; +} + +static int process_dir_items_leaf(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = inode->root->log_root; + struct extent_buffer *src = path->nodes[0]; + const int nritems = btrfs_header_nritems(src); + const u64 ino = btrfs_ino(inode); + const bool inode_logged_before = inode_logged(trans, inode); + bool last_found = false; + int batch_start = 0; + int batch_size = 0; + int i; + + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_key key; + int ret; + + btrfs_item_key_to_cpu(src, &key, i); + + if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { + last_found = true; + break; + } + + ctx->last_dir_item_offset = key.offset; + /* + * We must make sure that when we log a directory entry, the + * corresponding inode, after log replay, has a matching link + * count. For example: + * + * touch foo + * mkdir mydir + * sync + * ln foo mydir/bar + * xfs_io -c "fsync" mydir + * <crash> + * <mount fs and log replay> + * + * Would result in a fsync log that when replayed, our file inode + * would have a link count of 1, but we get two directory entries + * pointing to the same inode. After removing one of the names, + * it would not be possible to remove the other name, which + * resulted always in stale file handle errors, and would not be + * possible to rmdir the parent directory, since its i_size could + * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, + * resulting in -ENOTEMPTY errors. + */ + if (!ctx->log_new_dentries) { + struct btrfs_dir_item *di; + struct btrfs_key di_key; + + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(src, di, &di_key); + if ((btrfs_dir_transid(src, di) == trans->transid || + btrfs_dir_type(src, di) == BTRFS_FT_DIR) && + di_key.type != BTRFS_ROOT_ITEM_KEY) + ctx->log_new_dentries = true; + } + + if (!inode_logged_before) + goto add_to_batch; + + /* + * If we were logged before and have logged dir items, we can skip + * checking if any item with a key offset larger than the last one + * we logged is in the log tree, saving time and avoiding adding + * contention on the log tree. + */ + if (key.offset > inode->last_dir_index_offset) + goto add_to_batch; + /* + * Check if the key was already logged before. If not we can add + * it to a batch for bulk insertion. + */ + ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + btrfs_release_path(dst_path); + goto add_to_batch; + } + + /* + * Item exists in the log. Overwrite the item in the log if it + * has different content or do nothing if it has exactly the same + * content. And then flush the current batch if any - do it after + * overwriting the current item, or we would deadlock otherwise, + * since we are holding a path for the existing item. + */ + ret = do_overwrite_item(trans, log, dst_path, src, i, &key); + if (ret < 0) + return ret; + + if (batch_size > 0) { + ret = flush_dir_items_batch(trans, log, src, dst_path, + batch_start, batch_size); + if (ret < 0) + return ret; + batch_size = 0; + } + continue; +add_to_batch: + if (batch_size == 0) + batch_start = i; + batch_size++; + } + + if (batch_size > 0) { + int ret; + + ret = flush_dir_items_batch(trans, log, src, dst_path, + batch_start, batch_size); + if (ret < 0) + return ret; + } + + return last_found ? 1 : 0; +} + /* * log all the items included in the current transaction for a given * directory. This also creates the range items in the log tree required * to replay anything deleted before the fsync */ static noinline int log_dir_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_path *dst_path, int key_type, + struct btrfs_path *dst_path, struct btrfs_log_ctx *ctx, u64 min_offset, u64 *last_offset_ret) { struct btrfs_key min_key; + struct btrfs_root *root = inode->root; struct btrfs_root *log = root->log_root; - struct extent_buffer *src; int err = 0; int ret; - int i; - int nritems; u64 first_offset = min_offset; u64 last_offset = (u64)-1; u64 ino = btrfs_ino(inode); - log = root->log_root; - min_key.objectid = ino; - min_key.type = key_type; + min_key.type = BTRFS_DIR_INDEX_KEY; min_key.offset = min_offset; ret = btrfs_search_forward(root, &min_key, path, trans->transid); @@ -3650,9 +3851,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * we didn't find anything from this transaction, see if there * is anything at all */ - if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { + if (ret != 0 || min_key.objectid != ino || + min_key.type != BTRFS_DIR_INDEX_KEY) { min_key.objectid = ino; - min_key.type = key_type; + min_key.type = BTRFS_DIR_INDEX_KEY; min_key.offset = (u64)-1; btrfs_release_path(path); ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); @@ -3660,7 +3862,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, btrfs_release_path(path); return ret; } - ret = btrfs_previous_item(root, path, ino, key_type); + ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); /* if ret == 0 there are items for this type, * create a range to tell us the last key of this type. @@ -3671,18 +3873,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_key tmp; btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (key_type == tmp.type) + if (tmp.type == BTRFS_DIR_INDEX_KEY) first_offset = max(min_offset, tmp.offset) + 1; } goto done; } /* go backward to find any previous key */ - ret = btrfs_previous_item(root, path, ino, key_type); + ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); if (ret == 0) { struct btrfs_key tmp; btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (key_type == tmp.type) { + if (tmp.type == BTRFS_DIR_INDEX_KEY) { first_offset = tmp.offset; ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], @@ -3713,62 +3915,13 @@ search: * from our directory */ while (1) { - struct btrfs_key tmp; - src = path->nodes[0]; - nritems = btrfs_header_nritems(src); - for (i = path->slots[0]; i < nritems; i++) { - struct btrfs_dir_item *di; - - btrfs_item_key_to_cpu(src, &min_key, i); - - if (min_key.objectid != ino || min_key.type != key_type) - goto done; - - if (need_resched()) { - btrfs_release_path(path); - cond_resched(); - goto search; - } - - ret = overwrite_item(trans, log, dst_path, src, i, - &min_key); - if (ret) { + ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx); + if (ret != 0) { + if (ret < 0) err = ret; - goto done; - } - - /* - * We must make sure that when we log a directory entry, - * the corresponding inode, after log replay, has a - * matching link count. For example: - * - * touch foo - * mkdir mydir - * sync - * ln foo mydir/bar - * xfs_io -c "fsync" mydir - * <crash> - * <mount fs and log replay> - * - * Would result in a fsync log that when replayed, our - * file inode would have a link count of 1, but we get - * two directory entries pointing to the same inode. - * After removing one of the names, it would not be - * possible to remove the other name, which resulted - * always in stale file handle errors, and would not - * be possible to rmdir the parent directory, since - * its i_size could never decrement to the value - * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. - */ - di = btrfs_item_ptr(src, i, struct btrfs_dir_item); - btrfs_dir_item_key_to_cpu(src, di, &tmp); - if (ctx && - (btrfs_dir_transid(src, di) == trans->transid || - btrfs_dir_type(src, di) == BTRFS_FT_DIR) && - tmp.type != BTRFS_ROOT_ITEM_KEY) - ctx->log_new_dentries = true; + goto done; } - path->slots[0] = nritems; + path->slots[0] = btrfs_header_nritems(path->nodes[0]); /* * look ahead to the next item and see if it is also @@ -3782,21 +3935,27 @@ search: err = ret; goto done; } - btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (tmp.objectid != ino || tmp.type != key_type) { + btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); + if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { last_offset = (u64)-1; goto done; } if (btrfs_header_generation(path->nodes[0]) != trans->transid) { + ctx->last_dir_item_offset = min_key.offset; ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], - &tmp); + &min_key); if (ret) err = ret; else - last_offset = tmp.offset; + last_offset = min_key.offset; goto done; } + if (need_resched()) { + btrfs_release_path(path); + cond_resched(); + goto search; + } } done: btrfs_release_path(path); @@ -3808,8 +3967,8 @@ done: * insert the log range keys to indicate where the log * is valid */ - ret = insert_dir_log_key(trans, log, path, key_type, - ino, first_offset, last_offset); + ret = insert_dir_log_key(trans, log, path, ino, first_offset, + last_offset); if (ret) err = ret; } @@ -3829,7 +3988,7 @@ done: * key logged by this transaction. */ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, struct btrfs_log_ctx *ctx) @@ -3837,13 +3996,28 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, u64 min_key; u64 max_key; int ret; - int key_type = BTRFS_DIR_ITEM_KEY; -again: + /* + * If this is the first time we are being logged in the current + * transaction, or we were logged before but the inode was evicted and + * reloaded later, in which case its logged_trans is 0, reset the value + * of the last logged key offset. Note that we don't use the helper + * function inode_logged() here - that is because the function returns + * true after an inode eviction, assuming the worst case as it can not + * know for sure if the inode was logged before. So we can not skip key + * searches in the case the inode was evicted, because it may not have + * been logged in this transaction and may have been logged in a past + * transaction, so we need to reset the last dir index offset to (u64)-1. + */ + if (inode->logged_trans != trans->transid) + inode->last_dir_index_offset = (u64)-1; + min_key = 0; max_key = 0; + ctx->last_dir_item_offset = inode->last_dir_index_offset; + while (1) { - ret = log_dir_items(trans, root, inode, path, dst_path, key_type, + ret = log_dir_items(trans, inode, path, dst_path, ctx, min_key, &max_key); if (ret) return ret; @@ -3852,10 +4026,8 @@ again: min_key = max_key + 1; } - if (key_type == BTRFS_DIR_ITEM_KEY) { - key_type = BTRFS_DIR_INDEX_KEY; - goto again; - } + inode->last_dir_index_offset = ctx->last_dir_item_offset; + return 0; } @@ -3865,17 +4037,21 @@ again: * This cannot be run for file data extents because it does not * free the extents they point to. */ -static int drop_objectid_items(struct btrfs_trans_handle *trans, +static int drop_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, - u64 objectid, int max_key_type) + struct btrfs_inode *inode, + int max_key_type) { int ret; struct btrfs_key key; struct btrfs_key found_key; int start_slot; - key.objectid = objectid; + if (!inode_logged(trans, inode)) + return 0; + + key.objectid = btrfs_ino(inode); key.type = max_key_type; key.offset = (u64)-1; @@ -3892,7 +4068,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - if (found_key.objectid != objectid) + if (found_key.objectid != key.objectid) break; found_key.offset = 0; @@ -3917,6 +4093,21 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, return ret; } +static int truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log_root, + struct btrfs_inode *inode, + u64 new_size, u32 min_type) +{ + struct btrfs_truncate_control control = { + .new_size = new_size, + .ino = btrfs_ino(inode), + .min_type = min_type, + .skip_ref_updates = true, + }; + + return btrfs_truncate_inode_items(trans, log_root, &control); +} + static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, @@ -4089,6 +4280,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; + struct btrfs_item_batch batch; char *ins_data; int i; struct list_head ordered_sums; @@ -4103,13 +4295,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); + batch.keys = ins_keys; + batch.data_sizes = ins_sizes; + batch.total_data_size = 0; + batch.nr = nr; for (i = 0; i < nr; i++) { - ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); + ins_sizes[i] = btrfs_item_size(src, i + start_slot); + batch.total_data_size += ins_sizes[i]; btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); } - ret = btrfs_insert_empty_items(trans, log, dst_path, - ins_keys, ins_sizes, nr); + ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); if (ret) { kfree(ins_data); return ret; @@ -4149,6 +4345,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, found_type = btrfs_file_extent_type(src, extent); if (found_type == BTRFS_FILE_EXTENT_REG) { + struct btrfs_root *csum_root; u64 ds, dl, cs, cl; ds = btrfs_file_extent_disk_bytenr(src, extent); @@ -4167,8 +4364,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, cl = dl; } - ret = btrfs_lookup_csums_range( - fs_info->csum_root, + csum_root = btrfs_csum_root(fs_info, ds); + ret = btrfs_lookup_csums_range(csum_root, ds + cs, ds + cs + cl - 1, &ordered_sums, 0); if (ret) @@ -4220,6 +4417,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_ordered_extent *ordered; + struct btrfs_root *csum_root; u64 csum_offset; u64 csum_len; u64 mod_start = em->mod_start; @@ -4300,7 +4498,8 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, } /* block start is already adjusted for the file extent offset. */ - ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, + csum_root = btrfs_csum_root(trans->fs_info, em->block_start); + ret = btrfs_lookup_csums_range(csum_root, em->block_start + csum_offset, em->block_start + csum_offset + csum_len - 1, &ordered_sums, 0); @@ -4321,13 +4520,13 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, } static int log_one_extent(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, struct btrfs_root *root, + struct btrfs_inode *inode, const struct extent_map *em, struct btrfs_path *path, struct btrfs_log_ctx *ctx) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct btrfs_root *log = root->log_root; + struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct btrfs_map_token token; @@ -4340,14 +4539,25 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - drop_args.path = path; - drop_args.start = em->start; - drop_args.end = em->start + em->len; - drop_args.replace_extent = true; - drop_args.extent_item_size = sizeof(*fi); - ret = btrfs_drop_extents(trans, log, inode, &drop_args); - if (ret) - return ret; + /* + * If this is the first time we are logging the inode in the current + * transaction, we can avoid btrfs_drop_extents(), which is expensive + * because it does a deletion search, which always acquires write locks + * for extent buffers at levels 2, 1 and 0. This not only wastes time + * but also adds significant contention in a log tree, since log trees + * are small, with a root at level 2 or 3 at most, due to their short + * life span. + */ + if (inode_logged(trans, inode)) { + drop_args.path = path; + drop_args.start = em->start; + drop_args.end = em->start + em->len; + drop_args.replace_extent = true; + drop_args.extent_item_size = sizeof(*fi); + ret = btrfs_drop_extents(trans, log, inode, &drop_args); + if (ret) + return ret; + } if (!drop_args.extent_inserted) { key.objectid = btrfs_ino(inode); @@ -4505,13 +4715,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, * Avoid logging extent items logged in past fsync calls * and leading to duplicate keys in the log tree. */ - do { - ret = btrfs_truncate_inode_items(trans, - root->log_root, - inode, truncate_offset, - BTRFS_EXTENT_DATA_KEY, - NULL); - } while (ret == -EAGAIN); + ret = truncate_inode_items(trans, root->log_root, inode, + truncate_offset, + BTRFS_EXTENT_DATA_KEY); if (ret) goto out; dropped_extents = true; @@ -4538,7 +4744,6 @@ out: } static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_log_ctx *ctx) @@ -4603,7 +4808,7 @@ process: write_unlock(&tree->lock); - ret = log_one_extent(trans, inode, root, em, path, ctx); + ret = log_one_extent(trans, inode, em, path, ctx); write_lock(&tree->lock); clear_em_logging(tree, em); free_extent_map(em); @@ -4692,11 +4897,11 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, * with a journal, ext3/4, xfs, f2fs, etc). */ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path) { + struct btrfs_root *root = inode->root; int ret; struct btrfs_key key; const u64 ino = btrfs_ino(inode); @@ -4770,10 +4975,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, * truncate operation that changes the inode's size. */ static int btrfs_log_holes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; const u64 ino = btrfs_ino(inode); @@ -4915,7 +5120,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, struct btrfs_path *search_path; char *name = NULL; u32 name_len = 0; - u32 item_size = btrfs_item_size_nr(eb, slot); + u32 item_size = btrfs_item_size(eb, slot); u32 cur_offset = 0; unsigned long ptr = btrfs_item_ptr_offset(eb, slot); @@ -5050,7 +5255,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) { ret = PTR_ERR(inode); } else { - ret = btrfs_log_inode(trans, root, + ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE_ALL, ctx); @@ -5110,8 +5315,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_OTHER_INODE, ctx); + ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx); if (ret) { btrfs_add_delayed_iput(inode); continue; @@ -5222,7 +5426,7 @@ again: &other_ino, &other_parent); if (ret < 0) { return ret; - } else if (ret > 0 && ctx && + } else if (ret > 0 && other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { if (ins_nr > 0) { ins_nr++; @@ -5322,7 +5526,7 @@ next_key: * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_inode *inode, int inode_only, struct btrfs_log_ctx *ctx) { @@ -5330,7 +5534,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path; struct btrfs_key min_key; struct btrfs_key max_key; - struct btrfs_root *log = root->log_root; + struct btrfs_root *log = inode->root->log_root; int err = 0; int ret = 0; bool fast_search = false; @@ -5372,22 +5576,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * Only run delayed items if we are a directory. We want to make sure * all directory indexes hit the fs/subvolume tree so we can find them * and figure out which index ranges have to be logged. - * - * Otherwise commit the delayed inode only if the full sync flag is set, - * as we want to make sure an up to date version is in the subvolume - * tree so copy_inode_items_to_log() / copy_items() can find it and copy - * it to the log tree. For a non full sync, we always log the inode item - * based on the in-memory struct btrfs_inode which is always up to date. */ - if (S_ISDIR(inode->vfs_inode.i_mode)) - ret = btrfs_commit_inode_delayed_items(trans, inode); - else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) - ret = btrfs_commit_inode_delayed_inode(inode); - - if (ret) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - return ret; + if (S_ISDIR(inode->vfs_inode.i_mode)) { + err = btrfs_commit_inode_delayed_items(trans, inode); + if (err) + goto out; } if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { @@ -5426,9 +5619,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); if (inode_only == LOG_INODE_EXISTS) max_key_type = BTRFS_XATTR_ITEM_KEY; - ret = drop_objectid_items(trans, log, path, ino, max_key_type); + ret = drop_inode_items(trans, log, path, inode, max_key_type); } else { - if (inode_only == LOG_INODE_EXISTS) { + if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) { /* * Make sure the new inode item we write to the log has * the same isize as the current one (if it exists). @@ -5450,19 +5643,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &inode->runtime_flags)) { if (inode_only == LOG_INODE_EXISTS) { max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_objectid_items(trans, log, path, ino, - max_key.type); + ret = drop_inode_items(trans, log, path, inode, + max_key.type); } else { clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); - while(1) { - ret = btrfs_truncate_inode_items(trans, - log, inode, 0, 0, NULL); - if (ret != -EAGAIN) - break; - } + if (inode_logged(trans, inode)) + ret = truncate_inode_items(trans, log, + inode, 0, 0); } } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags) || @@ -5470,8 +5660,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (inode_only == LOG_INODE_ALL) fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_objectid_items(trans, log, path, ino, - max_key.type); + ret = drop_inode_items(trans, log, path, inode, + max_key.type); } else { if (inode_only == LOG_INODE_ALL) fast_search = true; @@ -5494,14 +5684,14 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + err = btrfs_log_all_xattrs(trans, inode, path, dst_path); if (err) goto out_unlock; xattrs_logged = true; if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_holes(trans, root, inode, path); + err = btrfs_log_holes(trans, inode, path); if (err) goto out_unlock; } @@ -5521,16 +5711,14 @@ log_extents: * BTRFS_INODE_COPY_EVERYTHING set. */ if (!xattrs_logged && inode->logged_trans < trans->transid) { - err = btrfs_log_all_xattrs(trans, root, inode, path, - dst_path); + err = btrfs_log_all_xattrs(trans, inode, path, dst_path); if (err) goto out_unlock; btrfs_release_path(path); } } if (fast_search) { - ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - ctx); + ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); if (ret) { err = ret; goto out_unlock; @@ -5545,59 +5733,52 @@ log_extents: } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { - ret = log_directory_changes(trans, root, inode, path, dst_path, - ctx); + ret = log_directory_changes(trans, inode, path, dst_path, ctx); if (ret) { err = ret; goto out_unlock; } } + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; /* - * If we are logging that an ancestor inode exists as part of logging a - * new name from a link or rename operation, don't mark the inode as - * logged - otherwise if an explicit fsync is made against an ancestor, - * the fsync considers the inode in the log and doesn't sync the log, - * resulting in the ancestor missing after a power failure unless the - * log was synced as part of an fsync against any other unrelated inode. - * So keep it simple for this case and just don't flag the ancestors as - * logged. + * Don't update last_log_commit if we logged that an inode exists. + * We do this for three reasons: + * + * 1) We might have had buffered writes to this inode that were + * flushed and had their ordered extents completed in this + * transaction, but we did not previously log the inode with + * LOG_INODE_ALL. Later the inode was evicted and after that + * it was loaded again and this LOG_INODE_EXISTS log operation + * happened. We must make sure that if an explicit fsync against + * the inode is performed later, it logs the new extents, an + * updated inode item, etc, and syncs the log. The same logic + * applies to direct IO writes instead of buffered writes. + * + * 2) When we log the inode with LOG_INODE_EXISTS, its inode item + * is logged with an i_size of 0 or whatever value was logged + * before. If later the i_size of the inode is increased by a + * truncate operation, the log is synced through an fsync of + * some other inode and then finally an explicit fsync against + * this inode is made, we must make sure this fsync logs the + * inode with the new i_size, the hole between old i_size and + * the new i_size, and syncs the log. + * + * 3) If we are logging that an ancestor inode exists as part of + * logging a new name from a link or rename operation, don't update + * its last_log_commit - otherwise if an explicit fsync is made + * against an ancestor, the fsync considers the inode in the log + * and doesn't sync the log, resulting in the ancestor missing after + * a power failure unless the log was synced as part of an fsync + * against any other unrelated inode. */ - if (!ctx || - !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && - &inode->vfs_inode != ctx->inode)) { - spin_lock(&inode->lock); - inode->logged_trans = trans->transid; - /* - * Don't update last_log_commit if we logged that an inode exists. - * We do this for two reasons: - * - * 1) We might have had buffered writes to this inode that were - * flushed and had their ordered extents completed in this - * transaction, but we did not previously log the inode with - * LOG_INODE_ALL. Later the inode was evicted and after that - * it was loaded again and this LOG_INODE_EXISTS log operation - * happened. We must make sure that if an explicit fsync against - * the inode is performed later, it logs the new extents, an - * updated inode item, etc, and syncs the log. The same logic - * applies to direct IO writes instead of buffered writes. - * - * 2) When we log the inode with LOG_INODE_EXISTS, its inode item - * is logged with an i_size of 0 or whatever value was logged - * before. If later the i_size of the inode is increased by a - * truncate operation, the log is synced through an fsync of - * some other inode and then finally an explicit fsync against - * this inode is made, we must make sure this fsync logs the - * inode with the new i_size, the hole between old i_size and - * the new i_size, and syncs the log. - */ - if (inode_only != LOG_INODE_EXISTS) - inode->last_log_commit = inode->last_sub_trans; - spin_unlock(&inode->lock); - } + if (inode_only != LOG_INODE_EXISTS) + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); out_unlock: mutex_unlock(&inode->log_mutex); - +out: btrfs_free_path(path); btrfs_free_path(dst_path); return err; @@ -5672,18 +5853,12 @@ struct btrfs_dir_list { * link_to_fixup_dir()); * * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that - * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and - * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item + * while logging the inode's items new index items (key type + * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item * has a size that doesn't match the sum of the lengths of all the logged - * names. This does not result in a problem because if a dir_item key is - * logged but its matching dir_index key is not logged, at log replay time we - * don't use it to replay the respective name (see replay_one_name()). On the - * other hand if only the dir_index key ends up being logged, the respective - * name is added to the fs/subvol tree with both the dir_item and dir_index - * keys created (see replay_one_name()). - * The directory's inode item with a wrong i_size is not a problem as well, - * since we don't use it at log replay time to set the i_size in the inode - * item of the fs/subvol tree (see overwrite_item()). + * names - this is ok, not a problem, because at log replay time we set the + * directory's i_size to the correct value (see replay_one_name() and + * do_overwrite_item()). */ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -5697,6 +5872,14 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_dir_list *dir_elem; int ret = 0; + /* + * If we are logging a new name, as part of a link or rename operation, + * don't bother logging new dentries, as we just want to log the names + * of an inode and that any new parents exist. + */ + if (ctx->logging_new_name) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -5721,7 +5904,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, goto next_dir_inode; min_key.objectid = dir_elem->ino; - min_key.type = BTRFS_DIR_ITEM_KEY; + min_key.type = BTRFS_DIR_INDEX_KEY; min_key.offset = 0; again: btrfs_release_path(path); @@ -5746,7 +5929,7 @@ process_leaf: btrfs_item_key_to_cpu(leaf, &min_key, i); if (min_key.objectid != dir_elem->ino || - min_key.type != BTRFS_DIR_ITEM_KEY) + min_key.type != BTRFS_DIR_INDEX_KEY) goto next_dir_inode; di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); @@ -5773,7 +5956,7 @@ process_leaf: ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), + ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); btrfs_add_delayed_iput(di_inode); if (ret) @@ -5858,7 +6041,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) break; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { struct btrfs_key inode_key; @@ -5917,11 +6100,10 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, continue; } - if (ctx) - ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), + ctx->log_new_dentries = false; + ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), LOG_INODE_ALL, ctx); - if (!ret && ctx && ctx->log_new_dentries) + if (!ret && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, BTRFS_I(dir_inode), ctx); btrfs_add_delayed_iput(dir_inode); @@ -5967,7 +6149,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation >= trans->transid && need_log_inode(trans, BTRFS_I(inode))) - ret = btrfs_log_inode(trans, root, BTRFS_I(inode), + ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); if (ret) @@ -6022,7 +6204,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (inode->generation >= trans->transid && need_log_inode(trans, inode)) { - ret = btrfs_log_inode(trans, root, inode, + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); if (ret) break; @@ -6165,7 +6347,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); + ret = btrfs_log_inode(trans, inode, inode_only, ctx); if (ret) goto end_trans; @@ -6182,7 +6364,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) + if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) log_dentries = true; /* @@ -6308,8 +6490,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_handle_fs_error(fs_info, ret, - "Failed to pin buffers while recovering log root tree."); + btrfs_abort_transaction(trans, ret); goto error; } @@ -6322,8 +6503,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_handle_fs_error(fs_info, ret, - "Couldn't find tree log root."); + btrfs_abort_transaction(trans, ret); goto error; } if (ret > 0) { @@ -6340,8 +6520,7 @@ again: log = btrfs_read_tree_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_handle_fs_error(fs_info, ret, - "Couldn't read tree log root."); + btrfs_abort_transaction(trans, ret); goto error; } @@ -6369,8 +6548,7 @@ again: if (!ret) goto next; - btrfs_handle_fs_error(fs_info, ret, - "Couldn't read target root for tree log recovery."); + btrfs_abort_transaction(trans, ret); goto error; } @@ -6378,14 +6556,15 @@ again: ret = btrfs_record_root_in_trans(trans, wc.replay_dest); if (ret) /* The loop needs to continue due to the root refs */ - btrfs_handle_fs_error(fs_info, ret, - "failed to record the log root in transaction"); + btrfs_abort_transaction(trans, ret); else ret = walk_log_tree(trans, log, &wc); if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { ret = fixup_inode_link_counts(trans, wc.replay_dest, path); + if (ret) + btrfs_abort_transaction(trans, ret); } if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { @@ -6402,6 +6581,8 @@ again: * could only happen during mount. */ ret = btrfs_init_root_free_objectid(root); + if (ret) + btrfs_abort_transaction(trans, ret); } wc.replay_dest->log_root = NULL; @@ -6562,15 +6743,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * was previously logged, make sure the next log attempt on the directory * is not skipped and logs the inode again. This is because the log may * not currently be authoritative for a range including the old - * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make - * sure after a log replay we do not end up with both the new and old - * dentries around (in case the inode is a directory we would have a - * directory with two hard links and 2 inode references for different - * parents). The next log attempt of old_dir will happen at - * btrfs_log_all_parents(), called through btrfs_log_inode_parent() - * below, because we have previously set inode->last_unlink_trans to the - * current transaction ID, either here or at btrfs_record_unlink_dir() in - * case inode is a directory. + * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we + * do not end up with both the new and old dentries around (in case the + * inode is a directory we would have a directory with two hard links and + * 2 inode references for different parents). The next log attempt of + * old_dir will happen at btrfs_log_all_parents(), called through + * btrfs_log_inode_parent() below, because we have previously set + * inode->last_unlink_trans to the current transaction ID, either here or + * at btrfs_record_unlink_dir() in case the inode is a directory. */ if (old_dir) old_dir->logged_trans = 0; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 731bd9c029f5..f6811c3df38a 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -17,6 +17,8 @@ struct btrfs_log_ctx { int log_transid; bool log_new_dentries; bool logging_new_name; + /* Tracks the last logged dir item/index key offset. */ + u64 last_dir_item_offset; struct inode *inode; struct list_head list; /* Only used for fast fsyncs. */ @@ -68,14 +70,14 @@ int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, struct btrfs_log_ctx *ctx); -int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct btrfs_inode *dir, u64 index); -int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct btrfs_inode *inode, u64 dirid); +void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct btrfs_inode *dir, u64 index); +void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 74023c8a783f..b458452a1aaf 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -52,7 +52,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, eb = path->nodes[0]; slot = path->slots[0]; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); offset = btrfs_item_ptr_offset(eb, slot); ret = -ENOENT; @@ -125,7 +125,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); - offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); + offset += btrfs_item_size(eb, slot) - sizeof(subid_le); } else { btrfs_warn(fs_info, "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", @@ -186,7 +186,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { btrfs_warn(fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); @@ -208,7 +208,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, goto out; } - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); if (item_size == sizeof(subid)) { ret = btrfs_del_item(trans, uuid_root, path); goto out; @@ -331,7 +331,7 @@ again_search_slot: goto skip; offset = btrfs_item_ptr_offset(leaf, slot); - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { btrfs_warn(fs_info, "uuid item with illegal size %lu!", diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 4968535dfff0..90eb5c2830a9 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -333,7 +333,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, if (key.objectid != btrfs_ino(inode) || key.type != key_type) break; - item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset; + item_end = btrfs_item_size(leaf, path->slots[0]) + key.offset; if (copied > 0) { /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2ec3b8ac8fa3..b07d382d53a8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -14,6 +14,7 @@ #include <linux/semaphore.h> #include <linux/uuid.h> #include <linux/list_sort.h> +#include <linux/namei.h> #include "misc.h" #include "ctree.h" #include "extent_map.h" @@ -33,6 +34,10 @@ #include "discard.h" #include "zoned.h" +#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID10 | \ + BTRFS_BLOCK_GROUP_RAID56_MASK) + const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, @@ -250,7 +255,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, + struct btrfs_io_context **bioc_ret, int mirror_num, int need_raid_map); /* @@ -508,7 +513,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, } if (flush) - filemap_write_and_wait((*bdev)->bd_inode->i_mapping); + sync_blockdev(*bdev); ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { blkdev_put(*bdev, flags); @@ -812,9 +817,13 @@ static noinline struct btrfs_device *device_list_add(const char *path, device = NULL; } else { + struct btrfs_dev_lookup_args args = { + .devid = devid, + .uuid = disk_super->dev_item.uuid, + }; + mutex_lock(&fs_devices->device_list_mutex); - device = btrfs_find_device(fs_devices, devid, - disk_super->dev_item.uuid, NULL); + device = btrfs_find_device(fs_devices, &args); /* * If this disk has been pulled into an fs devices created by @@ -1091,7 +1100,7 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) __btrfs_free_extra_devids(seed_dev, &latest_dev); - fs_devices->latest_bdev = latest_dev->bdev; + fs_devices->latest_dev = latest_dev; mutex_unlock(&uuid_mutex); } @@ -1122,8 +1131,10 @@ static void btrfs_close_one_device(struct btrfs_device *device) if (device->devid == BTRFS_DEV_REPLACE_DEVID) clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { + clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); fs_devices->missing_devices--; + } btrfs_close_bdev(device); if (device->bdev) { @@ -1155,7 +1166,6 @@ static void btrfs_close_one_device(struct btrfs_device *device) ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); ASSERT(list_empty(&device->dev_alloc_list)); ASSERT(list_empty(&device->post_commit_list)); - ASSERT(atomic_read(&device->reada_in_flight) == 0); } static void close_fs_devices(struct btrfs_fs_devices *fs_devices) @@ -1222,7 +1232,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, return -EINVAL; fs_devices->opened = 1; - fs_devices->latest_bdev = latest_dev->bdev; + fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; fs_devices->read_policy = BTRFS_READ_POLICY_PID; @@ -1286,7 +1296,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev pgoff_t index; /* make sure our super fits in the device */ - if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) + if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); /* make sure our super fits in the page */ @@ -1363,8 +1373,10 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, bytenr_orig = btrfs_sb_offset(0); ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); - if (ret) - return ERR_PTR(ret); + if (ret) { + device = ERR_PTR(ret); + goto error_bdev_put; + } disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); if (IS_ERR(disk_super)) { @@ -1843,8 +1855,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; + btrfs_reserve_chunk_metadata(trans, true); ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, &key, sizeof(*dev_item)); + btrfs_trans_release_chunk_metadata(trans); if (ret) goto out; @@ -1882,18 +1896,22 @@ out: /* * Function to update ctime/mtime for a given device path. * Mainly used for ctime/mtime based probe like libblkid. + * + * We don't care about errors here, this is just to be kind to userspace. */ -static void update_dev_time(struct block_device *bdev) +static void update_dev_time(const char *device_path) { - struct inode *inode = bdev->bd_inode; + struct path path; struct timespec64 now; + int ret; - /* Shouldn't happen but just in case. */ - if (!inode) + ret = kern_path(device_path, LOOKUP_FOLLOW, &path); + if (ret) return; - now = current_time(inode); - generic_update_time(inode, &now, S_MTIME | S_CTIME); + now = current_time(d_inode(path.dentry)); + inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); + path_put(&path); } static int btrfs_rm_dev_item(struct btrfs_device *device) @@ -1917,7 +1935,9 @@ static int btrfs_rm_dev_item(struct btrfs_device *device) key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; + btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + btrfs_trans_release_chunk_metadata(trans); if (ret) { if (ret > 0) ret = -ENOENT; @@ -1986,7 +2006,7 @@ static struct btrfs_device * btrfs_find_next_active_device( } /* - * Helper function to check if the given device is part of s_bdev / latest_bdev + * Helper function to check if the given device is part of s_bdev / latest_dev * and replace it with the provided or the next active device, in the context * where this function called, there should be always be another device (or * this_dev) which is active. @@ -2005,8 +2025,8 @@ void __cold btrfs_assign_next_active_device(struct btrfs_device *device, (fs_info->sb->s_bdev == device->bdev)) fs_info->sb->s_bdev = next_device->bdev; - if (fs_info->fs_devices->latest_bdev == device->bdev) - fs_info->fs_devices->latest_bdev = next_device->bdev; + if (fs_info->fs_devices->latest_dev->bdev == device->bdev) + fs_info->fs_devices->latest_dev = next_device; } /* @@ -2069,11 +2089,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ - update_dev_time(bdev); + update_dev_time(device_path); } -int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, - u64 devid, struct block_device **bdev, fmode_t *mode) +int btrfs_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_dev_lookup_args *args, + struct block_device **bdev, fmode_t *mode) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2081,22 +2102,23 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 num_devices; int ret = 0; - mutex_lock(&uuid_mutex); - + /* + * The device list in fs_devices is accessed without locks (neither + * uuid_mutex nor device_list_mutex) as it won't change on a mounted + * filesystem and another device rm cannot run. + */ num_devices = btrfs_num_devices(fs_info); ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) goto out; - device = btrfs_find_device_by_devspec(fs_info, devid, device_path); - - if (IS_ERR(device)) { - if (PTR_ERR(device) == -ENOENT && - device_path && strcmp(device_path, "missing") == 0) + device = btrfs_find_device(fs_info->fs_devices, args); + if (!device) { + if (args->missing) ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else - ret = PTR_ERR(device); + ret = -ENOENT; goto out; } @@ -2126,11 +2148,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, mutex_unlock(&fs_info->chunk_mutex); } - mutex_unlock(&uuid_mutex); ret = btrfs_shrink_device(device, 0); - if (!ret) - btrfs_reada_remove_dev(device); - mutex_lock(&uuid_mutex); if (ret) goto error_undo; @@ -2159,7 +2177,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, /* * In normal cases the cur_devices == fs_devices. But in case * of deleting a seed device, the cur_devices should point to - * its own fs_devices listed under the fs_devices->seed. + * its own fs_devices listed under the fs_devices->seed_list. */ cur_devices = device->fs_devices; mutex_lock(&fs_devices->device_list_mutex); @@ -2210,18 +2228,24 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, synchronize_rcu(); btrfs_free_device(device); - if (cur_devices->open_devices == 0) { + /* + * This can happen if cur_devices is the private seed devices list. We + * cannot call close_fs_devices() here because it expects the uuid_mutex + * to be held, but in fact we don't need that for the private + * seed_devices, we can simply decrement cur_devices->opened and then + * remove it from our list and free the fs_devices. + */ + if (cur_devices->num_devices == 0) { list_del_init(&cur_devices->seed_list); - close_fs_devices(cur_devices); + ASSERT(cur_devices->opened == 1); + cur_devices->opened--; free_fs_devices(cur_devices); } out: - mutex_unlock(&uuid_mutex); return ret; error_undo: - btrfs_reada_undo_remove_dev(device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, @@ -2305,13 +2329,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_unlock(&fs_devices->device_list_mutex); - /* - * The update_dev_time() with in btrfs_scratch_superblocks() - * may lead to a call to btrfs_show_devname() which will try - * to hold device_list_mutex. And here this device - * is already out of device list, so we don't have to hold - * the device_list_mutex lock. - */ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, tgtdev->name->str); @@ -2320,86 +2337,109 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) btrfs_free_device(tgtdev); } -static struct btrfs_device *btrfs_find_device_by_path( - struct btrfs_fs_info *fs_info, const char *device_path) +/** + * Populate args from device at path + * + * @fs_info: the filesystem + * @args: the args to populate + * @path: the path to the device + * + * This will read the super block of the device at @path and populate @args with + * the devid, fsid, and uuid. This is meant to be used for ioctls that need to + * lookup a device to operate on, but need to do it before we take any locks. + * This properly handles the special case of "missing" that a user may pass in, + * and does some basic sanity checks. The caller must make sure that @path is + * properly NUL terminated before calling in, and must call + * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and + * uuid buffers. + * + * Return: 0 for success, -errno for failure + */ +int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, + struct btrfs_dev_lookup_args *args, + const char *path) { - int ret = 0; struct btrfs_super_block *disk_super; - u64 devid; - u8 *dev_uuid; struct block_device *bdev; - struct btrfs_device *device; + int ret; - ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, - fs_info->bdev_holder, 0, &bdev, &disk_super); - if (ret) - return ERR_PTR(ret); + if (!path || !path[0]) + return -EINVAL; + if (!strcmp(path, "missing")) { + args->missing = true; + return 0; + } - devid = btrfs_stack_device_id(&disk_super->dev_item); - dev_uuid = disk_super->dev_item.uuid; + args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); + args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); + if (!args->uuid || !args->fsid) { + btrfs_put_dev_args_from_path(args); + return -ENOMEM; + } + + ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, + &bdev, &disk_super); + if (ret) + return ret; + args->devid = btrfs_stack_device_id(&disk_super->dev_item); + memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); if (btrfs_fs_incompat(fs_info, METADATA_UUID)) - device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->metadata_uuid); + memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); else - device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->fsid); - + memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - if (!device) - device = ERR_PTR(-ENOENT); blkdev_put(bdev, FMODE_READ); - return device; + return 0; } /* - * Lookup a device given by device id, or the path if the id is 0. + * Only use this jointly with btrfs_get_dev_args_from_path() because we will + * allocate our ->uuid and ->fsid pointers, everybody else uses local variables + * that don't need to be freed. */ +void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) +{ + kfree(args->uuid); + kfree(args->fsid); + args->uuid = NULL; + args->fsid = NULL; +} + struct btrfs_device *btrfs_find_device_by_devspec( struct btrfs_fs_info *fs_info, u64 devid, const char *device_path) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_device *device; + int ret; if (devid) { - device = btrfs_find_device(fs_info->fs_devices, devid, NULL, - NULL); + args.devid = devid; + device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) return ERR_PTR(-ENOENT); return device; } - if (!device_path || !device_path[0]) - return ERR_PTR(-EINVAL); - - if (strcmp(device_path, "missing") == 0) { - /* Find first missing device */ - list_for_each_entry(device, &fs_info->fs_devices->devices, - dev_list) { - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &device->dev_state) && !device->bdev) - return device; - } + ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); + if (ret) + return ERR_PTR(ret); + device = btrfs_find_device(fs_info->fs_devices, &args); + btrfs_put_dev_args_from_path(&args); + if (!device) return ERR_PTR(-ENOENT); - } - - return btrfs_find_device_by_path(fs_info, device_path); + return device; } -/* - * does all the dirty work required for changing file system's UUID. - */ -static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) +static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *old_devices; struct btrfs_fs_devices *seed_devices; - struct btrfs_super_block *disk_super = fs_info->super_copy; - struct btrfs_device *device; - u64 super_flags; lockdep_assert_held(&uuid_mutex); if (!fs_devices->seeding) - return -EINVAL; + return ERR_PTR(-EINVAL); /* * Private copy of the seed devices, anchored at @@ -2407,7 +2447,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) */ seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) - return PTR_ERR(seed_devices); + return seed_devices; /* * It's necessary to retain a copy of the original seed fs_devices in @@ -2418,7 +2458,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { kfree(seed_devices); - return PTR_ERR(old_devices); + return old_devices; } list_add(&old_devices->fs_list, &fs_uuids); @@ -2429,7 +2469,41 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&seed_devices->alloc_list); mutex_init(&seed_devices->device_list_mutex); - mutex_lock(&fs_devices->device_list_mutex); + return seed_devices; +} + +/* + * Splice seed devices into the sprout fs_devices. + * Generate a new fsid for the sprouted read-write filesystem. + */ +static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *seed_devices) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_super_block *disk_super = fs_info->super_copy; + struct btrfs_device *device; + u64 super_flags; + + /* + * We are updating the fsid, the thread leading to device_list_add() + * could race, so uuid_mutex is needed. + */ + lockdep_assert_held(&uuid_mutex); + + /* + * The threads listed below may traverse dev_list but can do that without + * device_list_mutex: + * - All device ops and balance - as we are in btrfs_exclop_start. + * - Various dev_list readers - are using RCU. + * - btrfs_ioctl_fitrim() - is using RCU. + * + * For-read threads as below are using device_list_mutex: + * - Readonly scrub btrfs_scrub_dev() + * - Readonly scrub btrfs_scrub_progress() + * - btrfs_get_dev_stats() + */ + lockdep_assert_held(&fs_devices->device_list_mutex); + list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, synchronize_rcu); list_for_each_entry(device, &seed_devices->devices, dev_list) @@ -2445,13 +2519,10 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) generate_random_uuid(fs_devices->fsid); memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); - mutex_unlock(&fs_devices->device_list_mutex); super_flags = btrfs_super_flags(disk_super) & ~BTRFS_SUPER_FLAG_SEEDING; btrfs_set_super_flags(disk_super, super_flags); - - return 0; } /* @@ -2459,6 +2530,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) */ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; struct btrfs_path *path; @@ -2468,7 +2540,6 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) struct btrfs_key key; u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; - u64 devid; int ret; path = btrfs_alloc_path(); @@ -2480,7 +2551,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) key.type = BTRFS_DEV_ITEM_KEY; while (1) { + btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + btrfs_trans_release_chunk_metadata(trans); if (ret < 0) goto error; @@ -2505,13 +2578,14 @@ next_slot: dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); - devid = btrfs_device_id(leaf, dev_item); + args.devid = btrfs_device_id(leaf, dev_item); read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); - device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid); + args.uuid = dev_uuid; + args.fsid = fs_uuid; + device = btrfs_find_device(fs_info->fs_devices, &args); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { @@ -2539,10 +2613,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct super_block *sb = fs_info->sb; struct rcu_string *name; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *seed_devices; u64 orig_super_total_bytes; u64 orig_super_num_devices; - int seeding_dev = 0; int ret = 0; + bool seeding_dev = false; bool locked = false; if (sb_rdonly(sb) && !fs_devices->seeding) @@ -2559,7 +2634,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } if (fs_devices->seeding) { - seeding_dev = 1; + seeding_dev = true; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); locked = true; @@ -2594,7 +2669,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->fs_info = fs_info; device->bdev = bdev; - ret = btrfs_get_dev_zone_info(device); + ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error_free_device; @@ -2610,8 +2685,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; - device->total_bytes = round_down(i_size_read(bdev->bd_inode), - fs_info->sectorsize); + device->total_bytes = + round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); @@ -2622,16 +2697,25 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { btrfs_clear_sb_rdonly(sb); - ret = btrfs_prepare_sprout(fs_info); - if (ret) { + + /* GFP_KERNEL allocation must not be under device_list_mutex */ + seed_devices = btrfs_init_sprout(fs_info); + if (IS_ERR(seed_devices)) { + ret = PTR_ERR(seed_devices); btrfs_abort_transaction(trans, ret); goto error_trans; } } + mutex_lock(&fs_devices->device_list_mutex); + if (seeding_dev) { + btrfs_setup_sprout(fs_info, seed_devices); + btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, + device); + } + device->fs_devices = fs_devices; - mutex_lock(&fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_add_rcu(&device->dev_list, &fs_devices->devices); list_add(&device->dev_alloc_list, &fs_devices->alloc_list); @@ -2693,7 +2777,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path /* * fs_devices now represents the newly sprouted filesystem and - * its fsid has been changed by btrfs_prepare_sprout + * its fsid has been changed by btrfs_sprout_splice(). */ btrfs_sysfs_update_sprout_fsid(fs_devices); } @@ -2733,7 +2817,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_forget_devices(device_path); /* Update ctime/mtime for blkid or udev */ - update_dev_time(bdev); + update_dev_time(device_path); return ret; @@ -2826,6 +2910,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total; u64 diff; + int ret; if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return -EACCES; @@ -2854,7 +2939,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, &trans->transaction->dev_update_list); mutex_unlock(&fs_info->chunk_mutex); - return btrfs_update_device(trans, device); + btrfs_reserve_chunk_metadata(trans, false); + ret = btrfs_update_device(trans, device); + btrfs_trans_release_chunk_metadata(trans); + + return ret; } static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) @@ -3096,7 +3185,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) const u64 sys_flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *sys_bg; - sys_bg = btrfs_alloc_chunk(trans, sys_flags); + sys_bg = btrfs_create_chunk(trans, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -4301,8 +4390,10 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); - if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) + if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { btrfs_info(fs_info, "balance: paused"); + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); + } /* * Balance can be canceled by: * @@ -4378,6 +4469,10 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; + spin_unlock(&fs_info->super_lock); /* * A ro->rw remount sequence should continue with the paused balance * regardless of who pauses it, system or the user as of now, so set @@ -4446,7 +4541,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) * is in a paused state and must have fs_info::balance_ctl properly * set up. */ - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); @@ -4587,7 +4682,7 @@ int btrfs_uuid_scan_kthread(void *data) eb = path->nodes[0]; slot = path->slots[0]; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); if (item_size < sizeof(root_item)) goto skip; @@ -4889,8 +4984,10 @@ again: round_down(old_total - diff, fs_info->sectorsize)); mutex_unlock(&fs_info->chunk_mutex); + btrfs_reserve_chunk_metadata(trans, false); /* Now btrfs_update_device() will change the on-disk size. */ ret = btrfs_update_device(trans, device); + btrfs_trans_release_chunk_metadata(trans); if (ret < 0) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -4973,7 +5070,7 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) } /* - * Structure used internally for __btrfs_alloc_chunk() function. + * Structure used internally for btrfs_create_chunk() function. * Wraps needed parameters. */ struct alloc_chunk_ctl { @@ -5377,7 +5474,7 @@ error_del_extent: return block_group; } -struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans, +struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type) { struct btrfs_fs_info *info = trans->fs_info; @@ -5446,7 +5543,6 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_root *chunk_root = fs_info->chunk_root; struct btrfs_key key; struct btrfs_chunk *chunk; @@ -5518,7 +5614,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, } btrfs_set_stack_chunk_length(chunk, bg->length); - btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); + btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); btrfs_set_stack_chunk_type(chunk, map->type); btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); @@ -5578,12 +5674,12 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) */ alloc_profile = btrfs_metadata_alloc_profile(fs_info); - meta_bg = btrfs_alloc_chunk(trans, alloc_profile); + meta_bg = btrfs_create_chunk(trans, alloc_profile); if (IS_ERR(meta_bg)) return PTR_ERR(meta_bg); alloc_profile = btrfs_system_alloc_profile(fs_info); - sys_bg = btrfs_alloc_chunk(trans, alloc_profile); + sys_bg = btrfs_create_chunk(trans, alloc_profile); if (IS_ERR(sys_bg)) return PTR_ERR(sys_bg); @@ -5597,17 +5693,17 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map) return btrfs_raid_array[index].tolerated_failures; } -int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) +bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct extent_map *em; struct map_lookup *map; - int readonly = 0; int miss_ndevs = 0; int i; + bool ret = true; em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(em)) - return 1; + return false; map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { @@ -5618,21 +5714,20 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) } if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &map->stripes[i].dev->dev_state)) { - readonly = 1; + ret = false; goto end; } } /* - * If the number of missing devices is larger than max errors, - * we can not write the data into that chunk successfully, so - * set it readonly. + * If the number of missing devices is larger than max errors, we can + * not write the data into that chunk successfully. */ if (miss_ndevs > btrfs_chunk_max_errors(map)) - readonly = 1; + ret = false; end: free_extent_map(em); - return readonly; + return ret; } void btrfs_mapping_tree_free(struct extent_map_tree *tree) @@ -5795,7 +5890,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, } /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ -static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) +static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) { int i; int again = 1; @@ -5804,52 +5899,55 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) again = 0; for (i = 0; i < num_stripes - 1; i++) { /* Swap if parity is on a smaller index */ - if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { - swap(bbio->stripes[i], bbio->stripes[i + 1]); - swap(bbio->raid_map[i], bbio->raid_map[i + 1]); + if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { + swap(bioc->stripes[i], bioc->stripes[i + 1]); + swap(bioc->raid_map[i], bioc->raid_map[i + 1]); again = 1; } } } } -static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) +static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + int total_stripes, + int real_stripes) { - struct btrfs_bio *bbio = kzalloc( - /* the size of the btrfs_bio */ - sizeof(struct btrfs_bio) + - /* plus the variable array for the stripes */ - sizeof(struct btrfs_bio_stripe) * (total_stripes) + - /* plus the variable array for the tgt dev */ + struct btrfs_io_context *bioc = kzalloc( + /* The size of btrfs_io_context */ + sizeof(struct btrfs_io_context) + + /* Plus the variable array for the stripes */ + sizeof(struct btrfs_io_stripe) * (total_stripes) + + /* Plus the variable array for the tgt dev */ sizeof(int) * (real_stripes) + /* - * plus the raid_map, which includes both the tgt dev - * and the stripes + * Plus the raid_map, which includes both the tgt dev + * and the stripes. */ sizeof(u64) * (total_stripes), GFP_NOFS|__GFP_NOFAIL); - atomic_set(&bbio->error, 0); - refcount_set(&bbio->refs, 1); + atomic_set(&bioc->error, 0); + refcount_set(&bioc->refs, 1); - bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); - bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); + bioc->fs_info = fs_info; + bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); + bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); - return bbio; + return bioc; } -void btrfs_get_bbio(struct btrfs_bio *bbio) +void btrfs_get_bioc(struct btrfs_io_context *bioc) { - WARN_ON(!refcount_read(&bbio->refs)); - refcount_inc(&bbio->refs); + WARN_ON(!refcount_read(&bioc->refs)); + refcount_inc(&bioc->refs); } -void btrfs_put_bbio(struct btrfs_bio *bbio) +void btrfs_put_bioc(struct btrfs_io_context *bioc) { - if (!bbio) + if (!bioc) return; - if (refcount_dec_and_test(&bbio->refs)) - kfree(bbio); + if (refcount_dec_and_test(&bioc->refs)) + kfree(bioc); } /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ @@ -5859,11 +5957,11 @@ void btrfs_put_bbio(struct btrfs_bio *bbio) */ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, - struct btrfs_bio **bbio_ret) + struct btrfs_io_context **bioc_ret) { struct extent_map *em; struct map_lookup *map; - struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; u64 length = *length_ret; u64 offset; u64 stripe_nr; @@ -5882,8 +5980,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, int ret = 0; int i; - /* discard always return a bbio */ - ASSERT(bbio_ret); + /* Discard always returns a bioc. */ + ASSERT(bioc_ret); em = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(em)) @@ -5946,26 +6044,25 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, &stripe_index); } - bbio = alloc_btrfs_bio(num_stripes, 0); - if (!bbio) { + bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0); + if (!bioc) { ret = -ENOMEM; goto out; } for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = + bioc->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - bbio->stripes[i].dev = map->stripes[stripe_index].dev; + bioc->stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - bbio->stripes[i].length = stripes_per_dev * + bioc->stripes[i].length = stripes_per_dev * map->stripe_len; if (i / sub_stripes < remaining_stripes) - bbio->stripes[i].length += - map->stripe_len; + bioc->stripes[i].length += map->stripe_len; /* * Special for the first stripe and @@ -5976,19 +6073,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, * off end_off */ if (i < sub_stripes) - bbio->stripes[i].length -= - stripe_offset; + bioc->stripes[i].length -= stripe_offset; if (stripe_index >= last_stripe && stripe_index <= (last_stripe + sub_stripes - 1)) - bbio->stripes[i].length -= - stripe_end_offset; + bioc->stripes[i].length -= stripe_end_offset; if (i == sub_stripes - 1) stripe_offset = 0; } else { - bbio->stripes[i].length = length; + bioc->stripes[i].length = length; } stripe_index++; @@ -5998,9 +6093,9 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, } } - *bbio_ret = bbio; - bbio->map_type = map->type; - bbio->num_stripes = num_stripes; + *bioc_ret = bioc; + bioc->map_type = map->type; + bioc->num_stripes = num_stripes; out: free_extent_map(em); return ret; @@ -6024,7 +6119,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, u64 srcdev_devid, int *mirror_num, u64 *physical) { - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; int num_stripes; int index_srcdev = 0; int found = 0; @@ -6033,20 +6128,20 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, int ret = 0; ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &length, &bbio, 0, 0); + logical, &length, &bioc, 0, 0); if (ret) { - ASSERT(bbio == NULL); + ASSERT(bioc == NULL); return ret; } - num_stripes = bbio->num_stripes; + num_stripes = bioc->num_stripes; if (*mirror_num > num_stripes) { /* * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, * that means that the requested area is not left of the left * cursor */ - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); return -EIO; } @@ -6056,7 +6151,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, * pointer to the one of the target drive. */ for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid != srcdev_devid) + if (bioc->stripes[i].dev->devid != srcdev_devid) continue; /* @@ -6064,15 +6159,15 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, * mirror with the lowest physical address */ if (found && - physical_of_found <= bbio->stripes[i].physical) + physical_of_found <= bioc->stripes[i].physical) continue; index_srcdev = i; found = 1; - physical_of_found = bbio->stripes[i].physical; + physical_of_found = bioc->stripes[i].physical; } - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); ASSERT(found); if (!found) @@ -6103,12 +6198,12 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) } static void handle_ops_on_dev_replace(enum btrfs_map_op op, - struct btrfs_bio **bbio_ret, + struct btrfs_io_context **bioc_ret, struct btrfs_dev_replace *dev_replace, u64 logical, int *num_stripes_ret, int *max_errors_ret) { - struct btrfs_bio *bbio = *bbio_ret; + struct btrfs_io_context *bioc = *bioc_ret; u64 srcdev_devid = dev_replace->srcdev->devid; int tgtdev_indexes = 0; int num_stripes = *num_stripes_ret; @@ -6138,17 +6233,17 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, */ index_where_to_add = num_stripes; for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { + if (bioc->stripes[i].dev->devid == srcdev_devid) { /* write to new disk, too */ - struct btrfs_bio_stripe *new = - bbio->stripes + index_where_to_add; - struct btrfs_bio_stripe *old = - bbio->stripes + i; + struct btrfs_io_stripe *new = + bioc->stripes + index_where_to_add; + struct btrfs_io_stripe *old = + bioc->stripes + i; new->physical = old->physical; new->length = old->length; new->dev = dev_replace->tgtdev; - bbio->tgtdev_map[i] = index_where_to_add; + bioc->tgtdev_map[i] = index_where_to_add; index_where_to_add++; max_errors++; tgtdev_indexes++; @@ -6168,30 +6263,29 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, * full copy of the source drive. */ for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { + if (bioc->stripes[i].dev->devid == srcdev_devid) { /* * In case of DUP, in order to keep it simple, * only add the mirror with the lowest physical * address */ if (found && - physical_of_found <= - bbio->stripes[i].physical) + physical_of_found <= bioc->stripes[i].physical) continue; index_srcdev = i; found = 1; - physical_of_found = bbio->stripes[i].physical; + physical_of_found = bioc->stripes[i].physical; } } if (found) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; + struct btrfs_io_stripe *tgtdev_stripe = + bioc->stripes + num_stripes; tgtdev_stripe->physical = physical_of_found; tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; + bioc->stripes[index_srcdev].length; tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; + bioc->tgtdev_map[index_srcdev] = num_stripes; tgtdev_indexes++; num_stripes++; @@ -6200,8 +6294,8 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, *num_stripes_ret = num_stripes; *max_errors_ret = max_errors; - bbio->num_tgtdevs = tgtdev_indexes; - *bbio_ret = bbio; + bioc->num_tgtdevs = tgtdev_indexes; + *bioc_ret = bioc; } static bool need_full_stripe(enum btrfs_map_op op) @@ -6258,7 +6352,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, stripe_offset = offset - stripe_offset; data_stripes = nr_data_stripes(map); - if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + /* Only stripe based profiles needs to check against stripe length. */ + if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { u64 max_len = stripe_len - stripe_offset; /* @@ -6304,7 +6399,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, + struct btrfs_io_context **bioc_ret, int mirror_num, int need_raid_map) { struct extent_map *em; @@ -6319,7 +6414,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int num_stripes; int max_errors = 0; int tgtdev_indexes = 0; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; int num_alloc_stripes; @@ -6328,7 +6423,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 raid56_full_stripe_start = (u64)-1; struct btrfs_io_geometry geom; - ASSERT(bbio_ret); + ASSERT(bioc_ret); ASSERT(op != BTRFS_MAP_DISCARD); em = btrfs_get_chunk_map(fs_info, logical, *length); @@ -6472,20 +6567,20 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, tgtdev_indexes = num_stripes; } - bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); - if (!bbio) { + bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); + if (!bioc) { ret = -ENOMEM; goto out; } for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = map->stripes[stripe_index].physical + + bioc->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - bbio->stripes[i].dev = map->stripes[stripe_index].dev; + bioc->stripes[i].dev = map->stripes[stripe_index].dev; stripe_index++; } - /* build raid_map */ + /* Build raid_map */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { u64 tmp; @@ -6497,15 +6592,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* Fill in the logical address of each stripe */ tmp = stripe_nr * data_stripes; for (i = 0; i < data_stripes; i++) - bbio->raid_map[(i+rot) % num_stripes] = + bioc->raid_map[(i + rot) % num_stripes] = em->start + (tmp + i) * map->stripe_len; - bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; if (map->type & BTRFS_BLOCK_GROUP_RAID6) - bbio->raid_map[(i+rot+1) % num_stripes] = + bioc->raid_map[(i + rot + 1) % num_stripes] = RAID6_Q_STRIPE; - sort_parity_stripes(bbio, num_stripes); + sort_parity_stripes(bioc, num_stripes); } if (need_full_stripe(op)) @@ -6513,15 +6608,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { - handle_ops_on_dev_replace(op, &bbio, dev_replace, logical, + handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, &num_stripes, &max_errors); } - *bbio_ret = bbio; - bbio->map_type = map->type; - bbio->num_stripes = num_stripes; - bbio->max_errors = max_errors; - bbio->mirror_num = mirror_num; + *bioc_ret = bioc; + bioc->map_type = map->type; + bioc->num_stripes = num_stripes; + bioc->max_errors = max_errors; + bioc->mirror_num = mirror_num; /* * this is the case that REQ_READ && dev_replace_is_ongoing && @@ -6530,9 +6625,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, */ if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { WARN_ON(num_stripes > 1); - bbio->stripes[0].dev = dev_replace->tgtdev; - bbio->stripes[0].physical = physical_to_patch_in_first_stripe; - bbio->mirror_num = map->num_stripes + 1; + bioc->stripes[0].dev = dev_replace->tgtdev; + bioc->stripes[0].physical = physical_to_patch_in_first_stripe; + bioc->mirror_num = map->num_stripes + 1; } out: if (dev_replace_is_ongoing) { @@ -6546,43 +6641,43 @@ out: int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num) + struct btrfs_io_context **bioc_ret, int mirror_num) { if (op == BTRFS_MAP_DISCARD) return __btrfs_map_block_for_discard(fs_info, logical, - length, bbio_ret); + length, bioc_ret); - return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, mirror_num, 0); } /* For Scrub/replace */ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret) + struct btrfs_io_context **bioc_ret) { - return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); + return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); } -static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) +static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio) { - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; + bio->bi_private = bioc->private; + bio->bi_end_io = bioc->end_io; bio_endio(bio); - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); } static void btrfs_end_bio(struct bio *bio) { - struct btrfs_bio *bbio = bio->bi_private; + struct btrfs_io_context *bioc = bio->bi_private; int is_orig_bio = 0; if (bio->bi_status) { - atomic_inc(&bbio->error); + atomic_inc(&bioc->error); if (bio->bi_status == BLK_STS_IOERR || bio->bi_status == BLK_STS_TARGET) { - struct btrfs_device *dev = btrfs_io_bio(bio)->device; + struct btrfs_device *dev = btrfs_bio(bio)->device; ASSERT(dev->bdev); if (btrfs_op(bio) == BTRFS_MAP_WRITE) @@ -6597,22 +6692,22 @@ static void btrfs_end_bio(struct bio *bio) } } - if (bio == bbio->orig_bio) + if (bio == bioc->orig_bio) is_orig_bio = 1; - btrfs_bio_counter_dec(bbio->fs_info); + btrfs_bio_counter_dec(bioc->fs_info); - if (atomic_dec_and_test(&bbio->stripes_pending)) { + if (atomic_dec_and_test(&bioc->stripes_pending)) { if (!is_orig_bio) { bio_put(bio); - bio = bbio->orig_bio; + bio = bioc->orig_bio; } - btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; + btrfs_bio(bio)->mirror_num = bioc->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the btrfs bio */ - if (atomic_read(&bbio->error) > bbio->max_errors) { + if (atomic_read(&bioc->error) > bioc->max_errors) { bio->bi_status = BLK_STS_IOERR; } else { /* @@ -6622,19 +6717,19 @@ static void btrfs_end_bio(struct bio *bio) bio->bi_status = BLK_STS_OK; } - btrfs_end_bbio(bbio, bio); + btrfs_end_bioc(bioc, bio); } else if (!is_orig_bio) { bio_put(bio); } } -static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, +static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, u64 physical, struct btrfs_device *dev) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bioc->fs_info; - bio->bi_private = bbio; - btrfs_io_bio(bio)->device = dev; + bio->bi_private = bioc; + btrfs_bio(bio)->device = dev; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; /* @@ -6663,20 +6758,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfsic_submit_bio(bio); } -static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) +static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) { - atomic_inc(&bbio->error); - if (atomic_dec_and_test(&bbio->stripes_pending)) { + atomic_inc(&bioc->error); + if (atomic_dec_and_test(&bioc->stripes_pending)) { /* Should be the original bio. */ - WARN_ON(bio != bbio->orig_bio); + WARN_ON(bio != bioc->orig_bio); - btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; + btrfs_bio(bio)->mirror_num = bioc->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - if (atomic_read(&bbio->error) > bbio->max_errors) + if (atomic_read(&bioc->error) > bioc->max_errors) bio->bi_status = BLK_STS_IOERR; else bio->bi_status = BLK_STS_OK; - btrfs_end_bbio(bbio, bio); + btrfs_end_bioc(bioc, bio); } } @@ -6691,36 +6786,34 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int ret; int dev_nr; int total_devs; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; length = bio->bi_iter.bi_size; map_length = length; btrfs_bio_counter_inc_blocked(fs_info); ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, - &map_length, &bbio, mirror_num, 1); + &map_length, &bioc, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); return errno_to_blk_status(ret); } - total_devs = bbio->num_stripes; - bbio->orig_bio = first_bio; - bbio->private = first_bio->bi_private; - bbio->end_io = first_bio->bi_end_io; - bbio->fs_info = fs_info; - atomic_set(&bbio->stripes_pending, bbio->num_stripes); + total_devs = bioc->num_stripes; + bioc->orig_bio = first_bio; + bioc->private = first_bio->bi_private; + bioc->end_io = first_bio->bi_end_io; + atomic_set(&bioc->stripes_pending, bioc->num_stripes); - if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && + if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - ret = raid56_parity_write(fs_info, bio, bbio, - map_length); + ret = raid56_parity_write(bio, bioc, map_length); } else { - ret = raid56_parity_recover(fs_info, bio, bbio, - map_length, mirror_num, 1); + ret = raid56_parity_recover(bio, bioc, map_length, + mirror_num, 1); } btrfs_bio_counter_dec(fs_info); @@ -6735,12 +6828,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, } for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { - dev = bbio->stripes[dev_nr].dev; + dev = bioc->stripes[dev_nr].dev; if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || (btrfs_op(first_bio) == BTRFS_MAP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - bbio_error(bbio, first_bio, logical); + bioc_error(bioc, first_bio, logical); continue; } @@ -6749,12 +6842,39 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, else bio = first_bio; - submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); + submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); } btrfs_bio_counter_dec(fs_info); return BLK_STS_OK; } +static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, + const struct btrfs_fs_devices *fs_devices) +{ + if (args->fsid == NULL) + return true; + if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) + return true; + return false; +} + +static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, + const struct btrfs_device *device) +{ + ASSERT((args->devid != (u64)-1) || args->missing); + + if ((args->devid != (u64)-1) && device->devid != args->devid) + return false; + if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) + return false; + if (!args->missing) + return true; + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && + !device->bdev) + return true; + return false; +} + /* * Find a device specified by @devid or @uuid in the list of @fs_devices, or * return NULL. @@ -6762,31 +6882,25 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, * If devid and uuid are both specified, the match must be exact, otherwise * only devid is used. */ -struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid) +struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, + const struct btrfs_dev_lookup_args *args) { struct btrfs_device *device; struct btrfs_fs_devices *seed_devs; - if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + if (dev_args_match_fs_devices(args, fs_devices)) { list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (device->devid == devid && - (!uuid || memcmp(device->uuid, uuid, - BTRFS_UUID_SIZE) == 0)) + if (dev_args_match_device(args, device)) return device; } } list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { - if (!fsid || - !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { - list_for_each_entry(device, &seed_devs->devices, - dev_list) { - if (device->devid == devid && - (!uuid || memcmp(device->uuid, uuid, - BTRFS_UUID_SIZE) == 0)) - return device; - } + if (!dev_args_match_fs_devices(args, seed_devs)) + continue; + list_for_each_entry(device, &seed_devs->devices, dev_list) { + if (dev_args_match_device(args, device)) + return device; } } @@ -6862,11 +6976,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->post_commit_list); - atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); - INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE, NULL); @@ -6952,6 +7063,7 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; @@ -7029,11 +7141,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); devid = btrfs_stripe_devid_nr(leaf, chunk, i); + args.devid = devid; read_extent_buffer(leaf, uuid, (unsigned long) btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); - map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, - devid, uuid, NULL); + args.uuid = uuid; + map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); @@ -7151,6 +7264,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, static int read_one_dev(struct extent_buffer *leaf, struct btrfs_dev_item *dev_item) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; @@ -7159,11 +7273,13 @@ static int read_one_dev(struct extent_buffer *leaf, u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; - devid = btrfs_device_id(leaf, dev_item); + devid = args.devid = btrfs_device_id(leaf, dev_item); read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); + args.uuid = dev_uuid; + args.fsid = fs_uuid; if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { fs_devices = open_seed_devices(fs_info, fs_uuid); @@ -7171,8 +7287,7 @@ static int read_one_dev(struct extent_buffer *leaf, return PTR_ERR(fs_devices); } - device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid); + device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, @@ -7236,7 +7351,7 @@ static int read_one_dev(struct extent_buffer *leaf, fill_device_from_item(leaf, dev_item, device); if (device->bdev) { - u64 max_total_bytes = i_size_read(device->bdev->bd_inode); + u64 max_total_bytes = bdev_nr_bytes(device->bdev); if (device->total_bytes > max_total_bytes) { btrfs_err(fs_info, @@ -7482,6 +7597,19 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) fs_info->fs_devices->total_rw_bytes = 0; /* + * Lockdep complains about possible circular locking dependency between + * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores + * used for freeze procection of a fs (struct super_block.s_writers), + * which we take when starting a transaction, and extent buffers of the + * chunk tree if we call read_one_dev() while holding a lock on an + * extent buffer of the chunk tree. Since we are mounting the filesystem + * and at this point there can't be any concurrent task modifying the + * chunk tree, to keep it simple, just skip locking on the chunk tree. + */ + ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); + path->skip_locking = 1; + + /* * Read all device items, and then all the chunk items. All * device items are found before any chunk item (their object id * is smaller than the lowest possible object id for a chunk @@ -7506,10 +7634,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) goto error; break; } - /* - * The nodes on level 1 are not locked but we don't need to do - * that during mount time as nothing else can access the tree - */ node = path->nodes[1]; if (node) { if (last_ra_node != node->start) { @@ -7537,7 +7661,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * requirement for chunk allocation, see the comment on * top of btrfs_chunk_alloc() for details. */ - ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); ret = read_one_chunk(&found_key, leaf, chunk); if (ret) @@ -7643,7 +7766,7 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device, } slot = path->slots[0]; eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); @@ -7721,7 +7844,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, } if (ret == 0 && - btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { @@ -7841,12 +7964,14 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_device *dev; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL); + args.devid = stats->devid; + dev = btrfs_find_device(fs_info->fs_devices, &args); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { @@ -7922,6 +8047,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) { + struct btrfs_dev_lookup_args args = { .devid = devid }; struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; @@ -7977,7 +8103,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } /* Make sure no dev extent is beyond device boundary */ - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; @@ -8208,23 +8334,26 @@ out: return ret; } -int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) +bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) { struct btrfs_block_group *cache; + if (!btrfs_is_zoned(fs_info)) + return false; + /* Do not attempt to repair in degraded state */ if (btrfs_test_opt(fs_info, DEGRADED)) - return 0; + return true; cache = btrfs_lookup_block_group(fs_info, logical); if (!cache) - return 0; + return true; spin_lock(&cache->lock); if (cache->relocating_repair) { spin_unlock(&cache->lock); btrfs_put_block_group(cache); - return 0; + return true; } cache->relocating_repair = 1; spin_unlock(&cache->lock); @@ -8232,5 +8361,5 @@ int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) kthread_run(relocating_repair_kthread, cache, "btrfs-relocating-repair"); - return 0; + return true; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2183361db614..005c9e2a491a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -123,13 +123,6 @@ struct btrfs_device { /* per-device scrub information */ struct scrub_ctx *scrub_ctx; - /* readahead state */ - atomic_t reada_in_flight; - u64 reada_next; - struct reada_zone *reada_curr_zone; - struct radix_tree_root reada_zones; - struct radix_tree_root reada_extents; - /* disk I/O failure stats. For detailed description refer to * enum btrfs_dev_stat_values in ioctl.h */ int dev_stats_valid; @@ -236,17 +229,40 @@ struct btrfs_fs_devices { bool fsid_change; struct list_head fs_list; + /* + * Number of devices under this fsid including missing and + * replace-target device and excludes seed devices. + */ u64 num_devices; + + /* + * The number of devices that successfully opened, including + * replace-target, excludes seed devices. + */ u64 open_devices; + + /* The number of devices that are under the chunk allocation list. */ u64 rw_devices; + + /* Count of missing devices under this fsid excluding seed device. */ u64 missing_devices; u64 total_rw_bytes; + + /* + * Count of devices from btrfs_super_block::num_devices for this fsid, + * which includes the seed device, excludes the transient replace-target + * device. + */ u64 total_devices; /* Highest generation number of seen devices */ u64 latest_generation; - struct block_device *latest_bdev; + /* + * The mount device or a device with highest generation after removal + * or replace. + */ + struct btrfs_device *latest_dev; /* all of the devices in the FS, protected by a mutex * so we can safely walk it to write out the supers without @@ -300,48 +316,62 @@ struct btrfs_fs_devices { / sizeof(struct btrfs_stripe) + 1) /* - * we need the mirror number and stripe index to be passed around - * the call chain while we are processing end_io (especially errors). - * Really, what we need is a btrfs_bio structure that has this info - * and is properly sized with its stripe array, but we're not there - * quite yet. We have our own btrfs bioset, and all of the bios - * we allocate are actually btrfs_io_bios. We'll cram as much of - * struct btrfs_bio as we can into this over time. + * Additional info to pass along bio. + * + * Mostly for btrfs specific features like csum and mirror_num. */ -struct btrfs_io_bio { +struct btrfs_bio { unsigned int mirror_num; + + /* @device is for stripe IO submission. */ struct btrfs_device *device; - u64 logical; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; struct bvec_iter iter; + /* * This member must come last, bio_alloc_bioset will allocate enough - * bytes for entire btrfs_io_bio but relies on bio being last. + * bytes for entire btrfs_bio but relies on bio being last. */ struct bio bio; }; -static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio) +static inline struct btrfs_bio *btrfs_bio(struct bio *bio) { - return container_of(bio, struct btrfs_io_bio, bio); + return container_of(bio, struct btrfs_bio, bio); } -static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio) +static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) { - if (io_bio->csum != io_bio->csum_inline) { - kfree(io_bio->csum); - io_bio->csum = NULL; + if (bbio->csum != bbio->csum_inline) { + kfree(bbio->csum); + bbio->csum = NULL; } } -struct btrfs_bio_stripe { +struct btrfs_io_stripe { struct btrfs_device *dev; u64 physical; u64 length; /* only used for discard mappings */ }; -struct btrfs_bio { +/* + * Context for IO subsmission for device stripe. + * + * - Track the unfinished mirrors for mirror based profiles + * Mirror based profiles are SINGLE/DUP/RAID1/RAID10. + * + * - Contain the logical -> physical mapping info + * Used by submit_stripe_bio() for mapping logical bio + * into physical device address. + * + * - Contain device replace info + * Used by handle_ops_on_dev_replace() to copy logical bios + * into the new device. + * + * - Contain RAID56 full stripe logical bytenrs + */ +struct btrfs_io_context { refcount_t refs; atomic_t stripes_pending; struct btrfs_fs_info *fs_info; @@ -361,7 +391,7 @@ struct btrfs_bio { * so raid_map[0] is the start of our full stripe */ u64 *raid_map; - struct btrfs_bio_stripe stripes[]; + struct btrfs_io_stripe stripes[]; }; struct btrfs_device_info { @@ -396,11 +426,11 @@ struct map_lookup { int num_stripes; int sub_stripes; int verified_stripes; /* For mount time dev extent verification */ - struct btrfs_bio_stripe stripes[]; + struct btrfs_io_stripe stripes[]; }; #define map_lookup_size(n) (sizeof(struct map_lookup) + \ - (sizeof(struct btrfs_bio_stripe) * (n))) + (sizeof(struct btrfs_io_stripe) * (n))) struct btrfs_balance_args; struct btrfs_balance_progress; @@ -414,6 +444,22 @@ struct btrfs_balance_control { struct btrfs_balance_progress stat; }; +/* + * Search for a given device by the set parameters + */ +struct btrfs_dev_lookup_args { + u64 devid; + u8 *uuid; + u8 *fsid; + bool missing; +}; + +/* We have to initialize to -1 because BTRFS_DEV_REPLACE_DEVID is 0 */ +#define BTRFS_DEV_LOOKUP_ARGS_INIT { .devid = (u64)-1 } + +#define BTRFS_DEV_LOOKUP_ARGS(name) \ + struct btrfs_dev_lookup_args name = BTRFS_DEV_LOOKUP_ARGS_INIT + enum btrfs_map_op { BTRFS_MAP_READ, BTRFS_MAP_WRITE, @@ -437,20 +483,20 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) } } -void btrfs_get_bbio(struct btrfs_bio *bbio); -void btrfs_put_bbio(struct btrfs_bio *bbio); +void btrfs_get_bioc(struct btrfs_io_context *bioc); +void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num); + struct btrfs_io_context **bioc_ret, int mirror_num); int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret); + struct btrfs_io_context **bioc_ret); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); -struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans, +struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, @@ -467,19 +513,23 @@ void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, const char *devpath); +int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, + struct btrfs_dev_lookup_args *args, + const char *path); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); +void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, - const char *device_path, u64 devid, + struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); -struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid); +struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, + const struct btrfs_dev_lookup_args *args); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_balance(struct btrfs_fs_info *fs_info, @@ -493,7 +543,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_uuid_scan_kthread(void *data); -int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset); +bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); @@ -580,6 +630,6 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); -int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); +bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 8a4514283a4b..99abf41b89b9 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -138,7 +138,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, * matches our target xattr, so lets check. */ ret = 0; - btrfs_assert_tree_locked(path->nodes[0]); + btrfs_assert_tree_write_locked(path->nodes[0]); di = btrfs_match_dir_item_name(fs_info, path, name, name_len); if (!di && !(flags & XATTR_REPLACE)) { ret = -ENOSPC; @@ -168,9 +168,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const int slot = path->slots[0]; struct extent_buffer *leaf = path->nodes[0]; const u16 old_data_len = btrfs_dir_data_len(leaf, di); - const u32 item_size = btrfs_item_size_nr(leaf, slot); + const u32 item_size = btrfs_item_size(leaf, slot); const u32 data_size = sizeof(*di) + name_len + size; - struct btrfs_item *item; unsigned long data_ptr; char *ptr; @@ -196,9 +195,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, btrfs_extend_item(path, data_size); } - item = btrfs_item_nr(slot); ptr = btrfs_item_ptr(leaf, slot, char); - ptr += btrfs_item_size(leaf, item) - data_size; + ptr += btrfs_item_size(leaf, slot) - data_size; di = (struct btrfs_dir_item *)ptr; btrfs_set_dir_data_len(leaf, di, size); data_ptr = ((unsigned long)(di + 1)) + name_len; @@ -335,7 +333,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) goto next_item; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); cur = 0; while (cur < item_size) { u16 name_len = btrfs_dir_name_len(leaf, di); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8afa90074891..767a0c6c9694 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); pages[0] = out_page; nr_pages = 1; @@ -148,22 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, int i; for (i = 0; i < in_buf_pages; i++) { - if (in_page) + if (in_page) { + kunmap(in_page); put_page(in_page); + } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); + data_in = kmap(in_page); memcpy(workspace->buf + i * PAGE_SIZE, data_in, PAGE_SIZE); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; } else { - if (in_page) + if (in_page) { + kunmap(in_page); put_page(in_page); + } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); + data_in = kmap(in_page); start += PAGE_SIZE; workspace->strm.next_in = data_in; } @@ -192,6 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -202,7 +207,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -229,6 +234,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } else if (workspace->strm.avail_out == 0) { /* get another page for the stream end */ + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -239,7 +245,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -258,8 +264,13 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = workspace->strm.total_in; out: *out_pages = nr_pages; - if (in_page) + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); put_page(in_page); + } return ret; } @@ -276,7 +287,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long buf_start; struct page **pages_in = cb->compressed_pages; - data_in = page_address(pages_in[page_in_index]); + data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -298,6 +309,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { pr_warn("BTRFS: inflateInit failed\n"); + kunmap(pages_in[page_in_index]); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -324,13 +336,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; - + kunmap(pages_in[page_in_index]); page_in_index++; if (page_in_index >= total_pages_in) { data_in = NULL; break; } - data_in = page_address(pages_in[page_in_index]); + data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, PAGE_SIZE); @@ -342,6 +354,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ret = 0; done: zlib_inflateEnd(&workspace->strm); + if (data_in) + kunmap(pages_in[page_in_index]); if (!ret) zero_fill_bio(cb->orig_bio); return ret; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 47af1ab3bf12..f559d517c7c4 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -4,6 +4,8 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/sched/mm.h> +#include <linux/atomic.h> +#include <linux/vmalloc.h> #include "ctree.h" #include "volumes.h" #include "zoned.h" @@ -39,12 +41,30 @@ #define BTRFS_NR_SB_LOG_ZONES 2 /* + * Minimum of active zones we need: + * + * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors + * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group + * - 1 zone for tree-log dedicated block group + * - 1 zone for relocation + */ +#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) + +/* * Maximum supported zone size. Currently, SMR disks have a zone size of * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not * expect the zone size to become larger than 8GiB in the near future. */ #define BTRFS_MAX_ZONE_SIZE SZ_8G +#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) + +static inline bool sb_zone_is_full(const struct blk_zone *zone) +{ + return (zone->cond == BLK_ZONE_COND_FULL) || + (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity); +} + static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_zone *zones = data; @@ -60,14 +80,13 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, bool empty[BTRFS_NR_SB_LOG_ZONES]; bool full[BTRFS_NR_SB_LOG_ZONES]; sector_t sector; + int i; - ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL && - zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL); - - empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY); - empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY); - full[0] = (zones[0].cond == BLK_ZONE_COND_FULL); - full[1] = (zones[1].cond == BLK_ZONE_COND_FULL); + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); + empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); + full[i] = sb_zone_is_full(&zones[i]); + } /* * Possible states of log buffer zones @@ -195,6 +214,8 @@ static int emulate_report_zones(struct btrfs_device *device, u64 pos, static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { + struct btrfs_zoned_device_info *zinfo = device->zone_info; + u32 zno; int ret; if (!*nr_zones) @@ -206,6 +227,34 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return 0; } + /* Check cache */ + if (zinfo->zone_cache) { + unsigned int i; + + ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); + zno = pos >> zinfo->zone_size_shift; + /* + * We cannot report zones beyond the zone end. So, it is OK to + * cap *nr_zones to at the end. + */ + *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno); + + for (i = 0; i < *nr_zones; i++) { + struct blk_zone *zone_info; + + zone_info = &zinfo->zone_cache[zno + i]; + if (!zone_info->len) + break; + } + + if (i == *nr_zones) { + /* Cache hit on all the zones */ + memcpy(zones, zinfo->zone_cache + zno, + sizeof(*zinfo->zone_cache) * *nr_zones); + return 0; + } + } + ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { @@ -219,6 +268,11 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, if (!ret) return -EIO; + /* Populate cache */ + if (zinfo->zone_cache) + memcpy(zinfo->zone_cache + zno, zones, + sizeof(*zinfo->zone_cache) * *nr_zones); + return 0; } @@ -282,7 +336,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) if (!device->bdev) continue; - ret = btrfs_get_dev_zone_info(device); + ret = btrfs_get_dev_zone_info(device, true); if (ret) break; } @@ -291,11 +345,14 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) return ret; } -int btrfs_get_dev_zone_info(struct btrfs_device *device) +int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; + struct request_queue *queue = bdev_get_queue(bdev); + unsigned int max_active_zones; + unsigned int nactive; sector_t nr_sectors; sector_t sector = 0; struct blk_zone *zones = NULL; @@ -318,6 +375,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!zone_info) return -ENOMEM; + device->zone_info = zone_info; + if (!bdev_is_zoned(bdev)) { if (!fs_info->zone_size) { ret = calculate_emulated_zone_size(fs_info); @@ -351,6 +410,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; + max_active_zones = queue_max_active_zones(queue); + if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { + btrfs_err_in_rcu(fs_info, +"zoned: %s: max active zones %u is too small, need at least %u active zones", + rcu_str_deref(device->name), max_active_zones, + BTRFS_MIN_ACTIVE_ZONES); + ret = -EINVAL; + goto out; + } + zone_info->max_active_zones = max_active_zones; + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->seq_zones) { ret = -ENOMEM; @@ -363,13 +433,37 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) goto out; } + zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->active_zones) { + ret = -ENOMEM; + goto out; + } + zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); if (!zones) { ret = -ENOMEM; goto out; } + /* + * Enable zone cache only for a zoned device. On a non-zoned device, we + * fill the zone info with emulated CONVENTIONAL zones, so no need to + * use the cache. + */ + if (populate_cache && bdev_is_zoned(device->bdev)) { + zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) * + zone_info->nr_zones); + if (!zone_info->zone_cache) { + btrfs_err_in_rcu(device->fs_info, + "zoned: failed to allocate zone cache for %s", + rcu_str_deref(device->name)); + ret = -ENOMEM; + goto out; + } + } + /* Get zones type */ + nactive = 0; while (sector < nr_sectors) { nr_zones = BTRFS_REPORT_NR_ZONES; ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, @@ -380,8 +474,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) for (i = 0; i < nr_zones; i++) { if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) __set_bit(nreported, zone_info->seq_zones); - if (zones[i].cond == BLK_ZONE_COND_EMPTY) + switch (zones[i].cond) { + case BLK_ZONE_COND_EMPTY: __set_bit(nreported, zone_info->empty_zones); + break; + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + __set_bit(nreported, zone_info->active_zones); + nactive++; + break; + } nreported++; } sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; @@ -396,6 +499,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) goto out; } + if (max_active_zones) { + if (nactive > max_active_zones) { + btrfs_err_in_rcu(device->fs_info, + "zoned: %u active zones on %s exceeds max_active_zones %u", + nactive, rcu_str_deref(device->name), + max_active_zones); + ret = -EIO; + goto out; + } + atomic_set(&zone_info->active_zones_left, + max_active_zones - nactive); + } + /* Validate superblock log */ nr_zones = BTRFS_NR_SB_LOG_ZONES; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -444,8 +560,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) kfree(zones); - device->zone_info = zone_info; - switch (bdev_zoned_model(bdev)) { case BLK_ZONED_HM: model = "host-managed zoned"; @@ -478,10 +592,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) out: kfree(zones); out_free_zone_info: - bitmap_free(zone_info->empty_zones); - bitmap_free(zone_info->seq_zones); - kfree(zone_info); - device->zone_info = NULL; + btrfs_destroy_dev_zone_info(device); return ret; } @@ -493,8 +604,10 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device) if (!zone_info) return; + bitmap_free(zone_info->active_zones); bitmap_free(zone_info->seq_zones); bitmap_free(zone_info->empty_zones); + vfree(zone_info->zone_cache); kfree(zone_info); device->zone_info = NULL; } @@ -585,7 +698,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) /* * stripe_size is always aligned to BTRFS_STRIPE_LEN in - * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size, + * btrfs_create_chunk(). Since we want stripe_len == zone_size, * check the alignment here. */ if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { @@ -664,7 +777,7 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, reset = &zones[1]; if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { - ASSERT(reset->cond == BLK_ZONE_COND_FULL); + ASSERT(sb_zone_is_full(reset)); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, reset->start, reset->len, @@ -676,9 +789,20 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, reset->wp = reset->start; } } else if (ret != -ENOENT) { - /* For READ, we want the precious one */ + /* + * For READ, we want the previous one. Move write pointer to + * the end of a zone, if it is at the head of a zone. + */ + u64 zone_end = 0; + if (wp == zones[0].start << SECTOR_SHIFT) - wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT; + zone_end = zones[1].start + zones[1].capacity; + else if (wp == zones[1].start << SECTOR_SHIFT) + zone_end = zones[0].start + zones[0].capacity; + if (zone_end) + wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT, + BTRFS_SUPER_INFO_SIZE); + wp -= BTRFS_SUPER_INFO_SIZE; } @@ -771,36 +895,56 @@ static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, return true; } -void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) +int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) { struct btrfs_zoned_device_info *zinfo = device->zone_info; struct blk_zone *zone; + int i; if (!is_sb_log_zone(zinfo, mirror)) - return; + return 0; zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; - if (zone->cond != BLK_ZONE_COND_FULL) { + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + /* Advance the next zone */ + if (zone->cond == BLK_ZONE_COND_FULL) { + zone++; + continue; + } + if (zone->cond == BLK_ZONE_COND_EMPTY) zone->cond = BLK_ZONE_COND_IMP_OPEN; - zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); + zone->wp += SUPER_INFO_SECTORS; + + if (sb_zone_is_full(zone)) { + /* + * No room left to write new superblock. Since + * superblock is written with REQ_SYNC, it is safe to + * finish the zone now. + * + * If the write pointer is exactly at the capacity, + * explicit ZONE_FINISH is not necessary. + */ + if (zone->wp != zone->start + zone->capacity) { + int ret; + + ret = blkdev_zone_mgmt(device->bdev, + REQ_OP_ZONE_FINISH, zone->start, + zone->len, GFP_NOFS); + if (ret) + return ret; + } - if (zone->wp == zone->start + zone->len) + zone->wp = zone->start + zone->len; zone->cond = BLK_ZONE_COND_FULL; - - return; + } + return 0; } - zone++; - ASSERT(zone->cond != BLK_ZONE_COND_FULL); - if (zone->cond == BLK_ZONE_COND_EMPTY) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - - zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); - - if (zone->wp == zone->start + zone->len) - zone->cond = BLK_ZONE_COND_FULL; + /* All the zones are FULL. Should not reach here. */ + ASSERT(0); + return -EIO; } int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) @@ -895,6 +1039,41 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, return pos; } +static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + unsigned int zno = (pos >> zone_info->zone_size_shift); + + /* We can use any number of zones */ + if (zone_info->max_active_zones == 0) + return true; + + if (!test_bit(zno, zone_info->active_zones)) { + /* Active zone left? */ + if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0) + return false; + if (test_and_set_bit(zno, zone_info->active_zones)) { + /* Someone already set the bit */ + atomic_inc(&zone_info->active_zones_left); + } + } + + return true; +} + +static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + unsigned int zno = (pos >> zone_info->zone_size_shift); + + /* We can use any number of zones */ + if (zone_info->max_active_zones == 0) + return; + + if (test_and_clear_bit(zno, zone_info->active_zones)) + atomic_inc(&zone_info->active_zones_left); +} + int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes) { @@ -910,6 +1089,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, *bytes = length; while (length) { btrfs_dev_set_zone_empty(device, physical); + btrfs_dev_clear_active_zone(device, physical); physical += device->zone_info->zone_size; length -= device->zone_info->zone_size; } @@ -974,7 +1154,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, u64 *offset_ret) { struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; @@ -989,6 +1169,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, key.type = 0; key.offset = 0; + root = btrfs_extent_root(fs_info, key.objectid); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); /* We should not find the exact match */ if (!ret) @@ -1039,6 +1220,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) int i; unsigned int nofs_flag; u64 *alloc_offsets = NULL; + u64 *caps = NULL; + unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; @@ -1063,10 +1246,28 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) map = em->map_lookup; + cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS); + if (!cache->physical_map) { + ret = -ENOMEM; + goto out; + } + alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); if (!alloc_offsets) { - free_extent_map(em); - return -ENOMEM; + ret = -ENOMEM; + goto out; + } + + caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS); + if (!caps) { + ret = -ENOMEM; + goto out; + } + + active = bitmap_zalloc(map->num_stripes, GFP_NOFS); + if (!active) { + ret = -ENOMEM; + goto out; } for (i = 0; i < map->num_stripes; i++) { @@ -1131,6 +1332,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + caps[i] = (zone.capacity << SECTOR_SHIFT); + switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: @@ -1144,14 +1347,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) alloc_offsets[i] = 0; break; case BLK_ZONE_COND_FULL: - alloc_offsets[i] = fs_info->zone_size; + alloc_offsets[i] = caps[i]; break; default: /* Partially used zone */ alloc_offsets[i] = ((zone.wp - zone.start) << SECTOR_SHIFT); + __set_bit(i, active); break; } + + /* + * Consider a zone as active if we can allow any number of + * active zones. + */ + if (!device->zone_info->max_active_zones) + __set_bit(i, active); } if (num_sequential > 0) @@ -1169,6 +1380,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * calculate_alloc_pointer() which takes extent buffer * locks to avoid deadlock. */ + + /* Zone capacity is always zone size in emulation */ + cache->zone_capacity = cache->length; if (new) { cache->alloc_offset = 0; goto out; @@ -1195,6 +1409,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } cache->alloc_offset = alloc_offsets[0]; + cache->zone_capacity = caps[0]; + cache->zone_is_active = test_bit(0, active); break; case BTRFS_BLOCK_GROUP_DUP: case BTRFS_BLOCK_GROUP_RAID1: @@ -1210,6 +1426,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + if (cache->zone_is_active) { + btrfs_get_block_group(cache); + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + } + out: if (cache->alloc_offset > fs_info->zone_size) { btrfs_err(fs_info, @@ -1218,6 +1441,14 @@ out: ret = -EIO; } + if (cache->alloc_offset > cache->zone_capacity) { + btrfs_err(fs_info, +"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", + cache->alloc_offset, cache->zone_capacity, + cache->start); + ret = -EIO; + } + /* An extent is allocated after the write pointer */ if (!ret && num_conventional && last_alloc > cache->alloc_offset) { btrfs_err(fs_info, @@ -1229,6 +1460,12 @@ out: if (!ret) cache->meta_write_pointer = cache->alloc_offset + cache->start; + if (ret) { + kfree(cache->physical_map); + cache->physical_map = NULL; + } + bitmap_free(active); + kfree(caps); kfree(alloc_offsets); free_extent_map(em); @@ -1243,17 +1480,15 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) return; WARN_ON(cache->bytes_super != 0); - unusable = cache->alloc_offset - cache->used; - free = cache->length - cache->alloc_offset; + unusable = (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); + free = cache->zone_capacity - cache->alloc_offset; /* We only need ->free_space in ALLOC_SEQ block groups */ cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; cache->free_space_ctl->free_space = free; cache->zone_unusable = unusable; - - /* Should not have any excluded extents. Just in case, though */ - btrfs_free_excluded_extents(cache); } void btrfs_redirty_list_add(struct btrfs_transaction *trans, @@ -1304,6 +1539,17 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!is_data_inode(&inode->vfs_inode)) return false; + /* + * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the + * extent layout the relocation code has. + * Furthermore we have set aside own block-group from which only the + * relocation "process" can allocate and make sure only one process at a + * time can add pages to an extent that gets relocated, so it's safe to + * use regular REQ_OP_WRITE for this special case. + */ + if (btrfs_is_data_reloc_root(inode->root)) + return false; + cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); if (!cache) @@ -1391,29 +1637,19 @@ bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, if (!btrfs_is_zoned(fs_info)) return true; - cache = *cache_ret; + cache = btrfs_lookup_block_group(fs_info, eb->start); + if (!cache) + return true; - if (cache && (eb->start < cache->start || - cache->start + cache->length <= eb->start)) { + if (cache->meta_write_pointer != eb->start) { btrfs_put_block_group(cache); cache = NULL; - *cache_ret = NULL; + ret = false; + } else { + cache->meta_write_pointer = eb->start + eb->len; } - if (!cache) - cache = btrfs_lookup_block_group(fs_info, eb->start); - - if (cache) { - if (cache->meta_write_pointer != eb->start) { - btrfs_put_block_group(cache); - cache = NULL; - ret = false; - } else { - cache->meta_write_pointer = eb->start + eb->len; - } - - *cache_ret = cache; - } + *cache_ret = cache; return ret; } @@ -1440,27 +1676,27 @@ int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 len static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, struct blk_zone *zone) { - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; u64 mapped_length = PAGE_SIZE; unsigned int nofs_flag; int nmirrors; int i, ret; ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &mapped_length, &bbio); - if (ret || !bbio || mapped_length < PAGE_SIZE) { - btrfs_put_bbio(bbio); + &mapped_length, &bioc); + if (ret || !bioc || mapped_length < PAGE_SIZE) { + btrfs_put_bioc(bioc); return -EIO; } - if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) return -EINVAL; nofs_flag = memalloc_nofs_save(); - nmirrors = (int)bbio->num_stripes; + nmirrors = (int)bioc->num_stripes; for (i = 0; i < nmirrors; i++) { - u64 physical = bbio->stripes[i].physical; - struct btrfs_device *dev = bbio->stripes[i].dev; + u64 physical = bioc->stripes[i].physical; + struct btrfs_device *dev = bioc->stripes[i].dev; /* Missing device */ if (!dev->bdev) @@ -1530,3 +1766,270 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, return device; } + +/** + * Activate block group and underlying device zones + * + * @block_group: the block group to activate + * + * Return: true on success, false otherwise + */ +bool btrfs_zone_activate(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct map_lookup *map; + struct btrfs_device *device; + u64 physical; + bool ret; + + if (!btrfs_is_zoned(block_group->fs_info)) + return true; + + map = block_group->physical_map; + /* Currently support SINGLE profile only */ + ASSERT(map->num_stripes == 1); + device = map->stripes[0].dev; + physical = map->stripes[0].physical; + + if (device->zone_info->max_active_zones == 0) + return true; + + spin_lock(&block_group->lock); + + if (block_group->zone_is_active) { + ret = true; + goto out_unlock; + } + + /* No space left */ + if (block_group->alloc_offset == block_group->zone_capacity) { + ret = false; + goto out_unlock; + } + + if (!btrfs_dev_set_active_zone(device, physical)) { + /* Cannot activate the zone */ + ret = false; + goto out_unlock; + } + + /* Successfully activated all the zones */ + block_group->zone_is_active = 1; + + spin_unlock(&block_group->lock); + + /* For the active block group list */ + btrfs_get_block_group(block_group); + + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(list_empty(&block_group->active_bg_list)); + list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + + return true; + +out_unlock: + spin_unlock(&block_group->lock); + return ret; +} + +int btrfs_zone_finish(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct map_lookup *map; + struct btrfs_device *device; + u64 physical; + int ret = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + map = block_group->physical_map; + /* Currently support SINGLE profile only */ + ASSERT(map->num_stripes == 1); + + device = map->stripes[0].dev; + physical = map->stripes[0].physical; + + if (device->zone_info->max_active_zones == 0) + return 0; + + spin_lock(&block_group->lock); + if (!block_group->zone_is_active) { + spin_unlock(&block_group->lock); + return 0; + } + + /* Check if we have unwritten allocated space */ + if ((block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && + block_group->alloc_offset > block_group->meta_write_pointer) { + spin_unlock(&block_group->lock); + return -EAGAIN; + } + spin_unlock(&block_group->lock); + + ret = btrfs_inc_block_group_ro(block_group, false); + if (ret) + return ret; + + /* Ensure all writes in this block group finish */ + btrfs_wait_block_group_reservations(block_group); + /* No need to wait for NOCOW writers. Zoned mode does not allow that. */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, + block_group->length); + + spin_lock(&block_group->lock); + + /* + * Bail out if someone already deactivated the block group, or + * allocated space is left in the block group. + */ + if (!block_group->zone_is_active) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return 0; + } + + if (block_group->reserved) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return -EAGAIN; + } + + block_group->zone_is_active = 0; + block_group->alloc_offset = block_group->zone_capacity; + block_group->free_space_ctl->free_space = 0; + btrfs_clear_treelog_bg(block_group); + btrfs_clear_data_reloc_bg(block_group); + spin_unlock(&block_group->lock); + + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + device->zone_info->zone_size >> SECTOR_SHIFT, + GFP_NOFS); + btrfs_dec_block_group_ro(block_group); + + if (!ret) { + btrfs_dev_clear_active_zone(device, physical); + + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(!list_empty(&block_group->active_bg_list)); + list_del_init(&block_group->active_bg_list); + spin_unlock(&fs_info->zone_active_bgs_lock); + + /* For active_bg_list */ + btrfs_put_block_group(block_group); + } + + return ret; +} + +bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) +{ + struct btrfs_device *device; + bool ret = false; + + if (!btrfs_is_zoned(fs_devices->fs_info)) + return true; + + /* Non-single profiles are not supported yet */ + ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0); + + /* Check if there is a device with active zones left */ + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + struct btrfs_zoned_device_info *zinfo = device->zone_info; + + if (!device->bdev) + continue; + + if (!zinfo->max_active_zones || + atomic_read(&zinfo->active_zones_left)) { + ret = true; + break; + } + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + +void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) +{ + struct btrfs_block_group *block_group; + struct map_lookup *map; + struct btrfs_device *device; + u64 physical; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + ASSERT(block_group); + + if (logical + length < block_group->start + block_group->zone_capacity) + goto out; + + spin_lock(&block_group->lock); + + if (!block_group->zone_is_active) { + spin_unlock(&block_group->lock); + goto out; + } + + block_group->zone_is_active = 0; + /* We should have consumed all the free space */ + ASSERT(block_group->alloc_offset == block_group->zone_capacity); + ASSERT(block_group->free_space_ctl->free_space == 0); + btrfs_clear_treelog_bg(block_group); + btrfs_clear_data_reloc_bg(block_group); + spin_unlock(&block_group->lock); + + map = block_group->physical_map; + device = map->stripes[0].dev; + physical = map->stripes[0].physical; + + if (!device->zone_info->max_active_zones) + goto out; + + btrfs_dev_clear_active_zone(device, physical); + + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(!list_empty(&block_group->active_bg_list)); + list_del_init(&block_group->active_bg_list); + spin_unlock(&fs_info->zone_active_bgs_lock); + + btrfs_put_block_group(block_group); + +out: + btrfs_put_block_group(block_group); +} + +void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg == bg->start) + fs_info->data_reloc_bg = 0; + spin_unlock(&fs_info->relocation_bg_lock); +} + +void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + + if (!btrfs_is_zoned(fs_info)) + return; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->zone_info) { + vfree(device->zone_info->zone_cache); + device->zone_info->zone_cache = NULL; + } + } + mutex_unlock(&fs_devices->device_list_mutex); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 4b299705bb12..cbf016a7bb5d 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -8,6 +8,7 @@ #include "volumes.h" #include "disk-io.h" #include "block-group.h" +#include "btrfs_inode.h" /* * Block groups with more than this value (percents) of unusable space will be @@ -23,8 +24,12 @@ struct btrfs_zoned_device_info { u64 zone_size; u8 zone_size_shift; u32 nr_zones; + unsigned int max_active_zones; + atomic_t active_zones_left; unsigned long *seq_zones; unsigned long *empty_zones; + unsigned long *active_zones; + struct blk_zone *zone_cache; struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; }; @@ -32,7 +37,7 @@ struct btrfs_zoned_device_info { int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone); int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); -int btrfs_get_dev_zone_info(struct btrfs_device *device); +int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); @@ -40,7 +45,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, u64 *bytenr_ret); int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, u64 *bytenr_ret); -void btrfs_advance_sb_log(struct btrfs_device *device, int mirror); +int btrfs_advance_sb_log(struct btrfs_device *device, int mirror); int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror); u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, u64 hole_end, u64 num_bytes); @@ -66,6 +71,13 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, u64 logical, u64 length); +bool btrfs_zone_activate(struct btrfs_block_group *block_group); +int btrfs_zone_finish(struct btrfs_block_group *block_group); +bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); +void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); +void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); +void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -78,7 +90,8 @@ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_i return 0; } -static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) +static inline int btrfs_get_dev_zone_info(struct btrfs_device *device, + bool populate_cache) { return 0; } @@ -113,8 +126,10 @@ static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror, return 0; } -static inline void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) -{ } +static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) +{ + return 0; +} static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) { @@ -199,6 +214,28 @@ static inline struct btrfs_device *btrfs_zoned_get_device( return ERR_PTR(-EOPNOTSUPP); } +static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) +{ + return true; +} + +static inline int btrfs_zone_finish(struct btrfs_block_group *block_group) +{ + return 0; +} + +static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, + u64 flags) +{ + return true; +} + +static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } + +static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } + +static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -317,4 +354,20 @@ static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg) spin_unlock(&fs_info->treelog_bg_lock); } +static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + + if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) + btrfs_inode_lock(&inode->vfs_inode, 0); +} + +static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + + if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) + btrfs_inode_unlock(&inode->vfs_inode, 0); +} + #endif diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 56dce9f00988..fc42dd0badd7 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -28,10 +28,10 @@ /* 307s to avoid pathologically clashing with transaction commit */ #define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ) -static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level, +static zstd_parameters zstd_get_btrfs_parameters(unsigned int level, size_t src_len) { - ZSTD_parameters params = ZSTD_getParams(level, src_len, 0); + zstd_parameters params = zstd_get_params(level, src_len); if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG) params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG; @@ -48,8 +48,8 @@ struct workspace { unsigned long last_used; /* jiffies */ struct list_head list; struct list_head lru_list; - ZSTD_inBuffer in_buf; - ZSTD_outBuffer out_buf; + zstd_in_buffer in_buf; + zstd_out_buffer out_buf; }; /* @@ -155,12 +155,12 @@ static void zstd_calc_ws_mem_sizes(void) unsigned int level; for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { - ZSTD_parameters params = + zstd_parameters params = zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT); size_t level_size = max_t(size_t, - ZSTD_CStreamWorkspaceBound(params.cParams), - ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT)); + zstd_cstream_workspace_bound(¶ms.cParams), + zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT)); max_size = max_t(size_t, max_size, level_size); zstd_ws_mem_sizes[level - 1] = max_size; @@ -371,7 +371,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); - ZSTD_CStream *stream; + zstd_cstream *stream; int ret = 0; int nr_pages = 0; struct page *in_page = NULL; /* The current page to read */ @@ -381,7 +381,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long len = *total_out; const unsigned long nr_dest_pages = *out_pages; unsigned long max_out = nr_dest_pages * PAGE_SIZE; - ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level, + zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, len); *out_pages = 0; @@ -389,17 +389,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = 0; /* Initialize the stream */ - stream = ZSTD_initCStream(params, len, workspace->mem, + stream = zstd_init_cstream(¶ms, len, workspace->mem, workspace->size); if (!stream) { - pr_warn("BTRFS: ZSTD_initCStream failed\n"); + pr_warn("BTRFS: zstd_init_cstream failed\n"); ret = -EIO; goto out; } /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = page_address(in_page); + workspace->in_buf.src = kmap(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); @@ -411,18 +411,18 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.dst = kmap(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); while (1) { size_t ret2; - ret2 = ZSTD_compressStream(stream, &workspace->out_buf, + ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); - if (ZSTD_isError(ret2)) { - pr_debug("BTRFS: ZSTD_compressStream returned %d\n", - ZSTD_getErrorCode(ret2)); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_compress_stream returned %d\n", + zstd_get_error_code(ret2)); ret = -EIO; goto out; } @@ -446,6 +446,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -457,7 +458,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.dst = kmap(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -472,12 +473,13 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; + kunmap(in_page); put_page(in_page); start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = page_address(in_page); + workspace->in_buf.src = kmap(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } @@ -485,10 +487,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, while (1) { size_t ret2; - ret2 = ZSTD_endStream(stream, &workspace->out_buf); - if (ZSTD_isError(ret2)) { - pr_debug("BTRFS: ZSTD_endStream returned %d\n", - ZSTD_getErrorCode(ret2)); + ret2 = zstd_end_stream(stream, &workspace->out_buf); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_end_stream returned %d\n", + zstd_get_error_code(ret2)); ret = -EIO; goto out; } @@ -504,6 +506,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -515,7 +518,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.dst = kmap(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -531,8 +534,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, out: *out_pages = nr_pages; /* Cleanup */ - if (in_page) + if (in_page) { + kunmap(in_page); put_page(in_page); + } + if (out_page) + kunmap(out_page); return ret; } @@ -541,22 +548,22 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct workspace *workspace = list_entry(ws, struct workspace, list); struct page **pages_in = cb->compressed_pages; size_t srclen = cb->compressed_len; - ZSTD_DStream *stream; + zstd_dstream *stream; int ret = 0; unsigned long page_in_index = 0; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long total_out = 0; - stream = ZSTD_initDStream( + stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (!stream) { - pr_debug("BTRFS: ZSTD_initDStream failed\n"); + pr_debug("BTRFS: zstd_init_dstream failed\n"); ret = -EIO; goto done; } - workspace->in_buf.src = page_address(pages_in[page_in_index]); + workspace->in_buf.src = kmap(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -567,11 +574,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) while (1) { size_t ret2; - ret2 = ZSTD_decompressStream(stream, &workspace->out_buf, + ret2 = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); - if (ZSTD_isError(ret2)) { - pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", - ZSTD_getErrorCode(ret2)); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_decompress_stream returned %d\n", + zstd_get_error_code(ret2)); ret = -EIO; goto done; } @@ -592,14 +599,14 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) break; if (workspace->in_buf.pos == workspace->in_buf.size) { - page_in_index++; + kunmap(pages_in[page_in_index++]); if (page_in_index >= total_pages_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = page_address(pages_in[page_in_index]); + workspace->in_buf.src = kmap(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } @@ -607,6 +614,8 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ret = 0; zero_fill_bio(cb->orig_bio); done: + if (workspace->in_buf.src) + kunmap(pages_in[page_in_index]); return ret; } @@ -615,16 +624,16 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - ZSTD_DStream *stream; + zstd_dstream *stream; int ret = 0; size_t ret2; unsigned long total_out = 0; unsigned long pg_offset = 0; - stream = ZSTD_initDStream( + stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (!stream) { - pr_warn("BTRFS: ZSTD_initDStream failed\n"); + pr_warn("BTRFS: zstd_init_dstream failed\n"); ret = -EIO; goto finish; } @@ -648,15 +657,15 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, /* Check if the frame is over and we still need more input */ if (ret2 == 0) { - pr_debug("BTRFS: ZSTD_decompressStream ended early\n"); + pr_debug("BTRFS: zstd_decompress_stream ended early\n"); ret = -EIO; goto finish; } - ret2 = ZSTD_decompressStream(stream, &workspace->out_buf, + ret2 = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); - if (ZSTD_isError(ret2)) { - pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", - ZSTD_getErrorCode(ret2)); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_decompress_stream returned %d\n", + zstd_get_error_code(ret2)); ret = -EIO; goto finish; } |