From 94ead93e63758f2c7cbe0c68ca232fff812ca33e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 13 Apr 2023 13:57:18 +0800 Subject: btrfs: scrub: use recovered data stripes as cache to avoid unnecessary read For P/Q stripe scrub, we have quite some duplicated read IO: - Data stripes read for verification This is triggered by the scrub_submit_initial_read() inside scrub_raid56_parity_stripe(). - Data stripes read (again) for P/Q stripe verification This is triggered by scrub_assemble_read_bios() from scrub_rbio(). Although we can have hit rbio cache and avoid unnecessary read, the chance is very low, as scrub would easily flush the whole rbio cache. This means, even we're just scrubbing a single P/Q stripe, we would read the data stripes twice for the best case scenario. If we need to recover some data stripes, it would cause more reads on the same data stripes, again and again. However before we call raid56_parity_submit_scrub_rbio() we already have all data stripes repaired and their contents ready to use. But RAID56 cache is unaware about the scrub cache, thus RAID56 layer itself still needs to re-read the data stripes. To avoid such cache miss, this patch would: - Introduce a new helper, raid56_parity_cache_data_pages() This function would grab the pages from an array, and copy the content to the rbio, marking all the involved sectors uptodate. The page copy is unavoidable because of the cache pages of rbio are all self managed, thus can not utilize outside pages without screwing up the lifespan. - Use the repaired data stripes as cache inside scrub_raid56_parity_stripe() By this, we ensure all the data sectors of the scrub rbio are already uptodate, and no need to read them again from disk. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bceaa8c2007e..f231883f504a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1972,6 +1972,13 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_bio_counter_dec(fs_info); goto out; } + /* Use the recovered stripes as cache to avoid read them from disk again. */ + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; + + raid56_parity_cache_data_pages(rbio, stripe->pages, + full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); + } raid56_parity_submit_scrub_rbio(rbio); wait_for_completion_io(&io_done); ret = blk_status_to_errno(bio->bi_status); -- cgit From b7f9945a1479a97a7aa2cc2b9d36e72f33fcb174 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 11 May 2023 16:01:44 +0800 Subject: btrfs: handle tree backref walk error properly [BUG] Smatch reports the following errors related to commit ("btrfs: output affected files when relocation fails"): fs/btrfs/inode.c:283 print_data_reloc_error() error: uninitialized symbol 'ref_level'. [CAUSE] That part of code is mostly copied from scrub, but unfortunately scrub code from the beginning is not doing the error handling properly. The offending code looks like this: do { ret = tree_backref_for_extent(); btrfs_warn_rl(); } while (ret != 1); There are several problems involved: - No error handling If that tree_backref_for_extent() failed, we would output the same error again and again, never really exit as it requires ret == 1 to exit. - Always do one extra output As tree_backref_for_extent() only return > 0 if there is no more backref item. This means after the last item we hit, we would output an invalid error message for ret > 0 case. [FIX] Fix the old code by: - Move @ref_root and @ref_level into the if branch And do not initialize them, so we can catch such uninitialized values just like what we do in the inode.c - Explicitly check the return value of tree_backref_for_extent() And handle ret < 0 and ret > 0 cases properly. - No more do {} while () loop Instead go while (true) {} loop since we will handle @ret manually. Reported-by: Dan Carpenter Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f231883f504a..265db0c15a4c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -479,11 +479,8 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * struct extent_buffer *eb; struct btrfs_extent_item *ei; struct scrub_warning swarn; - unsigned long ptr = 0; u64 flags = 0; - u64 ref_root; u32 item_size; - u8 ref_level = 0; int ret; /* Super block error, no need to search extent tree. */ @@ -513,19 +510,28 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * item_size = btrfs_item_size(eb, path->slots[0]); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - do { + unsigned long ptr = 0; + u8 ref_level; + u64 ref_root; + + while (true) { ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); + if (ret < 0) { + btrfs_warn(fs_info, + "failed to resolve tree backref for logical %llu: %d", + swarn.logical, ret); + break; + } + if (ret > 0) + break; btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", - errstr, swarn.logical, - btrfs_dev_name(dev), - swarn.physical, - ref_level ? "node" : "leaf", - ret < 0 ? -1 : ref_level, - ret < 0 ? -1 : ref_root); - } while (ret != 1); + errstr, swarn.logical, btrfs_dev_name(dev), + swarn.physical, (ref_level ? "node" : "leaf"), + ref_level, ref_root); + } btrfs_release_path(path); } else { struct btrfs_backref_walk_ctx ctx = { 0 }; -- cgit From b9cb105e737f32b9cc18c4f47edabdb0ae66d569 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 12 May 2023 13:44:57 +0800 Subject: btrfs: scrub: remove more unused functions These functions are defined in the scrub.c file, but last callers were removed in e9255d6c4054 ("btrfs: scrub: remove the old scrub recheck code"). fs/btrfs/scrub.c:553:20: warning: unused function 'scrub_stripe_index_and_offset'. fs/btrfs/scrub.c:543:19: warning: unused function 'scrub_nr_raid_mirrors'. Reported-by: Abaci Robot Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=4937 Signed-off-by: Jiapeng Chong Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 42 ------------------------------------------ 1 file changed, 42 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 265db0c15a4c..d7a14458c4a7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -552,48 +552,6 @@ out: btrfs_free_path(path); } -static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) -{ - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) - return 2; - else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) - return 3; - else - return (int)bioc->num_stripes; -} - -static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, - u64 full_stripe_logical, - int nstripes, int mirror, - int *stripe_index, - u64 *stripe_offset) -{ - int i; - - if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ? - nstripes - 1 : nstripes - 2; - - /* RAID5/6 */ - for (i = 0; i < nr_data_stripes; i++) { - const u64 data_stripe_start = full_stripe_logical + - (i * BTRFS_STRIPE_LEN); - - if (logical >= data_stripe_start && - logical < data_stripe_start + BTRFS_STRIPE_LEN) - break; - } - - *stripe_index = i; - *stripe_offset = (logical - full_stripe_logical) & - BTRFS_STRIPE_LEN_MASK; - } else { - /* The other RAID type */ - *stripe_index = mirror; - *stripe_offset = 0; - } -} - static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) { int ret = 0; -- cgit From 58e814fcacc1f652d2b794c82b7c9d96ee3c3bab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 25 May 2023 13:33:08 -1000 Subject: btrfs: use alloc_ordered_workqueue() to create ordered workqueues BACKGROUND ========== When multiple work items are queued to a workqueue, their execution order doesn't match the queueing order. They may get executed in any order and simultaneously. When fully serialized execution - one by one in the queueing order - is needed, an ordered workqueue should be used which can be created with alloc_ordered_workqueue(). However, alloc_ordered_workqueue() was a later addition. Before it, an ordered workqueue could be obtained by creating an UNBOUND workqueue with @max_active==1. This originally was an implementation side-effect which was broken by 4c16bd327c74 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered"). Because there were users that depended on the ordered execution, 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") made workqueue allocation path to implicitly promote UNBOUND workqueues w/ @max_active==1 to ordered workqueues. While this has worked okay, overloading the UNBOUND allocation interface this way creates other issues. It's difficult to tell whether a given workqueue actually needs to be ordered and users that legitimately want a min concurrency level wq unexpectedly gets an ordered one instead. With planned UNBOUND workqueue updates to improve execution locality and more prevalence of chiplet designs which can benefit from such improvements, this isn't a state we wanna be in forever. This patch series audits all call sites that create an UNBOUND workqueue w/ @max_active==1 and converts them to alloc_ordered_workqueue() as necessary. BTRFS ===== * fs_info->scrub_workers initialized in scrub_workers_get() was setting @max_active to 1 when @is_dev_replace is set and it seems that the workqueue actually needs to be ordered if @is_dev_replace. Update the code so that alloc_ordered_workqueue() is used if @is_dev_replace. * fs_info->discard_ctl.discard_workers initialized in btrfs_init_workqueues() was directly using alloc_workqueue() w/ @max_active==1. Converted to alloc_ordered_workqueue(). * fs_info->fixup_workers and fs_info->qgroup_rescan_workers initialized in btrfs_queue_work() use the btrfs's workqueue wrapper, btrfs_workqueue, which are allocated with btrfs_alloc_workqueue(). btrfs_workqueue implements automatic @max_active adjustment which is disabled when the specified max limit is below a certain threshold, so calling btrfs_alloc_workqueue() with @limit_active==1 yields an ordered workqueue whose @max_active won't be changed as the auto-tuning is disabled. This is rather brittle in that nothing clearly indicates that the two workqueues should be ordered or btrfs_alloc_workqueue() must disable auto-tuning when @limit_active==1. This patch factors out the common btrfs_workqueue init code into btrfs_init_workqueue() and add explicit btrfs_alloc_ordered_workqueue(). The two workqueues are converted to use the new ordered allocation interface. Signed-off-by: Tejun Heo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d7a14458c4a7..316c5a0e3a44 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2740,8 +2740,10 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; - scrub_workers = alloc_workqueue("btrfs-scrub", flags, - is_dev_replace ? 1 : max_active); + if (is_dev_replace) + scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags); + else + scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); if (!scrub_workers) goto fail_scrub_workers; -- cgit From 723b8bb17e2e680934f59823ab1e72c1000d88d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 06:17:38 +0200 Subject: btrfs: open code btrfs_map_sblock btrfs_map_sblock just hard codes three arguments and calls btrfs_map_sblock. Remove it as it doesn't provide any real value, but makes following the btrfs_map_block call chains harder. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 316c5a0e3a44..0cf8e2f277da 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -888,8 +888,9 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, /* For scrub, our mirror_num should always start at 1. */ ASSERT(stripe->mirror_num >= 1); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - stripe->logical, &mapped_len, &bioc); + ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + stripe->logical, &mapped_len, &bioc, + NULL, NULL, 1); /* * If we failed, dev will be NULL, and later detailed reports * will just be skipped. @@ -1921,8 +1922,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bio->bi_end_io = raid56_scrub_wait_endio; btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start, - &length, &bioc); + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, + &length, &bioc, NULL, NULL, 1); if (ret < 0) { btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); -- cgit From c2bbc0bab0bb3cdd38914fe714f9b6c3f7544e88 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 12 Jun 2023 14:14:29 +0800 Subject: btrfs: scrub: remove scrub_ctx::csum_list member Since the rework of scrub introduced by commit 2af2aaf98205 ("btrfs: scrub: introduce structure for new BTRFS_STRIPE_LEN based interface") and later commits, scrub no longer keeps its data checksum inside sctx. Instead we have scrub_stripe::csums for the checksum of the stripe. Thus we can remove the unused scrub_ctx::csum_list member. Reviewed-by: Christoph Hellwig Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0cf8e2f277da..297beaefd49c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -177,7 +177,6 @@ struct scrub_ctx { struct btrfs_fs_info *fs_info; int first_free; int cur_stripe; - struct list_head csum_list; atomic_t cancel_req; int readonly; int sectors_per_bio; @@ -309,17 +308,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) scrub_pause_off(fs_info); } -static void scrub_free_csums(struct scrub_ctx *sctx) -{ - while (!list_empty(&sctx->csum_list)) { - struct btrfs_ordered_sum *sum; - sum = list_first_entry(&sctx->csum_list, - struct btrfs_ordered_sum, list); - list_del(&sum->list); - kfree(sum); - } -} - static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) { int i; @@ -330,7 +318,6 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) release_scrub_stripe(&sctx->stripes[i]); - scrub_free_csums(sctx); kfree(sctx); } @@ -352,7 +339,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->fs_info = fs_info; - INIT_LIST_HEAD(&sctx->csum_list); for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { int ret; -- cgit From 81db6ae842b3c07edd67278e3693f53a28d694cf Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 12 Jun 2023 15:23:29 +0800 Subject: btrfs: scrub: remove btrfs_fs_info::scrub_wr_completion_workers Since the scrub rework introduced by commit 2af2aaf98205 ("btrfs: scrub: introduce structure for new BTRFS_STRIPE_LEN based interface") and later commits, scrub only needs one single workqueue, fs_info::scrub_worker. That scrub_wr_completion_workers is initially to handle the delay work after write bios finished. But the new scrub code goes submit-and-wait for write bios, thus all the work are done inside the scrub_worker. The last user of fs_info::scrub_wr_completion_workers is removed in commit 16f93993498b ("btrfs: scrub: remove the old writeback infrastructure"), so we can safely remove the workqueue. Reviewed-by: Christoph Hellwig Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 297beaefd49c..2c7fdbb60314 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2698,17 +2698,12 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, &fs_info->scrub_lock)) { struct workqueue_struct *scrub_workers = fs_info->scrub_workers; - struct workqueue_struct *scrub_wr_comp = - fs_info->scrub_wr_completion_workers; fs_info->scrub_workers = NULL; - fs_info->scrub_wr_completion_workers = NULL; mutex_unlock(&fs_info->scrub_lock); if (scrub_workers) destroy_workqueue(scrub_workers); - if (scrub_wr_comp) - destroy_workqueue(scrub_wr_comp); } } @@ -2719,7 +2714,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { struct workqueue_struct *scrub_workers = NULL; - struct workqueue_struct *scrub_wr_comp = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; int ret = -ENOMEM; @@ -2732,18 +2726,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, else scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); if (!scrub_workers) - goto fail_scrub_workers; - - scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active); - if (!scrub_wr_comp) - goto fail_scrub_wr_completion_workers; + return -ENOMEM; mutex_lock(&fs_info->scrub_lock); if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { - ASSERT(fs_info->scrub_workers == NULL && - fs_info->scrub_wr_completion_workers == NULL); + ASSERT(fs_info->scrub_workers == NULL); fs_info->scrub_workers = scrub_workers; - fs_info->scrub_wr_completion_workers = scrub_wr_comp; refcount_set(&fs_info->scrub_workers_refcnt, 1); mutex_unlock(&fs_info->scrub_lock); return 0; @@ -2754,10 +2742,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, ret = 0; - destroy_workqueue(scrub_wr_comp); -fail_scrub_wr_completion_workers: destroy_workqueue(scrub_workers); -fail_scrub_workers: return ret; } -- cgit