From 94ead93e63758f2c7cbe0c68ca232fff812ca33e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 13 Apr 2023 13:57:18 +0800
Subject: btrfs: scrub: use recovered data stripes as cache to avoid
 unnecessary read

For P/Q stripe scrub, we have quite some duplicated read IO:

- Data stripes read for verification
  This is triggered by the scrub_submit_initial_read() inside
  scrub_raid56_parity_stripe().

- Data stripes read (again) for P/Q stripe verification
  This is triggered by scrub_assemble_read_bios() from scrub_rbio().

  Although we can have hit rbio cache and avoid unnecessary read, the
  chance is very low, as scrub would easily flush the whole rbio cache.

This means, even we're just scrubbing a single P/Q stripe, we would read
the data stripes twice for the best case scenario.  If we need to
recover some data stripes, it would cause more reads on the same data
stripes, again and again.

However before we call raid56_parity_submit_scrub_rbio() we already
have all data stripes repaired and their contents ready to use.
But RAID56 cache is unaware about the scrub cache, thus RAID56 layer
itself still needs to re-read the data stripes.

To avoid such cache miss, this patch would:

- Introduce a new helper, raid56_parity_cache_data_pages()
  This function would grab the pages from an array, and copy the content
  to the rbio, marking all the involved sectors uptodate.

  The page copy is unavoidable because of the cache pages of rbio are all
  self managed, thus can not utilize outside pages without screwing up
  the lifespan.

- Use the repaired data stripes as cache inside
  scrub_raid56_parity_stripe()

By this, we ensure all the data sectors of the scrub rbio are already
uptodate, and no need to read them again from disk.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs/btrfs/scrub.c')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index bceaa8c2007e..f231883f504a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1972,6 +1972,13 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
 		btrfs_bio_counter_dec(fs_info);
 		goto out;
 	}
+	/* Use the recovered stripes as cache to avoid read them from disk again. */
+	for (int i = 0; i < data_stripes; i++) {
+		stripe = &sctx->raid56_data_stripes[i];
+
+		raid56_parity_cache_data_pages(rbio, stripe->pages,
+				full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT));
+	}
 	raid56_parity_submit_scrub_rbio(rbio);
 	wait_for_completion_io(&io_done);
 	ret = blk_status_to_errno(bio->bi_status);
-- 
cgit 


From b7f9945a1479a97a7aa2cc2b9d36e72f33fcb174 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 11 May 2023 16:01:44 +0800
Subject: btrfs: handle tree backref walk error properly

[BUG]
Smatch reports the following errors related to commit ("btrfs: output
affected files when relocation fails"):

	fs/btrfs/inode.c:283 print_data_reloc_error()
	error: uninitialized symbol 'ref_level'.

[CAUSE]
That part of code is mostly copied from scrub, but unfortunately scrub
code from the beginning is not doing the error handling properly.

The offending code looks like this:

	do {
		ret = tree_backref_for_extent();
		btrfs_warn_rl();
	} while (ret != 1);

There are several problems involved:

- No error handling
  If that tree_backref_for_extent() failed, we would output the same
  error again and again, never really exit as it requires ret == 1 to
  exit.

- Always do one extra output
  As tree_backref_for_extent() only return > 0 if there is no more
  backref item.
  This means after the last item we hit, we would output an invalid
  error message for ret > 0 case.

[FIX]
Fix the old code by:

- Move @ref_root and @ref_level into the if branch
  And do not initialize them, so we can catch such uninitialized values
  just like what we do in the inode.c

- Explicitly check the return value of tree_backref_for_extent()
  And handle ret < 0 and ret > 0 cases properly.

- No more do {} while () loop
  Instead go while (true) {} loop since we will handle @ret manually.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'fs/btrfs/scrub.c')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f231883f504a..265db0c15a4c 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -479,11 +479,8 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
 	struct extent_buffer *eb;
 	struct btrfs_extent_item *ei;
 	struct scrub_warning swarn;
-	unsigned long ptr = 0;
 	u64 flags = 0;
-	u64 ref_root;
 	u32 item_size;
-	u8 ref_level = 0;
 	int ret;
 
 	/* Super block error, no need to search extent tree. */
@@ -513,19 +510,28 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
 	item_size = btrfs_item_size(eb, path->slots[0]);
 
 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		do {
+		unsigned long ptr = 0;
+		u8 ref_level;
+		u64 ref_root;
+
+		while (true) {
 			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 						      item_size, &ref_root,
 						      &ref_level);
+			if (ret < 0) {
+				btrfs_warn(fs_info,
+				"failed to resolve tree backref for logical %llu: %d",
+						  swarn.logical, ret);
+				break;
+			}
+			if (ret > 0)
+				break;
 			btrfs_warn_in_rcu(fs_info,
 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
-				errstr, swarn.logical,
-				btrfs_dev_name(dev),
-				swarn.physical,
-				ref_level ? "node" : "leaf",
-				ret < 0 ? -1 : ref_level,
-				ret < 0 ? -1 : ref_root);
-		} while (ret != 1);
+				errstr, swarn.logical, btrfs_dev_name(dev),
+				swarn.physical, (ref_level ? "node" : "leaf"),
+				ref_level, ref_root);
+		}
 		btrfs_release_path(path);
 	} else {
 		struct btrfs_backref_walk_ctx ctx = { 0 };
-- 
cgit 


From b9cb105e737f32b9cc18c4f47edabdb0ae66d569 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Fri, 12 May 2023 13:44:57 +0800
Subject: btrfs: scrub: remove more unused functions

These functions are defined in the scrub.c file, but last callers were
removed in e9255d6c4054 ("btrfs: scrub: remove the old scrub recheck
code").

fs/btrfs/scrub.c:553:20: warning: unused function 'scrub_stripe_index_and_offset'.
fs/btrfs/scrub.c:543:19: warning: unused function 'scrub_nr_raid_mirrors'.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=4937
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 42 ------------------------------------------
 1 file changed, 42 deletions(-)

(limited to 'fs/btrfs/scrub.c')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 265db0c15a4c..d7a14458c4a7 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -552,48 +552,6 @@ out:
 	btrfs_free_path(path);
 }
 
-static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
-{
-	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
-		return 2;
-	else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
-		return 3;
-	else
-		return (int)bioc->num_stripes;
-}
-
-static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
-						 u64 full_stripe_logical,
-						 int nstripes, int mirror,
-						 int *stripe_index,
-						 u64 *stripe_offset)
-{
-	int i;
-
-	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ?
-					    nstripes - 1 : nstripes - 2;
-
-		/* RAID5/6 */
-		for (i = 0; i < nr_data_stripes; i++) {
-			const u64 data_stripe_start = full_stripe_logical +
-						(i * BTRFS_STRIPE_LEN);
-
-			if (logical >= data_stripe_start &&
-			    logical < data_stripe_start + BTRFS_STRIPE_LEN)
-				break;
-		}
-
-		*stripe_index = i;
-		*stripe_offset = (logical - full_stripe_logical) &
-				 BTRFS_STRIPE_LEN_MASK;
-	} else {
-		/* The other RAID type */
-		*stripe_index = mirror;
-		*stripe_offset = 0;
-	}
-}
-
 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
 {
 	int ret = 0;
-- 
cgit 


From 58e814fcacc1f652d2b794c82b7c9d96ee3c3bab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 25 May 2023 13:33:08 -1000
Subject: btrfs: use alloc_ordered_workqueue() to create ordered workqueues

BACKGROUND
==========

When multiple work items are queued to a workqueue, their execution order
doesn't match the queueing order. They may get executed in any order and
simultaneously. When fully serialized execution - one by one in the queueing
order - is needed, an ordered workqueue should be used which can be created
with alloc_ordered_workqueue().

However, alloc_ordered_workqueue() was a later addition. Before it, an
ordered workqueue could be obtained by creating an UNBOUND workqueue with
@max_active==1. This originally was an implementation side-effect which was
broken by 4c16bd327c74 ("workqueue: restore WQ_UNBOUND/max_active==1 to be
ordered"). Because there were users that depended on the ordered execution,
5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered")
made workqueue allocation path to implicitly promote UNBOUND workqueues w/
@max_active==1 to ordered workqueues.

While this has worked okay, overloading the UNBOUND allocation interface
this way creates other issues. It's difficult to tell whether a given
workqueue actually needs to be ordered and users that legitimately want a
min concurrency level wq unexpectedly gets an ordered one instead. With
planned UNBOUND workqueue updates to improve execution locality and more
prevalence of chiplet designs which can benefit from such improvements, this
isn't a state we wanna be in forever.

This patch series audits all call sites that create an UNBOUND workqueue w/
@max_active==1 and converts them to alloc_ordered_workqueue() as necessary.

BTRFS
=====

* fs_info->scrub_workers initialized in scrub_workers_get() was setting
  @max_active to 1 when @is_dev_replace is set and it seems that the
  workqueue actually needs to be ordered if @is_dev_replace. Update the code
  so that alloc_ordered_workqueue() is used if @is_dev_replace.

* fs_info->discard_ctl.discard_workers initialized in
  btrfs_init_workqueues() was directly using alloc_workqueue() w/
  @max_active==1. Converted to alloc_ordered_workqueue().

* fs_info->fixup_workers and fs_info->qgroup_rescan_workers initialized in
  btrfs_queue_work() use the btrfs's workqueue wrapper, btrfs_workqueue,
  which are allocated with btrfs_alloc_workqueue().

  btrfs_workqueue implements automatic @max_active adjustment which is
  disabled when the specified max limit is below a certain threshold, so
  calling btrfs_alloc_workqueue() with @limit_active==1 yields an ordered
  workqueue whose @max_active won't be changed as the auto-tuning is
  disabled.

  This is rather brittle in that nothing clearly indicates that the two
  workqueues should be ordered or btrfs_alloc_workqueue() must disable
  auto-tuning when @limit_active==1.

  This patch factors out the common btrfs_workqueue init code into
  btrfs_init_workqueue() and add explicit btrfs_alloc_ordered_workqueue().
  The two workqueues are converted to use the new ordered allocation
  interface.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/scrub.c')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d7a14458c4a7..316c5a0e3a44 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2740,8 +2740,10 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 	if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
 		return 0;
 
-	scrub_workers = alloc_workqueue("btrfs-scrub", flags,
-					is_dev_replace ? 1 : max_active);
+	if (is_dev_replace)
+		scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags);
+	else
+		scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active);
 	if (!scrub_workers)
 		goto fail_scrub_workers;
 
-- 
cgit 


From 723b8bb17e2e680934f59823ab1e72c1000d88d6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 31 May 2023 06:17:38 +0200
Subject: btrfs: open code btrfs_map_sblock

btrfs_map_sblock just hard codes three arguments and calls
btrfs_map_sblock.  Remove it as it doesn't provide any real value, but
makes following the btrfs_map_block call chains harder.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/scrub.c')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 316c5a0e3a44..0cf8e2f277da 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -888,8 +888,9 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
 
 		/* For scrub, our mirror_num should always start at 1. */
 		ASSERT(stripe->mirror_num >= 1);
-		ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
-				       stripe->logical, &mapped_len, &bioc);
+		ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+				      stripe->logical, &mapped_len, &bioc,
+				      NULL, NULL, 1);
 		/*
 		 * If we failed, dev will be NULL, and later detailed reports
 		 * will just be skipped.
@@ -1921,8 +1922,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
 	bio->bi_end_io = raid56_scrub_wait_endio;
 
 	btrfs_bio_counter_inc_blocked(fs_info);
-	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
-			       &length, &bioc);
+	ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
+			      &length, &bioc, NULL, NULL, 1);
 	if (ret < 0) {
 		btrfs_put_bioc(bioc);
 		btrfs_bio_counter_dec(fs_info);
-- 
cgit 


From c2bbc0bab0bb3cdd38914fe714f9b6c3f7544e88 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 12 Jun 2023 14:14:29 +0800
Subject: btrfs: scrub: remove scrub_ctx::csum_list member

Since the rework of scrub introduced by commit 2af2aaf98205 ("btrfs:
scrub: introduce structure for new BTRFS_STRIPE_LEN based interface")
and later commits, scrub no longer keeps its data checksum inside sctx.

Instead we have scrub_stripe::csums for the checksum of the stripe.
Thus we can remove the unused scrub_ctx::csum_list member.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'fs/btrfs/scrub.c')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 0cf8e2f277da..297beaefd49c 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -177,7 +177,6 @@ struct scrub_ctx {
 	struct btrfs_fs_info	*fs_info;
 	int			first_free;
 	int			cur_stripe;
-	struct list_head	csum_list;
 	atomic_t		cancel_req;
 	int			readonly;
 	int			sectors_per_bio;
@@ -309,17 +308,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 	scrub_pause_off(fs_info);
 }
 
-static void scrub_free_csums(struct scrub_ctx *sctx)
-{
-	while (!list_empty(&sctx->csum_list)) {
-		struct btrfs_ordered_sum *sum;
-		sum = list_first_entry(&sctx->csum_list,
-				       struct btrfs_ordered_sum, list);
-		list_del(&sum->list);
-		kfree(sum);
-	}
-}
-
 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 {
 	int i;
@@ -330,7 +318,6 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 	for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
 		release_scrub_stripe(&sctx->stripes[i]);
 
-	scrub_free_csums(sctx);
 	kfree(sctx);
 }
 
@@ -352,7 +339,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 	refcount_set(&sctx->refs, 1);
 	sctx->is_dev_replace = is_dev_replace;
 	sctx->fs_info = fs_info;
-	INIT_LIST_HEAD(&sctx->csum_list);
 	for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) {
 		int ret;
 
-- 
cgit 


From 81db6ae842b3c07edd67278e3693f53a28d694cf Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 12 Jun 2023 15:23:29 +0800
Subject: btrfs: scrub: remove btrfs_fs_info::scrub_wr_completion_workers

Since the scrub rework introduced by commit 2af2aaf98205 ("btrfs:
scrub: introduce structure for new BTRFS_STRIPE_LEN based interface")
and later commits, scrub only needs one single workqueue,
fs_info::scrub_worker.

That scrub_wr_completion_workers is initially to handle the delay work
after write bios finished.  But the new scrub code goes submit-and-wait
for write bios, thus all the work are done inside the scrub_worker.

The last user of fs_info::scrub_wr_completion_workers is removed in
commit 16f93993498b ("btrfs: scrub: remove the old writeback
infrastructure"), so we can safely remove the workqueue.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'fs/btrfs/scrub.c')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 297beaefd49c..2c7fdbb60314 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2698,17 +2698,12 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
 	if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
 					&fs_info->scrub_lock)) {
 		struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
-		struct workqueue_struct *scrub_wr_comp =
-						fs_info->scrub_wr_completion_workers;
 
 		fs_info->scrub_workers = NULL;
-		fs_info->scrub_wr_completion_workers = NULL;
 		mutex_unlock(&fs_info->scrub_lock);
 
 		if (scrub_workers)
 			destroy_workqueue(scrub_workers);
-		if (scrub_wr_comp)
-			destroy_workqueue(scrub_wr_comp);
 	}
 }
 
@@ -2719,7 +2714,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 						int is_dev_replace)
 {
 	struct workqueue_struct *scrub_workers = NULL;
-	struct workqueue_struct *scrub_wr_comp = NULL;
 	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
 	int max_active = fs_info->thread_pool_size;
 	int ret = -ENOMEM;
@@ -2732,18 +2726,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 	else
 		scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active);
 	if (!scrub_workers)
-		goto fail_scrub_workers;
-
-	scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
-	if (!scrub_wr_comp)
-		goto fail_scrub_wr_completion_workers;
+		return -ENOMEM;
 
 	mutex_lock(&fs_info->scrub_lock);
 	if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
-		ASSERT(fs_info->scrub_workers == NULL &&
-		       fs_info->scrub_wr_completion_workers == NULL);
+		ASSERT(fs_info->scrub_workers == NULL);
 		fs_info->scrub_workers = scrub_workers;
-		fs_info->scrub_wr_completion_workers = scrub_wr_comp;
 		refcount_set(&fs_info->scrub_workers_refcnt, 1);
 		mutex_unlock(&fs_info->scrub_lock);
 		return 0;
@@ -2754,10 +2742,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 
 	ret = 0;
 
-	destroy_workqueue(scrub_wr_comp);
-fail_scrub_wr_completion_workers:
 	destroy_workqueue(scrub_workers);
-fail_scrub_workers:
 	return ret;
 }
 
-- 
cgit