From d5b81ced74afded85619ffbbe9c32ba9d82c9b1e Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Wed, 31 Aug 2022 13:55:48 +0900
Subject: btrfs: zoned: fix API misuse of zone finish waiting

The commit 2ce543f47843 ("btrfs: zoned: wait until zone is finished when
allocation didn't progress") implemented a zone finish waiting mechanism
to the write path of zoned mode. However, using
wait_var_event()/wake_up_all() on fs_info->zone_finish_wait is wrong and
wait_var_event() just hangs because no one ever wakes it up once it goes
into sleep.

Instead, we can simply use wait_on_bit_io() and clear_and_wake_up_bit()
on fs_info->flags with a proper barrier installed.

Fixes: 2ce543f47843 ("btrfs: zoned: wait until zone is finished when allocation didn't progress")
CC: stable@vger.kernel.org # 5.16+
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   | 2 --
 fs/btrfs/disk-io.c | 1 -
 fs/btrfs/inode.c   | 7 +++----
 fs/btrfs/zoned.c   | 3 +--
 4 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9ef162dbd4bc..df8c99c99df9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1088,8 +1088,6 @@ struct btrfs_fs_info {
 
 	spinlock_t zone_active_bgs_lock;
 	struct list_head zone_active_bgs;
-	/* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */
-	wait_queue_head_t zone_finish_wait;
 
 	/* Updates are not protected by any lock */
 	struct btrfs_commit_stats commit_stats;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0345aa16c01..129711773ae5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3070,7 +3070,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	init_waitqueue_head(&fs_info->transaction_blocked_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
 	init_waitqueue_head(&fs_info->delayed_iputs_wait);
-	init_waitqueue_head(&fs_info->zone_finish_wait);
 
 	/* Usable values until the real ones are cached from the superblock */
 	fs_info->nodesize = 4096;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bea0adf28735..a95bac0c3e20 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1644,10 +1644,9 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
 			done_offset = end;
 
 		if (done_offset == start) {
-			struct btrfs_fs_info *info = inode->root->fs_info;
-
-			wait_var_event(&info->zone_finish_wait,
-				       !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags));
+			wait_on_bit_io(&inode->root->fs_info->flags,
+				       BTRFS_FS_NEED_ZONE_FINISH,
+				       TASK_UNINTERRUPTIBLE);
 			continue;
 		}
 
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index b150b07ba1a7..fb25ba7fcb2a 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2007,8 +2007,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
 	/* For active_bg_list */
 	btrfs_put_block_group(block_group);
 
-	clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
-	wake_up_all(&fs_info->zone_finish_wait);
+	clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
 
 	return 0;
 }
-- 
cgit 


From cac5c44c48c9fb9cc31bea15ebd9ef0c6462314f Mon Sep 17 00:00:00 2001
From: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Date: Fri, 26 Aug 2022 16:42:15 +0900
Subject: btrfs: zoned: set pseudo max append zone limit in zone emulation mode

The commit 7d7672bc5d10 ("btrfs: convert count_max_extents() to use
fs_info->max_extent_size") introduced a division by
fs_info->max_extent_size. This max_extent_size is initialized with max
zone append limit size of the device btrfs runs on. However, in zone
emulation mode, the device is not zoned then its zone append limit is
zero. This resulted in zero value of fs_info->max_extent_size and caused
zero division error.

Fix the error by setting non-zero pseudo value to max append zone limit
in zone emulation mode. Set the pseudo value based on max_segments as
suggested in the commit c2ae7b772ef4 ("btrfs: zoned: revive
max_zone_append_bytes").

Fixes: 7d7672bc5d10 ("btrfs: convert count_max_extents() to use fs_info->max_extent_size")
CC: stable@vger.kernel.org # 5.12+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index fb25ba7fcb2a..f576e30ceb33 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -421,10 +421,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 	 * since btrfs adds the pages one by one to a bio, and btrfs cannot
 	 * increase the metadata reservation even if it increases the number of
 	 * extents, it is safe to stick with the limit.
+	 *
+	 * With the zoned emulation, we can have non-zoned device on the zoned
+	 * mode. In this case, we don't have a valid max zone append size. So,
+	 * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size.
 	 */
-	zone_info->max_zone_append_size =
-		min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
-		      (u64)bdev_max_segments(bdev) << PAGE_SHIFT);
+	if (bdev_is_zoned(bdev)) {
+		zone_info->max_zone_append_size = min_t(u64,
+			(u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
+			(u64)bdev_max_segments(bdev) << PAGE_SHIFT);
+	} else {
+		zone_info->max_zone_append_size =
+			(u64)bdev_max_segments(bdev) << PAGE_SHIFT;
+	}
 	if (!IS_ALIGNED(nr_sectors, zone_sectors))
 		zone_info->nr_zones++;
 
-- 
cgit 


From 6ca64ac2763149fb66c0b4bf12f5e0977a88e51d Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Mon, 5 Sep 2022 05:38:24 -0700
Subject: btrfs: zoned: fix mounting with conventional zones

Since commit 6a921de58992 ("btrfs: zoned: introduce
space_info->active_total_bytes"), we're only counting the bytes of a
block group on an active zone as usable for metadata writes. But on a
SMR drive, we don't have active zones and short circuit some of the
logic.

This leads to an error on mount, because we cannot reserve space for
metadata writes.

Fix this by also setting the BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE bit in the
block-group's runtime flag if the zone is a conventional zone.

Fixes: 6a921de58992 ("btrfs: zoned: introduce space_info->active_total_bytes")
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 81 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 40 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index f576e30ceb33..62e7007a7e46 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1187,7 +1187,7 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
  * offset.
  */
 static int calculate_alloc_pointer(struct btrfs_block_group *cache,
-				   u64 *offset_ret)
+				   u64 *offset_ret, bool new)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	struct btrfs_root *root;
@@ -1197,6 +1197,21 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
 	int ret;
 	u64 length;
 
+	/*
+	 * Avoid  tree lookups for a new block group, there's no use for it.
+	 * It must always be 0.
+	 *
+	 * Also, we have a lock chain of extent buffer lock -> chunk mutex.
+	 * For new a block group, this function is called from
+	 * btrfs_make_block_group() which is already taking the chunk mutex.
+	 * Thus, we cannot call calculate_alloc_pointer() which takes extent
+	 * buffer locks to avoid deadlock.
+	 */
+	if (new) {
+		*offset_ret = 0;
+		return 0;
+	}
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1332,6 +1347,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 		else
 			num_conventional++;
 
+		/*
+		 * Consider a zone as active if we can allow any number of
+		 * active zones.
+		 */
+		if (!device->zone_info->max_active_zones)
+			__set_bit(i, active);
+
 		if (!is_sequential) {
 			alloc_offsets[i] = WP_CONVENTIONAL;
 			continue;
@@ -1398,45 +1420,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 			__set_bit(i, active);
 			break;
 		}
-
-		/*
-		 * Consider a zone as active if we can allow any number of
-		 * active zones.
-		 */
-		if (!device->zone_info->max_active_zones)
-			__set_bit(i, active);
 	}
 
 	if (num_sequential > 0)
 		cache->seq_zone = true;
 
 	if (num_conventional > 0) {
-		/*
-		 * Avoid calling calculate_alloc_pointer() for new BG. It
-		 * is no use for new BG. It must be always 0.
-		 *
-		 * Also, we have a lock chain of extent buffer lock ->
-		 * chunk mutex.  For new BG, this function is called from
-		 * btrfs_make_block_group() which is already taking the
-		 * chunk mutex. Thus, we cannot call
-		 * calculate_alloc_pointer() which takes extent buffer
-		 * locks to avoid deadlock.
-		 */
-
 		/* Zone capacity is always zone size in emulation */
 		cache->zone_capacity = cache->length;
-		if (new) {
-			cache->alloc_offset = 0;
-			goto out;
-		}
-		ret = calculate_alloc_pointer(cache, &last_alloc);
-		if (ret || map->num_stripes == num_conventional) {
-			if (!ret)
-				cache->alloc_offset = last_alloc;
-			else
-				btrfs_err(fs_info,
+		ret = calculate_alloc_pointer(cache, &last_alloc, new);
+		if (ret) {
+			btrfs_err(fs_info,
 			"zoned: failed to determine allocation offset of bg %llu",
-					  cache->start);
+				  cache->start);
+			goto out;
+		} else if (map->num_stripes == num_conventional) {
+			cache->alloc_offset = last_alloc;
+			cache->zone_is_active = 1;
 			goto out;
 		}
 	}
@@ -1504,13 +1504,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 		goto out;
 	}
 
-	if (cache->zone_is_active) {
-		btrfs_get_block_group(cache);
-		spin_lock(&fs_info->zone_active_bgs_lock);
-		list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
-		spin_unlock(&fs_info->zone_active_bgs_lock);
-	}
-
 out:
 	if (cache->alloc_offset > fs_info->zone_size) {
 		btrfs_err(fs_info,
@@ -1535,10 +1528,16 @@ out:
 		ret = -EIO;
 	}
 
-	if (!ret)
+	if (!ret) {
 		cache->meta_write_pointer = cache->alloc_offset + cache->start;
-
-	if (ret) {
+		if (cache->zone_is_active) {
+			btrfs_get_block_group(cache);
+			spin_lock(&fs_info->zone_active_bgs_lock);
+			list_add_tail(&cache->active_bg_list,
+				      &fs_info->zone_active_bgs);
+			spin_unlock(&fs_info->zone_active_bgs_lock);
+		}
+	} else {
 		kfree(cache->physical_map);
 		cache->physical_map = NULL;
 	}
-- 
cgit 


From 5da431b71d4b9be3c8cf6786eff9e2d41a5f9f65 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 18 Aug 2022 15:06:44 +0800
Subject: btrfs: fix the max chunk size and stripe length calculation

[BEHAVIOR CHANGE]
Since commit f6fca3917b4d ("btrfs: store chunk size in space-info
struct"), btrfs no longer can create larger data chunks than 1G:

  mkfs.btrfs -f -m raid1 -d raid0 $dev1 $dev2 $dev3 $dev4
  mount $dev1 $mnt

  btrfs balance start --full $mnt
  btrfs balance start --full $mnt
  umount $mnt

  btrfs ins dump-tree -t chunk $dev1 | grep "DATA|RAID0" -C 2

Before that offending commit, what we got is a 4G data chunk:

	item 6 key (FIRST_CHUNK_TREE CHUNK_ITEM 9492758528) itemoff 15491 itemsize 176
		length 4294967296 owner 2 stripe_len 65536 type DATA|RAID0
		io_align 65536 io_width 65536 sector_size 4096
		num_stripes 4 sub_stripes 1

Now what we got is only 1G data chunk:

	item 6 key (FIRST_CHUNK_TREE CHUNK_ITEM 6271533056) itemoff 15491 itemsize 176
		length 1073741824 owner 2 stripe_len 65536 type DATA|RAID0
		io_align 65536 io_width 65536 sector_size 4096
		num_stripes 4 sub_stripes 1

This will increase the number of data chunks by the number of devices,
not only increase system chunk usage, but also greatly increase mount
time.

Without a proper reason, we should not change the max chunk size.

[CAUSE]
Previously, we set max data chunk size to 10G, while max data stripe
length to 1G.

Commit f6fca3917b4d ("btrfs: store chunk size in space-info struct")
completely ignored the 10G limit, but use 1G max stripe limit instead,
causing above shrink in max data chunk size.

[FIX]
Fix the max data chunk size to 10G, and in decide_stripe_size_regular()
we limit stripe_size to 1G manually.

This should only affect data chunks, as for metadata chunks we always
set the max stripe size the same as max chunk size (256M or 1G
depending on fs size).

Now the same script result the same old result:

	item 6 key (FIRST_CHUNK_TREE CHUNK_ITEM 9492758528) itemoff 15491 itemsize 176
		length 4294967296 owner 2 stripe_len 65536 type DATA|RAID0
		io_align 65536 io_width 65536 sector_size 4096
		num_stripes 4 sub_stripes 1

Reported-by: Wang Yugui <wangyugui@e16-tech.com>
Fixes: f6fca3917b4d ("btrfs: store chunk size in space-info struct")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c | 2 +-
 fs/btrfs/volumes.c    | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d0cbeb7ae81c..435559ba94fa 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -199,7 +199,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
 	ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
 
 	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		return SZ_1G;
+		return BTRFS_MAX_DATA_CHUNK_SIZE;
 	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 		return SZ_32M;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 064ab2a79c80..f63ff91e2883 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5267,6 +5267,9 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
 				       ctl->stripe_size);
 	}
 
+	/* Stripe size should not go beyond 1G. */
+	ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
+
 	/* Align to BTRFS_STRIPE_LEN */
 	ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
 	ctl->chunk_size = ctl->stripe_size * data_stripes;
-- 
cgit