summaryrefslogtreecommitdiff
path: root/fs/btrfs/zoned.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/zoned.c')
-rw-r--r--fs/btrfs/zoned.c729
1 files changed, 536 insertions, 193 deletions
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 947a87576f6c..359a98e6de85 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -9,7 +9,6 @@
#include "ctree.h"
#include "volumes.h"
#include "zoned.h"
-#include "rcu-string.h"
#include "disk-io.h"
#include "block-group.h"
#include "dev-replace.h"
@@ -17,6 +16,8 @@
#include "fs.h"
#include "accessors.h"
#include "bio.h"
+#include "transaction.h"
+#include "sysfs.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
@@ -36,12 +37,15 @@
#define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G)
#define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G)
-#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
-#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
+#define BTRFS_SB_LOG_FIRST_SHIFT ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
+#define BTRFS_SB_LOG_SECOND_SHIFT ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
/* Number of superblock log zones */
#define BTRFS_NR_SB_LOG_ZONES 2
+/* Default number of max active zones when the device has no limits. */
+#define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128
+
/*
* Minimum of active zones we need:
*
@@ -87,10 +91,10 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
bool empty[BTRFS_NR_SB_LOG_ZONES];
bool full[BTRFS_NR_SB_LOG_ZONES];
sector_t sector;
- int i;
- for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
- ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
+ for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL,
+ "zones[%d].type=%d", i, zones[i].type);
empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
full[i] = sb_zone_is_full(&zones[i]);
}
@@ -121,9 +125,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
struct address_space *mapping = bdev->bd_mapping;
struct page *page[BTRFS_NR_SB_LOG_ZONES];
struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
- int i;
- for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT;
u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
BTRFS_SUPER_INFO_SIZE;
@@ -144,7 +147,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
else
sector = zones[0].start;
- for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
+ for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
btrfs_release_disk_super(super[i]);
} else if (!full[0] && (empty[1] || full[1])) {
sector = zones[0].wp;
@@ -164,14 +167,14 @@ static inline u32 sb_zone_number(int shift, int mirror)
{
u64 zone = U64_MAX;
- ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+ ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX, "mirror=%d", mirror);
switch (mirror) {
case 0: zone = 0; break;
case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
}
- ASSERT(zone <= U32_MAX);
+ ASSERT(zone <= U32_MAX, "zone=%llu", zone);
return (u32)zone;
}
@@ -238,7 +241,8 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
unsigned int i;
u32 zno;
- ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(pos, zinfo->zone_size),
+ "pos=%llu zinfo->zone_size=%llu", pos, zinfo->zone_size);
zno = pos >> zinfo->zone_size_shift;
/*
* We cannot report zones beyond the zone end. So, it is OK to
@@ -262,17 +266,17 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
}
}
- ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
- copy_zone_info_cb, zones);
+ ret = blkdev_report_zones_cached(device->bdev, pos >> SECTOR_SHIFT,
+ *nr_zones, copy_zone_info_cb, zones);
if (ret < 0) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: failed to read zone %llu on %s (devid %llu)",
- pos, rcu_str_deref(device->name),
+ pos, rcu_dereference(device->name),
device->devid);
return ret;
}
*nr_zones = ret;
- if (!ret)
+ if (unlikely(!ret))
return -EIO;
/* Populate cache */
@@ -289,7 +293,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
/* The emulated zone size is determined from the size of device extent */
static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -306,28 +310,21 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
+ return ret;
/* No dev extents at all? Not good */
- if (ret > 0) {
- ret = -EUCLEAN;
- goto out;
- }
+ if (unlikely(ret > 0))
+ return -EUCLEAN;
}
leaf = path->nodes[0];
dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
- ret = 0;
-
-out:
- btrfs_free_path(path);
-
- return ret;
+ return 0;
}
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
@@ -404,16 +401,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
/* We reject devices with a zone size larger than 8GB */
if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: zone size %llu larger than supported maximum %llu",
- rcu_str_deref(device->name),
+ rcu_dereference(device->name),
zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
ret = -EINVAL;
goto out;
} else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: zone size %llu smaller than supported minimum %u",
- rcu_str_deref(device->name),
+ rcu_dereference(device->name),
zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
ret = -EINVAL;
goto out;
@@ -425,11 +422,14 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
- max_active_zones = bdev_max_active_zones(bdev);
+ max_active_zones = min_not_zero(bdev_max_active_zones(bdev),
+ bdev_max_open_zones(bdev));
+ if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES)
+ max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES;
if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: max active zones %u is too small, need at least %u active zones",
- rcu_str_deref(device->name), max_active_zones,
+ rcu_dereference(device->name), max_active_zones,
BTRFS_MIN_ACTIVE_ZONES);
ret = -EINVAL;
goto out;
@@ -469,9 +469,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
zone_info->zone_cache = vcalloc(zone_info->nr_zones,
sizeof(struct blk_zone));
if (!zone_info->zone_cache) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: failed to allocate zone cache for %s",
- rcu_str_deref(device->name));
+ rcu_dereference(device->name));
ret = -ENOMEM;
goto out;
}
@@ -496,6 +496,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
case BLK_ZONE_COND_CLOSED:
+ case BLK_ZONE_COND_ACTIVE:
__set_bit(nreported, zone_info->active_zones);
nactive++;
break;
@@ -505,20 +506,25 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
}
- if (nreported != zone_info->nr_zones) {
- btrfs_err_in_rcu(device->fs_info,
+ if (unlikely(nreported != zone_info->nr_zones)) {
+ btrfs_err(device->fs_info,
"inconsistent number of zones on %s (%u/%u)",
- rcu_str_deref(device->name), nreported,
+ rcu_dereference(device->name), nreported,
zone_info->nr_zones);
ret = -EIO;
goto out;
}
if (max_active_zones) {
- if (nactive > max_active_zones) {
- btrfs_err_in_rcu(device->fs_info,
+ if (unlikely(nactive > max_active_zones)) {
+ if (bdev_max_active_zones(bdev) == 0) {
+ max_active_zones = 0;
+ zone_info->max_active_zones = 0;
+ goto validate;
+ }
+ btrfs_err(device->fs_info,
"zoned: %u active zones on %s exceeds max_active_zones %u",
- nactive, rcu_str_deref(device->name),
+ nactive, rcu_dereference(device->name),
max_active_zones);
ret = -EIO;
goto out;
@@ -528,6 +534,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
}
+validate:
/* Validate superblock log */
nr_zones = BTRFS_NR_SB_LOG_ZONES;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -546,8 +553,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (ret)
goto out;
- if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
- btrfs_err_in_rcu(device->fs_info,
+ if (unlikely(nr_zones != BTRFS_NR_SB_LOG_ZONES)) {
+ btrfs_err(device->fs_info,
"zoned: failed to read super block log zone info at devid %llu zone %u",
device->devid, sb_zone);
ret = -EUCLEAN;
@@ -564,8 +571,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
ret = sb_write_pointer(device->bdev,
&zone_info->sb_zones[sb_pos], &sb_wp);
- if (ret != -ENOENT && ret) {
- btrfs_err_in_rcu(device->fs_info,
+ if (unlikely(ret != -ENOENT && ret)) {
+ btrfs_err(device->fs_info,
"zoned: super block log zone corrupted devid %llu zone %u",
device->devid, sb_zone);
ret = -EUCLEAN;
@@ -584,9 +591,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
emulated = "emulated ";
}
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"%s block device %s, %u %szones of %llu bytes",
- model, rcu_str_deref(device->name), zone_info->nr_zones,
+ model, rcu_dereference(device->name), zone_info->nr_zones,
emulated, zone_info->zone_size);
return 0;
@@ -652,8 +659,7 @@ out:
return NULL;
}
-int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
- struct blk_zone *zone)
+static int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone)
{
unsigned int nr_zones = 1;
int ret;
@@ -717,11 +723,14 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
* zoned mode. In this case, we don't have a valid max zone
* append size.
*/
- if (bdev_is_zoned(device->bdev)) {
- blk_stack_limits(lim,
- &bdev_get_queue(device->bdev)->limits,
- 0);
- }
+ if (bdev_is_zoned(device->bdev))
+ blk_stack_limits(lim, bdev_limits(device->bdev), 0);
+ }
+
+ ret = blk_validate_limits(lim);
+ if (ret) {
+ btrfs_err(fs_info, "zoned: failed to validate queue limits");
+ return ret;
}
/*
@@ -755,8 +764,9 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
(u64)lim->max_segments << PAGE_SHIFT),
fs_info->sectorsize);
fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
- if (fs_info->max_zone_append_size < fs_info->max_extent_size)
- fs_info->max_extent_size = fs_info->max_zone_append_size;
+
+ fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size,
+ fs_info->max_zone_append_size);
/*
* Check mount options here, because we might change fs_info->zoned
@@ -770,7 +780,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt)
+int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info,
+ unsigned long long *mount_opt)
{
if (!btrfs_is_zoned(info))
return 0;
@@ -888,12 +899,12 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
if (sb_zone + 1 >= nr_zones)
return -ENOENT;
- ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
- BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
- zones);
+ ret = blkdev_report_zones_cached(bdev, zone_start_sector(sb_zone, bdev),
+ BTRFS_NR_SB_LOG_ZONES,
+ copy_zone_info_cb, zones);
if (ret < 0)
return ret;
- if (ret != BTRFS_NR_SB_LOG_ZONES)
+ if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES))
return -EIO;
return sb_log_location(bdev, zones, rw, bytenr_ret);
@@ -994,7 +1005,7 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
}
/* All the zones are FULL. Should not reach here. */
- ASSERT(0);
+ DEBUG_WARN("unexpected state, all zones full");
return -EIO;
}
@@ -1047,8 +1058,10 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
bool have_sb;
int i;
- ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
- ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size),
+ "hole_start=%llu zinfo->zone_size=%llu", hole_start, zinfo->zone_size);
+ ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size),
+ "num_bytes=%llu zinfo->zone_size=%llu", num_bytes, zinfo->zone_size);
while (pos < hole_end) {
begin = pos >> shift;
@@ -1164,8 +1177,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
u64 pos;
int ret;
- ASSERT(IS_ALIGNED(start, zinfo->zone_size));
- ASSERT(IS_ALIGNED(size, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(start, zinfo->zone_size),
+ "start=%llu, zinfo->zone_size=%llu", start, zinfo->zone_size);
+ ASSERT(IS_ALIGNED(size, zinfo->zone_size),
+ "size=%llu, zinfo->zone_size=%llu", size, zinfo->zone_size);
if (begin + nbits > zinfo->nr_zones)
return -ERANGE;
@@ -1187,10 +1202,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
continue;
/* Free regions should be empty */
- btrfs_warn_in_rcu(
+ btrfs_warn(
device->fs_info,
"zoned: resetting device %s (devid %llu) zone %llu for allocation",
- rcu_str_deref(device->name), device->devid, pos >> shift);
+ rcu_dereference(device->name), device->devid, pos >> shift);
WARN_ON_ONCE(1);
ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
@@ -1213,7 +1228,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
int ret;
@@ -1245,10 +1260,10 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
root = btrfs_extent_root(fs_info, key.objectid);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
/* We should not find the exact match */
- if (!ret)
+ if (unlikely(!ret))
ret = -EUCLEAN;
if (ret < 0)
- goto out;
+ return ret;
ret = btrfs_previous_extent_item(root, path, cache->start);
if (ret) {
@@ -1256,7 +1271,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
ret = 0;
*offset_ret = 0;
}
- goto out;
+ return ret;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
@@ -1266,17 +1281,12 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
else
length = fs_info->nodesize;
- if (!(found_key.objectid >= cache->start &&
- found_key.objectid + length <= cache->start + cache->length)) {
- ret = -EUCLEAN;
- goto out;
+ if (unlikely(!(found_key.objectid >= cache->start &&
+ found_key.objectid + length <= cache->start + cache->length))) {
+ return -EUCLEAN;
}
*offset_ret = found_key.objectid + length - cache->start;
- ret = 0;
-
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
struct zone_info {
@@ -1287,7 +1297,7 @@ struct zone_info {
static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
struct zone_info *info, unsigned long *active,
- struct btrfs_chunk_map *map)
+ struct btrfs_chunk_map *map, bool new)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *device;
@@ -1314,9 +1324,12 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
if (!btrfs_dev_is_sequential(device, info->physical)) {
up_read(&dev_replace->rwsem);
info->alloc_offset = WP_CONVENTIONAL;
+ info->capacity = device->zone_info->zone_size;
return 0;
}
+ ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical));
+
/* This zone will be used for allocation, so mark this zone non-empty. */
btrfs_dev_clear_zone_empty(device, info->physical);
@@ -1329,6 +1342,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
* to determine the allocation offset within the zone.
*/
WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
+
+ if (new) {
+ sector_t capacity;
+
+ capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT);
+ up_read(&dev_replace->rwsem);
+ info->alloc_offset = 0;
+ info->capacity = capacity << SECTOR_SHIFT;
+
+ return 0;
+ }
+
nofs_flag = memalloc_nofs_save();
ret = btrfs_get_dev_zone(device, info->physical, &zone);
memalloc_nofs_restore(nofs_flag);
@@ -1340,10 +1365,10 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
return 0;
}
- if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
- btrfs_err_in_rcu(fs_info,
+ if (unlikely(zone.type == BLK_ZONE_TYPE_CONVENTIONAL)) {
+ btrfs_err(fs_info,
"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
- zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
+ zone.start << SECTOR_SHIFT, rcu_dereference(device->name),
device->devid);
up_read(&dev_replace->rwsem);
return -EIO;
@@ -1357,7 +1382,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
btrfs_err(fs_info,
"zoned: offline/readonly zone %llu on device %s (devid %llu)",
(info->physical >> device->zone_info->zone_size_shift),
- rcu_str_deref(device->name), device->devid);
+ rcu_dereference(device->name), device->devid);
info->alloc_offset = WP_MISSING_DEV;
break;
case BLK_ZONE_COND_EMPTY:
@@ -1382,7 +1407,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
struct zone_info *info,
unsigned long *active)
{
- if (info->alloc_offset == WP_MISSING_DEV) {
+ if (unlikely(info->alloc_offset == WP_MISSING_DEV)) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
info->physical);
@@ -1399,7 +1424,8 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1408,40 +1434,49 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
return -EINVAL;
}
- if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
+ bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
+
+ if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
zone_info[0].physical);
return -EIO;
}
- if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
+ if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
zone_info[1].physical);
return -EIO;
}
- if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
+
+ if (zone_info[0].alloc_offset == WP_CONVENTIONAL)
+ zone_info[0].alloc_offset = last_alloc;
+
+ if (zone_info[1].alloc_offset == WP_CONVENTIONAL)
+ zone_info[1].alloc_offset = last_alloc;
+
+ if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) {
btrfs_err(bg->fs_info,
"zoned: write pointer offset mismatch of zones in DUP profile");
return -EIO;
}
if (test_bit(0, active) != test_bit(1, active)) {
- if (!btrfs_zone_activate(bg))
+ if (unlikely(!btrfs_zone_activate(bg)))
return -EIO;
} else if (test_bit(0, active)) {
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
}
bg->alloc_offset = zone_info[0].alloc_offset;
- bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity);
return 0;
}
static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
int i;
@@ -1452,30 +1487,32 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
return -EINVAL;
}
+ /* In case a device is missing we have a cap of 0, so don't use it. */
+ bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
+
for (i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
- if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ zone_info[i].alloc_offset = last_alloc;
+
+ if (unlikely((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
+ !btrfs_test_opt(fs_info, DEGRADED))) {
btrfs_err(fs_info,
"zoned: write pointer offset mismatch of zones in %s profile",
btrfs_bg_type_to_raid_name(map->type));
return -EIO;
}
if (test_bit(0, active) != test_bit(i, active)) {
- if (!btrfs_test_opt(fs_info, DEGRADED) &&
- !btrfs_zone_activate(bg)) {
+ if (unlikely(!btrfs_test_opt(fs_info, DEGRADED) &&
+ !btrfs_zone_activate(bg))) {
return -EIO;
}
} else {
if (test_bit(0, active))
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
}
- /* In case a device is missing we have a cap of 0, so don't use it. */
- bg->zone_capacity = min_not_zero(zone_info[0].capacity,
- zone_info[1].capacity);
}
if (zone_info[0].alloc_offset != WP_MISSING_DEV)
@@ -1489,9 +1526,12 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 stripe_nr = 0, stripe_offset = 0;
+ u32 stripe_index = 0;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1499,13 +1539,30 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
return -EINVAL;
}
+ if (last_alloc) {
+ u32 factor = map->num_stripes;
+
+ stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT;
+ stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK;
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ }
+
for (int i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
+
+ zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
+
+ if (stripe_index > i)
+ zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
+ else if (stripe_index == i)
+ zone_info[i].alloc_offset += stripe_offset;
+ }
+
if (test_bit(0, active) != test_bit(i, active)) {
- if (!btrfs_zone_activate(bg))
+ if (unlikely(!btrfs_zone_activate(bg)))
return -EIO;
} else {
if (test_bit(0, active))
@@ -1521,9 +1578,12 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 stripe_nr = 0, stripe_offset = 0;
+ u32 stripe_index = 0;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1531,19 +1591,35 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
return -EINVAL;
}
+ if (last_alloc) {
+ u32 factor = map->num_stripes / map->sub_stripes;
+
+ stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT;
+ stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK;
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ }
+
for (int i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
if (test_bit(0, active) != test_bit(i, active)) {
- if (!btrfs_zone_activate(bg))
+ if (unlikely(!btrfs_zone_activate(bg)))
return -EIO;
} else {
if (test_bit(0, active))
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
}
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
+ zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
+
+ if (stripe_index > (i / map->sub_stripes))
+ zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
+ else if (stripe_index == (i / map->sub_stripes))
+ zone_info[i].alloc_offset += stripe_offset;
+ }
+
if ((i % map->sub_stripes) == 0) {
bg->zone_capacity += zone_info[i].capacity;
bg->alloc_offset += zone_info[i].alloc_offset;
@@ -1559,18 +1635,19 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
struct btrfs_chunk_map *map;
u64 logical = cache->start;
u64 length = cache->length;
- struct zone_info *zone_info = NULL;
+ struct zone_info AUTO_KFREE(zone_info);
int ret;
int i;
unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
+ u64 profile;
if (!btrfs_is_zoned(fs_info))
return 0;
/* Sanity check */
- if (!IS_ALIGNED(length, fs_info->zone_size)) {
+ if (unlikely(!IS_ALIGNED(length, fs_info->zone_size))) {
btrfs_err(fs_info,
"zoned: block group %llu len %llu unaligned to zone size %llu",
logical, length, fs_info->zone_size);
@@ -1596,7 +1673,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
for (i = 0; i < map->num_stripes; i++) {
- ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
+ ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new);
if (ret)
goto out;
@@ -1610,8 +1687,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
if (num_conventional > 0) {
- /* Zone capacity is always zone size in emulation */
- cache->zone_capacity = cache->length;
ret = calculate_alloc_pointer(cache, &last_alloc, new);
if (ret) {
btrfs_err(fs_info,
@@ -1620,28 +1695,34 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
} else if (map->num_stripes == num_conventional) {
cache->alloc_offset = last_alloc;
+ cache->zone_capacity = cache->length;
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
goto out;
}
}
- switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+ switch (profile) {
case 0: /* single */
ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
break;
case BTRFS_BLOCK_GROUP_DUP:
- ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_dup(cache, map, zone_info, active,
+ last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID1:
case BTRFS_BLOCK_GROUP_RAID1C3:
case BTRFS_BLOCK_GROUP_RAID1C4:
- ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid1(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID0:
- ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid0(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID10:
- ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid10(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID5:
case BTRFS_BLOCK_GROUP_RAID6:
@@ -1652,6 +1733,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 &&
+ profile != BTRFS_BLOCK_GROUP_RAID10) {
+ /*
+ * Detected broken write pointer. Make this block group
+ * unallocatable by setting the allocation pointer at the end of
+ * allocatable region. Relocating this block group will fix the
+ * mismatch.
+ *
+ * Currently, we cannot handle RAID0 or RAID10 case like this
+ * because we don't have a proper zone_capacity value. But,
+ * reading from this block group won't work anyway by a missing
+ * stripe.
+ */
+ cache->alloc_offset = cache->zone_capacity;
+ }
+
out:
/* Reject non SINGLE data profiles without RST */
if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
@@ -1659,10 +1756,10 @@ out:
!fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
btrfs_bg_type_to_raid_name(map->type));
- return -EINVAL;
+ ret = -EINVAL;
}
- if (cache->alloc_offset > cache->zone_capacity) {
+ if (unlikely(cache->alloc_offset > cache->zone_capacity)) {
btrfs_err(fs_info,
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
cache->alloc_offset, cache->zone_capacity,
@@ -1692,7 +1789,6 @@ out:
cache->physical_map = NULL;
}
bitmap_free(active);
- kfree(zone_info);
return ret;
}
@@ -1719,21 +1815,21 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
struct btrfs_inode *inode = bbio->inode;
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_group *cache;
bool ret = false;
if (!btrfs_is_zoned(fs_info))
return false;
- if (!inode || !is_data_inode(&inode->vfs_inode))
+ if (!is_data_inode(inode))
return false;
if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
return false;
/*
- * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
+ * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the
* extent layout the relocation code has.
* Furthermore we have set aside own block-group from which only the
* relocation "process" can allocate and make sure only one process at a
@@ -1768,16 +1864,18 @@ void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
u64 logical)
{
- struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree;
+ struct extent_map_tree *em_tree = &ordered->inode->extent_tree;
struct extent_map *em;
ordered->disk_bytenr = logical;
write_lock(&em_tree->lock);
- em = search_extent_mapping(em_tree, ordered->file_offset,
- ordered->num_bytes);
- em->block_start = logical;
- free_extent_map(em);
+ em = btrfs_search_extent_mapping(em_tree, ordered->file_offset,
+ ordered->num_bytes);
+ /* The em should be a new COW extent, thus it should not have an offset. */
+ ASSERT(em->offset == 0, "em->offset=%llu", em->offset);
+ em->disk_bytenr = logical;
+ btrfs_free_extent_map(em);
write_unlock(&em_tree->lock);
}
@@ -1787,8 +1885,8 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
struct btrfs_ordered_extent *new;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
- split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset,
- ordered->num_bytes, len, logical))
+ btrfs_split_extent_map(ordered->inode, ordered->file_offset,
+ ordered->num_bytes, len, logical))
return false;
new = btrfs_split_ordered_extent(ordered, len);
@@ -1801,7 +1899,7 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *sum;
u64 logical, len;
@@ -1845,7 +1943,7 @@ out:
* here so that we don't attempt to log the csums later.
*/
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
+ test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) {
while ((sum = list_first_entry_or_null(&ordered->list,
typeof(*sum), list))) {
list_del(&sum->list);
@@ -1965,7 +2063,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
if (block_group->meta_write_pointer > eb->start)
return -EBUSY;
- /* If for_sync, this hole will be filled with trasnsaction commit. */
+ /* If for_sync, this hole will be filled with transaction commit. */
if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
return -EAGAIN;
return -EBUSY;
@@ -1991,7 +2089,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
&mapped_length, &bioc, NULL, NULL);
- if (ret || !bioc || mapped_length < PAGE_SIZE) {
+ if (unlikely(ret || !bioc || mapped_length < PAGE_SIZE)) {
ret = -EIO;
goto out_put_bioc;
}
@@ -2049,7 +2147,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
if (physical_pos == wp)
return 0;
- if (physical_pos > wp)
+ if (unlikely(physical_pos > wp))
return -EUCLEAN;
length = wp - physical_pos;
@@ -2085,10 +2183,15 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
- /* No space left */
- if (btrfs_zoned_bg_is_full(block_group)) {
- ret = false;
- goto out_unlock;
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) {
+ /* The caller should check if the block group is full. */
+ if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) {
+ ret = false;
+ goto out_unlock;
+ }
+ } else {
+ /* Since it is already written, it should have been active. */
+ WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start);
}
for (i = 0; i < map->num_stripes; i++) {
@@ -2099,6 +2202,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
physical = map->stripes[i].physical;
zinfo = device->zone_info;
+ if (!device->bdev)
+ continue;
+
if (zinfo->max_active_zones == 0)
continue;
@@ -2143,27 +2249,15 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
const u64 end = block_group->start + block_group->length;
- struct radix_tree_iter iter;
struct extent_buffer *eb;
- void __rcu **slot;
+ unsigned long index, start = (block_group->start >> fs_info->nodesize_bits);
rcu_read_lock();
- radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
- block_group->start >> fs_info->sectorsize_bits) {
- eb = radix_tree_deref_slot(slot);
- if (!eb)
- continue;
- if (radix_tree_deref_retry(eb)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
+ xa_for_each_start(&fs_info->buffer_tree, index, eb, start) {
if (eb->start < block_group->start)
continue;
if (eb->start >= end)
break;
-
- slot = radix_tree_iter_resume(slot, &iter);
rcu_read_unlock();
wait_on_extent_buffer_writeback(eb);
rcu_read_lock();
@@ -2171,6 +2265,40 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
rcu_read_unlock();
}
+static int call_zone_finish(struct btrfs_block_group *block_group,
+ struct btrfs_io_stripe *stripe)
+{
+ struct btrfs_device *device = stripe->dev;
+ const u64 physical = stripe->physical;
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ int ret;
+
+ if (!device->bdev)
+ return 0;
+
+ if (zinfo->max_active_zones == 0)
+ return 0;
+
+ if (btrfs_dev_is_sequential(device, physical)) {
+ unsigned int nofs_flags;
+
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ zinfo->zone_size >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
+
+ if (ret)
+ return ret;
+ }
+
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ zinfo->reserved_active_zones++;
+ btrfs_dev_clear_active_zone(device, physical);
+
+ return 0;
+}
+
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -2215,8 +2343,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
/* Ensure all writes in this block group finish */
btrfs_wait_block_group_reservations(block_group);
/* No need to wait for NOCOW writers. Zoned mode does not allow that */
- btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
- block_group->length);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group);
/* Wait for extent buffers to be written. */
if (is_metadata)
wait_eb_writebacks(block_group);
@@ -2256,28 +2383,12 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
down_read(&dev_replace->rwsem);
map = block_group->physical_map;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_device *device = map->stripes[i].dev;
- const u64 physical = map->stripes[i].physical;
- struct btrfs_zoned_device_info *zinfo = device->zone_info;
- unsigned int nofs_flags;
-
- if (zinfo->max_active_zones == 0)
- continue;
-
- nofs_flags = memalloc_nofs_save();
- ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
- physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT);
- memalloc_nofs_restore(nofs_flags);
+ ret = call_zone_finish(block_group, &map->stripes[i]);
if (ret) {
up_read(&dev_replace->rwsem);
return ret;
}
-
- if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
- zinfo->reserved_active_zones++;
- btrfs_dev_clear_active_zone(device, physical);
}
up_read(&dev_replace->rwsem);
@@ -2314,6 +2425,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
if (!btrfs_is_zoned(fs_info))
return true;
+ if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags))
+ return false;
+
/* Check if there is a device with active zones left */
mutex_lock(&fs_info->chunk_mutex);
spin_lock(&fs_info->zone_active_bgs_lock);
@@ -2352,16 +2466,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
return ret;
}
-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
{
struct btrfs_block_group *block_group;
u64 min_alloc_bytes;
if (!btrfs_is_zoned(fs_info))
- return;
+ return 0;
block_group = btrfs_lookup_block_group(fs_info, logical);
- ASSERT(block_group);
+ if (WARN_ON_ONCE(!block_group))
+ return -ENOENT;
/* No MIXED_BG on zoned btrfs. */
if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
@@ -2378,16 +2493,21 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
out:
btrfs_put_block_group(block_group);
+ return 0;
}
static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
{
+ int ret;
struct btrfs_block_group *bg =
container_of(work, struct btrfs_block_group, zone_finish_work);
wait_on_extent_buffer_writeback(bg->last_eb);
free_extent_buffer(bg->last_eb);
- btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
+ ret = do_zone_finish(bg, true);
+ if (ret)
+ btrfs_handle_fs_error(bg->fs_info, ret,
+ "Failed to finish block-group's zone");
btrfs_put_block_group(bg);
}
@@ -2406,10 +2526,10 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
/* For the work */
btrfs_get_block_group(bg);
- atomic_inc(&eb->refs);
+ refcount_inc(&eb->refs);
bg->last_eb = eb;
INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
- queue_work(system_unbound_wq, &bg->zone_finish_work);
+ queue_work(system_dfl_wq, &bg->zone_finish_work);
}
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
@@ -2422,6 +2542,106 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
spin_unlock(&fs_info->relocation_bg_lock);
}
+void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ struct btrfs_space_info *space_info = data_sinfo;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_block_group *bg;
+ struct list_head *bg_list;
+ u64 alloc_flags;
+ bool first = true;
+ bool did_chunk_alloc = false;
+ int index;
+ int ret;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ if (fs_info->data_reloc_bg)
+ return;
+
+ if (sb_rdonly(fs_info->sb))
+ return;
+
+ alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
+ index = btrfs_bg_flags_to_raid_index(alloc_flags);
+
+ /* Scan the data space_info to find empty block groups. Take the second one. */
+again:
+ bg_list = &space_info->block_groups[index];
+ list_for_each_entry(bg, bg_list, list) {
+ if (bg->alloc_offset != 0)
+ continue;
+
+ if (first) {
+ first = false;
+ continue;
+ }
+
+ if (space_info == data_sinfo) {
+ /* Migrate the block group to the data relocation space_info. */
+ struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0];
+ int factor;
+
+ ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC,
+ "reloc_sinfo->subgroup_id=%d", reloc_sinfo->subgroup_id);
+ factor = btrfs_bg_type_to_factor(bg->flags);
+
+ down_write(&space_info->groups_sem);
+ list_del_init(&bg->list);
+ /* We can assume this as we choose the second empty one. */
+ ASSERT(!list_empty(&space_info->block_groups[index]));
+ up_write(&space_info->groups_sem);
+
+ spin_lock(&space_info->lock);
+ space_info->total_bytes -= bg->length;
+ space_info->disk_total -= bg->length * factor;
+ space_info->disk_total -= bg->zone_unusable;
+ /* There is no allocation ever happened. */
+ ASSERT(bg->used == 0, "bg->used=%llu", bg->used);
+ /* No super block in a block group on the zoned setup. */
+ ASSERT(bg->bytes_super == 0, "bg->bytes_super=%llu", bg->bytes_super);
+ spin_unlock(&space_info->lock);
+
+ bg->space_info = reloc_sinfo;
+ if (reloc_sinfo->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(bg);
+
+ btrfs_add_bg_to_space_info(fs_info, bg);
+ }
+
+ fs_info->data_reloc_bg = bg->start;
+ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags);
+ btrfs_zone_activate(bg);
+
+ return;
+ }
+
+ if (did_chunk_alloc)
+ return;
+
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return;
+
+ /* Allocate new BG in the data relocation space_info. */
+ space_info = data_sinfo->sub_group[0];
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC,
+ "space_info->subgroup_id=%d", space_info->subgroup_id);
+ ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
+ btrfs_end_transaction(trans);
+ if (ret == 1) {
+ /*
+ * We allocated a new block group in the data relocation space_info. We
+ * can take that one.
+ */
+ first = false;
+ did_chunk_alloc = true;
+ goto again;
+ }
+}
+
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
@@ -2440,12 +2660,12 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
mutex_unlock(&fs_devices->device_list_mutex);
}
-bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
+ u64 total = btrfs_super_total_bytes(fs_info->super_copy);
u64 used = 0;
- u64 total = 0;
u64 factor;
ASSERT(btrfs_is_zoned(fs_info));
@@ -2458,7 +2678,6 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
if (!device->bdev)
continue;
- total += device->disk_total_bytes;
used += device->bytes_used;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -2512,7 +2731,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->alloc_offset == 0 ||
- (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
+ !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) ||
test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
continue;
@@ -2539,10 +2758,9 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
return ret < 0 ? ret : 1;
}
-int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- bool do_finish)
+int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish)
{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
struct btrfs_block_group *bg;
int index;
@@ -2641,3 +2859,128 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->zone_active_bgs_lock);
}
+
+/*
+ * Reset the zones of unused block groups from @space_info->bytes_zone_unusable.
+ *
+ * @space_info: the space to work on
+ * @num_bytes: targeting reclaim bytes
+ *
+ * This one resets the zones of a block group, so we can reuse the region
+ * without removing the block group. On the other hand, btrfs_delete_unused_bgs()
+ * just removes a block group and frees up the underlying zones. So, we still
+ * need to allocate a new block group to reuse the zones.
+ *
+ * Resetting is faster than deleting/recreating a block group. It is similar
+ * to freeing the logical space on the regular mode. However, we cannot change
+ * the block group's profile with this operation.
+ */
+int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ while (num_bytes > 0) {
+ struct btrfs_chunk_map *map;
+ struct btrfs_block_group *bg = NULL;
+ bool found = false;
+ u64 reclaimed = 0;
+
+ /*
+ * Here, we choose a fully zone_unusable block group. It's
+ * technically possible to reset a partly zone_unusable block
+ * group, which still has some free space left. However,
+ * handling that needs to cope with the allocation side, which
+ * makes the logic more complex. So, let's handle the easy case
+ * for now.
+ */
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) {
+ if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags)
+ continue;
+
+ /*
+ * Use trylock to avoid locking order violation. In
+ * btrfs_reclaim_bgs_work(), the lock order is
+ * &bg->lock -> &fs_info->unused_bgs_lock. We skip a
+ * block group if we cannot take its lock.
+ */
+ if (!spin_trylock(&bg->lock))
+ continue;
+ if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) {
+ spin_unlock(&bg->lock);
+ continue;
+ }
+ spin_unlock(&bg->lock);
+ found = true;
+ break;
+ }
+ if (!found) {
+ spin_unlock(&fs_info->unused_bgs_lock);
+ return 0;
+ }
+
+ list_del_init(&bg->bg_list);
+ btrfs_put_block_group(bg);
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /*
+ * Since the block group is fully zone_unusable and we cannot
+ * allocate from this block group anymore, we don't need to set
+ * this block group read-only.
+ */
+
+ down_read(&fs_info->dev_replace.rwsem);
+ map = bg->physical_map;
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ unsigned int nofs_flags;
+ int ret;
+
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET,
+ stripe->physical >> SECTOR_SHIFT,
+ zone_size_sectors);
+ memalloc_nofs_restore(nofs_flags);
+
+ if (ret) {
+ up_read(&fs_info->dev_replace.rwsem);
+ return ret;
+ }
+ }
+ up_read(&fs_info->dev_replace.rwsem);
+
+ spin_lock(&space_info->lock);
+ spin_lock(&bg->lock);
+ ASSERT(!btrfs_is_block_group_used(bg));
+ if (bg->ro) {
+ spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
+ continue;
+ }
+
+ reclaimed = bg->alloc_offset;
+ bg->zone_unusable = bg->length - bg->zone_capacity;
+ bg->alloc_offset = 0;
+ /*
+ * This holds because we currently reset fully used then freed
+ * block group.
+ */
+ ASSERT(reclaimed == bg->zone_capacity,
+ "reclaimed=%llu bg->zone_capacity=%llu", reclaimed, bg->zone_capacity);
+ bg->free_space_ctl->free_space += reclaimed;
+ space_info->bytes_zone_unusable -= reclaimed;
+ spin_unlock(&bg->lock);
+ btrfs_return_free_space(space_info, reclaimed);
+ spin_unlock(&space_info->lock);
+
+ if (num_bytes <= reclaimed)
+ break;
+ num_bytes -= reclaimed;
+ }
+
+ return 0;
+}