summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/extent_io.c24
-rw-r--r--fs/btrfs/inode.c29
-rw-r--r--fs/btrfs/subpage.c19
-rw-r--r--fs/btrfs/super.c13
-rw-r--r--fs/btrfs/zoned.c133
5 files changed, 163 insertions, 55 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f23d75986947..c953297aa89a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1512,7 +1512,7 @@ out:
/*
* Return 0 if we have submitted or queued the sector for submission.
- * Return <0 for critical errors.
+ * Return <0 for critical errors, and the sector will have its dirty flag cleared.
*
* Caller should make sure filepos < i_size and handle filepos >= i_size case.
*/
@@ -1535,8 +1535,17 @@ static int submit_one_sector(struct btrfs_inode *inode,
ASSERT(filepos < i_size);
em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
- if (IS_ERR(em))
+ if (IS_ERR(em)) {
+ /*
+ * When submission failed, we should still clear the folio dirty.
+ * Or the folio will be written back again but without any
+ * ordered extent.
+ */
+ btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
+ btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
+ btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
return PTR_ERR(em);
+ }
extent_offset = filepos - em->start;
em_end = btrfs_extent_map_end(em);
@@ -1609,8 +1618,12 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
folio_unlock(folio);
return 1;
}
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_folio_clear_dirty(fs_info, folio, start, len);
+ btrfs_folio_set_writeback(fs_info, folio, start, len);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
return ret;
+ }
for (cur = start; cur < start + len; cur += fs_info->sectorsize)
set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
@@ -1666,8 +1679,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
*
- * If we hit any error, the corresponding sector will still be dirty
- * thus no need to clear PAGECACHE_TAG_DIRTY.
+ * If we hit any error, the corresponding sector will have its dirty
+ * flag cleared and writeback finished, thus no need to handle the error case.
*/
if (!submitted_io && !error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
@@ -1813,6 +1826,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
xas_load(&xas);
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
xas_unlock_irqrestore(&xas, flags);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d740910e071a..9e4aec7330cb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4189,6 +4189,23 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
return ret;
}
+static void update_time_after_link_or_unlink(struct btrfs_inode *dir)
+{
+ struct timespec64 now;
+
+ /*
+ * If we are replaying a log tree, we do not want to update the mtime
+ * and ctime of the parent directory with the current time, since the
+ * log replay procedure is responsible for setting them to their correct
+ * values (the ones it had when the fsync was done).
+ */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags))
+ return;
+
+ now = inode_set_ctime_current(&dir->vfs_inode);
+ inode_set_mtime_to_ts(&dir->vfs_inode, now);
+}
+
/*
* unlink helper that gets used here in inode.c and in the tree logging
* recovery code. It remove a link in a directory with a given name, and
@@ -4289,7 +4306,7 @@ skip_backref:
inode_inc_iversion(&inode->vfs_inode);
inode_set_ctime_current(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
+ update_time_after_link_or_unlink(dir);
return btrfs_update_inode(trans, dir);
}
@@ -6683,15 +6700,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
name->len * 2);
inode_inc_iversion(&parent_inode->vfs_inode);
- /*
- * If we are replaying a log tree, we do not want to update the mtime
- * and ctime of the parent directory with the current time, since the
- * log replay procedure is responsible for setting them to their correct
- * values (the ones it had when the fsync was done).
- */
- if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
- inode_set_mtime_to_ts(&parent_inode->vfs_inode,
- inode_set_ctime_current(&parent_inode->vfs_inode));
+ update_time_after_link_or_unlink(parent_inode);
ret = btrfs_update_inode(trans, parent_inode);
if (ret)
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index c9b3821957f7..cb4f97833dc3 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -448,8 +448,25 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&bfs->lock, flags);
bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+
+ /*
+ * Don't clear the TOWRITE tag when starting writeback on a still-dirty
+ * folio. Doing so can cause WB_SYNC_ALL writepages() to overlook it,
+ * assume writeback is complete, and exit too early — violating sync
+ * ordering guarantees.
+ */
if (!folio_test_writeback(folio))
- folio_start_writeback(folio);
+ __folio_start_writeback(folio, true);
+ if (!folio_test_dirty(folio)) {
+ struct address_space *mapping = folio_mapping(folio);
+ XA_STATE(xas, &mapping->i_pages, folio->index);
+ unsigned long flags;
+
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irqrestore(&xas, flags);
+ }
spin_unlock_irqrestore(&bfs->lock, flags);
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 68e35a3700ff..a262b494a89f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -88,6 +88,9 @@ struct btrfs_fs_context {
refcount_t refs;
};
+static void btrfs_emit_options(struct btrfs_fs_info *info,
+ struct btrfs_fs_context *old);
+
enum {
Opt_acl,
Opt_clear_cache,
@@ -698,12 +701,9 @@ bool btrfs_check_options(const struct btrfs_fs_info *info,
if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
- btrfs_info(info, "disk space caching is enabled");
btrfs_warn(info,
"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2");
}
- if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
- btrfs_info(info, "using free-space-tree");
}
return ret;
@@ -980,6 +980,8 @@ static int btrfs_fill_super(struct super_block *sb,
return ret;
}
+ btrfs_emit_options(fs_info, NULL);
+
inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
@@ -1437,7 +1439,7 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
{
btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
- btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow");
btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
@@ -1459,10 +1461,11 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums");
btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags");
+ btrfs_info_if_unset(info, old, NODATASUM, "setting datasum");
btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
- btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers");
btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index db11b5b5f0e6..ea662036f441 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -17,6 +17,7 @@
#include "accessors.h"
#include "bio.h"
#include "transaction.h"
+#include "sysfs.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
@@ -42,6 +43,9 @@
/* Number of superblock log zones */
#define BTRFS_NR_SB_LOG_ZONES 2
+/* Default number of max active zones when the device has no limits. */
+#define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128
+
/*
* Minimum of active zones we need:
*
@@ -416,7 +420,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
- max_active_zones = bdev_max_active_zones(bdev);
+ max_active_zones = min_not_zero(bdev_max_active_zones(bdev),
+ bdev_max_open_zones(bdev));
+ if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES)
+ max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES;
if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
btrfs_err(fs_info,
"zoned: %s: max active zones %u is too small, need at least %u active zones",
@@ -2168,10 +2175,15 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
- /* No space left */
- if (btrfs_zoned_bg_is_full(block_group)) {
- ret = false;
- goto out_unlock;
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) {
+ /* The caller should check if the block group is full. */
+ if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) {
+ ret = false;
+ goto out_unlock;
+ }
+ } else {
+ /* Since it is already written, it should have been active. */
+ WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start);
}
for (i = 0; i < map->num_stripes; i++) {
@@ -2230,7 +2242,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
struct btrfs_fs_info *fs_info = block_group->fs_info;
const u64 end = block_group->start + block_group->length;
struct extent_buffer *eb;
- unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits);
+ unsigned long index, start = (block_group->start >> fs_info->nodesize_bits);
rcu_read_lock();
xa_for_each_start(&fs_info->buffer_tree, index, eb, start) {
@@ -2245,6 +2257,40 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
rcu_read_unlock();
}
+static int call_zone_finish(struct btrfs_block_group *block_group,
+ struct btrfs_io_stripe *stripe)
+{
+ struct btrfs_device *device = stripe->dev;
+ const u64 physical = stripe->physical;
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ int ret;
+
+ if (!device->bdev)
+ return 0;
+
+ if (zinfo->max_active_zones == 0)
+ return 0;
+
+ if (btrfs_dev_is_sequential(device, physical)) {
+ unsigned int nofs_flags;
+
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ zinfo->zone_size >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
+
+ if (ret)
+ return ret;
+ }
+
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ zinfo->reserved_active_zones++;
+ btrfs_dev_clear_active_zone(device, physical);
+
+ return 0;
+}
+
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -2329,31 +2375,12 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
down_read(&dev_replace->rwsem);
map = block_group->physical_map;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_device *device = map->stripes[i].dev;
- const u64 physical = map->stripes[i].physical;
- struct btrfs_zoned_device_info *zinfo = device->zone_info;
- unsigned int nofs_flags;
-
- if (!device->bdev)
- continue;
-
- if (zinfo->max_active_zones == 0)
- continue;
-
- nofs_flags = memalloc_nofs_save();
- ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
- physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT);
- memalloc_nofs_restore(nofs_flags);
+ ret = call_zone_finish(block_group, &map->stripes[i]);
if (ret) {
up_read(&dev_replace->rwsem);
return ret;
}
-
- if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
- zinfo->reserved_active_zones++;
- btrfs_dev_clear_active_zone(device, physical);
}
up_read(&dev_replace->rwsem);
@@ -2504,12 +2531,12 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
- struct btrfs_space_info *space_info = data_sinfo->sub_group[0];
+ struct btrfs_space_info *space_info = data_sinfo;
struct btrfs_trans_handle *trans;
struct btrfs_block_group *bg;
struct list_head *bg_list;
u64 alloc_flags;
- bool initial = false;
+ bool first = true;
bool did_chunk_alloc = false;
int index;
int ret;
@@ -2523,21 +2550,52 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
if (sb_rdonly(fs_info->sb))
return;
- ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
index = btrfs_bg_flags_to_raid_index(alloc_flags);
- bg_list = &data_sinfo->block_groups[index];
+ /* Scan the data space_info to find empty block groups. Take the second one. */
again:
+ bg_list = &space_info->block_groups[index];
list_for_each_entry(bg, bg_list, list) {
- if (bg->used > 0)
+ if (bg->alloc_offset != 0)
continue;
- if (!initial) {
- initial = true;
+ if (first) {
+ first = false;
continue;
}
+ if (space_info == data_sinfo) {
+ /* Migrate the block group to the data relocation space_info. */
+ struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0];
+ int factor;
+
+ ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
+ factor = btrfs_bg_type_to_factor(bg->flags);
+
+ down_write(&space_info->groups_sem);
+ list_del_init(&bg->list);
+ /* We can assume this as we choose the second empty one. */
+ ASSERT(!list_empty(&space_info->block_groups[index]));
+ up_write(&space_info->groups_sem);
+
+ spin_lock(&space_info->lock);
+ space_info->total_bytes -= bg->length;
+ space_info->disk_total -= bg->length * factor;
+ /* There is no allocation ever happened. */
+ ASSERT(bg->used == 0);
+ ASSERT(bg->zone_unusable == 0);
+ /* No super block in a block group on the zoned setup. */
+ ASSERT(bg->bytes_super == 0);
+ spin_unlock(&space_info->lock);
+
+ bg->space_info = reloc_sinfo;
+ if (reloc_sinfo->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(bg);
+
+ btrfs_add_bg_to_space_info(fs_info, bg);
+ }
+
fs_info->data_reloc_bg = bg->start;
set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags);
btrfs_zone_activate(bg);
@@ -2552,11 +2610,18 @@ again:
if (IS_ERR(trans))
return;
+ /* Allocate new BG in the data relocation space_info. */
+ space_info = data_sinfo->sub_group[0];
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
if (ret == 1) {
+ /*
+ * We allocated a new block group in the data relocation space_info. We
+ * can take that one.
+ */
+ first = false;
did_chunk_alloc = true;
- bg_list = &space_info->block_groups[index];
goto again;
}
}