summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2025-10-29 15:54:36 +0100
committerChristian Brauner <brauner@kernel.org>2025-10-29 15:54:36 +0100
commit891bea757c771dca47e871444faecb1ed642a311 (patch)
treeeb814e06688a4d12c06424a60d497738b5a47c73
parent211c43d0938dd57c2937366339f8b651e7990d1a (diff)
parent015a54407782309c11655e1938bcbcef131ebf87 (diff)
Merge patch series "allow file systems to increase the minimum writeback chunk size v2"
Christoph Hellwig <hch@lst.de> says: The relatively low minimal writeback size of 4MiB leads means that written back inodes on rotational media are switched a lot. Besides introducing additional seeks, this also can lead to extreme file fragmentation on zoned devices when a lot of files are cached relative to the available writeback bandwidth. Add a superblock field that allows the file system to override the default size, and set it to the zone size for zoned XFS. * patches from https://patch.msgid.link/20251017034611.651385-1-hch@lst.de: xfs: set s_min_writeback_pages for zoned file systems writeback: allow the file system to override MIN_WRITEBACK_PAGES writeback: cleanup writeback_chunk_size Link: https://patch.msgid.link/20251017034611.651385-1-hch@lst.de Signed-off-by: Christian Brauner <brauner@kernel.org>
-rw-r--r--fs/fs-writeback.c26
-rw-r--r--fs/super.c1
-rw-r--r--fs/xfs/xfs_zone_alloc.c28
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/writeback.h5
5 files changed, 42 insertions, 19 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4448de35ec8b..52763fa499d6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -33,11 +33,6 @@
#include "internal.h"
/*
- * 4MB minimal write chunk size
- */
-#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
-
-/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
struct wb_writeback_work {
@@ -1889,8 +1884,8 @@ out:
return ret;
}
-static long writeback_chunk_size(struct bdi_writeback *wb,
- struct wb_writeback_work *work)
+static long writeback_chunk_size(struct super_block *sb,
+ struct bdi_writeback *wb, struct wb_writeback_work *work)
{
long pages;
@@ -1908,16 +1903,13 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
* (maybe slowly) sync all tagged pages
*/
if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
- pages = LONG_MAX;
- else {
- pages = min(wb->avg_write_bandwidth / 2,
- global_wb_domain.dirty_limit / DIRTY_SCOPE);
- pages = min(pages, work->nr_pages);
- pages = round_down(pages + MIN_WRITEBACK_PAGES,
- MIN_WRITEBACK_PAGES);
- }
+ return LONG_MAX;
- return pages;
+ pages = min(wb->avg_write_bandwidth / 2,
+ global_wb_domain.dirty_limit / DIRTY_SCOPE);
+ pages = min(pages, work->nr_pages);
+ return round_down(pages + sb->s_min_writeback_pages,
+ sb->s_min_writeback_pages);
}
/*
@@ -2019,7 +2011,7 @@ static long writeback_sb_inodes(struct super_block *sb,
inode->i_state |= I_SYNC;
wbc_attach_and_unlock_inode(&wbc, inode);
- write_chunk = writeback_chunk_size(wb, work);
+ write_chunk = writeback_chunk_size(inode->i_sb, wb, work);
wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
diff --git a/fs/super.c b/fs/super.c
index 5bab94fb7e03..599c1d2641fe 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -389,6 +389,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
goto fail;
if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
goto fail;
+ s->s_min_writeback_pages = MIN_WRITEBACK_PAGES;
return s;
fail:
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 1147bacb2da8..c342595acc3e 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -1215,6 +1215,7 @@ xfs_mount_zones(
.mp = mp,
};
struct xfs_buftarg *bt = mp->m_rtdev_targp;
+ xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks;
int error;
if (!bt) {
@@ -1245,10 +1246,33 @@ xfs_mount_zones(
return -ENOMEM;
xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
- mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
- mp->m_max_open_zones);
+ mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones);
trace_xfs_zones_mount(mp);
+ /*
+ * The writeback code switches between inodes regularly to provide
+ * fairness. The default lower bound is 4MiB, but for zoned file
+ * systems we want to increase that both to reduce seeks, but also more
+ * importantly so that workloads that writes files in a multiple of the
+ * zone size do not get fragmented and require garbage collection when
+ * they shouldn't. Increase is to the zone size capped by the max
+ * extent len.
+ *
+ * Note that because s_min_writeback_pages is a superblock field, this
+ * value also get applied to non-zoned files on the data device if
+ * there are any. On typical zoned setup all data is on the RT device
+ * because using the more efficient sequential write required zones
+ * is the reason for using the zone allocator, and either the RT device
+ * and the (meta)data device are on the same block device, or the
+ * (meta)data device is on a fast SSD while the data on the RT device
+ * is on a SMR HDD. In any combination of the above cases enforcing
+ * the higher min_writeback_pages for non-RT inodes is either a noop
+ * or beneficial.
+ */
+ mp->m_super->s_min_writeback_pages =
+ XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >>
+ PAGE_SHIFT;
+
if (bdev_is_zoned(bt->bt_bdev)) {
error = blkdev_report_zones(bt->bt_bdev,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a5dbfa20f8d7..6bf369095d2e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1583,6 +1583,7 @@ struct super_block {
spinlock_t s_inode_wblist_lock;
struct list_head s_inodes_wb; /* writeback inodes */
+ long s_min_writeback_pages;
} __randomize_layout;
static inline struct user_namespace *i_user_ns(const struct inode *inode)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 22dd4adc5667..49e1dd96f43e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -374,4 +374,9 @@ bool redirty_page_for_writepage(struct writeback_control *, struct page *);
void sb_mark_inode_writeback(struct inode *inode);
void sb_clear_inode_writeback(struct inode *inode);
+/*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
+
#endif /* WRITEBACK_H */