From 4ce02c67972211be488408c275c8fbf19faf29b3 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Mon, 10 Jul 2023 14:12:43 -0700 Subject: iomap: Add per-block dirty state tracking to improve performance When filesystem blocksize is less than folio size (either with mapping_large_folio_support() or with blocksize < pagesize) and when the folio is uptodate in pagecache, then even a byte write can cause an entire folio to be written to disk during writeback. This happens because we currently don't have a mechanism to track per-block dirty state within struct iomap_folio_state. We currently only track uptodate state. This patch implements support for tracking per-block dirty state in iomap_folio_state->state bitmap. This should help improve the filesystem write performance and help reduce write amplification. Performance testing of below fio workload reveals ~16x performance improvement using nvme with XFS (4k blocksize) on Power (64K pagesize) FIO reported write bw scores improved from around ~28 MBps to ~452 MBps. 1. [global] ioengine=psync rw=randwrite overwrite=1 pre_read=1 direct=0 bs=4k size=1G dir=./ numjobs=8 fdatasync=1 runtime=60 iodepth=64 group_reporting=1 [fio-run] 2. Also our internal performance team reported that this patch improves their database workload performance by around ~83% (with XFS on Power) Reported-by: Aravinda Herle Reported-by: Brian Foster Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 161 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 150 insertions(+), 11 deletions(-) (limited to 'fs/iomap') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 1ec4ef44851f..283fb96f6609 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -25,13 +25,19 @@ typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length); /* - * Structure allocated for each folio to track per-block uptodate state + * Structure allocated for each folio to track per-block uptodate, dirty state * and I/O completions. */ struct iomap_folio_state { atomic_t read_bytes_pending; atomic_t write_bytes_pending; spinlock_t state_lock; + + /* + * Each block has two bits in this bitmap: + * Bits [0..blocks_per_folio) has the uptodate status. + * Bits [b_p_f...(2*b_p_f)) has the dirty status. + */ unsigned long state[]; }; @@ -78,6 +84,61 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off, folio_mark_uptodate(folio); } +static inline bool ifs_block_is_dirty(struct folio *folio, + struct iomap_folio_state *ifs, int block) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + + return test_bit(block + blks_per_folio, ifs->state); +} + +static void ifs_clear_range_dirty(struct folio *folio, + struct iomap_folio_state *ifs, size_t off, size_t len) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int first_blk = (off >> inode->i_blkbits); + unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk + 1; + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks); + spin_unlock_irqrestore(&ifs->state_lock, flags); +} + +static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len) +{ + struct iomap_folio_state *ifs = folio->private; + + if (ifs) + ifs_clear_range_dirty(folio, ifs, off, len); +} + +static void ifs_set_range_dirty(struct folio *folio, + struct iomap_folio_state *ifs, size_t off, size_t len) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int first_blk = (off >> inode->i_blkbits); + unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk + 1; + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks); + spin_unlock_irqrestore(&ifs->state_lock, flags); +} + +static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len) +{ + struct iomap_folio_state *ifs = folio->private; + + if (ifs) + ifs_set_range_dirty(folio, ifs, off, len); +} + static struct iomap_folio_state *ifs_alloc(struct inode *inode, struct folio *folio, unsigned int flags) { @@ -93,14 +154,24 @@ static struct iomap_folio_state *ifs_alloc(struct inode *inode, else gfp = GFP_NOFS | __GFP_NOFAIL; - ifs = kzalloc(struct_size(ifs, state, BITS_TO_LONGS(nr_blocks)), - gfp); - if (ifs) { - spin_lock_init(&ifs->state_lock); - if (folio_test_uptodate(folio)) - bitmap_fill(ifs->state, nr_blocks); - folio_attach_private(folio, ifs); - } + /* + * ifs->state tracks two sets of state flags when the + * filesystem block size is smaller than the folio size. + * The first state tracks per-block uptodate and the + * second tracks per-block dirty state. + */ + ifs = kzalloc(struct_size(ifs, state, + BITS_TO_LONGS(2 * nr_blocks)), gfp); + if (!ifs) + return ifs; + + spin_lock_init(&ifs->state_lock); + if (folio_test_uptodate(folio)) + bitmap_set(ifs->state, 0, nr_blocks); + if (folio_test_dirty(folio)) + bitmap_set(ifs->state, nr_blocks, nr_blocks); + folio_attach_private(folio, ifs); + return ifs; } @@ -519,6 +590,17 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) } EXPORT_SYMBOL_GPL(iomap_invalidate_folio); +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + struct inode *inode = mapping->host; + size_t len = folio_size(folio); + + ifs_alloc(inode, folio, 0); + iomap_set_range_dirty(folio, 0, len); + return filemap_dirty_folio(mapping, folio); +} +EXPORT_SYMBOL_GPL(iomap_dirty_folio); + static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -723,6 +805,7 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, if (unlikely(copied < len && !folio_test_uptodate(folio))) return 0; iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); + iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied); filemap_dirty_folio(inode->i_mapping, folio); return copied; } @@ -892,6 +975,43 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +static int iomap_write_delalloc_ifs_punch(struct inode *inode, + struct folio *folio, loff_t start_byte, loff_t end_byte, + iomap_punch_t punch) +{ + unsigned int first_blk, last_blk, i; + loff_t last_byte; + u8 blkbits = inode->i_blkbits; + struct iomap_folio_state *ifs; + int ret = 0; + + /* + * When we have per-block dirty tracking, there can be + * blocks within a folio which are marked uptodate + * but not dirty. In that case it is necessary to punch + * out such blocks to avoid leaking any delalloc blocks. + */ + ifs = folio->private; + if (!ifs) + return ret; + + last_byte = min_t(loff_t, end_byte - 1, + folio_pos(folio) + folio_size(folio) - 1); + first_blk = offset_in_folio(folio, start_byte) >> blkbits; + last_blk = offset_in_folio(folio, last_byte) >> blkbits; + for (i = first_blk; i <= last_blk; i++) { + if (!ifs_block_is_dirty(folio, ifs, i)) { + ret = punch(inode, folio_pos(folio) + (i << blkbits), + 1 << blkbits); + if (ret) + return ret; + } + } + + return ret; +} + + static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, iomap_punch_t punch) @@ -909,6 +1029,12 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, return ret; } + /* Punch non-dirty blocks within folio */ + ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte, + end_byte, punch); + if (ret) + return ret; + /* * Make sure the next punch start is correctly bound to * the end of this data range, not the end of the folio. @@ -1639,7 +1765,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, struct folio *folio, u64 end_pos) { - struct iomap_folio_state *ifs = ifs_alloc(inode, folio, 0); + struct iomap_folio_state *ifs = folio->private; struct iomap_ioend *ioend, *next; unsigned len = i_blocksize(inode); unsigned nblocks = i_blocks_per_folio(inode, folio); @@ -1647,6 +1773,13 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, int error = 0, count = 0, i; LIST_HEAD(submit_list); + WARN_ON_ONCE(end_pos <= pos); + + if (!ifs && nblocks > 1) { + ifs = ifs_alloc(inode, folio, 0); + iomap_set_range_dirty(folio, 0, end_pos - pos); + } + WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0); /* @@ -1655,7 +1788,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, * invalid, grab a new one. */ for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) { - if (ifs && !ifs_block_is_uptodate(ifs, i)) + if (ifs && !ifs_block_is_dirty(folio, ifs, i)) continue; error = wpc->ops->map_blocks(wpc, inode, pos); @@ -1699,6 +1832,12 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, } } + /* + * We can have dirty bits set past end of file in page_mkwrite path + * while mapping the last partial folio. Hence it's better to clear + * all the dirty bits in the folio here. + */ + iomap_clear_range_dirty(folio, 0, folio_size(folio)); folio_start_writeback(folio); folio_unlock(folio); -- cgit