From e2aaee9cd34d8396a48abf0b1be81a464c1d51c5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 29 Jun 2020 14:47:20 -0700 Subject: xfs: move helpers that lock and unlock two inodes against userspace IO Move the double-inode locking helpers to xfs_inode.c since they're not specific to reflink. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_inode.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9aea7d68d8ab..24edec472a7c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3881,3 +3881,96 @@ xfs_log_force_inode( return 0; return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); } + +/* + * Grab the exclusive iolock for a data copy from src to dest, making sure to + * abide vfs locking order (lowest pointer value goes first) and breaking the + * layout leases before proceeding. The loop is needed because we cannot call + * the blocking break_layout() with the iolocks held, and therefore have to + * back out both locks. + */ +static int +xfs_iolock_two_inodes_and_break_layout( + struct inode *src, + struct inode *dest) +{ + int error; + + if (src > dest) + swap(src, dest); + +retry: + /* Wait to break both inodes' layouts before we start locking. */ + error = break_layout(src, true); + if (error) + return error; + if (src != dest) { + error = break_layout(dest, true); + if (error) + return error; + } + + /* Lock one inode and make sure nobody got in and leased it. */ + inode_lock(src); + error = break_layout(src, false); + if (error) { + inode_unlock(src); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + if (src == dest) + return 0; + + /* Lock the other inode and make sure nobody got in and leased it. */ + inode_lock_nested(dest, I_MUTEX_NONDIR2); + error = break_layout(dest, false); + if (error) { + inode_unlock(src); + inode_unlock(dest); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + return 0; +} + +/* + * Lock two inodes so that userspace cannot initiate I/O via file syscalls or + * mmap activity. + */ +int +xfs_ilock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + int ret; + + ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); + if (ret) + return ret; + if (ip1 == ip2) + xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); + else + xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, + ip2, XFS_MMAPLOCK_EXCL); + return 0; +} + +/* Unlock both inodes to allow IO and mmap activity. */ +void +xfs_iunlock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + bool same_inode = (ip1 == ip2); + + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + if (!same_inode) + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + inode_unlock(VFS_I(ip2)); + if (!same_inode) + inode_unlock(VFS_I(ip1)); +} -- cgit From 96355d5a1f0ee6dcc182c37db4894ec0c29f1692 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:45 -0700 Subject: xfs: Don't allow logging of XFS_ISTALE inodes In tracking down a problem in this patchset, I discovered we are reclaiming dirty stale inodes. This wasn't discovered until inodes were always attached to the cluster buffer and then the rcu callback that freed inodes was assert failing because the inode still had an active pointer to the cluster buffer after it had been reclaimed. Debugging the issue indicated that this was a pre-existing issue resulting from the way the inodes are handled in xfs_inactive_ifree. When we free a cluster buffer from xfs_ifree_cluster, all the inodes in cache are marked XFS_ISTALE. Those that are clean have nothing else done to them and so eventually get cleaned up by background reclaim. i.e. it is assumed we'll never dirty/relog an inode marked XFS_ISTALE. On journal commit dirty stale inodes as are handled by both buffer and inode log items to run though xfs_istale_done() and removed from the AIL (buffer log item commit) or the log item will simply unpin it because the buffer log item will clean it. What happens to any specific inode is entirely dependent on which log item wins the commit race, but the result is the same - stale inodes are clean, not attached to the cluster buffer, and not in the AIL. Hence inode reclaim can just free these inodes without further care. However, if the stale inode is relogged, it gets dirtied again and relogged into the CIL. Most of the time this isn't an issue, because relogging simply changes the inode's location in the current checkpoint. Problems arise, however, when the CIL checkpoints between two transactions in the xfs_inactive_ifree() deferops processing. This results in the XFS_ISTALE inode being redirtied and inserted into the CIL without any of the other stale cluster buffer infrastructure being in place. Hence on journal commit, it simply gets unpinned, so it remains dirty in memory. Everything in inode writeback avoids XFS_ISTALE inodes so it can't be written back, and it is not tracked in the AIL so there's not even a trigger to attempt to clean the inode. Hence the inode just sits dirty in memory until inode reclaim comes along, sees that it is XFS_ISTALE, and goes to reclaim it. This reclaiming of a dirty inode caused use after free, list corruptions and other nasty issues later in this patchset. Hence this patch addresses a violation of the "never log XFS_ISTALE inodes" caused by the deferops processing rolling a transaction and relogging a stale inode in xfs_inactive_free. It also adds a bunch of asserts to catch this problem in debug kernels so that we don't reintroduce this problem in future. Reproducer for this issue was generic/558 on a v4 filesystem. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 24edec472a7c..917998801a99 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1740,10 +1740,31 @@ xfs_inactive_ifree( return error; } + /* + * We do not hold the inode locked across the entire rolling transaction + * here. We only need to hold it for the first transaction that + * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the + * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode + * here breaks the relationship between cluster buffer invalidation and + * stale inode invalidation on cluster buffer item journal commit + * completion, and can result in leaving dirty stale inodes hanging + * around in memory. + * + * We have no need for serialising this inode operation against other + * operations - we freed the inode and hence reallocation is required + * and that will serialise on reallocating the space the deferops need + * to free. Hence we can unlock the inode on the first commit of + * the transaction rather than roll it right through the deferops. This + * avoids relogging the XFS_ISTALE inode. + * + * We check that xfs_ifree() hasn't grown an internal transaction roll + * by asserting that the inode is still locked when it returns. + */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); error = xfs_ifree(tp, ip); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (error) { /* * If we fail to free the inode, shut down. The cancel @@ -1756,7 +1777,6 @@ xfs_inactive_ifree( xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); } xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1774,7 +1794,6 @@ xfs_inactive_ifree( xfs_notice(mp, "%s: xfs_trans_commit returned error %d", __func__, error); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; } -- cgit From 1dfde687a65fec73e6914c184ecf8e9e54ccfe74 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:45 -0700 Subject: xfs: remove logged flag from inode log item This was used to track if the item had logged fields being flushed to disk. We log everything in the inode these days, so this logic is no longer needed. Remove it. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 917998801a99..2f65fe70d305 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2679,7 +2679,6 @@ xfs_ifree_cluster( list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_type == XFS_LI_INODE) { iip = (struct xfs_inode_log_item *)lip; - ASSERT(iip->ili_logged == 1); lip->li_cb = xfs_istale_done; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, @@ -2708,7 +2707,6 @@ xfs_ifree_cluster( iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; - iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -3840,19 +3838,16 @@ xfs_iflush_int( * * We can play with the ili_fields bits here, because the inode lock * must be held exclusively in order to set bits there and the flush - * lock protects the ili_last_fields bits. Set ili_logged so the flush - * done routine can tell whether or not to look in the AIL. Also, store - * the current LSN of the inode so that we can tell whether the item has - * moved in the AIL from xfs_iflush_done(). In order to read the lsn we - * need the AIL lock, because it is a 64 bit value that cannot be read - * atomically. + * lock protects the ili_last_fields bits. Store the current LSN of the + * inode so that we can tell whether the item has moved in the AIL from + * xfs_iflush_done(). In order to read the lsn we need the AIL lock, + * because it is a 64 bit value that cannot be read atomically. */ error = 0; flush_out: iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; - iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); -- cgit From 1319ebefd6ed7a9988b7b4bc9317fbcf61a28bfc Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:46 -0700 Subject: xfs: add an inode item lock The inode log item is kind of special in that it can be aggregating new changes in memory at the same time time existing changes are being written back to disk. This means there are fields in the log item that are accessed concurrently from contexts that don't share any locking at all. e.g. updating ili_last_fields occurs at flush time under the ILOCK_EXCL and flush lock at flush time, under the flush lock at IO completion time, and is read under the ILOCK_EXCL when the inode is logged. Hence there is no actual serialisation between reading the field during logging of the inode in transactions vs clearing the field in IO completion. We currently get away with this by the fact that we are only clearing fields in IO completion, and nothing bad happens if we accidentally log more of the inode than we actually modify. Worst case is we consume a tiny bit more memory and log bandwidth. However, if we want to do more complex state manipulations on the log item that requires updates at all three of these potential locations, we need to have some mechanism of serialising those operations. To do this, introduce a spinlock into the log item to serialise internal state. This could be done via the xfs_inode i_flags_lock, but this then leads to potential lock inversion issues where inode flag updates need to occur inside locks that best nest inside the inode log item locks (e.g. marking inodes stale during inode cluster freeing). Using a separate spinlock avoids these sorts of problems and simplifies future code. This does not touch the use of ili_fields in the item formatting code - that is entirely protected by the ILOCK_EXCL at this point in time, so it remains untouched. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2f65fe70d305..d6da08165a2e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2704,9 +2704,11 @@ xfs_ifree_cluster( continue; iip = ip->i_itemp; + spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -2742,6 +2744,7 @@ xfs_ifree( { int error; struct xfs_icluster xic = { 0 }; + struct xfs_inode_log_item *iip = ip->i_itemp; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(VFS_I(ip)->i_nlink == 0); @@ -2779,7 +2782,9 @@ xfs_ifree( ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; /* Don't attempt to replay owner changes for a deleted inode */ - ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER); + spin_lock(&iip->ili_lock); + iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); + spin_unlock(&iip->ili_lock); /* * Bump the generation count so no one will be confused @@ -3835,20 +3840,19 @@ xfs_iflush_int( * know that the information those bits represent is permanently on * disk. As long as the flush completes before the inode is logged * again, then both ili_fields and ili_last_fields will be cleared. - * - * We can play with the ili_fields bits here, because the inode lock - * must be held exclusively in order to set bits there and the flush - * lock protects the ili_last_fields bits. Store the current LSN of the - * inode so that we can tell whether the item has moved in the AIL from - * xfs_iflush_done(). In order to read the lsn we need the AIL lock, - * because it is a 64 bit value that cannot be read atomically. */ error = 0; flush_out: + spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); + /* + * Store the current LSN of the inode so that we can tell whether the + * item has moved in the AIL from xfs_iflush_done(). + */ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); -- cgit From f593bf144c7dfee9715aa787ebbbe5dd8882e8e9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:46 -0700 Subject: xfs: mark inode buffers in cache Inode buffers always have write IO callbacks, so by marking them directly we can avoid needing to attach ->b_iodone functions to them. This avoids an indirect call, and makes future modifications much simpler. While this is largely a refactor of existing functionality, we broaden the scope of the flag to beyond where inodes are explicitly attached because future changes need to know what type of log items are attached to the buffer. Adding this buffer flag may invoke the inode iodone callback in cases where it wouldn't have been previously, but this is not a functional change because the callback is identical to the normal buffer write iodone callback when inodes are not attached. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d6da08165a2e..4621d67f3428 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3862,13 +3862,13 @@ flush_out: * completion on the buffer to remove the inode from the AIL and release * the flush lock. */ + bp->b_flags |= _XBF_INODES; xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); ASSERT(!list_empty(&bp->b_li_list)); - ASSERT(bp->b_iodone != NULL); return error; } -- cgit From aac855ab1a98d9c20762047f26af47d391c3ba7a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:48:48 -0700 Subject: xfs: make inode IO completion buffer centric Having different io completion callbacks for different inode states makes things complex. We can detect if the inode is stale via the XFS_ISTALE flag in IO completion, so we don't need a special callback just for this. This means inodes only have a single iodone callback, and inode IO completion is entirely buffer centric at this point. Hence we no longer need to use a log item callback at all as we can just call xfs_iflush_done() directly from the buffer completions and walk the buffer log item list to complete the all inodes under IO. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4621d67f3428..721b8420be04 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2679,7 +2679,6 @@ xfs_ifree_cluster( list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_type == XFS_LI_INODE) { iip = (struct xfs_inode_log_item *)lip; - lip->li_cb = xfs_istale_done; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -2712,8 +2711,7 @@ xfs_ifree_cluster( xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); - xfs_buf_attach_iodone(bp, xfs_istale_done, - &iip->ili_item); + xfs_buf_attach_iodone(bp, NULL, &iip->ili_item); if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -3863,7 +3861,7 @@ flush_out: * the flush lock. */ bp->b_flags |= _XBF_INODES; - xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); + xfs_buf_attach_iodone(bp, NULL, &iip->ili_item); /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); -- cgit From 2ef3f7f5db15aea47b92fd770bc45cf317aa2b97 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:14 -0700 Subject: xfs: get rid of log item callbacks They are not used anymore, so remove them from the log item and the buffer iodone attachment interfaces. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 721b8420be04..77cc9cbcd311 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2711,7 +2711,8 @@ xfs_ifree_cluster( xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); - xfs_buf_attach_iodone(bp, NULL, &iip->ili_item); + list_add_tail(&iip->ili_item.li_bio_list, + &bp->b_li_list); if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -3861,7 +3862,7 @@ flush_out: * the flush lock. */ bp->b_flags |= _XBF_INODES; - xfs_buf_attach_iodone(bp, NULL, &iip->ili_item); + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); -- cgit From 71e3e35646861f2f9b8d36e00720904ed3ca31cb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:18 -0700 Subject: xfs: rework stale inodes in xfs_ifree_cluster Once we have inodes pinning the cluster buffer and attached whenever they are dirty, we no longer have a guarantee that the items are flush locked when we lock the cluster buffer. Hence we cannot just walk the buffer log item list and modify the attached inodes. If the inode is not flush locked, we have to ILOCK it first and then flush lock it to do all the prerequisite checks needed to avoid races with other code. This is already handled by xfs_ifree_get_one_inode(), so rework the inode iteration loop and function to update all inodes in cache whether they are attached to the buffer or not. Note: we also remove the copying of the log item lsn to the ili_flush_lsn as xfs_iflush_done() now uses the XFS_ISTALE flag to trigger aborts and so flush lsn matching is not needed in IO completion for processing freed inodes. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 161 +++++++++++++++++++++-------------------------------- 1 file changed, 62 insertions(+), 99 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 77cc9cbcd311..1c3a8bed4875 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2517,17 +2517,19 @@ out: } /* - * Look up the inode number specified and mark it stale if it is found. If it is - * dirty, return the inode so it can be attached to the cluster buffer so it can - * be processed appropriately when the cluster free transaction completes. + * Look up the inode number specified and if it is not already marked XFS_ISTALE + * mark it stale. We should only find clean inodes in this lookup that aren't + * already stale. */ -static struct xfs_inode * -xfs_ifree_get_one_inode( - struct xfs_perag *pag, +static void +xfs_ifree_mark_inode_stale( + struct xfs_buf *bp, struct xfs_inode *free_ip, xfs_ino_t inum) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = bp->b_mount; + struct xfs_perag *pag = bp->b_pag; + struct xfs_inode_log_item *iip; struct xfs_inode *ip; retry: @@ -2535,8 +2537,10 @@ retry: ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); /* Inode not in memory, nothing to do */ - if (!ip) - goto out_rcu_unlock; + if (!ip) { + rcu_read_unlock(); + return; + } /* * because this is an RCU protected lookup, we could find a recently @@ -2547,9 +2551,9 @@ retry: spin_lock(&ip->i_flags_lock); if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { spin_unlock(&ip->i_flags_lock); - goto out_rcu_unlock; + rcu_read_unlock(); + return; } - spin_unlock(&ip->i_flags_lock); /* * Don't try to lock/unlock the current inode, but we _cannot_ skip the @@ -2559,43 +2563,53 @@ retry: */ if (ip != free_ip) { if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); delay(1); goto retry; } - - /* - * Check the inode number again in case we're racing with - * freeing in xfs_reclaim_inode(). See the comments in that - * function for more information as to why the initial check is - * not sufficient. - */ - if (ip->i_ino != inum) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out_rcu_unlock; - } } + ip->i_flags |= XFS_ISTALE; + spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_ISTALE); + /* + * If we can't get the flush lock, the inode is already attached. All + * we needed to do here is mark the inode stale so buffer IO completion + * will remove it from the AIL. + */ + iip = ip->i_itemp; + if (!xfs_iflock_nowait(ip)) { + ASSERT(!list_empty(&iip->ili_item.li_bio_list)); + ASSERT(iip->ili_last_fields); + goto out_iunlock; + } + ASSERT(!iip || list_empty(&iip->ili_item.li_bio_list)); /* - * We don't need to attach clean inodes or those only with unlogged - * changes (which we throw away, anyway). + * Clean inodes can be released immediately. Everything else has to go + * through xfs_iflush_abort() on journal commit as the flock + * synchronises removal of the inode from the cluster buffer against + * inode reclaim. */ - if (!ip->i_itemp || xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); + if (xfs_inode_clean(ip)) { xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out_no_inode; + goto out_iunlock; } - return ip; -out_rcu_unlock: - rcu_read_unlock(); -out_no_inode: - return NULL; + /* we have a dirty inode in memory that has not yet been flushed. */ + ASSERT(iip->ili_fields); + spin_lock(&iip->ili_lock); + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); + ASSERT(iip->ili_last_fields); + +out_iunlock: + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); } /* @@ -2605,26 +2619,20 @@ out_no_inode: */ STATIC int xfs_ifree_cluster( - xfs_inode_t *free_ip, - xfs_trans_t *tp, + struct xfs_inode *free_ip, + struct xfs_trans *tp, struct xfs_icluster *xic) { - xfs_mount_t *mp = free_ip->i_mount; + struct xfs_mount *mp = free_ip->i_mount; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + struct xfs_buf *bp; + xfs_daddr_t blkno; + xfs_ino_t inum = xic->first_ino; int nbufs; int i, j; int ioffset; - xfs_daddr_t blkno; - xfs_buf_t *bp; - xfs_inode_t *ip; - struct xfs_inode_log_item *iip; - struct xfs_log_item *lip; - struct xfs_perag *pag; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - xfs_ino_t inum; int error; - inum = xic->first_ino; - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { @@ -2653,10 +2661,8 @@ xfs_ifree_cluster( error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, mp->m_bsize * igeo->blocks_per_cluster, XBF_UNMAPPED, &bp); - if (error) { - xfs_perag_put(pag); + if (error) return error; - } /* * This buffer may not have been correctly initialised as we @@ -2670,59 +2676,16 @@ xfs_ifree_cluster( bp->b_ops = &xfs_inode_buf_ops; /* - * Walk the inodes already attached to the buffer and mark them - * stale. These will all have the flush locks held, so an - * in-memory inode walk can't lock them. By marking them all - * stale first, we will not attempt to lock them in the loop - * below as the XFS_ISTALE flag will be set. + * Now we need to set all the cached clean inodes as XFS_ISTALE, + * too. This requires lookups, and will skip inodes that we've + * already marked XFS_ISTALE. */ - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - if (lip->li_type == XFS_LI_INODE) { - iip = (struct xfs_inode_log_item *)lip; - xfs_trans_ail_copy_lsn(mp->m_ail, - &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - } - } - - - /* - * For each inode in memory attempt to add it to the inode - * buffer and set it up for being staled on buffer IO - * completion. This is safe as we've locked out tail pushing - * and flushing by locking the buffer. - * - * We have already marked every inode that was part of a - * transaction stale above, which means there is no point in - * even trying to lock them. - */ - for (i = 0; i < igeo->inodes_per_cluster; i++) { - ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i); - if (!ip) - continue; - - iip = ip->i_itemp; - spin_lock(&iip->ili_lock); - iip->ili_last_fields = iip->ili_fields; - iip->ili_fields = 0; - iip->ili_fsync_fields = 0; - spin_unlock(&iip->ili_lock); - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - - list_add_tail(&iip->ili_item.li_bio_list, - &bp->b_li_list); - - if (ip != free_ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + for (i = 0; i < igeo->inodes_per_cluster; i++) + xfs_ifree_mark_inode_stale(bp, free_ip, inum + i); xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } - - xfs_perag_put(pag); return 0; } -- cgit From 48d55e2ae3ce837598c073995bbbac5d24a35fe1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:18 -0700 Subject: xfs: attach inodes to the cluster buffer when dirtied Rather than attach inodes to the cluster buffer just when we are doing IO, attach the inodes to the cluster buffer when they are dirtied. The means the buffer always carries a list of dirty inodes that reference it, and we can use that list to make more fundamental changes to inode writeback that aren't otherwise possible. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1c3a8bed4875..c4586ac3656a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2584,27 +2584,24 @@ retry: ASSERT(iip->ili_last_fields); goto out_iunlock; } - ASSERT(!iip || list_empty(&iip->ili_item.li_bio_list)); /* - * Clean inodes can be released immediately. Everything else has to go - * through xfs_iflush_abort() on journal commit as the flock - * synchronises removal of the inode from the cluster buffer against - * inode reclaim. + * Inodes not attached to the buffer can be released immediately. + * Everything else has to go through xfs_iflush_abort() on journal + * commit as the flock synchronises removal of the inode from the + * cluster buffer against inode reclaim. */ - if (xfs_inode_clean(ip)) { + if (!iip || list_empty(&iip->ili_item.li_bio_list)) { xfs_ifunlock(ip); goto out_iunlock; } /* we have a dirty inode in memory that has not yet been flushed. */ - ASSERT(iip->ili_fields); spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; spin_unlock(&iip->ili_lock); - list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); ASSERT(iip->ili_last_fields); out_iunlock: @@ -3818,19 +3815,8 @@ flush_out: xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); - /* - * Attach the inode item callback to the buffer whether the flush - * succeeded or not. If not, the caller will shut down and fail I/O - * completion on the buffer to remove the inode from the AIL and release - * the flush lock. - */ - bp->b_flags |= _XBF_INODES; - list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); - /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); - - ASSERT(!list_empty(&bp->b_li_list)); return error; } -- cgit From 90c60e16401248a4900f3f9387f563d0178dcf34 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:19 -0700 Subject: xfs: xfs_iflush() is no longer necessary Now we have a cached buffer on inode log items, we don't need to do buffer lookups when flushing inodes anymore - all we need to do is lock the buffer and we are ready to go. This largely gets rid of the need for xfs_iflush(), which is essentially just a mechanism to look up the buffer and flush the inode to it. Instead, we can just call xfs_iflush_cluster() with a few modifications to ensure it also flushes the inode we already hold locked. This allows the AIL inode item pushing to be almost entirely non-blocking in XFS - we won't block unless memory allocation for the cluster inode lookup blocks or the block device queues are full. Writeback during inode reclaim becomes a little more complex because we now have to lock the buffer ourselves, but otherwise this change is largely a functional no-op that removes a whole lot of code. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 107 +++++++---------------------------------------------- 1 file changed, 14 insertions(+), 93 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c4586ac3656a..4a9539048639 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3450,7 +3450,18 @@ out_release_wip: return error; } -STATIC int +/* + * Non-blocking flush of dirty inode metadata into the backing buffer. + * + * The caller must have a reference to the inode and hold the cluster buffer + * locked. The function will walk across all the inodes on the cluster buffer it + * can find and lock without blocking, and flush them to the cluster buffer. + * + * On success, the caller must write out the buffer returned in *bp and + * release it. On failure, the filesystem will be shut down, the buffer will + * have been unlocked and released, and EFSCORRUPTED will be returned. + */ +int xfs_iflush_cluster( struct xfs_inode *ip, struct xfs_buf *bp) @@ -3485,8 +3496,6 @@ xfs_iflush_cluster( for (i = 0; i < nr_found; i++) { cip = cilist[i]; - if (cip == ip) - continue; /* * because this is an RCU protected lookup, we could find a @@ -3577,99 +3586,11 @@ out_free: kmem_free(cilist); out_put: xfs_perag_put(pag); - return error; -} - -/* - * Flush dirty inode metadata into the backing buffer. - * - * The caller must have the inode lock and the inode flush lock held. The - * inode lock will still be held upon return to the caller, and the inode - * flush lock will be released after the inode has reached the disk. - * - * The caller must write out the buffer returned in *bpp and release it. - */ -int -xfs_iflush( - struct xfs_inode *ip, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp = NULL; - struct xfs_dinode *dip; - int error; - - XFS_STATS_INC(mp, xs_iflush_count); - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || - ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - - *bpp = NULL; - - xfs_iunpin_wait(ip); - - /* - * For stale inodes we cannot rely on the backing buffer remaining - * stale in cache for the remaining life of the stale inode and so - * xfs_imap_to_bp() below may give us a buffer that no longer contains - * inodes below. We have to check this after ensuring the inode is - * unpinned so that it is safe to reclaim the stale inode after the - * flush call. - */ - if (xfs_iflags_test(ip, XFS_ISTALE)) { - xfs_ifunlock(ip); - return 0; - } - - /* - * Get the buffer containing the on-disk inode. We are doing a try-lock - * operation here, so we may get an EAGAIN error. In that case, return - * leaving the inode dirty. - * - * If we get any other error, we effectively have a corruption situation - * and we cannot flush the inode. Abort the flush and shut down. - */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK); - if (error == -EAGAIN) { - xfs_ifunlock(ip); - return error; - } - if (error) - goto abort; - - /* - * If the buffer is pinned then push on the log now so we won't - * get stuck waiting in the write for too long. - */ - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - - /* - * Flush the provided inode then attempt to gather others from the - * cluster into the write. - * - * Note: Once we attempt to flush an inode, we must run buffer - * completion callbacks on any failure. If this fails, simulate an I/O - * failure on the buffer and shut down. - */ - error = xfs_iflush_int(ip, bp); - if (!error) - error = xfs_iflush_cluster(ip, bp); if (error) { bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); - goto shutdown; + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } - - *bpp = bp; - return 0; - -abort: - xfs_iflush_abort(ip); -shutdown: - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } @@ -3687,7 +3608,7 @@ xfs_iflush_int( ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - ASSERT(iip != NULL && iip->ili_fields != 0); + ASSERT(iip->ili_item.li_buf == bp); dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); -- cgit From e6187b3444e88ed9aa5f3843603e1f024b6d0309 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:19 -0700 Subject: xfs: rename xfs_iflush_int() with xfs_iflush() gone, we can rename xfs_iflush_int() back to xfs_iflush(). Also move it up above xfs_iflush_cluster() so we don't need the forward definition any more. Signed-off-by: Dave Chinner Reviewed-by: Amir Goldstein Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 293 ++++++++++++++++++++++++++--------------------------- 1 file changed, 146 insertions(+), 147 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4a9539048639..31e105f95739 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -44,7 +44,6 @@ kmem_zone_t *xfs_inode_zone; */ #define XFS_ITRUNC_MAX_EXTENTS 2 -STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *); STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *); @@ -3450,152 +3449,8 @@ out_release_wip: return error; } -/* - * Non-blocking flush of dirty inode metadata into the backing buffer. - * - * The caller must have a reference to the inode and hold the cluster buffer - * locked. The function will walk across all the inodes on the cluster buffer it - * can find and lock without blocking, and flush them to the cluster buffer. - * - * On success, the caller must write out the buffer returned in *bp and - * release it. On failure, the filesystem will be shut down, the buffer will - * have been unlocked and released, and EFSCORRUPTED will be returned. - */ -int -xfs_iflush_cluster( - struct xfs_inode *ip, - struct xfs_buf *bp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - unsigned long first_index, mask; - int cilist_size; - struct xfs_inode **cilist; - struct xfs_inode *cip; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - int error = 0; - int nr_found; - int clcount = 0; - int i; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - - cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *); - cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); - if (!cilist) - goto out_put; - - mask = ~(igeo->inodes_per_cluster - 1); - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; - rcu_read_lock(); - /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, - first_index, igeo->inodes_per_cluster); - if (nr_found == 0) - goto out_free; - - for (i = 0; i < nr_found; i++) { - cip = cilist[i]; - - /* - * because this is an RCU protected lookup, we could find a - * recently freed or even reallocated inode during the lookup. - * We need to check under the i_flags_lock for a valid inode - * here. Skip it if it is not valid or the wrong inode. - */ - spin_lock(&cip->i_flags_lock); - if (!cip->i_ino || - __xfs_iflags_test(cip, XFS_ISTALE)) { - spin_unlock(&cip->i_flags_lock); - continue; - } - - /* - * Once we fall off the end of the cluster, no point checking - * any more inodes in the list because they will also all be - * outside the cluster. - */ - if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { - spin_unlock(&cip->i_flags_lock); - break; - } - spin_unlock(&cip->i_flags_lock); - - /* - * Do an un-protected check to see if the inode is dirty and - * is a candidate for flushing. These checks will be repeated - * later after the appropriate locks are acquired. - */ - if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) - continue; - - /* - * Try to get locks. If any are unavailable or it is pinned, - * then this inode cannot be flushed and is skipped. - */ - - if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) - continue; - if (!xfs_iflock_nowait(cip)) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - if (xfs_ipincount(cip)) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - - /* - * Check the inode number again, just to be certain we are not - * racing with freeing in xfs_reclaim_inode(). See the comments - * in that function for more information as to why the initial - * check is not sufficient. - */ - if (!cip->i_ino) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - /* - * arriving here means that this inode can be flushed. First - * re-check that it's dirty before flushing. - */ - if (!xfs_inode_clean(cip)) { - error = xfs_iflush_int(cip, bp); - if (error) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - goto out_free; - } - clcount++; - } else { - xfs_ifunlock(cip); - } - xfs_iunlock(cip, XFS_ILOCK_SHARED); - } - - if (clcount) { - XFS_STATS_INC(mp, xs_icluster_flushcnt); - XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); - } - -out_free: - rcu_read_unlock(); - kmem_free(cilist); -out_put: - xfs_perag_put(pag); - if (error) { - bp->b_flags |= XBF_ASYNC; - xfs_buf_ioend_fail(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - } - return error; -} - -STATIC int -xfs_iflush_int( +static int +xfs_iflush( struct xfs_inode *ip, struct xfs_buf *bp) { @@ -3741,6 +3596,150 @@ flush_out: return error; } +/* + * Non-blocking flush of dirty inode metadata into the backing buffer. + * + * The caller must have a reference to the inode and hold the cluster buffer + * locked. The function will walk across all the inodes on the cluster buffer it + * can find and lock without blocking, and flush them to the cluster buffer. + * + * On success, the caller must write out the buffer returned in *bp and + * release it. On failure, the filesystem will be shut down, the buffer will + * have been unlocked and released, and EFSCORRUPTED will be returned. + */ +int +xfs_iflush_cluster( + struct xfs_inode *ip, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + unsigned long first_index, mask; + int cilist_size; + struct xfs_inode **cilist; + struct xfs_inode *cip; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + int error = 0; + int nr_found; + int clcount = 0; + int i; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + + cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *); + cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); + if (!cilist) + goto out_put; + + mask = ~(igeo->inodes_per_cluster - 1); + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; + rcu_read_lock(); + /* really need a gang lookup range call here */ + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, + first_index, igeo->inodes_per_cluster); + if (nr_found == 0) + goto out_free; + + for (i = 0; i < nr_found; i++) { + cip = cilist[i]; + + /* + * because this is an RCU protected lookup, we could find a + * recently freed or even reallocated inode during the lookup. + * We need to check under the i_flags_lock for a valid inode + * here. Skip it if it is not valid or the wrong inode. + */ + spin_lock(&cip->i_flags_lock); + if (!cip->i_ino || + __xfs_iflags_test(cip, XFS_ISTALE)) { + spin_unlock(&cip->i_flags_lock); + continue; + } + + /* + * Once we fall off the end of the cluster, no point checking + * any more inodes in the list because they will also all be + * outside the cluster. + */ + if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { + spin_unlock(&cip->i_flags_lock); + break; + } + spin_unlock(&cip->i_flags_lock); + + /* + * Do an un-protected check to see if the inode is dirty and + * is a candidate for flushing. These checks will be repeated + * later after the appropriate locks are acquired. + */ + if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) + continue; + + /* + * Try to get locks. If any are unavailable or it is pinned, + * then this inode cannot be flushed and is skipped. + */ + + if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) + continue; + if (!xfs_iflock_nowait(cip)) { + xfs_iunlock(cip, XFS_ILOCK_SHARED); + continue; + } + if (xfs_ipincount(cip)) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); + continue; + } + + + /* + * Check the inode number again, just to be certain we are not + * racing with freeing in xfs_reclaim_inode(). See the comments + * in that function for more information as to why the initial + * check is not sufficient. + */ + if (!cip->i_ino) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); + continue; + } + + /* + * arriving here means that this inode can be flushed. First + * re-check that it's dirty before flushing. + */ + if (!xfs_inode_clean(cip)) { + error = xfs_iflush(cip, bp); + if (error) { + xfs_iunlock(cip, XFS_ILOCK_SHARED); + goto out_free; + } + clcount++; + } else { + xfs_ifunlock(cip); + } + xfs_iunlock(cip, XFS_ILOCK_SHARED); + } + + if (clcount) { + XFS_STATS_INC(mp, xs_icluster_flushcnt); + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); + } + +out_free: + rcu_read_unlock(); + kmem_free(cilist); +out_put: + xfs_perag_put(pag); + if (error) { + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioend_fail(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } + return error; +} + /* Release an inode. */ void xfs_irele( -- cgit From 5717ea4d527acbec9300cb083b100dd0003ac777 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:20 -0700 Subject: xfs: rework xfs_iflush_cluster() dirty inode iteration Now that we have all the dirty inodes attached to the cluster buffer, we don't actually have to do radix tree lookups to find them. Sure, the radix tree is efficient, but walking a linked list of just the dirty inodes attached to the buffer is much better. We are also no longer dependent on having a locked inode passed into the function to determine where to start the lookup. This means we can drop it from the function call and treat all inodes the same. We also make xfs_iflush_cluster skip inodes marked with XFS_IRECLAIM. This we avoid races with inodes that reclaim is actively referencing or are being re-initialised by inode lookup. If they are actually dirty, they'll get written by a future cluster flush.... We also add a shutdown check after obtaining the flush lock so that we catch inodes that are dirty in memory and may have inconsistent state due to the shutdown in progress. We abort these inodes directly and so they remove themselves directly from the buffer list and the AIL rather than having to wait for the buffer to be failed and callbacks run to be processed correctly. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 171 +++++++++++++++++++++++------------------------------ 1 file changed, 75 insertions(+), 96 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 31e105f95739..ece3622f6d28 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3603,141 +3603,120 @@ flush_out: * locked. The function will walk across all the inodes on the cluster buffer it * can find and lock without blocking, and flush them to the cluster buffer. * - * On success, the caller must write out the buffer returned in *bp and - * release it. On failure, the filesystem will be shut down, the buffer will - * have been unlocked and released, and EFSCORRUPTED will be returned. + * On successful flushing of at least one inode, the caller must write out the + * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and + * the caller needs to release the buffer. On failure, the filesystem will be + * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED + * will be returned. */ int xfs_iflush_cluster( - struct xfs_inode *ip, struct xfs_buf *bp) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - unsigned long first_index, mask; - int cilist_size; - struct xfs_inode **cilist; - struct xfs_inode *cip; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - int error = 0; - int nr_found; + struct xfs_mount *mp = bp->b_mount; + struct xfs_log_item *lip, *n; + struct xfs_inode *ip; + struct xfs_inode_log_item *iip; int clcount = 0; - int i; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - - cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *); - cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); - if (!cilist) - goto out_put; - - mask = ~(igeo->inodes_per_cluster - 1); - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; - rcu_read_lock(); - /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, - first_index, igeo->inodes_per_cluster); - if (nr_found == 0) - goto out_free; + int error = 0; - for (i = 0; i < nr_found; i++) { - cip = cilist[i]; + /* + * We must use the safe variant here as on shutdown xfs_iflush_abort() + * can remove itself from the list. + */ + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + iip = (struct xfs_inode_log_item *)lip; + ip = iip->ili_inode; /* - * because this is an RCU protected lookup, we could find a - * recently freed or even reallocated inode during the lookup. - * We need to check under the i_flags_lock for a valid inode - * here. Skip it if it is not valid or the wrong inode. + * Quick and dirty check to avoid locks if possible. */ - spin_lock(&cip->i_flags_lock); - if (!cip->i_ino || - __xfs_iflags_test(cip, XFS_ISTALE)) { - spin_unlock(&cip->i_flags_lock); + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) + continue; + if (xfs_ipincount(ip)) continue; - } /* - * Once we fall off the end of the cluster, no point checking - * any more inodes in the list because they will also all be - * outside the cluster. + * The inode is still attached to the buffer, which means it is + * dirty but reclaim might try to grab it. Check carefully for + * that, and grab the ilock while still holding the i_flags_lock + * to guarantee reclaim will not be able to reclaim this inode + * once we drop the i_flags_lock. */ - if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { - spin_unlock(&cip->i_flags_lock); - break; + spin_lock(&ip->i_flags_lock); + ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) { + spin_unlock(&ip->i_flags_lock); + continue; } - spin_unlock(&cip->i_flags_lock); /* - * Do an un-protected check to see if the inode is dirty and - * is a candidate for flushing. These checks will be repeated - * later after the appropriate locks are acquired. + * ILOCK will pin the inode against reclaim and prevent + * concurrent transactions modifying the inode while we are + * flushing the inode. */ - if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + spin_unlock(&ip->i_flags_lock); continue; + } + spin_unlock(&ip->i_flags_lock); /* - * Try to get locks. If any are unavailable or it is pinned, - * then this inode cannot be flushed and is skipped. + * Skip inodes that are already flush locked as they have + * already been written to the buffer. */ - - if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) - continue; - if (!xfs_iflock_nowait(cip)) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); + if (!xfs_iflock_nowait(ip)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); continue; } - if (xfs_ipincount(cip)) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - /* - * Check the inode number again, just to be certain we are not - * racing with freeing in xfs_reclaim_inode(). See the comments - * in that function for more information as to why the initial - * check is not sufficient. + * Abort flushing this inode if we are shut down because the + * inode may not currently be in the AIL. This can occur when + * log I/O failure unpins the inode without inserting into the + * AIL, leaving a dirty/unpinned inode attached to the buffer + * that otherwise looks like it should be flushed. */ - if (!cip->i_ino) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_iunpin_wait(ip); + /* xfs_iflush_abort() drops the flush lock */ + xfs_iflush_abort(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + error = -EIO; continue; } - /* - * arriving here means that this inode can be flushed. First - * re-check that it's dirty before flushing. - */ - if (!xfs_inode_clean(cip)) { - error = xfs_iflush(cip, bp); - if (error) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - goto out_free; - } - clcount++; - } else { - xfs_ifunlock(cip); + /* don't block waiting on a log force to unpin dirty inodes */ + if (xfs_ipincount(ip)) { + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + continue; } - xfs_iunlock(cip, XFS_ILOCK_SHARED); - } - if (clcount) { - XFS_STATS_INC(mp, xs_icluster_flushcnt); - XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); + if (!xfs_inode_clean(ip)) + error = xfs_iflush(ip, bp); + else + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error) + break; + clcount++; } -out_free: - rcu_read_unlock(); - kmem_free(cilist); -out_put: - xfs_perag_put(pag); if (error) { bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; } - return error; + + if (!clcount) + return -EAGAIN; + + XFS_STATS_INC(mp, xs_icluster_flushcnt); + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); + return 0; + } /* Release an inode. */ -- cgit From e2705b0304778916db87831217ec642e34d9d9fa Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Jun 2020 14:49:20 -0700 Subject: xfs: remove xfs_inobp_check() This debug code is called on every xfs_iflush() call, which then checks every inode in the buffer for non-zero unlinked list field. Hence it checks every inode in the cluster buffer every time a single inode on that cluster it flushed. This is resulting in: - 38.91% 5.33% [kernel] [k] xfs_iflush - 17.70% xfs_iflush - 9.93% xfs_inobp_check 4.36% xfs_buf_offset 10% of the CPU time spent flushing inodes is repeatedly checking unlinked fields in the buffer. We don't need to do this. The other place we call xfs_inobp_check() is xfs_iunlink_update_dinode(), and this is after we've done this assert for the agino we are about to write into that inode: ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); which means we've already checked that the agino we are about to write is not 0 on debug kernels. The inode buffer verifiers do everything else we need, so let's just remove this debug code. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ece3622f6d28..5c07bf491d9f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2165,7 +2165,6 @@ xfs_iunlink_update_dinode( xfs_dinode_calc_crc(mp, dip); xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); - xfs_inobp_check(mp, ibp); } /* Set an in-core inode's unlinked pointer and return the old value. */ @@ -3558,7 +3557,6 @@ xfs_iflush( xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); if (XFS_IFORK_Q(ip)) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); - xfs_inobp_check(mp, bp); /* * We've recorded everything logged in the inode, so we'd like to clear -- cgit From 92a005448f6fed70b5e7a9f29a1f930118449f1b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 13 Jul 2020 09:13:00 -0700 Subject: xfs: get rid of unnecessary xfs_perag_{get,put} pairs In the course of some operations, we look up the perag from the mount multiple times to get or change perag information. These are often very short pieces of code, so while the lookup cost is generally low, the cost of the lookup is far higher than the cost of the operation we are doing on the perag. Since we changed buffers to hold references to the perag they are cached in, many modification contexts already hold active references to the perag that are held across these operations. This is especially true for any operation that is serialised by an allocation group header buffer. In these cases, we can just use the buffer's reference to the perag to avoid needing to do lookups to access the perag. This means that many operations don't need to do perag lookups at all to access the perag because they've already looked up objects that own persistent references and hence can use that reference instead. Cc: Dave Chinner Cc: "Darrick J. Wong" Signed-off-by: Gao Xiang Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 38 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5c07bf491d9f..407d6299606d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2265,7 +2265,6 @@ xfs_iunlink( } if (next_agino != NULLAGINO) { - struct xfs_perag *pag; xfs_agino_t old_agino; /* @@ -2282,9 +2281,7 @@ xfs_iunlink( * agino has been unlinked, add a backref from the next inode * back to agino. */ - pag = xfs_perag_get(mp, agno); - error = xfs_iunlink_add_backref(pag, agino, next_agino); - xfs_perag_put(pag); + error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino); if (error) return error; } @@ -2420,7 +2417,6 @@ xfs_iunlink_remove( struct xfs_buf *agibp; struct xfs_buf *last_ibp; struct xfs_dinode *last_dip = NULL; - struct xfs_perag *pag = NULL; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino; @@ -2464,32 +2460,22 @@ xfs_iunlink_remove( * this inode's backref to point from the next inode. */ if (next_agino != NULLAGINO) { - pag = xfs_perag_get(mp, agno); - error = xfs_iunlink_change_backref(pag, next_agino, + error = xfs_iunlink_change_backref(agibp->b_pag, next_agino, NULLAGINO); if (error) - goto out; + return error; } - if (head_agino == agino) { - /* Point the head of the list to the next unlinked inode. */ - error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, - next_agino); - if (error) - goto out; - } else { + if (head_agino != agino) { struct xfs_imap imap; xfs_agino_t prev_agino; - if (!pag) - pag = xfs_perag_get(mp, agno); - /* We need to search the list for the inode being freed. */ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, &prev_agino, &imap, &last_dip, &last_ibp, - pag); + agibp->b_pag); if (error) - goto out; + return error; /* Point the previous inode on the list to the next inode. */ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, @@ -2503,15 +2489,13 @@ xfs_iunlink_remove( * change_backref takes care of deleting the backref if * next_agino is NULLAGINO. */ - error = xfs_iunlink_change_backref(pag, agino, next_agino); - if (error) - goto out; + return xfs_iunlink_change_backref(agibp->b_pag, agino, + next_agino); } -out: - if (pag) - xfs_perag_put(pag); - return error; + /* Point the head of the list to the next unlinked inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); } /* -- cgit From b63da6c8dfa9b2ab3554e8c59ef294d1f28bb9bd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 5 Aug 2020 08:49:58 -0700 Subject: xfs: delete duplicated words + other fixes Delete repeated words in fs/xfs/. {we, that, the, a, to, fork} Change "it it" to "it is" in one location. Signed-off-by: Randy Dunlap To: linux-fsdevel@vger.kernel.org Cc: Darrick J. Wong Cc: linux-xfs@vger.kernel.org Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 407d6299606d..c06129cffba9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -451,7 +451,7 @@ xfs_lock_inodes( /* * Currently supports between 2 and 5 inodes with exclusive locking. We * support an arbitrary depth of locking here, but absolute limits on - * inodes depend on the the type of locking and the limits placed by + * inodes depend on the type of locking and the limits placed by * lockdep annotations in xfs_lock_inumorder. These are all checked by * the asserts. */ @@ -3105,7 +3105,7 @@ out_trans_abort: /* * xfs_rename_alloc_whiteout() * - * Return a referenced, unlinked, unlocked inode that that can be used as a + * Return a referenced, unlinked, unlocked inode that can be used as a * whiteout in a rename transaction. We use a tmpfile inode here so that if we * crash between allocating the inode and linking it into the rename transaction * recovery will free the inode and we won't leak it. -- cgit