From ce85a1e04645b1ed386b074297df27ab5b8801c0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 4 Aug 2023 08:20:57 -0700 Subject: xfs: stabilize fs summary counters for online fsck If the fscounters scrubber notices incorrect summary counters, it's entirely possible that scrub is simply racing with other threads that are updating the incore counters. There isn't a good way to stabilize percpu counters or set ourselves up to observe live updates with hooks like we do for the quotacheck or nlinks scanners, so we instead choose to freeze the filesystem long enough to walk the incore per-AG structures. Past me thought that it was going to be commonplace to have to freeze the filesystem to perform some kind of repair and set up a whole separate infrastructure to freeze the filesystem in such a way that userspace could not unfreeze while we were running. This involved adding a mutex and freeze_super/thaw_super functions and dealing with the fact that the VFS freeze/thaw functions can free the VFS superblock references on return. This was all very overwrought, since fscounters turned out to be the only user of scrub freezes, and it doesn't require the log to quiesce, only the incore superblock counters. We prevent other threads from changing the freeze level by calling freeze_super_excl with a custom freeze cookie to keep everyone else out of the filesystem. The end result is that fscounters should be much more efficient. When we're checking a busy system and we can't stabilize the counters, the custom freeze will do less work, which should result in less downtime. Repair should be similarly speedy, but that's in a later patch. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/fscounters.c | 188 +++++++++++++++++++++++++++++++++++++--------- fs/xfs/scrub/scrub.c | 6 +- fs/xfs/scrub/scrub.h | 1 + fs/xfs/scrub/trace.h | 26 +++++++ 4 files changed, 183 insertions(+), 38 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index e382a35e98d8..05be757668bb 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2019-2023 Oracle. All Rights Reserved. * Author: Darrick J. Wong @@ -8,6 +8,8 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_mount.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" @@ -16,6 +18,7 @@ #include "xfs_ag.h" #include "xfs_rtalloc.h" #include "xfs_inode.h" +#include "xfs_icache.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -53,6 +56,7 @@ struct xchk_fscounters { uint64_t frextents; unsigned long long icount_min; unsigned long long icount_max; + bool frozen; }; /* @@ -123,6 +127,82 @@ xchk_fscount_warmup( return error; } +static inline int +xchk_fsfreeze( + struct xfs_scrub *sc) +{ + int error; + + error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + trace_xchk_fsfreeze(sc, error); + return error; +} + +static inline int +xchk_fsthaw( + struct xfs_scrub *sc) +{ + int error; + + /* This should always succeed, we have a kernel freeze */ + error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + trace_xchk_fsthaw(sc, error); + return error; +} + +/* + * We couldn't stabilize the filesystem long enough to sample all the variables + * that comprise the summary counters and compare them to the percpu counters. + * We need to disable all writer threads, which means taking the first two + * freeze levels to put userspace to sleep, and the third freeze level to + * prevent background threads from starting new transactions. Take one level + * more to prevent other callers from unfreezing the filesystem while we run. + */ +STATIC int +xchk_fscounters_freeze( + struct xfs_scrub *sc) +{ + struct xchk_fscounters *fsc = sc->buf; + int error = 0; + + if (sc->flags & XCHK_HAVE_FREEZE_PROT) { + sc->flags &= ~XCHK_HAVE_FREEZE_PROT; + mnt_drop_write_file(sc->file); + } + + /* Try to grab a kernel freeze. */ + while ((error = xchk_fsfreeze(sc)) == -EBUSY) { + if (xchk_should_terminate(sc, &error)) + return error; + + delay(HZ / 10); + } + if (error) + return error; + + fsc->frozen = true; + return 0; +} + +/* Thaw the filesystem after checking or repairing fscounters. */ +STATIC void +xchk_fscounters_cleanup( + void *buf) +{ + struct xchk_fscounters *fsc = buf; + struct xfs_scrub *sc = fsc->sc; + int error; + + if (!fsc->frozen) + return; + + error = xchk_fsthaw(sc); + if (error) + xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error); + else + fsc->frozen = false; +} + int xchk_setup_fscounters( struct xfs_scrub *sc) @@ -140,6 +220,7 @@ xchk_setup_fscounters( sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; + sc->buf_cleanup = xchk_fscounters_cleanup; fsc = sc->buf; fsc->sc = sc; @@ -150,7 +231,18 @@ xchk_setup_fscounters( if (error) return error; - return xchk_trans_alloc(sc, 0); + /* + * Pause all writer activity in the filesystem while we're scrubbing to + * reduce the likelihood of background perturbations to the counters + * throwing off our calculations. + */ + if (sc->flags & XCHK_TRY_HARDER) { + error = xchk_fscounters_freeze(sc); + if (error) + return error; + } + + return xfs_trans_alloc_empty(sc->mp, &sc->tp); } /* @@ -290,8 +382,7 @@ retry: if (fsc->ifree > fsc->icount) { if (tries--) goto retry; - xchk_set_incomplete(sc); - return 0; + return -EDEADLOCK; } return 0; @@ -367,6 +458,8 @@ xchk_fscount_count_frextents( * Otherwise, we /might/ have a problem. If the change in the summations is * more than we want to tolerate, the filesystem is probably busy and we should * just send back INCOMPLETE and see if userspace will try again. + * + * If we're repairing then we require an exact match. */ static inline bool xchk_fscount_within_range( @@ -396,21 +489,7 @@ xchk_fscount_within_range( if (expected >= min_value && expected <= max_value) return true; - /* - * If the difference between the two summations is too large, the fs - * might just be busy and so we'll mark the scrub incomplete. Return - * true here so that we don't mark the counter corrupt. - * - * XXX: In the future when userspace can grant scrub permission to - * quiesce the filesystem to solve the outsized variance problem, this - * check should be moved up and the return code changed to signal to - * userspace that we need quiesce permission. - */ - if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) { - xchk_set_incomplete(sc); - return true; - } - + /* Everything else is bad. */ return false; } @@ -422,6 +501,7 @@ xchk_fscounters( struct xfs_mount *mp = sc->mp; struct xchk_fscounters *fsc = sc->buf; int64_t icount, ifree, fdblocks, frextents; + bool try_again = false; int error; /* Snapshot the percpu counters. */ @@ -431,9 +511,26 @@ xchk_fscounters( frextents = percpu_counter_sum(&mp->m_frextents); /* No negative values, please! */ - if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0) + if (icount < 0 || ifree < 0) xchk_set_corrupt(sc); + /* + * If the filesystem is not frozen, the counter summation calls above + * can race with xfs_mod_freecounter, which subtracts a requested space + * reservation from the counter and undoes the subtraction if that made + * the counter go negative. Therefore, it's possible to see negative + * values here, and we should only flag that as a corruption if we + * froze the fs. This is much more likely to happen with frextents + * since there are no reserved pools. + */ + if (fdblocks < 0 || frextents < 0) { + if (!fsc->frozen) + return -EDEADLOCK; + + xchk_set_corrupt(sc); + return 0; + } + /* See if icount is obviously wrong. */ if (icount < fsc->icount_min || icount > fsc->icount_max) xchk_set_corrupt(sc); @@ -446,12 +543,6 @@ xchk_fscounters( if (frextents > mp->m_sb.sb_rextents) xchk_set_corrupt(sc); - /* - * XXX: We can't quiesce percpu counter updates, so exit early. - * This can be re-enabled when we gain exclusive freeze functionality. - */ - return 0; - /* * If ifree exceeds icount by more than the minimum variance then * something's probably wrong with the counters. @@ -463,8 +554,6 @@ xchk_fscounters( error = xchk_fscount_aggregate_agcounts(sc, fsc); if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) - return 0; /* Count the free extents counter for rt volumes. */ error = xchk_fscount_count_frextents(sc, fsc); @@ -473,20 +562,45 @@ xchk_fscounters( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) return 0; - /* Compare the in-core counters with whatever we counted. */ - if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) - xchk_set_corrupt(sc); + /* + * Compare the in-core counters with whatever we counted. If the fs is + * frozen, we treat the discrepancy as a corruption because the freeze + * should have stabilized the counter values. Otherwise, we need + * userspace to call us back having granted us freeze permission. + */ + if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, + fsc->icount)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } - if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) - xchk_set_corrupt(sc); + if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, - fsc->fdblocks)) - xchk_set_corrupt(sc); + fsc->fdblocks)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, - fsc->frextents)) - xchk_set_corrupt(sc); + fsc->frextents)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } + + if (try_again) + return -EDEADLOCK; return 0; } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 3d98f604765e..a0fffbcd022b 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -184,8 +184,10 @@ xchk_teardown( xchk_irele(sc, sc->ip); sc->ip = NULL; } - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + if (sc->flags & XCHK_HAVE_FREEZE_PROT) { + sc->flags &= ~XCHK_HAVE_FREEZE_PROT; mnt_drop_write_file(sc->file); + } if (sc->buf) { if (sc->buf_cleanup) sc->buf_cleanup(sc->buf); @@ -505,6 +507,8 @@ retry_op: error = mnt_want_write_file(sc->file); if (error) goto out_sc; + + sc->flags |= XCHK_HAVE_FREEZE_PROT; } /* Set up for the operation. */ diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index e113f2f5c254..f8ba00e51ca9 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -106,6 +106,7 @@ struct xfs_scrub { /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ #define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */ +#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index b3894daeb86a..0b54f1a1cf0c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -98,6 +98,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); #define XFS_SCRUB_STATE_STRINGS \ { XCHK_TRY_HARDER, "try_harder" }, \ + { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ { XREP_ALREADY_FIXED, "already_fixed" } @@ -693,6 +694,31 @@ TRACE_EVENT(xchk_fscounters_within_range, __entry->old_value) ) +DECLARE_EVENT_CLASS(xchk_fsfreeze_class, + TP_PROTO(struct xfs_scrub *sc, int error), + TP_ARGS(sc, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->error = error; + ), + TP_printk("dev %d:%d type %s error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->error) +); +#define DEFINE_XCHK_FSFREEZE_EVENT(name) \ +DEFINE_EVENT(xchk_fsfreeze_class, name, \ + TP_PROTO(struct xfs_scrub *sc, int error), \ + TP_ARGS(sc, error)) +DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze); +DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw); + TRACE_EVENT(xchk_refcount_incorrect, TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec, xfs_nlink_t seen), -- cgit From dbbff489064d89391c4f0c7a73e77e61ce29fe96 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Aug 2023 15:05:33 -0700 Subject: xfs: reformat the xfs_fs_free prototype The xfs_fs_free prototype formatting is a weird mix of the classic XFS style and the Linux style. Fix it up to be consistent. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Message-Id: <20230809220545.1308228-2-hch@lst.de> Signed-off-by: Christian Brauner --- fs/xfs/xfs_super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 818510243130..79c1dd9435c2 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1934,7 +1934,8 @@ xfs_fs_reconfigure( return 0; } -static void xfs_fs_free( +static void +xfs_fs_free( struct fs_context *fc) { struct xfs_mount *mp = fc->s_fs_info; -- cgit From 1aa2d074d4c777e2150382878d0a5611d829b380 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Aug 2023 15:05:34 -0700 Subject: xfs: remove a superfluous s_fs_info NULL check in xfs_fs_put_super ->put_super is only called when sb->s_root is set, and thus when fill_super succeeds. Thus drop the NULL check that can't happen in xfs_fs_put_super. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Reviewed-by: "Darrick J. Wong" Message-Id: <20230809220545.1308228-3-hch@lst.de> Signed-off-by: Christian Brauner --- fs/xfs/xfs_super.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 79c1dd9435c2..e621e0da87f3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1133,10 +1133,6 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - /* if ->fill_super failed, we have no mount to tear down */ - if (!sb->s_fs_info) - return; - xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid); xfs_filestream_unmount(mp); xfs_unmountfs(mp); -- cgit From 2a9311adb87c98599989b80405fe2c60cd4075dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Aug 2023 15:05:35 -0700 Subject: xfs: free the xfs_mount in ->kill_sb As a rule of thumb everything allocated to the fs_context and moved into the super_block should be freed by ->kill_sb so that the teardown handling doesn't need to be duplicated between the fill_super error path and put_super. Implement a XFS-specific kill_sb method to do that. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Reviewed-by: "Darrick J. Wong" Message-Id: <20230809220545.1308228-4-hch@lst.de> Signed-off-by: Christian Brauner --- fs/xfs/xfs_super.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e621e0da87f3..31260fc09719 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1144,9 +1144,6 @@ xfs_fs_put_super( xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); xfs_close_devices(mp); - - sb->s_fs_info = NULL; - xfs_mount_free(mp); } static long @@ -1488,7 +1485,7 @@ xfs_fs_fill_super( error = xfs_fs_validate_params(mp); if (error) - goto out_free_names; + return error; sb_min_blocksize(sb, BBSIZE); sb->s_xattr = xfs_xattr_handlers; @@ -1515,7 +1512,7 @@ xfs_fs_fill_super( error = xfs_open_devices(mp); if (error) - goto out_free_names; + return error; error = xfs_init_mount_workqueues(mp); if (error) @@ -1735,9 +1732,6 @@ xfs_fs_fill_super( xfs_destroy_mount_workqueues(mp); out_close_devices: xfs_close_devices(mp); - out_free_names: - sb->s_fs_info = NULL; - xfs_mount_free(mp); return error; out_unmount: @@ -2000,12 +1994,20 @@ static int xfs_init_fs_context( return 0; } +static void +xfs_kill_sb( + struct super_block *sb) +{ + kill_block_super(sb); + xfs_mount_free(XFS_M(sb)); +} + static struct file_system_type xfs_fs_type = { .owner = THIS_MODULE, .name = "xfs", .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, - .kill_sb = kill_block_super, + .kill_sb = xfs_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("xfs"); -- cgit From d3ef7e94ee36adc8f0006d253a9ad45793b874cd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Aug 2023 15:05:36 -0700 Subject: xfs: remove xfs_blkdev_put There isn't much use for this trivial wrapper, especially as the NULL check is only needed in a single call site. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Reviewed-by: "Darrick J. Wong" Message-Id: <20230809220545.1308228-5-hch@lst.de> Signed-off-by: Christian Brauner --- fs/xfs/xfs_super.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 31260fc09719..332e32d8ca5a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -406,15 +406,6 @@ xfs_blkdev_get( return error; } -STATIC void -xfs_blkdev_put( - struct xfs_mount *mp, - struct block_device *bdev) -{ - if (bdev) - blkdev_put(bdev, mp); -} - STATIC void xfs_close_devices( struct xfs_mount *mp) @@ -423,13 +414,13 @@ xfs_close_devices( struct block_device *logdev = mp->m_logdev_targp->bt_bdev; xfs_free_buftarg(mp->m_logdev_targp); - xfs_blkdev_put(mp, logdev); + blkdev_put(logdev, mp); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; xfs_free_buftarg(mp->m_rtdev_targp); - xfs_blkdev_put(mp, rtdev); + blkdev_put(rtdev, mp); } xfs_free_buftarg(mp->m_ddev_targp); } @@ -504,10 +495,11 @@ xfs_open_devices( out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - xfs_blkdev_put(mp, rtdev); + if (rtdev) + blkdev_put(rtdev, mp); out_close_logdev: if (logdev && logdev != ddev) - xfs_blkdev_put(mp, logdev); + blkdev_put(logdev, mp); return error; } -- cgit From 41233576e9a4515dc9b0bd1cbbb896b520a1f486 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Aug 2023 15:05:37 -0700 Subject: xfs: close the RT and log block devices in xfs_free_buftarg Closing the block devices logically belongs into xfs_free_buftarg, So instead of open coding it in the caller move it there and add a check for the s_bdev so that the main device isn't close as that's done by the VFS helper. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Message-Id: <20230809220545.1308228-6-hch@lst.de> Signed-off-by: Christian Brauner --- fs/xfs/xfs_buf.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 15d1e5a7c2d3..65110df78e13 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1938,6 +1938,8 @@ void xfs_free_buftarg( struct xfs_buftarg *btp) { + struct block_device *bdev = btp->bt_bdev; + unregister_shrinker(&btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); @@ -1946,6 +1948,9 @@ xfs_free_buftarg( blkdev_issue_flush(btp->bt_bdev); invalidate_bdev(btp->bt_bdev); fs_put_dax(btp->bt_daxdev, btp->bt_mount); + /* the main block device is closed by kill_block_super */ + if (bdev != btp->bt_mount->m_super->s_bdev) + blkdev_put(bdev, btp->bt_mount); kmem_free(btp); } -- cgit From 35a93b148b0363dca23c3db1cc9d48100eb8b276 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Aug 2023 15:05:38 -0700 Subject: xfs: close the external block devices in xfs_mount_free blkdev_put must not be called under sb->s_umount to avoid a lock order reversal with disk->open_mutex. Move closing the buftargs into ->kill_sb to archive that. Note that the flushing of the disk caches and block device mapping invalidated needs to stay in ->put_super as the main block device is closed in kill_block_super already. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Message-Id: <20230809220545.1308228-7-hch@lst.de> Signed-off-by: Christian Brauner --- fs/xfs/xfs_buf.c | 2 -- fs/xfs/xfs_super.c | 36 ++++++++++++++++++++++-------------- 2 files changed, 22 insertions(+), 16 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 65110df78e13..f2e5a2c78dc4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1945,8 +1945,6 @@ xfs_free_buftarg( percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); - blkdev_issue_flush(btp->bt_bdev); - invalidate_bdev(btp->bt_bdev); fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (bdev != btp->bt_mount->m_super->s_bdev) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 332e32d8ca5a..18d34e802aac 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -407,22 +407,19 @@ xfs_blkdev_get( } STATIC void -xfs_close_devices( +xfs_shutdown_devices( struct xfs_mount *mp) { if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { - struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - - xfs_free_buftarg(mp->m_logdev_targp); - blkdev_put(logdev, mp); + blkdev_issue_flush(mp->m_logdev_targp->bt_bdev); + invalidate_bdev(mp->m_logdev_targp->bt_bdev); } if (mp->m_rtdev_targp) { - struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - - xfs_free_buftarg(mp->m_rtdev_targp); - blkdev_put(rtdev, mp); + blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); + invalidate_bdev(mp->m_rtdev_targp->bt_bdev); } - xfs_free_buftarg(mp->m_ddev_targp); + blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); + invalidate_bdev(mp->m_ddev_targp->bt_bdev); } /* @@ -750,6 +747,17 @@ static void xfs_mount_free( struct xfs_mount *mp) { + /* + * Free the buftargs here because blkdev_put needs to be called outside + * of sb->s_umount, which is held around the call to ->put_super. + */ + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) + xfs_free_buftarg(mp->m_logdev_targp); + if (mp->m_rtdev_targp) + xfs_free_buftarg(mp->m_rtdev_targp); + if (mp->m_ddev_targp) + xfs_free_buftarg(mp->m_ddev_targp); + kfree(mp->m_rtname); kfree(mp->m_logname); kmem_free(mp); @@ -1135,7 +1143,7 @@ xfs_fs_put_super( xfs_inodegc_free_percpu(mp); xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); - xfs_close_devices(mp); + xfs_shutdown_devices(mp); } static long @@ -1508,7 +1516,7 @@ xfs_fs_fill_super( error = xfs_init_mount_workqueues(mp); if (error) - goto out_close_devices; + goto out_shutdown_devices; error = xfs_init_percpu_counters(mp); if (error) @@ -1722,8 +1730,8 @@ xfs_fs_fill_super( xfs_destroy_percpu_counters(mp); out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); - out_close_devices: - xfs_close_devices(mp); + out_shutdown_devices: + xfs_shutdown_devices(mp); return error; out_unmount: -- cgit From 1a0a5dad67b60250dce151c9533ccbecdfd822d4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Aug 2023 15:05:39 -0700 Subject: xfs: document the invalidate_bdev call in invalidate_bdev Copy and paste the commit message from Darrick into a comment to explain the seemingly odd invalidate_bdev in xfs_shutdown_devices. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Message-Id: <20230809220545.1308228-8-hch@lst.de> Signed-off-by: Christian Brauner --- fs/xfs/xfs_super.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 18d34e802aac..46d0cbe2b5f8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -410,6 +410,32 @@ STATIC void xfs_shutdown_devices( struct xfs_mount *mp) { + /* + * Udev is triggered whenever anyone closes a block device or unmounts + * a file systemm on a block device. + * The default udev rules invoke blkid to read the fs super and create + * symlinks to the bdev under /dev/disk. For this, it uses buffered + * reads through the page cache. + * + * xfs_db also uses buffered reads to examine metadata. There is no + * coordination between xfs_db and udev, which means that they can run + * concurrently. Note there is no coordination between the kernel and + * blkid either. + * + * On a system with 64k pages, the page cache can cache the superblock + * and the root inode (and hence the root directory) with the same 64k + * page. If udev spawns blkid after the mkfs and the system is busy + * enough that it is still running when xfs_db starts up, they'll both + * read from the same page in the pagecache. + * + * The unmount writes updated inode metadata to disk directly. The XFS + * buffer cache does not use the bdev pagecache, so it needs to + * invalidate that pagecache on unmount. If the above scenario occurs, + * the pagecache no longer reflects what's on disk, xfs_db reads the + * stale metadata, and fails to find /a. Most of the time this succeeds + * because closing a bdev invalidates the page cache, but when processes + * race, everyone loses. + */ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { blkdev_issue_flush(mp->m_logdev_targp->bt_bdev); invalidate_bdev(mp->m_logdev_targp->bt_bdev); -- cgit From 2ea6f68932f73a6a9d82160d3ad0a49a5a6bb183 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Aug 2023 17:41:25 +0200 Subject: fs: use the super_block as holder when mounting file systems The file system type is not a very useful holder as it doesn't allow us to go back to the actual file system instance. Pass the super_block instead which is useful when passed back to the file system driver. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Message-Id: <20230802154131.2221419-7-hch@lst.de> Signed-off-by: Christian Brauner --- fs/xfs/xfs_buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f2e5a2c78dc4..3b903f6bce98 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1948,7 +1948,7 @@ xfs_free_buftarg( fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (bdev != btp->bt_mount->m_super->s_bdev) - blkdev_put(bdev, btp->bt_mount); + blkdev_put(bdev, btp->bt_mount->m_super); kmem_free(btp); } -- cgit From 8d945b595ed07db13fef1f3311ad456c97941930 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Aug 2023 17:41:30 +0200 Subject: xfs: drop s_umount over opening the log and RT devices Just like get_tree_bdev needs to drop s_umount when opening the main device, we need to do the same for the xfs log and RT devices to avoid a potential lock order reversal with s_unmount for the mark_dead path. It might be preferable to just drop s_umount over ->fill_super entirely, but that will require a fairly massive audit first, so we'll do the easy version here first. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Message-Id: <20230802154131.2221419-12-hch@lst.de> Signed-off-by: Christian Brauner --- fs/xfs/xfs_super.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 46d0cbe2b5f8..97ef41acb03d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -462,17 +462,24 @@ STATIC int xfs_open_devices( struct xfs_mount *mp) { - struct block_device *ddev = mp->m_super->s_bdev; + struct super_block *sb = mp->m_super; + struct block_device *ddev = sb->s_bdev; struct block_device *logdev = NULL, *rtdev = NULL; int error; + /* + * blkdev_put() can't be called under s_umount, see the comment + * in get_tree_bdev() for more details + */ + up_write(&sb->s_umount); + /* * Open real time and log devices - order is important. */ if (mp->m_logname) { error = xfs_blkdev_get(mp, mp->m_logname, &logdev); if (error) - return error; + goto out_relock; } if (mp->m_rtname) { @@ -510,7 +517,10 @@ xfs_open_devices( mp->m_logdev_targp = mp->m_ddev_targp; } - return 0; + error = 0; +out_relock: + down_write(&sb->s_umount); + return error; out_free_rtdev_targ: if (mp->m_rtdev_targp) @@ -523,7 +533,7 @@ xfs_open_devices( out_close_logdev: if (logdev && logdev != ddev) blkdev_put(logdev, mp); - return error; + goto out_relock; } /* -- cgit From 8ffa54e3370c5a8b9538dbe4077fc9c4b5a08f45 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Aug 2023 17:41:31 +0200 Subject: xfs use fs_holder_ops for the log and RT devices Use the generic fs_holder_ops to shut down the file system when the log or RT device goes away instead of duplicating the logic. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Message-Id: <20230802154131.2221419-13-hch@lst.de> Signed-off-by: Christian Brauner --- fs/xfs/xfs_super.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 97ef41acb03d..8fee15292499 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -377,17 +377,6 @@ disable_dax: return 0; } -static void -xfs_bdev_mark_dead( - struct block_device *bdev) -{ - xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED); -} - -static const struct blk_holder_ops xfs_holder_ops = { - .mark_dead = xfs_bdev_mark_dead, -}; - STATIC int xfs_blkdev_get( xfs_mount_t *mp, @@ -396,8 +385,8 @@ xfs_blkdev_get( { int error = 0; - *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp, - &xfs_holder_ops); + *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, + mp->m_super, &fs_holder_ops); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); xfs_warn(mp, "Invalid device [%s], error=%d", name, error); @@ -529,10 +518,10 @@ out_relock: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: if (rtdev) - blkdev_put(rtdev, mp); + blkdev_put(rtdev, sb); out_close_logdev: if (logdev && logdev != ddev) - blkdev_put(logdev, mp); + blkdev_put(logdev, sb); goto out_relock; } -- cgit