diff options
Diffstat (limited to 'fs/xfs/scrub/fscounters.c')
| -rw-r--r-- | fs/xfs/scrub/fscounters.c | 410 |
1 files changed, 338 insertions, 72 deletions
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index ec2064ed3c30..cebd0d526926 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -1,21 +1,29 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2019 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_mount.h" -#include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_health.h" +#include "xfs_btree.h" +#include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_rtgroup.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" +#include "scrub/fscounters.h" /* * FS Summary Counters @@ -67,20 +75,20 @@ xchk_fscount_warmup( struct xfs_buf *agi_bp = NULL; struct xfs_buf *agf_bp = NULL; struct xfs_perag *pag = NULL; - xfs_agnumber_t agno; int error = 0; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); - - if (pag->pagi_init && pag->pagf_init) - goto next_loop_perag; + while ((pag = xfs_perag_next(mp, pag))) { + if (xchk_should_terminate(sc, &error)) + break; + if (xfs_perag_initialised_agi(pag) && + xfs_perag_initialised_agf(pag)) + continue; /* Lock both AG headers. */ - error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp); + error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp); if (error) break; - error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp); + error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); if (error) break; @@ -88,21 +96,16 @@ xchk_fscount_warmup( * These are supposed to be initialized by the header read * function. */ - error = -EFSCORRUPTED; - if (!pag->pagi_init || !pag->pagf_init) + if (!xfs_perag_initialised_agi(pag) || + !xfs_perag_initialised_agf(pag)) { + error = -EFSCORRUPTED; break; + } xfs_buf_relse(agf_bp); agf_bp = NULL; xfs_buf_relse(agi_bp); agi_bp = NULL; -next_loop_perag: - xfs_perag_put(pag); - pag = NULL; - error = 0; - - if (xchk_should_terminate(sc, &error)) - break; } if (agf_bp) @@ -110,22 +113,106 @@ next_loop_perag: if (agi_bp) xfs_buf_relse(agi_bp); if (pag) - xfs_perag_put(pag); + xfs_perag_rele(pag); + return error; +} + +static inline int +xchk_fsfreeze( + struct xfs_scrub *sc) +{ + int error; + + error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL); + trace_xchk_fsfreeze(sc, error); return error; } +static inline int +xchk_fsthaw( + struct xfs_scrub *sc) +{ + int error; + + /* This should always succeed, we have a kernel freeze */ + error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL); + trace_xchk_fsthaw(sc, error); + return error; +} + +/* + * We couldn't stabilize the filesystem long enough to sample all the variables + * that comprise the summary counters and compare them to the percpu counters. + * We need to disable all writer threads, which means taking the first two + * freeze levels to put userspace to sleep, and the third freeze level to + * prevent background threads from starting new transactions. Take one level + * more to prevent other callers from unfreezing the filesystem while we run. + */ +STATIC int +xchk_fscounters_freeze( + struct xfs_scrub *sc) +{ + struct xchk_fscounters *fsc = sc->buf; + int error = 0; + + if (sc->flags & XCHK_HAVE_FREEZE_PROT) { + sc->flags &= ~XCHK_HAVE_FREEZE_PROT; + mnt_drop_write_file(sc->file); + } + + /* Try to grab a kernel freeze. */ + while ((error = xchk_fsfreeze(sc)) == -EBUSY) { + if (xchk_should_terminate(sc, &error)) + return error; + + delay(HZ / 10); + } + if (error) + return error; + + fsc->frozen = true; + return 0; +} + +/* Thaw the filesystem after checking or repairing fscounters. */ +STATIC void +xchk_fscounters_cleanup( + void *buf) +{ + struct xchk_fscounters *fsc = buf; + struct xfs_scrub *sc = fsc->sc; + int error; + + if (!fsc->frozen) + return; + + error = xchk_fsthaw(sc); + if (error) + xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error); + else + fsc->frozen = false; +} + int xchk_setup_fscounters( - struct xfs_scrub *sc, - struct xfs_inode *ip) + struct xfs_scrub *sc) { struct xchk_fscounters *fsc; int error; - sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0); + /* + * If the AGF doesn't track btreeblks, we have to lock the AGF to count + * btree block usage by walking the actual btrees. + */ + if (!xfs_has_lazysbcount(sc->mp)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; + sc->buf_cleanup = xchk_fscounters_cleanup; fsc = sc->buf; + fsc->sc = sc; xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); @@ -135,13 +222,66 @@ xchk_setup_fscounters( return error; /* - * Pause background reclaim while we're scrubbing to reduce the - * likelihood of background perturbations to the counters throwing off - * our calculations. + * Pause all writer activity in the filesystem while we're scrubbing to + * reduce the likelihood of background perturbations to the counters + * throwing off our calculations. + * + * If we're repairing, we need to prevent any other thread from + * changing the global fs summary counters while we're repairing them. + * This requires the fs to be frozen, which will disable background + * reclaim and purge all inactive inodes. */ - xchk_stop_reaping(sc); + if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) { + error = xchk_fscounters_freeze(sc); + if (error) + return error; + } + + xchk_trans_alloc_empty(sc); + return 0; +} - return xchk_trans_alloc(sc, 0); +/* + * Part 1: Collecting filesystem summary counts. For each AG, we add its + * summary counts (total inodes, free inodes, free data blocks) to an incore + * copy of the overall filesystem summary counts. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. + */ + +/* Count free space btree blocks manually for pre-lazysbcount filesystems. */ +static int +xchk_fscount_btreeblks( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc, + xfs_agnumber_t agno) +{ + xfs_filblks_t blocks; + int error; + + error = xchk_ag_init_existing(sc, agno, &sc->sa); + if (error) + goto out_free; + + error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); + if (error) + goto out_free; + fsc->fdblocks += blocks - 1; + + error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); + if (error) + goto out_free; + fsc->fdblocks += blocks - 1; + +out_free: + xchk_ag_free(sc, &sc->sa); + return error; } /* @@ -156,9 +296,8 @@ xchk_fscount_aggregate_agcounts( struct xchk_fscounters *fsc) { struct xfs_mount *mp = sc->mp; - struct xfs_perag *pag; + struct xfs_perag *pag = NULL; uint64_t delayed; - xfs_agnumber_t agno; int tries = 8; int error = 0; @@ -167,13 +306,15 @@ retry: fsc->ifree = 0; fsc->fdblocks = 0; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); + while ((pag = xfs_perag_next(mp, pag))) { + if (xchk_should_terminate(sc, &error)) + break; /* This somehow got unset since the warmup? */ - if (!pag->pagi_init || !pag->pagf_init) { - xfs_perag_put(pag); - return -EFSCORRUPTED; + if (!xfs_perag_initialised_agi(pag) || + !xfs_perag_initialised_agf(pag)) { + error = -EFSCORRUPTED; + break; } /* Count all the inodes */ @@ -183,7 +324,13 @@ retry: /* Add up the free/freelist/bnobt/cntbt blocks */ fsc->fdblocks += pag->pagf_freeblks; fsc->fdblocks += pag->pagf_flcount; - fsc->fdblocks += pag->pagf_btreeblks; + if (xfs_has_lazysbcount(sc->mp)) { + fsc->fdblocks += pag->pagf_btreeblks; + } else { + error = xchk_fscount_btreeblks(sc, fsc, pag_agno(pag)); + if (error) + break; + } /* * Per-AG reservations are taken out of the incore counters, @@ -192,20 +339,19 @@ retry: fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; - xfs_perag_put(pag); - - if (xchk_should_terminate(sc, &error)) - break; } - - if (error) + if (pag) + xfs_perag_rele(pag); + if (error) { + xchk_set_incomplete(sc); return error; + } /* * The global incore space reservation is taken from the incore * counters, so leave that out of the computation. */ - fsc->fdblocks -= mp->m_resblks_avail; + fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail; /* * Delayed allocation reservations are taken out of the incore counters @@ -233,12 +379,81 @@ retry: if (fsc->ifree > fsc->icount) { if (tries--) goto retry; - xchk_set_incomplete(sc); + return -EDEADLOCK; + } + + return 0; +} + +#ifdef CONFIG_XFS_RT +STATIC int +xchk_fscount_add_frextent( + struct xfs_rtgroup *rtg, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xchk_fscounters *fsc = priv; + int error = 0; + + fsc->frextents += rec->ar_extcount; + + xchk_should_terminate(fsc->sc, &error); + return error; +} + +/* Calculate the number of free realtime extents from the realtime bitmap. */ +STATIC int +xchk_fscount_count_frextents( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_rtgroup *rtg = NULL; + int error; + + fsc->frextents = 0; + fsc->frextents_delayed = 0; + + /* + * Don't bother verifying and repairing the fs counters for zoned file + * systems as they don't track an on-disk frextents count, and the + * in-memory percpu counter also includes reservations. + */ + if (!xfs_has_realtime(mp) || xfs_has_zoned(mp)) return 0; + + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + error = xfs_rtalloc_query_all(rtg, sc->tp, + xchk_fscount_add_frextent, fsc); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + if (error) { + xchk_set_incomplete(sc); + xfs_rtgroup_rele(rtg); + return error; + } } + fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents); + return 0; +} +#else +STATIC int +xchk_fscount_count_frextents( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + fsc->frextents = 0; + fsc->frextents_delayed = 0; return 0; } +#endif /* CONFIG_XFS_RT */ + +/* + * Part 2: Comparing filesystem summary counters. All we have to do here is + * sum the percpu counters and compare them to what we've observed. + */ /* * Is the @counter reasonably close to the @expected value? @@ -252,6 +467,8 @@ retry: * Otherwise, we /might/ have a problem. If the change in the summations is * more than we want to tolerate, the filesystem is probably busy and we should * just send back INCOMPLETE and see if userspace will try again. + * + * If we're repairing then we require an exact match. */ static inline bool xchk_fscount_within_range( @@ -274,6 +491,10 @@ xchk_fscount_within_range( if (curr_value == expected) return true; + /* We require exact matches when repair is running. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + return false; + min_value = min(old_value, curr_value); max_value = max(old_value, curr_value); @@ -281,21 +502,7 @@ xchk_fscount_within_range( if (expected >= min_value && expected <= max_value) return true; - /* - * If the difference between the two summations is too large, the fs - * might just be busy and so we'll mark the scrub incomplete. Return - * true here so that we don't mark the counter corrupt. - * - * XXX: In the future when userspace can grant scrub permission to - * quiesce the filesystem to solve the outsized variance problem, this - * check should be moved up and the return code changed to signal to - * userspace that we need quiesce permission. - */ - if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) { - xchk_set_incomplete(sc); - return true; - } - + /* Everything else is bad. */ return false; } @@ -306,17 +513,36 @@ xchk_fscounters( { struct xfs_mount *mp = sc->mp; struct xchk_fscounters *fsc = sc->buf; - int64_t icount, ifree, fdblocks; + int64_t icount, ifree, fdblocks, frextents; + bool try_again = false; int error; /* Snapshot the percpu counters. */ icount = percpu_counter_sum(&mp->m_icount); ifree = percpu_counter_sum(&mp->m_ifree); - fdblocks = percpu_counter_sum(&mp->m_fdblocks); + fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); + frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS); /* No negative values, please! */ - if (icount < 0 || ifree < 0 || fdblocks < 0) + if (icount < 0 || ifree < 0) + xchk_set_corrupt(sc); + + /* + * If the filesystem is not frozen, the counter summation calls above + * can race with xfs_dec_freecounter, which subtracts a requested space + * reservation from the counter and undoes the subtraction if that made + * the counter go negative. Therefore, it's possible to see negative + * values here, and we should only flag that as a corruption if we + * froze the fs. This is much more likely to happen with frextents + * since there are no reserved pools. + */ + if (fdblocks < 0 || frextents < 0) { + if (!fsc->frozen) + return -EDEADLOCK; + xchk_set_corrupt(sc); + return 0; + } /* See if icount is obviously wrong. */ if (icount < fsc->icount_min || icount > fsc->icount_max) @@ -326,6 +552,10 @@ xchk_fscounters( if (fdblocks > mp->m_sb.sb_dblocks) xchk_set_corrupt(sc); + /* See if frextents is obviously wrong. */ + if (frextents > mp->m_sb.sb_rextents) + xchk_set_corrupt(sc); + /* * If ifree exceeds icount by more than the minimum variance then * something's probably wrong with the counters. @@ -337,19 +567,55 @@ xchk_fscounters( error = xchk_fscount_aggregate_agcounts(sc, fsc); if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) return error; + + /* Count the free extents counter for rt volumes. */ + error = xchk_fscount_count_frextents(sc, fsc); + if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) + return error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) return 0; - /* Compare the in-core counters with whatever we counted. */ - if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) - xchk_set_corrupt(sc); + /* + * Compare the in-core counters with whatever we counted. If the fs is + * frozen, we treat the discrepancy as a corruption because the freeze + * should have stabilized the counter values. Otherwise, we need + * userspace to call us back having granted us freeze permission. + */ + if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, + fsc->icount)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } - if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) - xchk_set_corrupt(sc); + if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } - if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, - fsc->fdblocks)) - xchk_set_corrupt(sc); + if (!xchk_fscount_within_range(sc, fdblocks, + &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } + + if (!xfs_has_zoned(mp) && + !xchk_fscount_within_range(sc, frextents, + &mp->m_free[XC_FREE_RTEXTENTS].count, + fsc->frextents - fsc->frextents_delayed)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } + + if (try_again) + return -EDEADLOCK; return 0; } |
