summaryrefslogtreecommitdiff
path: root/fs/xfs/scrub/fscounters.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/scrub/fscounters.c')
-rw-r--r--fs/xfs/scrub/fscounters.c410
1 files changed, 338 insertions, 72 deletions
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index ec2064ed3c30..cebd0d526926 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -1,21 +1,29 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2019 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2019-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_mount.h"
-#include "xfs_sb.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_health.h"
+#include "xfs_btree.h"
+#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_rtgroup.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
+#include "scrub/fscounters.h"
/*
* FS Summary Counters
@@ -67,20 +75,20 @@ xchk_fscount_warmup(
struct xfs_buf *agi_bp = NULL;
struct xfs_buf *agf_bp = NULL;
struct xfs_perag *pag = NULL;
- xfs_agnumber_t agno;
int error = 0;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
-
- if (pag->pagi_init && pag->pagf_init)
- goto next_loop_perag;
+ while ((pag = xfs_perag_next(mp, pag))) {
+ if (xchk_should_terminate(sc, &error))
+ break;
+ if (xfs_perag_initialised_agi(pag) &&
+ xfs_perag_initialised_agf(pag))
+ continue;
/* Lock both AG headers. */
- error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp);
+ error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp);
if (error)
break;
- error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
if (error)
break;
@@ -88,21 +96,16 @@ xchk_fscount_warmup(
* These are supposed to be initialized by the header read
* function.
*/
- error = -EFSCORRUPTED;
- if (!pag->pagi_init || !pag->pagf_init)
+ if (!xfs_perag_initialised_agi(pag) ||
+ !xfs_perag_initialised_agf(pag)) {
+ error = -EFSCORRUPTED;
break;
+ }
xfs_buf_relse(agf_bp);
agf_bp = NULL;
xfs_buf_relse(agi_bp);
agi_bp = NULL;
-next_loop_perag:
- xfs_perag_put(pag);
- pag = NULL;
- error = 0;
-
- if (xchk_should_terminate(sc, &error))
- break;
}
if (agf_bp)
@@ -110,22 +113,106 @@ next_loop_perag:
if (agi_bp)
xfs_buf_relse(agi_bp);
if (pag)
- xfs_perag_put(pag);
+ xfs_perag_rele(pag);
+ return error;
+}
+
+static inline int
+xchk_fsfreeze(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL);
+ trace_xchk_fsfreeze(sc, error);
return error;
}
+static inline int
+xchk_fsthaw(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ /* This should always succeed, we have a kernel freeze */
+ error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL);
+ trace_xchk_fsthaw(sc, error);
+ return error;
+}
+
+/*
+ * We couldn't stabilize the filesystem long enough to sample all the variables
+ * that comprise the summary counters and compare them to the percpu counters.
+ * We need to disable all writer threads, which means taking the first two
+ * freeze levels to put userspace to sleep, and the third freeze level to
+ * prevent background threads from starting new transactions. Take one level
+ * more to prevent other callers from unfreezing the filesystem while we run.
+ */
+STATIC int
+xchk_fscounters_freeze(
+ struct xfs_scrub *sc)
+{
+ struct xchk_fscounters *fsc = sc->buf;
+ int error = 0;
+
+ if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
+ sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
+ mnt_drop_write_file(sc->file);
+ }
+
+ /* Try to grab a kernel freeze. */
+ while ((error = xchk_fsfreeze(sc)) == -EBUSY) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ delay(HZ / 10);
+ }
+ if (error)
+ return error;
+
+ fsc->frozen = true;
+ return 0;
+}
+
+/* Thaw the filesystem after checking or repairing fscounters. */
+STATIC void
+xchk_fscounters_cleanup(
+ void *buf)
+{
+ struct xchk_fscounters *fsc = buf;
+ struct xfs_scrub *sc = fsc->sc;
+ int error;
+
+ if (!fsc->frozen)
+ return;
+
+ error = xchk_fsthaw(sc);
+ if (error)
+ xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error);
+ else
+ fsc->frozen = false;
+}
+
int
xchk_setup_fscounters(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
struct xchk_fscounters *fsc;
int error;
- sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
+ /*
+ * If the AGF doesn't track btreeblks, we have to lock the AGF to count
+ * btree block usage by walking the actual btrees.
+ */
+ if (!xfs_has_lazysbcount(sc->mp))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
+ sc->buf_cleanup = xchk_fscounters_cleanup;
fsc = sc->buf;
+ fsc->sc = sc;
xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
@@ -135,13 +222,66 @@ xchk_setup_fscounters(
return error;
/*
- * Pause background reclaim while we're scrubbing to reduce the
- * likelihood of background perturbations to the counters throwing off
- * our calculations.
+ * Pause all writer activity in the filesystem while we're scrubbing to
+ * reduce the likelihood of background perturbations to the counters
+ * throwing off our calculations.
+ *
+ * If we're repairing, we need to prevent any other thread from
+ * changing the global fs summary counters while we're repairing them.
+ * This requires the fs to be frozen, which will disable background
+ * reclaim and purge all inactive inodes.
*/
- xchk_stop_reaping(sc);
+ if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) {
+ error = xchk_fscounters_freeze(sc);
+ if (error)
+ return error;
+ }
+
+ xchk_trans_alloc_empty(sc);
+ return 0;
+}
- return xchk_trans_alloc(sc, 0);
+/*
+ * Part 1: Collecting filesystem summary counts. For each AG, we add its
+ * summary counts (total inodes, free inodes, free data blocks) to an incore
+ * copy of the overall filesystem summary counts.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned. This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error. Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
+ */
+
+/* Count free space btree blocks manually for pre-lazysbcount filesystems. */
+static int
+xchk_fscount_btreeblks(
+ struct xfs_scrub *sc,
+ struct xchk_fscounters *fsc,
+ xfs_agnumber_t agno)
+{
+ xfs_filblks_t blocks;
+ int error;
+
+ error = xchk_ag_init_existing(sc, agno, &sc->sa);
+ if (error)
+ goto out_free;
+
+ error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
+ if (error)
+ goto out_free;
+ fsc->fdblocks += blocks - 1;
+
+ error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
+ if (error)
+ goto out_free;
+ fsc->fdblocks += blocks - 1;
+
+out_free:
+ xchk_ag_free(sc, &sc->sa);
+ return error;
}
/*
@@ -156,9 +296,8 @@ xchk_fscount_aggregate_agcounts(
struct xchk_fscounters *fsc)
{
struct xfs_mount *mp = sc->mp;
- struct xfs_perag *pag;
+ struct xfs_perag *pag = NULL;
uint64_t delayed;
- xfs_agnumber_t agno;
int tries = 8;
int error = 0;
@@ -167,13 +306,15 @@ retry:
fsc->ifree = 0;
fsc->fdblocks = 0;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
+ while ((pag = xfs_perag_next(mp, pag))) {
+ if (xchk_should_terminate(sc, &error))
+ break;
/* This somehow got unset since the warmup? */
- if (!pag->pagi_init || !pag->pagf_init) {
- xfs_perag_put(pag);
- return -EFSCORRUPTED;
+ if (!xfs_perag_initialised_agi(pag) ||
+ !xfs_perag_initialised_agf(pag)) {
+ error = -EFSCORRUPTED;
+ break;
}
/* Count all the inodes */
@@ -183,7 +324,13 @@ retry:
/* Add up the free/freelist/bnobt/cntbt blocks */
fsc->fdblocks += pag->pagf_freeblks;
fsc->fdblocks += pag->pagf_flcount;
- fsc->fdblocks += pag->pagf_btreeblks;
+ if (xfs_has_lazysbcount(sc->mp)) {
+ fsc->fdblocks += pag->pagf_btreeblks;
+ } else {
+ error = xchk_fscount_btreeblks(sc, fsc, pag_agno(pag));
+ if (error)
+ break;
+ }
/*
* Per-AG reservations are taken out of the incore counters,
@@ -192,20 +339,19 @@ retry:
fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
- xfs_perag_put(pag);
-
- if (xchk_should_terminate(sc, &error))
- break;
}
-
- if (error)
+ if (pag)
+ xfs_perag_rele(pag);
+ if (error) {
+ xchk_set_incomplete(sc);
return error;
+ }
/*
* The global incore space reservation is taken from the incore
* counters, so leave that out of the computation.
*/
- fsc->fdblocks -= mp->m_resblks_avail;
+ fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail;
/*
* Delayed allocation reservations are taken out of the incore counters
@@ -233,12 +379,81 @@ retry:
if (fsc->ifree > fsc->icount) {
if (tries--)
goto retry;
- xchk_set_incomplete(sc);
+ return -EDEADLOCK;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_XFS_RT
+STATIC int
+xchk_fscount_add_frextent(
+ struct xfs_rtgroup *rtg,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xchk_fscounters *fsc = priv;
+ int error = 0;
+
+ fsc->frextents += rec->ar_extcount;
+
+ xchk_should_terminate(fsc->sc, &error);
+ return error;
+}
+
+/* Calculate the number of free realtime extents from the realtime bitmap. */
+STATIC int
+xchk_fscount_count_frextents(
+ struct xfs_scrub *sc,
+ struct xchk_fscounters *fsc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_rtgroup *rtg = NULL;
+ int error;
+
+ fsc->frextents = 0;
+ fsc->frextents_delayed = 0;
+
+ /*
+ * Don't bother verifying and repairing the fs counters for zoned file
+ * systems as they don't track an on-disk frextents count, and the
+ * in-memory percpu counter also includes reservations.
+ */
+ if (!xfs_has_realtime(mp) || xfs_has_zoned(mp))
return 0;
+
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ error = xfs_rtalloc_query_all(rtg, sc->tp,
+ xchk_fscount_add_frextent, fsc);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ if (error) {
+ xchk_set_incomplete(sc);
+ xfs_rtgroup_rele(rtg);
+ return error;
+ }
}
+ fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents);
+ return 0;
+}
+#else
+STATIC int
+xchk_fscount_count_frextents(
+ struct xfs_scrub *sc,
+ struct xchk_fscounters *fsc)
+{
+ fsc->frextents = 0;
+ fsc->frextents_delayed = 0;
return 0;
}
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * Part 2: Comparing filesystem summary counters. All we have to do here is
+ * sum the percpu counters and compare them to what we've observed.
+ */
/*
* Is the @counter reasonably close to the @expected value?
@@ -252,6 +467,8 @@ retry:
* Otherwise, we /might/ have a problem. If the change in the summations is
* more than we want to tolerate, the filesystem is probably busy and we should
* just send back INCOMPLETE and see if userspace will try again.
+ *
+ * If we're repairing then we require an exact match.
*/
static inline bool
xchk_fscount_within_range(
@@ -274,6 +491,10 @@ xchk_fscount_within_range(
if (curr_value == expected)
return true;
+ /* We require exact matches when repair is running. */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ return false;
+
min_value = min(old_value, curr_value);
max_value = max(old_value, curr_value);
@@ -281,21 +502,7 @@ xchk_fscount_within_range(
if (expected >= min_value && expected <= max_value)
return true;
- /*
- * If the difference between the two summations is too large, the fs
- * might just be busy and so we'll mark the scrub incomplete. Return
- * true here so that we don't mark the counter corrupt.
- *
- * XXX: In the future when userspace can grant scrub permission to
- * quiesce the filesystem to solve the outsized variance problem, this
- * check should be moved up and the return code changed to signal to
- * userspace that we need quiesce permission.
- */
- if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
- xchk_set_incomplete(sc);
- return true;
- }
-
+ /* Everything else is bad. */
return false;
}
@@ -306,17 +513,36 @@ xchk_fscounters(
{
struct xfs_mount *mp = sc->mp;
struct xchk_fscounters *fsc = sc->buf;
- int64_t icount, ifree, fdblocks;
+ int64_t icount, ifree, fdblocks, frextents;
+ bool try_again = false;
int error;
/* Snapshot the percpu counters. */
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
- fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
+ frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS);
/* No negative values, please! */
- if (icount < 0 || ifree < 0 || fdblocks < 0)
+ if (icount < 0 || ifree < 0)
+ xchk_set_corrupt(sc);
+
+ /*
+ * If the filesystem is not frozen, the counter summation calls above
+ * can race with xfs_dec_freecounter, which subtracts a requested space
+ * reservation from the counter and undoes the subtraction if that made
+ * the counter go negative. Therefore, it's possible to see negative
+ * values here, and we should only flag that as a corruption if we
+ * froze the fs. This is much more likely to happen with frextents
+ * since there are no reserved pools.
+ */
+ if (fdblocks < 0 || frextents < 0) {
+ if (!fsc->frozen)
+ return -EDEADLOCK;
+
xchk_set_corrupt(sc);
+ return 0;
+ }
/* See if icount is obviously wrong. */
if (icount < fsc->icount_min || icount > fsc->icount_max)
@@ -326,6 +552,10 @@ xchk_fscounters(
if (fdblocks > mp->m_sb.sb_dblocks)
xchk_set_corrupt(sc);
+ /* See if frextents is obviously wrong. */
+ if (frextents > mp->m_sb.sb_rextents)
+ xchk_set_corrupt(sc);
+
/*
* If ifree exceeds icount by more than the minimum variance then
* something's probably wrong with the counters.
@@ -337,19 +567,55 @@ xchk_fscounters(
error = xchk_fscount_aggregate_agcounts(sc, fsc);
if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
return error;
+
+ /* Count the free extents counter for rt volumes. */
+ error = xchk_fscount_count_frextents(sc, fsc);
+ if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
+ return error;
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
return 0;
- /* Compare the in-core counters with whatever we counted. */
- if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
- xchk_set_corrupt(sc);
+ /*
+ * Compare the in-core counters with whatever we counted. If the fs is
+ * frozen, we treat the discrepancy as a corruption because the freeze
+ * should have stabilized the counter values. Otherwise, we need
+ * userspace to call us back having granted us freeze permission.
+ */
+ if (!xchk_fscount_within_range(sc, icount, &mp->m_icount,
+ fsc->icount)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
- if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
- xchk_set_corrupt(sc);
+ if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
- if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
- fsc->fdblocks))
- xchk_set_corrupt(sc);
+ if (!xchk_fscount_within_range(sc, fdblocks,
+ &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
+
+ if (!xfs_has_zoned(mp) &&
+ !xchk_fscount_within_range(sc, frextents,
+ &mp->m_free[XC_FREE_RTEXTENTS].count,
+ fsc->frextents - fsc->frextents_delayed)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
+
+ if (try_again)
+ return -EDEADLOCK;
return 0;
}