summaryrefslogtreecommitdiff
path: root/fs/xfs/scrub
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/scrub')
-rw-r--r--fs/xfs/scrub/agheader.c967
-rw-r--r--fs/xfs/scrub/alloc.c183
-rw-r--r--fs/xfs/scrub/attr.c471
-rw-r--r--fs/xfs/scrub/bmap.c734
-rw-r--r--fs/xfs/scrub/btree.c676
-rw-r--r--fs/xfs/scrub/btree.h66
-rw-r--r--fs/xfs/scrub/common.c775
-rw-r--r--fs/xfs/scrub/common.h160
-rw-r--r--fs/xfs/scrub/dabtree.c613
-rw-r--r--fs/xfs/scrub/dabtree.h59
-rw-r--r--fs/xfs/scrub/dir.c842
-rw-r--r--fs/xfs/scrub/ialloc.c528
-rw-r--r--fs/xfs/scrub/inode.c611
-rw-r--r--fs/xfs/scrub/parent.c327
-rw-r--r--fs/xfs/scrub/quota.c297
-rw-r--r--fs/xfs/scrub/refcount.c515
-rw-r--r--fs/xfs/scrub/rmap.c261
-rw-r--r--fs/xfs/scrub/rtbitmap.c122
-rw-r--r--fs/xfs/scrub/scrub.c462
-rw-r--r--fs/xfs/scrub/scrub.h152
-rw-r--r--fs/xfs/scrub/symlink.c92
-rw-r--r--fs/xfs/scrub/trace.c58
-rw-r--r--fs/xfs/scrub/trace.h500
-rw-r--r--fs/xfs/scrub/xfs_scrub.h29
24 files changed, 9500 insertions, 0 deletions
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
new file mode 100644
index 000000000000..018aabbd9394
--- /dev/null
+++ b/fs/xfs/scrub/agheader.c
@@ -0,0 +1,967 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * Walk all the blocks in the AGFL. The fn function can return any negative
+ * error code or XFS_BTREE_QUERY_RANGE_ABORT.
+ */
+int
+xfs_scrub_walk_agfl(
+ struct xfs_scrub_context *sc,
+ int (*fn)(struct xfs_scrub_context *,
+ xfs_agblock_t bno, void *),
+ void *priv)
+{
+ struct xfs_agf *agf;
+ __be32 *agfl_bno;
+ struct xfs_mount *mp = sc->mp;
+ unsigned int flfirst;
+ unsigned int fllast;
+ int i;
+ int error;
+
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, sc->sa.agfl_bp);
+ flfirst = be32_to_cpu(agf->agf_flfirst);
+ fllast = be32_to_cpu(agf->agf_fllast);
+
+ /* Nothing to walk in an empty AGFL. */
+ if (agf->agf_flcount == cpu_to_be32(0))
+ return 0;
+
+ /* first to last is a consecutive list. */
+ if (fllast >= flfirst) {
+ for (i = flfirst; i <= fllast; i++) {
+ error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
+ if (error)
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+ }
+
+ return 0;
+ }
+
+ /* first to the end */
+ for (i = flfirst; i < xfs_agfl_size(mp); i++) {
+ error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
+ if (error)
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+ }
+
+ /* the start to last. */
+ for (i = 0; i <= fllast; i++) {
+ error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
+ if (error)
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Superblock */
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xfs_scrub_superblock_xref(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agnumber_t agno = sc->sm->sm_agno;
+ xfs_agblock_t agbno;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ agbno = XFS_SB_BLOCK(mp);
+
+ error = xfs_scrub_ag_init(sc, agno, &sc->sa);
+ if (!xfs_scrub_xref_process_error(sc, agno, agbno, &error))
+ return;
+
+ xfs_scrub_xref_is_used_space(sc, agbno, 1);
+ xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1);
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
+ xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xfs_scrub_xref_is_not_shared(sc, agbno, 1);
+
+ /* scrub teardown will take care of sc->sa for us */
+}
+
+/*
+ * Scrub the filesystem superblock.
+ *
+ * Note: We do /not/ attempt to check AG 0's superblock. Mount is
+ * responsible for validating all the geometry information in sb 0, so
+ * if the filesystem is capable of initiating online scrub, then clearly
+ * sb 0 is ok and we can use its information to check everything else.
+ */
+int
+xfs_scrub_superblock(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+ struct xfs_dsb *sb;
+ xfs_agnumber_t agno;
+ uint32_t v2_ok;
+ __be32 features_mask;
+ int error;
+ __be16 vernum_mask;
+
+ agno = sc->sm->sm_agno;
+ if (agno == 0)
+ return 0;
+
+ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+ /*
+ * The superblock verifier can return several different error codes
+ * if it thinks the superblock doesn't look right. For a mount these
+ * would all get bounced back to userspace, but if we're here then the
+ * fs mounted successfully, which means that this secondary superblock
+ * is simply incorrect. Treat all these codes the same way we treat
+ * any corruption.
+ */
+ switch (error) {
+ case -EINVAL: /* also -EWRONGFS */
+ case -ENOSYS:
+ case -EFBIG:
+ error = -EFSCORRUPTED;
+ default:
+ break;
+ }
+ if (!xfs_scrub_process_error(sc, agno, XFS_SB_BLOCK(mp), &error))
+ return error;
+
+ sb = XFS_BUF_TO_SBP(bp);
+
+ /*
+ * Verify the geometries match. Fields that are permanently
+ * set by mkfs are checked; fields that can be updated later
+ * (and are not propagated to backup superblocks) are preen
+ * checked.
+ */
+ if (sb->sb_blocksize != cpu_to_be32(mp->m_sb.sb_blocksize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_dblocks != cpu_to_be64(mp->m_sb.sb_dblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rblocks != cpu_to_be64(mp->m_sb.sb_rblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rextents != cpu_to_be64(mp->m_sb.sb_rextents))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (!uuid_equal(&sb->sb_uuid, &mp->m_sb.sb_uuid))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_logstart != cpu_to_be64(mp->m_sb.sb_logstart))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rootino != cpu_to_be64(mp->m_sb.sb_rootino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_rextsize != cpu_to_be32(mp->m_sb.sb_rextsize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_agblocks != cpu_to_be32(mp->m_sb.sb_agblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_agcount != cpu_to_be32(mp->m_sb.sb_agcount))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rbmblocks != cpu_to_be32(mp->m_sb.sb_rbmblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_logblocks != cpu_to_be32(mp->m_sb.sb_logblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check sb_versionnum bits that are set at mkfs time. */
+ vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS |
+ XFS_SB_VERSION_NUMBITS |
+ XFS_SB_VERSION_ALIGNBIT |
+ XFS_SB_VERSION_DALIGNBIT |
+ XFS_SB_VERSION_SHAREDBIT |
+ XFS_SB_VERSION_LOGV2BIT |
+ XFS_SB_VERSION_SECTORBIT |
+ XFS_SB_VERSION_EXTFLGBIT |
+ XFS_SB_VERSION_DIRV2BIT);
+ if ((sb->sb_versionnum & vernum_mask) !=
+ (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check sb_versionnum bits that can be set after mkfs time. */
+ vernum_mask = cpu_to_be16(XFS_SB_VERSION_ATTRBIT |
+ XFS_SB_VERSION_NLINKBIT |
+ XFS_SB_VERSION_QUOTABIT);
+ if ((sb->sb_versionnum & vernum_mask) !=
+ (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_sectsize != cpu_to_be16(mp->m_sb.sb_sectsize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inodesize != cpu_to_be16(mp->m_sb.sb_inodesize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inopblock != cpu_to_be16(mp->m_sb.sb_inopblock))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (memcmp(sb->sb_fname, mp->m_sb.sb_fname, sizeof(sb->sb_fname)))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_blocklog != mp->m_sb.sb_blocklog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_sectlog != mp->m_sb.sb_sectlog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inodelog != mp->m_sb.sb_inodelog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inopblog != mp->m_sb.sb_inopblog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_agblklog != mp->m_sb.sb_agblklog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rextslog != mp->m_sb.sb_rextslog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_imax_pct != mp->m_sb.sb_imax_pct)
+ xfs_scrub_block_set_preen(sc, bp);
+
+ /*
+ * Skip the summary counters since we track them in memory anyway.
+ * sb_icount, sb_ifree, sb_fdblocks, sb_frexents
+ */
+
+ if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ /*
+ * Skip the quota flags since repair will force quotacheck.
+ * sb_qflags
+ */
+
+ if (sb->sb_flags != mp->m_sb.sb_flags)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_shared_vn != mp->m_sb.sb_shared_vn)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inoalignmt != cpu_to_be32(mp->m_sb.sb_inoalignmt))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_unit != cpu_to_be32(mp->m_sb.sb_unit))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_width != cpu_to_be32(mp->m_sb.sb_width))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_dirblklog != mp->m_sb.sb_dirblklog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_logsectlog != mp->m_sb.sb_logsectlog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_logsectsize != cpu_to_be16(mp->m_sb.sb_logsectsize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_logsunit != cpu_to_be32(mp->m_sb.sb_logsunit))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Do we see any invalid bits in sb_features2? */
+ if (!xfs_sb_version_hasmorebits(&mp->m_sb)) {
+ if (sb->sb_features2 != 0)
+ xfs_scrub_block_set_corrupt(sc, bp);
+ } else {
+ v2_ok = XFS_SB_VERSION2_OKBITS;
+ if (XFS_SB_VERSION_NUM(&mp->m_sb) >= XFS_SB_VERSION_5)
+ v2_ok |= XFS_SB_VERSION2_CRCBIT;
+
+ if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok)))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_features2 != sb->sb_bad_features2)
+ xfs_scrub_block_set_preen(sc, bp);
+ }
+
+ /* Check sb_features2 flags that are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_VERSION2_LAZYSBCOUNTBIT |
+ XFS_SB_VERSION2_PROJID32BIT |
+ XFS_SB_VERSION2_CRCBIT |
+ XFS_SB_VERSION2_FTYPE);
+ if ((sb->sb_features2 & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features2) & features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check sb_features2 flags that can be set after mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT);
+ if ((sb->sb_features2 & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features2) & features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+ /* all v5 fields must be zero */
+ if (memchr_inv(&sb->sb_features_compat, 0,
+ sizeof(struct xfs_dsb) -
+ offsetof(struct xfs_dsb, sb_features_compat)))
+ xfs_scrub_block_set_corrupt(sc, bp);
+ } else {
+ /* Check compat flags; all are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_COMPAT_UNKNOWN);
+ if ((sb->sb_features_compat & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features_compat) & features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check ro compat flags; all are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_RO_COMPAT_UNKNOWN |
+ XFS_SB_FEAT_RO_COMPAT_FINOBT |
+ XFS_SB_FEAT_RO_COMPAT_RMAPBT |
+ XFS_SB_FEAT_RO_COMPAT_REFLINK);
+ if ((sb->sb_features_ro_compat & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features_ro_compat) &
+ features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check incompat flags; all are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_UNKNOWN |
+ XFS_SB_FEAT_INCOMPAT_FTYPE |
+ XFS_SB_FEAT_INCOMPAT_SPINODES |
+ XFS_SB_FEAT_INCOMPAT_META_UUID);
+ if ((sb->sb_features_incompat & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features_incompat) &
+ features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check log incompat flags; all are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN);
+ if ((sb->sb_features_log_incompat & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features_log_incompat) &
+ features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Don't care about sb_crc */
+
+ if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ /* Don't care about sb_lsn */
+ }
+
+ if (xfs_sb_version_hasmetauuid(&mp->m_sb)) {
+ /* The metadata UUID must be the same for all supers */
+ if (!uuid_equal(&sb->sb_meta_uuid, &mp->m_sb.sb_meta_uuid))
+ xfs_scrub_block_set_corrupt(sc, bp);
+ }
+
+ /* Everything else must be zero. */
+ if (memchr_inv(sb + 1, 0,
+ BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ xfs_scrub_superblock_xref(sc, bp);
+
+ return error;
+}
+
+/* AGF */
+
+/* Tally freespace record lengths. */
+STATIC int
+xfs_scrub_agf_record_bno_lengths(
+ struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *rec,
+ void *priv)
+{
+ xfs_extlen_t *blocks = priv;
+
+ (*blocks) += rec->ar_blockcount;
+ return 0;
+}
+
+/* Check agf_freeblks */
+static inline void
+xfs_scrub_agf_xref_freeblks(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ xfs_extlen_t blocks = 0;
+ int error;
+
+ if (!sc->sa.bno_cur)
+ return;
+
+ error = xfs_alloc_query_all(sc->sa.bno_cur,
+ xfs_scrub_agf_record_bno_lengths, &blocks);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.bno_cur))
+ return;
+ if (blocks != be32_to_cpu(agf->agf_freeblks))
+ xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp);
+}
+
+/* Cross reference the AGF with the cntbt (freespace by length btree) */
+static inline void
+xfs_scrub_agf_xref_cntbt(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ xfs_agblock_t agbno;
+ xfs_extlen_t blocks;
+ int have;
+ int error;
+
+ if (!sc->sa.cnt_cur)
+ return;
+
+ /* Any freespace at all? */
+ error = xfs_alloc_lookup_le(sc->sa.cnt_cur, 0, -1U, &have);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.cnt_cur))
+ return;
+ if (!have) {
+ if (agf->agf_freeblks != be32_to_cpu(0))
+ xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp);
+ return;
+ }
+
+ /* Check agf_longest */
+ error = xfs_alloc_get_rec(sc->sa.cnt_cur, &agbno, &blocks, &have);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.cnt_cur))
+ return;
+ if (!have || blocks != be32_to_cpu(agf->agf_longest))
+ xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp);
+}
+
+/* Check the btree block counts in the AGF against the btrees. */
+STATIC void
+xfs_scrub_agf_xref_btreeblks(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ struct xfs_mount *mp = sc->mp;
+ xfs_agblock_t blocks;
+ xfs_agblock_t btreeblks;
+ int error;
+
+ /* Check agf_rmap_blocks; set up for agf_btreeblks check */
+ if (sc->sa.rmap_cur) {
+ error = xfs_btree_count_blocks(sc->sa.rmap_cur, &blocks);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+ return;
+ btreeblks = blocks - 1;
+ if (blocks != be32_to_cpu(agf->agf_rmap_blocks))
+ xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp);
+ } else {
+ btreeblks = 0;
+ }
+
+ /*
+ * No rmap cursor; we can't xref if we have the rmapbt feature.
+ * We also can't do it if we're missing the free space btree cursors.
+ */
+ if ((xfs_sb_version_hasrmapbt(&mp->m_sb) && !sc->sa.rmap_cur) ||
+ !sc->sa.bno_cur || !sc->sa.cnt_cur)
+ return;
+
+ /* Check agf_btreeblks */
+ error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.bno_cur))
+ return;
+ btreeblks += blocks - 1;
+
+ error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.cnt_cur))
+ return;
+ btreeblks += blocks - 1;
+
+ if (btreeblks != be32_to_cpu(agf->agf_btreeblks))
+ xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp);
+}
+
+/* Check agf_refcount_blocks against tree size */
+static inline void
+xfs_scrub_agf_xref_refcblks(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ xfs_agblock_t blocks;
+ int error;
+
+ if (!sc->sa.refc_cur)
+ return;
+
+ error = xfs_btree_count_blocks(sc->sa.refc_cur, &blocks);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur))
+ return;
+ if (blocks != be32_to_cpu(agf->agf_refcount_blocks))
+ xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agf_bp);
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xfs_scrub_agf_xref(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agblock_t agbno;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ agbno = XFS_AGF_BLOCK(mp);
+
+ error = xfs_scrub_ag_btcur_init(sc, &sc->sa);
+ if (error)
+ return;
+
+ xfs_scrub_xref_is_used_space(sc, agbno, 1);
+ xfs_scrub_agf_xref_freeblks(sc);
+ xfs_scrub_agf_xref_cntbt(sc);
+ xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1);
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
+ xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xfs_scrub_agf_xref_btreeblks(sc);
+ xfs_scrub_xref_is_not_shared(sc, agbno, 1);
+ xfs_scrub_agf_xref_refcblks(sc);
+
+ /* scrub teardown will take care of sc->sa for us */
+}
+
+/* Scrub the AGF. */
+int
+xfs_scrub_agf(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_agf *agf;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t eoag;
+ xfs_agblock_t agfl_first;
+ xfs_agblock_t agfl_last;
+ xfs_agblock_t agfl_count;
+ xfs_agblock_t fl_count;
+ int level;
+ int error = 0;
+
+ agno = sc->sa.agno = sc->sm->sm_agno;
+ error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp,
+ &sc->sa.agf_bp, &sc->sa.agfl_bp);
+ if (!xfs_scrub_process_error(sc, agno, XFS_AGF_BLOCK(sc->mp), &error))
+ goto out;
+ xfs_scrub_buffer_recheck(sc, sc->sa.agf_bp);
+
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+
+ /* Check the AG length */
+ eoag = be32_to_cpu(agf->agf_length);
+ if (eoag != xfs_ag_block_count(mp, agno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ /* Check the AGF btree roots and levels */
+ agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ }
+
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ agbno = be32_to_cpu(agf->agf_refcount_root);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ level = be32_to_cpu(agf->agf_refcount_level);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ }
+
+ /* Check the AGFL counters */
+ agfl_first = be32_to_cpu(agf->agf_flfirst);
+ agfl_last = be32_to_cpu(agf->agf_fllast);
+ agfl_count = be32_to_cpu(agf->agf_flcount);
+ if (agfl_last > agfl_first)
+ fl_count = agfl_last - agfl_first + 1;
+ else
+ fl_count = xfs_agfl_size(mp) - agfl_first + agfl_last + 1;
+ if (agfl_count != 0 && fl_count != agfl_count)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ xfs_scrub_agf_xref(sc);
+out:
+ return error;
+}
+
+/* AGFL */
+
+struct xfs_scrub_agfl_info {
+ struct xfs_owner_info oinfo;
+ unsigned int sz_entries;
+ unsigned int nr_entries;
+ xfs_agblock_t *entries;
+};
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xfs_scrub_agfl_block_xref(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno,
+ struct xfs_owner_info *oinfo)
+{
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ xfs_scrub_xref_is_used_space(sc, agbno, 1);
+ xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1);
+ xfs_scrub_xref_is_owned_by(sc, agbno, 1, oinfo);
+ xfs_scrub_xref_is_not_shared(sc, agbno, 1);
+}
+
+/* Scrub an AGFL block. */
+STATIC int
+xfs_scrub_agfl_block(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno,
+ void *priv)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_scrub_agfl_info *sai = priv;
+ xfs_agnumber_t agno = sc->sa.agno;
+
+ if (xfs_verify_agbno(mp, agno, agbno) &&
+ sai->nr_entries < sai->sz_entries)
+ sai->entries[sai->nr_entries++] = agbno;
+ else
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agfl_bp);
+
+ xfs_scrub_agfl_block_xref(sc, agbno, priv);
+
+ return 0;
+}
+
+static int
+xfs_scrub_agblock_cmp(
+ const void *pa,
+ const void *pb)
+{
+ const xfs_agblock_t *a = pa;
+ const xfs_agblock_t *b = pb;
+
+ return (int)*a - (int)*b;
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xfs_scrub_agfl_xref(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agblock_t agbno;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ agbno = XFS_AGFL_BLOCK(mp);
+
+ error = xfs_scrub_ag_btcur_init(sc, &sc->sa);
+ if (error)
+ return;
+
+ xfs_scrub_xref_is_used_space(sc, agbno, 1);
+ xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1);
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
+ xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xfs_scrub_xref_is_not_shared(sc, agbno, 1);
+
+ /*
+ * Scrub teardown will take care of sc->sa for us. Leave sc->sa
+ * active so that the agfl block xref can use it too.
+ */
+}
+
+/* Scrub the AGFL. */
+int
+xfs_scrub_agfl(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_scrub_agfl_info sai;
+ struct xfs_agf *agf;
+ xfs_agnumber_t agno;
+ unsigned int agflcount;
+ unsigned int i;
+ int error;
+
+ agno = sc->sa.agno = sc->sm->sm_agno;
+ error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp,
+ &sc->sa.agf_bp, &sc->sa.agfl_bp);
+ if (!xfs_scrub_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error))
+ goto out;
+ if (!sc->sa.agf_bp)
+ return -EFSCORRUPTED;
+ xfs_scrub_buffer_recheck(sc, sc->sa.agfl_bp);
+
+ xfs_scrub_agfl_xref(sc);
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Allocate buffer to ensure uniqueness of AGFL entries. */
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agflcount = be32_to_cpu(agf->agf_flcount);
+ if (agflcount > xfs_agfl_size(sc->mp)) {
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ goto out;
+ }
+ memset(&sai, 0, sizeof(sai));
+ sai.sz_entries = agflcount;
+ sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS);
+ if (!sai.entries) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ /* Check the blocks in the AGFL. */
+ xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG);
+ error = xfs_scrub_walk_agfl(sc, xfs_scrub_agfl_block, &sai);
+ if (error)
+ goto out_free;
+
+ if (agflcount != sai.nr_entries) {
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ goto out_free;
+ }
+
+ /* Sort entries, check for duplicates. */
+ sort(sai.entries, sai.nr_entries, sizeof(sai.entries[0]),
+ xfs_scrub_agblock_cmp, NULL);
+ for (i = 1; i < sai.nr_entries; i++) {
+ if (sai.entries[i] == sai.entries[i - 1]) {
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ break;
+ }
+ }
+
+out_free:
+ kmem_free(sai.entries);
+out:
+ return error;
+}
+
+/* AGI */
+
+/* Check agi_count/agi_freecount */
+static inline void
+xfs_scrub_agi_xref_icounts(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(sc->sa.agi_bp);
+ xfs_agino_t icount;
+ xfs_agino_t freecount;
+ int error;
+
+ if (!sc->sa.ino_cur)
+ return;
+
+ error = xfs_ialloc_count_inodes(sc->sa.ino_cur, &icount, &freecount);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.ino_cur))
+ return;
+ if (be32_to_cpu(agi->agi_count) != icount ||
+ be32_to_cpu(agi->agi_freecount) != freecount)
+ xfs_scrub_block_xref_set_corrupt(sc, sc->sa.agi_bp);
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xfs_scrub_agi_xref(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agblock_t agbno;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ agbno = XFS_AGI_BLOCK(mp);
+
+ error = xfs_scrub_ag_btcur_init(sc, &sc->sa);
+ if (error)
+ return;
+
+ xfs_scrub_xref_is_used_space(sc, agbno, 1);
+ xfs_scrub_xref_is_not_inode_chunk(sc, agbno, 1);
+ xfs_scrub_agi_xref_icounts(sc);
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
+ xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xfs_scrub_xref_is_not_shared(sc, agbno, 1);
+
+ /* scrub teardown will take care of sc->sa for us */
+}
+
+/* Scrub the AGI. */
+int
+xfs_scrub_agi(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_agi *agi;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t eoag;
+ xfs_agino_t agino;
+ xfs_agino_t first_agino;
+ xfs_agino_t last_agino;
+ xfs_agino_t icount;
+ int i;
+ int level;
+ int error = 0;
+
+ agno = sc->sa.agno = sc->sm->sm_agno;
+ error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp,
+ &sc->sa.agf_bp, &sc->sa.agfl_bp);
+ if (!xfs_scrub_process_error(sc, agno, XFS_AGI_BLOCK(sc->mp), &error))
+ goto out;
+ xfs_scrub_buffer_recheck(sc, sc->sa.agi_bp);
+
+ agi = XFS_BUF_TO_AGI(sc->sa.agi_bp);
+
+ /* Check the AG length */
+ eoag = be32_to_cpu(agi->agi_length);
+ if (eoag != xfs_ag_block_count(mp, agno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ /* Check btree roots and levels */
+ agbno = be32_to_cpu(agi->agi_root);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ level = be32_to_cpu(agi->agi_level);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ agbno = be32_to_cpu(agi->agi_free_root);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ level = be32_to_cpu(agi->agi_free_level);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+ }
+
+ /* Check inode counters */
+ xfs_ialloc_agino_range(mp, agno, &first_agino, &last_agino);
+ icount = be32_to_cpu(agi->agi_count);
+ if (icount > last_agino - first_agino + 1 ||
+ icount < be32_to_cpu(agi->agi_freecount))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ /* Check inode pointers */
+ agino = be32_to_cpu(agi->agi_newino);
+ if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ agino = be32_to_cpu(agi->agi_dirino);
+ if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ /* Check unlinked inode buckets */
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ agino = be32_to_cpu(agi->agi_unlinked[i]);
+ if (agino == NULLAGINO)
+ continue;
+ if (!xfs_verify_agino(mp, agno, agino))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+ }
+
+ if (agi->agi_pad32 != cpu_to_be32(0))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ xfs_scrub_agi_xref(sc);
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
new file mode 100644
index 000000000000..517c079d3f68
--- /dev/null
+++ b/fs/xfs/scrub/alloc.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_alloc.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/*
+ * Set us up to scrub free space btrees.
+ */
+int
+xfs_scrub_setup_ag_allocbt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_ag_btree(sc, ip, false);
+}
+
+/* Free space btree scrubber. */
+/*
+ * Ensure there's a corresponding cntbt/bnobt record matching this
+ * bnobt/cntbt record, respectively.
+ */
+STATIC void
+xfs_scrub_allocbt_xref_other(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ struct xfs_btree_cur **pcur;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ int has_otherrec;
+ int error;
+
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT)
+ pcur = &sc->sa.cnt_cur;
+ else
+ pcur = &sc->sa.bno_cur;
+ if (!*pcur)
+ return;
+
+ error = xfs_alloc_lookup_le(*pcur, agbno, len, &has_otherrec);
+ if (!xfs_scrub_should_check_xref(sc, &error, pcur))
+ return;
+ if (!has_otherrec) {
+ xfs_scrub_btree_xref_set_corrupt(sc, *pcur, 0);
+ return;
+ }
+
+ error = xfs_alloc_get_rec(*pcur, &fbno, &flen, &has_otherrec);
+ if (!xfs_scrub_should_check_xref(sc, &error, pcur))
+ return;
+ if (!has_otherrec) {
+ xfs_scrub_btree_xref_set_corrupt(sc, *pcur, 0);
+ return;
+ }
+
+ if (fbno != agbno || flen != len)
+ xfs_scrub_btree_xref_set_corrupt(sc, *pcur, 0);
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xfs_scrub_allocbt_xref(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ xfs_scrub_allocbt_xref_other(sc, agbno, len);
+ xfs_scrub_xref_is_not_inode_chunk(sc, agbno, len);
+ xfs_scrub_xref_has_no_owner(sc, agbno, len);
+ xfs_scrub_xref_is_not_shared(sc, agbno, len);
+}
+
+/* Scrub a bnobt/cntbt record. */
+STATIC int
+xfs_scrub_allocbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agblock_t bno;
+ xfs_extlen_t len;
+ int error = 0;
+
+ bno = be32_to_cpu(rec->alloc.ar_startblock);
+ len = be32_to_cpu(rec->alloc.ar_blockcount);
+
+ if (bno + len <= bno ||
+ !xfs_verify_agbno(mp, agno, bno) ||
+ !xfs_verify_agbno(mp, agno, bno + len - 1))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ xfs_scrub_allocbt_xref(bs->sc, bno, len);
+
+ return error;
+}
+
+/* Scrub the freespace btrees for some AG. */
+STATIC int
+xfs_scrub_allocbt(
+ struct xfs_scrub_context *sc,
+ xfs_btnum_t which)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_btree_cur *cur;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
+ return xfs_scrub_btree(sc, cur, xfs_scrub_allocbt_rec, &oinfo, NULL);
+}
+
+int
+xfs_scrub_bnobt(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_allocbt(sc, XFS_BTNUM_BNO);
+}
+
+int
+xfs_scrub_cntbt(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_allocbt(sc, XFS_BTNUM_CNT);
+}
+
+/* xref check that the extent is not free */
+void
+xfs_scrub_xref_is_used_space(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ bool is_freesp;
+ int error;
+
+ if (!sc->sa.bno_cur)
+ return;
+
+ error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.bno_cur))
+ return;
+ if (is_freesp)
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.bno_cur, 0);
+}
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
new file mode 100644
index 000000000000..127575f0abfb
--- /dev/null
+++ b/fs/xfs/scrub/attr.c
@@ -0,0 +1,471 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/dabtree.h"
+#include "scrub/trace.h"
+
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+/* Set us up to scrub an inode's extended attributes. */
+int
+xfs_scrub_setup_xattr(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ size_t sz;
+
+ /*
+ * Allocate the buffer without the inode lock held. We need enough
+ * space to read every xattr value in the file or enough space to
+ * hold three copies of the xattr free space bitmap. (Not both at
+ * the same time.)
+ */
+ sz = max_t(size_t, XATTR_SIZE_MAX, 3 * sizeof(long) *
+ BITS_TO_LONGS(sc->mp->m_attr_geo->blksize));
+ sc->buf = kmem_zalloc_large(sz, KM_SLEEP);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ return xfs_scrub_setup_inode_contents(sc, ip, 0);
+}
+
+/* Extended Attributes */
+
+struct xfs_scrub_xattr {
+ struct xfs_attr_list_context context;
+ struct xfs_scrub_context *sc;
+};
+
+/*
+ * Check that an extended attribute key can be looked up by hash.
+ *
+ * We use the XFS attribute list iterator (i.e. xfs_attr_list_int_ilocked)
+ * to call this function for every attribute key in an inode. Once
+ * we're here, we load the attribute value to see if any errors happen,
+ * or if we get more or less data than we expected.
+ */
+static void
+xfs_scrub_xattr_listent(
+ struct xfs_attr_list_context *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ int valuelen)
+{
+ struct xfs_scrub_xattr *sx;
+ struct xfs_da_args args = { NULL };
+ int error = 0;
+
+ sx = container_of(context, struct xfs_scrub_xattr, context);
+
+ if (flags & XFS_ATTR_INCOMPLETE) {
+ /* Incomplete attr key, just mark the inode for preening. */
+ xfs_scrub_ino_set_preen(sx->sc, context->dp->i_ino);
+ return;
+ }
+
+ args.flags = ATTR_KERNOTIME;
+ if (flags & XFS_ATTR_ROOT)
+ args.flags |= ATTR_ROOT;
+ else if (flags & XFS_ATTR_SECURE)
+ args.flags |= ATTR_SECURE;
+ args.geo = context->dp->i_mount->m_attr_geo;
+ args.whichfork = XFS_ATTR_FORK;
+ args.dp = context->dp;
+ args.name = name;
+ args.namelen = namelen;
+ args.hashval = xfs_da_hashname(args.name, args.namelen);
+ args.trans = context->tp;
+ args.value = sx->sc->buf;
+ args.valuelen = XATTR_SIZE_MAX;
+
+ error = xfs_attr_get_ilocked(context->dp, &args);
+ if (error == -EEXIST)
+ error = 0;
+ if (!xfs_scrub_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
+ &error))
+ goto fail_xref;
+ if (args.valuelen != valuelen)
+ xfs_scrub_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK,
+ args.blkno);
+
+fail_xref:
+ return;
+}
+
+/*
+ * Mark a range [start, start+len) in this map. Returns true if the
+ * region was free, and false if there's a conflict or a problem.
+ *
+ * Within a char, the lowest bit of the char represents the byte with
+ * the smallest address
+ */
+STATIC bool
+xfs_scrub_xattr_set_map(
+ struct xfs_scrub_context *sc,
+ unsigned long *map,
+ unsigned int start,
+ unsigned int len)
+{
+ unsigned int mapsize = sc->mp->m_attr_geo->blksize;
+ bool ret = true;
+
+ if (start >= mapsize)
+ return false;
+ if (start + len > mapsize) {
+ len = mapsize - start;
+ ret = false;
+ }
+
+ if (find_next_bit(map, mapsize, start) < start + len)
+ ret = false;
+ bitmap_set(map, start, len);
+
+ return ret;
+}
+
+/*
+ * Check the leaf freemap from the usage bitmap. Returns false if the
+ * attr freemap has problems or points to used space.
+ */
+STATIC bool
+xfs_scrub_xattr_check_freemap(
+ struct xfs_scrub_context *sc,
+ unsigned long *map,
+ struct xfs_attr3_icleaf_hdr *leafhdr)
+{
+ unsigned long *freemap;
+ unsigned long *dstmap;
+ unsigned int mapsize = sc->mp->m_attr_geo->blksize;
+ int i;
+
+ /* Construct bitmap of freemap contents. */
+ freemap = (unsigned long *)sc->buf + BITS_TO_LONGS(mapsize);
+ bitmap_zero(freemap, mapsize);
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ if (!xfs_scrub_xattr_set_map(sc, freemap,
+ leafhdr->freemap[i].base,
+ leafhdr->freemap[i].size))
+ return false;
+ }
+
+ /* Look for bits that are set in freemap and are marked in use. */
+ dstmap = freemap + BITS_TO_LONGS(mapsize);
+ return bitmap_and(dstmap, freemap, map, mapsize) == 0;
+}
+
+/*
+ * Check this leaf entry's relations to everything else.
+ * Returns the number of bytes used for the name/value data.
+ */
+STATIC void
+xfs_scrub_xattr_entry(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ char *buf_end,
+ struct xfs_attr_leafblock *leaf,
+ struct xfs_attr3_icleaf_hdr *leafhdr,
+ unsigned long *usedmap,
+ struct xfs_attr_leaf_entry *ent,
+ int idx,
+ unsigned int *usedbytes,
+ __u32 *last_hashval)
+{
+ struct xfs_mount *mp = ds->state->mp;
+ char *name_end;
+ struct xfs_attr_leaf_name_local *lentry;
+ struct xfs_attr_leaf_name_remote *rentry;
+ unsigned int nameidx;
+ unsigned int namesize;
+
+ if (ent->pad2 != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ /* Hash values in order? */
+ if (be32_to_cpu(ent->hashval) < *last_hashval)
+ xfs_scrub_da_set_corrupt(ds, level);
+ *last_hashval = be32_to_cpu(ent->hashval);
+
+ nameidx = be16_to_cpu(ent->nameidx);
+ if (nameidx < leafhdr->firstused ||
+ nameidx >= mp->m_attr_geo->blksize) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ return;
+ }
+
+ /* Check the name information. */
+ if (ent->flags & XFS_ATTR_LOCAL) {
+ lentry = xfs_attr3_leaf_name_local(leaf, idx);
+ namesize = xfs_attr_leaf_entsize_local(lentry->namelen,
+ be16_to_cpu(lentry->valuelen));
+ name_end = (char *)lentry + namesize;
+ if (lentry->namelen == 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ } else {
+ rentry = xfs_attr3_leaf_name_remote(leaf, idx);
+ namesize = xfs_attr_leaf_entsize_remote(rentry->namelen);
+ name_end = (char *)rentry + namesize;
+ if (rentry->namelen == 0 || rentry->valueblk == 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ }
+ if (name_end > buf_end)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, nameidx, namesize))
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (!(ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ *usedbytes += namesize;
+}
+
+/* Scrub an attribute leaf. */
+STATIC int
+xfs_scrub_xattr_block(
+ struct xfs_scrub_da_btree *ds,
+ int level)
+{
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct xfs_mount *mp = ds->state->mp;
+ struct xfs_da_state_blk *blk = &ds->state->path.blk[level];
+ struct xfs_buf *bp = blk->bp;
+ xfs_dablk_t *last_checked = ds->private;
+ struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_attr_leaf_entry *ent;
+ struct xfs_attr_leaf_entry *entries;
+ unsigned long *usedmap = ds->sc->buf;
+ char *buf_end;
+ size_t off;
+ __u32 last_hashval = 0;
+ unsigned int usedbytes = 0;
+ unsigned int hdrsize;
+ int i;
+
+ if (*last_checked == blk->blkno)
+ return 0;
+ *last_checked = blk->blkno;
+ bitmap_zero(usedmap, mp->m_attr_geo->blksize);
+
+ /* Check all the padding. */
+ if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb)) {
+ struct xfs_attr3_leafblock *leaf = bp->b_addr;
+
+ if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 ||
+ leaf->hdr.info.hdr.pad != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ } else {
+ if (leaf->hdr.pad1 != 0 || leaf->hdr.info.pad != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ }
+
+ /* Check the leaf header */
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+ hdrsize = xfs_attr3_leaf_hdr_size(leaf);
+
+ if (leafhdr.usedbytes > mp->m_attr_geo->blksize)
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (leafhdr.firstused > mp->m_attr_geo->blksize)
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (leafhdr.firstused < hdrsize)
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, 0, hdrsize))
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ entries = xfs_attr3_leaf_entryp(leaf);
+ if ((char *)&entries[leafhdr.count] > (char *)leaf + leafhdr.firstused)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize;
+ for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) {
+ /* Mark the leaf entry itself. */
+ off = (char *)ent - (char *)leaf;
+ if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, off,
+ sizeof(xfs_attr_leaf_entry_t))) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+
+ /* Check the entry and nameval. */
+ xfs_scrub_xattr_entry(ds, level, buf_end, leaf, &leafhdr,
+ usedmap, ent, i, &usedbytes, &last_hashval);
+
+ if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+ }
+
+ if (!xfs_scrub_xattr_check_freemap(ds->sc, usedmap, &leafhdr))
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ if (leafhdr.usedbytes != usedbytes)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+out:
+ return 0;
+}
+
+/* Scrub a attribute btree record. */
+STATIC int
+xfs_scrub_xattr_rec(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ void *rec)
+{
+ struct xfs_mount *mp = ds->state->mp;
+ struct xfs_attr_leaf_entry *ent = rec;
+ struct xfs_da_state_blk *blk;
+ struct xfs_attr_leaf_name_local *lentry;
+ struct xfs_attr_leaf_name_remote *rentry;
+ struct xfs_buf *bp;
+ xfs_dahash_t calc_hash;
+ xfs_dahash_t hash;
+ int nameidx;
+ int hdrsize;
+ unsigned int badflags;
+ int error;
+
+ blk = &ds->state->path.blk[level];
+
+ /* Check the whole block, if necessary. */
+ error = xfs_scrub_xattr_block(ds, level);
+ if (error)
+ goto out;
+ if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Check the hash of the entry. */
+ error = xfs_scrub_da_btree_hash(ds, level, &ent->hashval);
+ if (error)
+ goto out;
+
+ /* Find the attr entry's location. */
+ bp = blk->bp;
+ hdrsize = xfs_attr3_leaf_hdr_size(bp->b_addr);
+ nameidx = be16_to_cpu(ent->nameidx);
+ if (nameidx < hdrsize || nameidx >= mp->m_attr_geo->blksize) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+
+ /* Retrieve the entry and check it. */
+ hash = be32_to_cpu(ent->hashval);
+ badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE |
+ XFS_ATTR_INCOMPLETE);
+ if ((ent->flags & badflags) != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (ent->flags & XFS_ATTR_LOCAL) {
+ lentry = (struct xfs_attr_leaf_name_local *)
+ (((char *)bp->b_addr) + nameidx);
+ if (lentry->namelen <= 0) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+ calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen);
+ } else {
+ rentry = (struct xfs_attr_leaf_name_remote *)
+ (((char *)bp->b_addr) + nameidx);
+ if (rentry->namelen <= 0) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+ calc_hash = xfs_da_hashname(rentry->name, rentry->namelen);
+ }
+ if (calc_hash != hash)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+out:
+ return error;
+}
+
+/* Scrub the extended attribute metadata. */
+int
+xfs_scrub_xattr(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_scrub_xattr sx;
+ struct attrlist_cursor_kern cursor = { 0 };
+ xfs_dablk_t last_checked = -1U;
+ int error = 0;
+
+ if (!xfs_inode_hasattr(sc->ip))
+ return -ENOENT;
+
+ memset(&sx, 0, sizeof(sx));
+ /* Check attribute tree structure */
+ error = xfs_scrub_da_btree(sc, XFS_ATTR_FORK, xfs_scrub_xattr_rec,
+ &last_checked);
+ if (error)
+ goto out;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Check that every attr key can also be looked up by hash. */
+ sx.context.dp = sc->ip;
+ sx.context.cursor = &cursor;
+ sx.context.resynch = 1;
+ sx.context.put_listent = xfs_scrub_xattr_listent;
+ sx.context.tp = sc->tp;
+ sx.context.flags = ATTR_INCOMPLETE;
+ sx.sc = sc;
+
+ /*
+ * Look up every xattr in this file by name.
+ *
+ * Use the backend implementation of xfs_attr_list to call
+ * xfs_scrub_xattr_listent on every attribute key in this inode.
+ * In other words, we use the same iterator/callback mechanism
+ * that listattr uses to scrub extended attributes, though in our
+ * _listent function, we check the value of the attribute.
+ *
+ * The VFS only locks i_rwsem when modifying attrs, so keep all
+ * three locks held because that's the only way to ensure we're
+ * the only thread poking into the da btree. We traverse the da
+ * btree while holding a leaf buffer locked for the xattr name
+ * iteration, which doesn't really follow the usual buffer
+ * locking order.
+ */
+ error = xfs_attr_list_int_ilocked(&sx.context);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
+ goto out;
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
new file mode 100644
index 000000000000..639d14b51e90
--- /dev/null
+++ b/fs/xfs/scrub/bmap.c
@@ -0,0 +1,734 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/* Set us up with an inode's bmap. */
+int
+xfs_scrub_setup_inode_bmap(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ error = xfs_scrub_get_inode(sc, ip);
+ if (error)
+ goto out;
+
+ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+
+ /*
+ * We don't want any ephemeral data fork updates sitting around
+ * while we inspect block mappings, so wait for directio to finish
+ * and flush dirty data if we have delalloc reservations.
+ */
+ if (S_ISREG(VFS_I(sc->ip)->i_mode) &&
+ sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) {
+ inode_dio_wait(VFS_I(sc->ip));
+ error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping);
+ if (error)
+ goto out;
+ }
+
+ /* Got the inode, lock it and we're ready to go. */
+ error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ if (error)
+ goto out;
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+out:
+ /* scrub teardown will unlock and release the inode */
+ return error;
+}
+
+/*
+ * Inode fork block mapping (BMBT) scrubber.
+ * More complex than the others because we have to scrub
+ * all the extents regardless of whether or not the fork
+ * is in btree format.
+ */
+
+struct xfs_scrub_bmap_info {
+ struct xfs_scrub_context *sc;
+ xfs_fileoff_t lastoff;
+ bool is_rt;
+ bool is_shared;
+ int whichfork;
+};
+
+/* Look for a corresponding rmap for this irec. */
+static inline bool
+xfs_scrub_bmap_get_rmap(
+ struct xfs_scrub_bmap_info *info,
+ struct xfs_bmbt_irec *irec,
+ xfs_agblock_t agbno,
+ uint64_t owner,
+ struct xfs_rmap_irec *rmap)
+{
+ xfs_fileoff_t offset;
+ unsigned int rflags = 0;
+ int has_rmap;
+ int error;
+
+ if (info->whichfork == XFS_ATTR_FORK)
+ rflags |= XFS_RMAP_ATTR_FORK;
+
+ /*
+ * CoW staging extents are owned (on disk) by the refcountbt, so
+ * their rmaps do not have offsets.
+ */
+ if (info->whichfork == XFS_COW_FORK)
+ offset = 0;
+ else
+ offset = irec->br_startoff;
+
+ /*
+ * If the caller thinks this could be a shared bmbt extent (IOWs,
+ * any data fork extent of a reflink inode) then we have to use the
+ * range rmap lookup to make sure we get the correct owner/offset.
+ */
+ if (info->is_shared) {
+ error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno,
+ owner, offset, rflags, rmap, &has_rmap);
+ if (!xfs_scrub_should_check_xref(info->sc, &error,
+ &info->sc->sa.rmap_cur))
+ return false;
+ goto out;
+ }
+
+ /*
+ * Otherwise, use the (faster) regular lookup.
+ */
+ error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 0, owner,
+ offset, rflags, &has_rmap);
+ if (!xfs_scrub_should_check_xref(info->sc, &error,
+ &info->sc->sa.rmap_cur))
+ return false;
+ if (!has_rmap)
+ goto out;
+
+ error = xfs_rmap_get_rec(info->sc->sa.rmap_cur, rmap, &has_rmap);
+ if (!xfs_scrub_should_check_xref(info->sc, &error,
+ &info->sc->sa.rmap_cur))
+ return false;
+
+out:
+ if (!has_rmap)
+ xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ return has_rmap;
+}
+
+/* Make sure that we have rmapbt records for this extent. */
+STATIC void
+xfs_scrub_bmap_xref_rmap(
+ struct xfs_scrub_bmap_info *info,
+ struct xfs_bmbt_irec *irec,
+ xfs_agblock_t agbno)
+{
+ struct xfs_rmap_irec rmap;
+ unsigned long long rmap_end;
+ uint64_t owner;
+
+ if (!info->sc->sa.rmap_cur)
+ return;
+
+ if (info->whichfork == XFS_COW_FORK)
+ owner = XFS_RMAP_OWN_COW;
+ else
+ owner = info->sc->ip->i_ino;
+
+ /* Find the rmap record for this irec. */
+ if (!xfs_scrub_bmap_get_rmap(info, irec, agbno, owner, &rmap))
+ return;
+
+ /* Check the rmap. */
+ rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
+ if (rmap.rm_startblock > agbno ||
+ agbno + irec->br_blockcount > rmap_end)
+ xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /*
+ * Check the logical offsets if applicable. CoW staging extents
+ * don't track logical offsets since the mappings only exist in
+ * memory.
+ */
+ if (info->whichfork != XFS_COW_FORK) {
+ rmap_end = (unsigned long long)rmap.rm_offset +
+ rmap.rm_blockcount;
+ if (rmap.rm_offset > irec->br_startoff ||
+ irec->br_startoff + irec->br_blockcount > rmap_end)
+ xfs_scrub_fblock_xref_set_corrupt(info->sc,
+ info->whichfork, irec->br_startoff);
+ }
+
+ if (rmap.rm_owner != owner)
+ xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /*
+ * Check for discrepancies between the unwritten flag in the irec and
+ * the rmap. Note that the (in-memory) CoW fork distinguishes between
+ * unwritten and written extents, but we don't track that in the rmap
+ * records because the blocks are owned (on-disk) by the refcountbt,
+ * which doesn't track unwritten state.
+ */
+ if (owner != XFS_RMAP_OWN_COW &&
+ irec->br_state == XFS_EXT_UNWRITTEN &&
+ !(rmap.rm_flags & XFS_RMAP_UNWRITTEN))
+ xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ if (info->whichfork == XFS_ATTR_FORK &&
+ !(rmap.rm_flags & XFS_RMAP_ATTR_FORK))
+ xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
+ xfs_scrub_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+}
+
+/* Cross-reference a single rtdev extent record. */
+STATIC void
+xfs_scrub_bmap_rt_extent_xref(
+ struct xfs_scrub_bmap_info *info,
+ struct xfs_inode *ip,
+ struct xfs_btree_cur *cur,
+ struct xfs_bmbt_irec *irec)
+{
+ if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ xfs_scrub_xref_is_used_rt_space(info->sc, irec->br_startblock,
+ irec->br_blockcount);
+}
+
+/* Cross-reference a single datadev extent record. */
+STATIC void
+xfs_scrub_bmap_extent_xref(
+ struct xfs_scrub_bmap_info *info,
+ struct xfs_inode *ip,
+ struct xfs_btree_cur *cur,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = info->sc->mp;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_extlen_t len;
+ int error;
+
+ if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock);
+ agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
+ len = irec->br_blockcount;
+
+ error = xfs_scrub_ag_init(info->sc, agno, &info->sc->sa);
+ if (!xfs_scrub_fblock_process_error(info->sc, info->whichfork,
+ irec->br_startoff, &error))
+ return;
+
+ xfs_scrub_xref_is_used_space(info->sc, agbno, len);
+ xfs_scrub_xref_is_not_inode_chunk(info->sc, agbno, len);
+ xfs_scrub_bmap_xref_rmap(info, irec, agbno);
+ switch (info->whichfork) {
+ case XFS_DATA_FORK:
+ if (xfs_is_reflink_inode(info->sc->ip))
+ break;
+ /* fall through */
+ case XFS_ATTR_FORK:
+ xfs_scrub_xref_is_not_shared(info->sc, agbno,
+ irec->br_blockcount);
+ break;
+ case XFS_COW_FORK:
+ xfs_scrub_xref_is_cow_staging(info->sc, agbno,
+ irec->br_blockcount);
+ break;
+ }
+
+ xfs_scrub_ag_free(info->sc, &info->sc->sa);
+}
+
+/* Scrub a single extent record. */
+STATIC int
+xfs_scrub_bmap_extent(
+ struct xfs_inode *ip,
+ struct xfs_btree_cur *cur,
+ struct xfs_scrub_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = info->sc->mp;
+ struct xfs_buf *bp = NULL;
+ xfs_filblks_t end;
+ int error = 0;
+
+ if (cur)
+ xfs_btree_get_block(cur, 0, &bp);
+
+ /*
+ * Check for out-of-order extents. This record could have come
+ * from the incore list, for which there is no ordering check.
+ */
+ if (irec->br_startoff < info->lastoff)
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* There should never be a "hole" extent in either extent list. */
+ if (irec->br_startblock == HOLESTARTBLOCK)
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /*
+ * Check for delalloc extents. We never iterate the ones in the
+ * in-core extent scan, and we should never see these in the bmbt.
+ */
+ if (isnullstartblock(irec->br_startblock))
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* Make sure the extent points to a valid place. */
+ if (irec->br_blockcount > MAXEXTLEN)
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ if (irec->br_startblock + irec->br_blockcount <= irec->br_startblock)
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ end = irec->br_startblock + irec->br_blockcount - 1;
+ if (info->is_rt &&
+ (!xfs_verify_rtbno(mp, irec->br_startblock) ||
+ !xfs_verify_rtbno(mp, end)))
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ if (!info->is_rt &&
+ (!xfs_verify_fsbno(mp, irec->br_startblock) ||
+ !xfs_verify_fsbno(mp, end) ||
+ XFS_FSB_TO_AGNO(mp, irec->br_startblock) !=
+ XFS_FSB_TO_AGNO(mp, end)))
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* We don't allow unwritten extents on attr forks. */
+ if (irec->br_state == XFS_EXT_UNWRITTEN &&
+ info->whichfork == XFS_ATTR_FORK)
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ if (info->is_rt)
+ xfs_scrub_bmap_rt_extent_xref(info, ip, cur, irec);
+ else
+ xfs_scrub_bmap_extent_xref(info, ip, cur, irec);
+
+ info->lastoff = irec->br_startoff + irec->br_blockcount;
+ return error;
+}
+
+/* Scrub a bmbt record. */
+STATIC int
+xfs_scrub_bmapbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_scrub_bmap_info *info = bs->private;
+ struct xfs_inode *ip = bs->cur->bc_private.b.ip;
+ struct xfs_buf *bp = NULL;
+ struct xfs_btree_block *block;
+ uint64_t owner;
+ int i;
+
+ /*
+ * Check the owners of the btree blocks up to the level below
+ * the root since the verifiers don't do that.
+ */
+ if (xfs_sb_version_hascrc(&bs->cur->bc_mp->m_sb) &&
+ bs->cur->bc_ptrs[0] == 1) {
+ for (i = 0; i < bs->cur->bc_nlevels - 1; i++) {
+ block = xfs_btree_get_block(bs->cur, i, &bp);
+ owner = be64_to_cpu(block->bb_u.l.bb_owner);
+ if (owner != ip->i_ino)
+ xfs_scrub_fblock_set_corrupt(bs->sc,
+ info->whichfork, 0);
+ }
+ }
+
+ /* Set up the in-core record and scrub it. */
+ xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+ return xfs_scrub_bmap_extent(ip, bs->cur, info, &irec);
+}
+
+/* Scan the btree records. */
+STATIC int
+xfs_scrub_bmap_btree(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ struct xfs_scrub_bmap_info *info)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork);
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
+ error = xfs_scrub_btree(sc, cur, xfs_scrub_bmapbt_rec, &oinfo, info);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR :
+ XFS_BTREE_NOERROR);
+ return error;
+}
+
+struct xfs_scrub_bmap_check_rmap_info {
+ struct xfs_scrub_context *sc;
+ int whichfork;
+ struct xfs_iext_cursor icur;
+};
+
+/* Can we find bmaps that fit this rmap? */
+STATIC int
+xfs_scrub_bmap_check_rmap(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_scrub_bmap_check_rmap_info *sbcri = priv;
+ struct xfs_ifork *ifp;
+ struct xfs_scrub_context *sc = sbcri->sc;
+ bool have_map;
+
+ /* Is this even the right fork? */
+ if (rec->rm_owner != sc->ip->i_ino)
+ return 0;
+ if ((sbcri->whichfork == XFS_ATTR_FORK) ^
+ !!(rec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return 0;
+ if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ return 0;
+
+ /* Now look up the bmbt record. */
+ ifp = XFS_IFORK_PTR(sc->ip, sbcri->whichfork);
+ if (!ifp) {
+ xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork,
+ rec->rm_offset);
+ goto out;
+ }
+ have_map = xfs_iext_lookup_extent(sc->ip, ifp, rec->rm_offset,
+ &sbcri->icur, &irec);
+ if (!have_map)
+ xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork,
+ rec->rm_offset);
+ /*
+ * bmap extent record lengths are constrained to 2^21 blocks in length
+ * because of space constraints in the on-disk metadata structure.
+ * However, rmap extent record lengths are constrained only by AG
+ * length, so we have to loop through the bmbt to make sure that the
+ * entire rmap is covered by bmbt records.
+ */
+ while (have_map) {
+ if (irec.br_startoff != rec->rm_offset)
+ xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork,
+ rec->rm_offset);
+ if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp,
+ cur->bc_private.a.agno, rec->rm_startblock))
+ xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork,
+ rec->rm_offset);
+ if (irec.br_blockcount > rec->rm_blockcount)
+ xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork,
+ rec->rm_offset);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ break;
+ rec->rm_startblock += irec.br_blockcount;
+ rec->rm_offset += irec.br_blockcount;
+ rec->rm_blockcount -= irec.br_blockcount;
+ if (rec->rm_blockcount == 0)
+ break;
+ have_map = xfs_iext_next_extent(ifp, &sbcri->icur, &irec);
+ if (!have_map)
+ xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork,
+ rec->rm_offset);
+ }
+
+out:
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+ return 0;
+}
+
+/* Make sure each rmap has a corresponding bmbt entry. */
+STATIC int
+xfs_scrub_bmap_check_ag_rmaps(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_agnumber_t agno)
+{
+ struct xfs_scrub_bmap_check_rmap_info sbcri;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agf;
+ int error;
+
+ error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf);
+ if (error)
+ return error;
+
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, agno);
+ if (!cur) {
+ error = -ENOMEM;
+ goto out_agf;
+ }
+
+ sbcri.sc = sc;
+ sbcri.whichfork = whichfork;
+ error = xfs_rmap_query_all(cur, xfs_scrub_bmap_check_rmap, &sbcri);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ error = 0;
+
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+out_agf:
+ xfs_trans_brelse(sc->tp, agf);
+ return error;
+}
+
+/* Make sure each rmap has a corresponding bmbt entry. */
+STATIC int
+xfs_scrub_bmap_check_rmaps(
+ struct xfs_scrub_context *sc,
+ int whichfork)
+{
+ loff_t size;
+ xfs_agnumber_t agno;
+ int error;
+
+ if (!xfs_sb_version_hasrmapbt(&sc->mp->m_sb) ||
+ whichfork == XFS_COW_FORK ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return 0;
+
+ /* Don't support realtime rmap checks yet. */
+ if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK)
+ return 0;
+
+ /*
+ * Only do this for complex maps that are in btree format, or for
+ * situations where we would seem to have a size but zero extents.
+ * The inode repair code can zap broken iforks, which means we have
+ * to flag this bmap as corrupt if there are rmaps that need to be
+ * reattached.
+ */
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ size = i_size_read(VFS_I(sc->ip));
+ break;
+ case XFS_ATTR_FORK:
+ size = XFS_IFORK_Q(sc->ip);
+ break;
+ default:
+ size = 0;
+ break;
+ }
+ if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+ (size == 0 || XFS_IFORK_NEXTENTS(sc->ip, whichfork) > 0))
+ return 0;
+
+ for (agno = 0; agno < sc->mp->m_sb.sb_agcount; agno++) {
+ error = xfs_scrub_bmap_check_ag_rmaps(sc, whichfork, agno);
+ if (error)
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * Scrub an inode fork's block mappings.
+ *
+ * First we scan every record in every btree block, if applicable.
+ * Then we unconditionally scan the incore extent cache.
+ */
+STATIC int
+xfs_scrub_bmap(
+ struct xfs_scrub_context *sc,
+ int whichfork)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_scrub_bmap_info info = { NULL };
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t endoff;
+ struct xfs_iext_cursor icur;
+ int error = 0;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+
+ info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
+ info.whichfork = whichfork;
+ info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip);
+ info.sc = sc;
+
+ switch (whichfork) {
+ case XFS_COW_FORK:
+ /* Non-existent CoW forks are ignorable. */
+ if (!ifp)
+ goto out;
+ /* No CoW forks on non-reflink inodes/filesystems. */
+ if (!xfs_is_reflink_inode(ip)) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+ goto out;
+ }
+ break;
+ case XFS_ATTR_FORK:
+ if (!ifp)
+ goto out_check_rmap;
+ if (!xfs_sb_version_hasattr(&mp->m_sb) &&
+ !xfs_sb_version_hasattr2(&mp->m_sb))
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+ break;
+ default:
+ ASSERT(whichfork == XFS_DATA_FORK);
+ break;
+ }
+
+ /* Check the fork values */
+ switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ case XFS_DINODE_FMT_UUID:
+ case XFS_DINODE_FMT_DEV:
+ case XFS_DINODE_FMT_LOCAL:
+ /* No mappings to check. */
+ goto out;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ xfs_scrub_fblock_set_corrupt(sc, whichfork, 0);
+ goto out;
+ }
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (whichfork == XFS_COW_FORK) {
+ xfs_scrub_fblock_set_corrupt(sc, whichfork, 0);
+ goto out;
+ }
+
+ error = xfs_scrub_bmap_btree(sc, whichfork, &info);
+ if (error)
+ goto out;
+ break;
+ default:
+ xfs_scrub_fblock_set_corrupt(sc, whichfork, 0);
+ goto out;
+ }
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Now try to scrub the in-memory extent list. */
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(sc->tp, ip, whichfork);
+ if (!xfs_scrub_fblock_process_error(sc, whichfork, 0, &error))
+ goto out;
+ }
+
+ /* Find the offset of the last extent in the mapping. */
+ error = xfs_bmap_last_offset(ip, &endoff, whichfork);
+ if (!xfs_scrub_fblock_process_error(sc, whichfork, 0, &error))
+ goto out;
+
+ /* Scrub extent records. */
+ info.lastoff = 0;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ if (xfs_scrub_should_terminate(sc, &error))
+ break;
+ if (isnullstartblock(irec.br_startblock))
+ continue;
+ if (irec.br_startoff >= endoff) {
+ xfs_scrub_fblock_set_corrupt(sc, whichfork,
+ irec.br_startoff);
+ goto out;
+ }
+ error = xfs_scrub_bmap_extent(ip, NULL, &info, &irec);
+ if (error)
+ goto out;
+ }
+
+out_check_rmap:
+ error = xfs_scrub_bmap_check_rmaps(sc, whichfork);
+ if (!xfs_scrub_fblock_xref_process_error(sc, whichfork, 0, &error))
+ goto out;
+out:
+ return error;
+}
+
+/* Scrub an inode's data fork. */
+int
+xfs_scrub_bmap_data(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_bmap(sc, XFS_DATA_FORK);
+}
+
+/* Scrub an inode's attr fork. */
+int
+xfs_scrub_bmap_attr(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_bmap(sc, XFS_ATTR_FORK);
+}
+
+/* Scrub an inode's CoW fork. */
+int
+xfs_scrub_bmap_cow(
+ struct xfs_scrub_context *sc)
+{
+ if (!xfs_is_reflink_inode(sc->ip))
+ return -ENOENT;
+
+ return xfs_scrub_bmap(sc, XFS_COW_FORK);
+}
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
new file mode 100644
index 000000000000..54218168c8f9
--- /dev/null
+++ b/fs/xfs/scrub/btree.c
@@ -0,0 +1,676 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/* btree scrubbing */
+
+/*
+ * Check for btree operation errors. See the section about handling
+ * operational errors in common.c.
+ */
+static bool
+__xfs_scrub_btree_process_error(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ int level,
+ int *error,
+ __u32 errflag,
+ void *ret_ip)
+{
+ if (*error == 0)
+ return true;
+
+ switch (*error) {
+ case -EDEADLOCK:
+ /* Used to restart an op with deadlock avoidance. */
+ trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error);
+ break;
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Note the badness but don't abort. */
+ sc->sm->sm_flags |= errflag;
+ *error = 0;
+ /* fall through */
+ default:
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ trace_xfs_scrub_ifork_btree_op_error(sc, cur, level,
+ *error, ret_ip);
+ else
+ trace_xfs_scrub_btree_op_error(sc, cur, level,
+ *error, ret_ip);
+ break;
+ }
+ return false;
+}
+
+bool
+xfs_scrub_btree_process_error(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ int level,
+ int *error)
+{
+ return __xfs_scrub_btree_process_error(sc, cur, level, error,
+ XFS_SCRUB_OFLAG_CORRUPT, __return_address);
+}
+
+bool
+xfs_scrub_btree_xref_process_error(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ int level,
+ int *error)
+{
+ return __xfs_scrub_btree_process_error(sc, cur, level, error,
+ XFS_SCRUB_OFLAG_XFAIL, __return_address);
+}
+
+/* Record btree block corruption. */
+static void
+__xfs_scrub_btree_set_corrupt(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ int level,
+ __u32 errflag,
+ void *ret_ip)
+{
+ sc->sm->sm_flags |= errflag;
+
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ trace_xfs_scrub_ifork_btree_error(sc, cur, level,
+ ret_ip);
+ else
+ trace_xfs_scrub_btree_error(sc, cur, level,
+ ret_ip);
+}
+
+void
+xfs_scrub_btree_set_corrupt(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ __xfs_scrub_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT,
+ __return_address);
+}
+
+void
+xfs_scrub_btree_xref_set_corrupt(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ __xfs_scrub_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT,
+ __return_address);
+}
+
+/*
+ * Make sure this record is in order and doesn't stray outside of the parent
+ * keys.
+ */
+STATIC void
+xfs_scrub_btree_rec(
+ struct xfs_scrub_btree *bs)
+{
+ struct xfs_btree_cur *cur = bs->cur;
+ union xfs_btree_rec *rec;
+ union xfs_btree_key key;
+ union xfs_btree_key hkey;
+ union xfs_btree_key *keyp;
+ struct xfs_btree_block *block;
+ struct xfs_btree_block *keyblock;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cur, 0, &bp);
+ rec = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+
+ trace_xfs_scrub_btree_rec(bs->sc, cur, 0);
+
+ /* If this isn't the first record, are they in order? */
+ if (!bs->firstrec && !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 0);
+ bs->firstrec = false;
+ memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
+
+ if (cur->bc_nlevels == 1)
+ return;
+
+ /* Is this at least as large as the parent low key? */
+ cur->bc_ops->init_key_from_rec(&key, rec);
+ keyblock = xfs_btree_get_block(cur, 1, &bp);
+ keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 1);
+
+ if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return;
+
+ /* Is this no larger than the parent high key? */
+ cur->bc_ops->init_high_key_from_rec(&hkey, rec);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 1);
+}
+
+/*
+ * Make sure this key is in order and doesn't stray outside of the parent
+ * keys.
+ */
+STATIC void
+xfs_scrub_btree_key(
+ struct xfs_scrub_btree *bs,
+ int level)
+{
+ struct xfs_btree_cur *cur = bs->cur;
+ union xfs_btree_key *key;
+ union xfs_btree_key *keyp;
+ struct xfs_btree_block *block;
+ struct xfs_btree_block *keyblock;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cur, level, &bp);
+ key = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
+
+ trace_xfs_scrub_btree_key(bs->sc, cur, level);
+
+ /* If this isn't the first key, are they in order? */
+ if (!bs->firstkey[level] &&
+ !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level], key))
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+ bs->firstkey[level] = false;
+ memcpy(&bs->lastkey[level], key, cur->bc_ops->key_len);
+
+ if (level + 1 >= cur->bc_nlevels)
+ return;
+
+ /* Is this at least as large as the parent low key? */
+ keyblock = xfs_btree_get_block(cur, level + 1, &bp);
+ keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+
+ if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return;
+
+ /* Is this no larger than the parent high key? */
+ key = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+}
+
+/*
+ * Check a btree pointer. Returns true if it's ok to use this pointer.
+ * Callers do not need to set the corrupt flag.
+ */
+static bool
+xfs_scrub_btree_ptr_ok(
+ struct xfs_scrub_btree *bs,
+ int level,
+ union xfs_btree_ptr *ptr)
+{
+ bool res;
+
+ /* A btree rooted in an inode has no block pointer to the root. */
+ if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == bs->cur->bc_nlevels)
+ return true;
+
+ /* Otherwise, check the pointers. */
+ if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
+ else
+ res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
+ if (!res)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level);
+
+ return res;
+}
+
+/* Check that a btree block's sibling matches what we expect it. */
+STATIC int
+xfs_scrub_btree_block_check_sibling(
+ struct xfs_scrub_btree *bs,
+ int level,
+ int direction,
+ union xfs_btree_ptr *sibling)
+{
+ struct xfs_btree_cur *cur = bs->cur;
+ struct xfs_btree_block *pblock;
+ struct xfs_buf *pbp;
+ struct xfs_btree_cur *ncur = NULL;
+ union xfs_btree_ptr *pp;
+ int success;
+ int error;
+
+ error = xfs_btree_dup_cursor(cur, &ncur);
+ if (!xfs_scrub_btree_process_error(bs->sc, cur, level + 1, &error) ||
+ !ncur)
+ return error;
+
+ /*
+ * If the pointer is null, we shouldn't be able to move the upper
+ * level pointer anywhere.
+ */
+ if (xfs_btree_ptr_is_null(cur, sibling)) {
+ if (direction > 0)
+ error = xfs_btree_increment(ncur, level + 1, &success);
+ else
+ error = xfs_btree_decrement(ncur, level + 1, &success);
+ if (error == 0 && success)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+ error = 0;
+ goto out;
+ }
+
+ /* Increment upper level pointer. */
+ if (direction > 0)
+ error = xfs_btree_increment(ncur, level + 1, &success);
+ else
+ error = xfs_btree_decrement(ncur, level + 1, &success);
+ if (!xfs_scrub_btree_process_error(bs->sc, cur, level + 1, &error))
+ goto out;
+ if (!success) {
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level + 1);
+ goto out;
+ }
+
+ /* Compare upper level pointer to sibling pointer. */
+ pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
+ pp = xfs_btree_ptr_addr(ncur, ncur->bc_ptrs[level + 1], pblock);
+ if (!xfs_scrub_btree_ptr_ok(bs, level + 1, pp))
+ goto out;
+ if (pbp)
+ xfs_scrub_buffer_recheck(bs->sc, pbp);
+
+ if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+out:
+ xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/* Check the siblings of a btree block. */
+STATIC int
+xfs_scrub_btree_block_check_siblings(
+ struct xfs_scrub_btree *bs,
+ struct xfs_btree_block *block)
+{
+ struct xfs_btree_cur *cur = bs->cur;
+ union xfs_btree_ptr leftsib;
+ union xfs_btree_ptr rightsib;
+ int level;
+ int error = 0;
+
+ xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB);
+ xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB);
+ level = xfs_btree_get_level(block);
+
+ /* Root block should never have siblings. */
+ if (level == cur->bc_nlevels - 1) {
+ if (!xfs_btree_ptr_is_null(cur, &leftsib) ||
+ !xfs_btree_ptr_is_null(cur, &rightsib))
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+ goto out;
+ }
+
+ /*
+ * Does the left & right sibling pointers match the adjacent
+ * parent level pointers?
+ * (These function absorbs error codes for us.)
+ */
+ error = xfs_scrub_btree_block_check_sibling(bs, level, -1, &leftsib);
+ if (error)
+ return error;
+ error = xfs_scrub_btree_block_check_sibling(bs, level, 1, &rightsib);
+ if (error)
+ return error;
+out:
+ return error;
+}
+
+struct check_owner {
+ struct list_head list;
+ xfs_daddr_t daddr;
+ int level;
+};
+
+/*
+ * Make sure this btree block isn't in the free list and that there's
+ * an rmap record for it.
+ */
+STATIC int
+xfs_scrub_btree_check_block_owner(
+ struct xfs_scrub_btree *bs,
+ int level,
+ xfs_daddr_t daddr)
+{
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_btnum_t btnum;
+ bool init_sa;
+ int error = 0;
+
+ if (!bs->cur)
+ return 0;
+
+ btnum = bs->cur->bc_btnum;
+ agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
+ agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
+
+ init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
+ if (init_sa) {
+ error = xfs_scrub_ag_init(bs->sc, agno, &bs->sc->sa);
+ if (!xfs_scrub_btree_xref_process_error(bs->sc, bs->cur,
+ level, &error))
+ return error;
+ }
+
+ xfs_scrub_xref_is_used_space(bs->sc, agbno, 1);
+ /*
+ * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we
+ * have to nullify it (to shut down further block owner checks) if
+ * self-xref encounters problems.
+ */
+ if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
+ bs->cur = NULL;
+
+ xfs_scrub_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo);
+ if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
+ bs->cur = NULL;
+
+ if (init_sa)
+ xfs_scrub_ag_free(bs->sc, &bs->sc->sa);
+
+ return error;
+}
+
+/* Check the owner of a btree block. */
+STATIC int
+xfs_scrub_btree_check_owner(
+ struct xfs_scrub_btree *bs,
+ int level,
+ struct xfs_buf *bp)
+{
+ struct xfs_btree_cur *cur = bs->cur;
+ struct check_owner *co;
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && bp == NULL)
+ return 0;
+
+ /*
+ * We want to cross-reference each btree block with the bnobt
+ * and the rmapbt. We cannot cross-reference the bnobt or
+ * rmapbt while scanning the bnobt or rmapbt, respectively,
+ * because we cannot alter the cursor and we'd prefer not to
+ * duplicate cursors. Therefore, save the buffer daddr for
+ * later scanning.
+ */
+ if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
+ co = kmem_alloc(sizeof(struct check_owner),
+ KM_MAYFAIL | KM_NOFS);
+ if (!co)
+ return -ENOMEM;
+ co->level = level;
+ co->daddr = XFS_BUF_ADDR(bp);
+ list_add_tail(&co->list, &bs->to_check);
+ return 0;
+ }
+
+ return xfs_scrub_btree_check_block_owner(bs, level, XFS_BUF_ADDR(bp));
+}
+
+/*
+ * Grab and scrub a btree block given a btree pointer. Returns block
+ * and buffer pointers (if applicable) if they're ok to use.
+ */
+STATIC int
+xfs_scrub_btree_get_block(
+ struct xfs_scrub_btree *bs,
+ int level,
+ union xfs_btree_ptr *pp,
+ struct xfs_btree_block **pblock,
+ struct xfs_buf **pbp)
+{
+ void *failed_at;
+ int error;
+
+ *pblock = NULL;
+ *pbp = NULL;
+
+ error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock);
+ if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, level, &error) ||
+ !*pblock)
+ return error;
+
+ xfs_btree_get_block(bs->cur, level, pbp);
+ if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
+ level, *pbp);
+ else
+ failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
+ level, *pbp);
+ if (failed_at) {
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level);
+ return 0;
+ }
+ if (*pbp)
+ xfs_scrub_buffer_recheck(bs->sc, *pbp);
+
+ /*
+ * Check the block's owner; this function absorbs error codes
+ * for us.
+ */
+ error = xfs_scrub_btree_check_owner(bs, level, *pbp);
+ if (error)
+ return error;
+
+ /*
+ * Check the block's siblings; this function absorbs error codes
+ * for us.
+ */
+ return xfs_scrub_btree_block_check_siblings(bs, *pblock);
+}
+
+/*
+ * Check that the low and high keys of this block match the keys stored
+ * in the parent block.
+ */
+STATIC void
+xfs_scrub_btree_block_keys(
+ struct xfs_scrub_btree *bs,
+ int level,
+ struct xfs_btree_block *block)
+{
+ union xfs_btree_key block_keys;
+ struct xfs_btree_cur *cur = bs->cur;
+ union xfs_btree_key *high_bk;
+ union xfs_btree_key *parent_keys;
+ union xfs_btree_key *high_pk;
+ struct xfs_btree_block *parent_block;
+ struct xfs_buf *bp;
+
+ if (level >= cur->bc_nlevels - 1)
+ return;
+
+ /* Calculate the keys for this block. */
+ xfs_btree_get_keys(cur, block, &block_keys);
+
+ /* Obtain the parent's copy of the keys for this block. */
+ parent_block = xfs_btree_get_block(cur, level + 1, &bp);
+ parent_keys = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1],
+ parent_block);
+
+ if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 1);
+
+ if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return;
+
+ /* Get high keys */
+ high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
+ high_pk = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1],
+ parent_block);
+
+ if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 1);
+}
+
+/*
+ * Visit all nodes and leaves of a btree. Check that all pointers and
+ * records are in order, that the keys reflect the records, and use a callback
+ * so that the caller can verify individual records.
+ */
+int
+xfs_scrub_btree(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ xfs_scrub_btree_rec_fn scrub_fn,
+ struct xfs_owner_info *oinfo,
+ void *private)
+{
+ struct xfs_scrub_btree bs = { NULL };
+ union xfs_btree_ptr ptr;
+ union xfs_btree_ptr *pp;
+ union xfs_btree_rec *recp;
+ struct xfs_btree_block *block;
+ int level;
+ struct xfs_buf *bp;
+ struct check_owner *co;
+ struct check_owner *n;
+ int i;
+ int error = 0;
+
+ /* Initialize scrub state */
+ bs.cur = cur;
+ bs.scrub_rec = scrub_fn;
+ bs.oinfo = oinfo;
+ bs.firstrec = true;
+ bs.private = private;
+ bs.sc = sc;
+ for (i = 0; i < XFS_BTREE_MAXLEVELS; i++)
+ bs.firstkey[i] = true;
+ INIT_LIST_HEAD(&bs.to_check);
+
+ /* Don't try to check a tree with a height we can't handle. */
+ if (cur->bc_nlevels > XFS_BTREE_MAXLEVELS) {
+ xfs_scrub_btree_set_corrupt(sc, cur, 0);
+ goto out;
+ }
+
+ /*
+ * Load the root of the btree. The helper function absorbs
+ * error codes for us.
+ */
+ level = cur->bc_nlevels - 1;
+ cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ if (!xfs_scrub_btree_ptr_ok(&bs, cur->bc_nlevels, &ptr))
+ goto out;
+ error = xfs_scrub_btree_get_block(&bs, level, &ptr, &block, &bp);
+ if (error || !block)
+ goto out;
+
+ cur->bc_ptrs[level] = 1;
+
+ while (level < cur->bc_nlevels) {
+ block = xfs_btree_get_block(cur, level, &bp);
+
+ if (level == 0) {
+ /* End of leaf, pop back towards the root. */
+ if (cur->bc_ptrs[level] >
+ be16_to_cpu(block->bb_numrecs)) {
+ xfs_scrub_btree_block_keys(&bs, level, block);
+ if (level < cur->bc_nlevels - 1)
+ cur->bc_ptrs[level + 1]++;
+ level++;
+ continue;
+ }
+
+ /* Records in order for scrub? */
+ xfs_scrub_btree_rec(&bs);
+
+ /* Call out to the record checker. */
+ recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+ error = bs.scrub_rec(&bs, recp);
+ if (error)
+ break;
+ if (xfs_scrub_should_terminate(sc, &error) ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ break;
+
+ cur->bc_ptrs[level]++;
+ continue;
+ }
+
+ /* End of node, pop back towards the root. */
+ if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
+ xfs_scrub_btree_block_keys(&bs, level, block);
+ if (level < cur->bc_nlevels - 1)
+ cur->bc_ptrs[level + 1]++;
+ level++;
+ continue;
+ }
+
+ /* Keys in order for scrub? */
+ xfs_scrub_btree_key(&bs, level);
+
+ /* Drill another level deeper. */
+ pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
+ if (!xfs_scrub_btree_ptr_ok(&bs, level, pp)) {
+ cur->bc_ptrs[level]++;
+ continue;
+ }
+ level--;
+ error = xfs_scrub_btree_get_block(&bs, level, pp, &block, &bp);
+ if (error || !block)
+ goto out;
+
+ cur->bc_ptrs[level] = 1;
+ }
+
+out:
+ /* Process deferred owner checks on btree blocks. */
+ list_for_each_entry_safe(co, n, &bs.to_check, list) {
+ if (!error && bs.cur)
+ error = xfs_scrub_btree_check_block_owner(&bs,
+ co->level, co->daddr);
+ list_del(&co->list);
+ kmem_free(co);
+ }
+
+ return error;
+}
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
new file mode 100644
index 000000000000..e2b868ede70b
--- /dev/null
+++ b/fs/xfs/scrub/btree.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_BTREE_H__
+#define __XFS_SCRUB_BTREE_H__
+
+/* btree scrub */
+
+/* Check for btree operation errors. */
+bool xfs_scrub_btree_process_error(struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur, int level, int *error);
+
+/* Check for btree xref operation errors. */
+bool xfs_scrub_btree_xref_process_error(struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur, int level,
+ int *error);
+
+/* Check for btree corruption. */
+void xfs_scrub_btree_set_corrupt(struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur, int level);
+
+/* Check for btree xref discrepancies. */
+void xfs_scrub_btree_xref_set_corrupt(struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur, int level);
+
+struct xfs_scrub_btree;
+typedef int (*xfs_scrub_btree_rec_fn)(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec);
+
+struct xfs_scrub_btree {
+ /* caller-provided scrub state */
+ struct xfs_scrub_context *sc;
+ struct xfs_btree_cur *cur;
+ xfs_scrub_btree_rec_fn scrub_rec;
+ struct xfs_owner_info *oinfo;
+ void *private;
+
+ /* internal scrub state */
+ union xfs_btree_rec lastrec;
+ bool firstrec;
+ union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS];
+ bool firstkey[XFS_BTREE_MAXLEVELS];
+ struct list_head to_check;
+};
+int xfs_scrub_btree(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ xfs_scrub_btree_rec_fn scrub_fn,
+ struct xfs_owner_info *oinfo, void *private);
+
+#endif /* __XFS_SCRUB_BTREE_H__ */
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
new file mode 100644
index 000000000000..8ed91d5c868d
--- /dev/null
+++ b/fs/xfs/scrub/common.c
@@ -0,0 +1,775 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_itable.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/btree.h"
+
+/* Common code for the metadata scrubbers. */
+
+/*
+ * Handling operational errors.
+ *
+ * The *_process_error() family of functions are used to process error return
+ * codes from functions called as part of a scrub operation.
+ *
+ * If there's no error, we return true to tell the caller that it's ok
+ * to move on to the next check in its list.
+ *
+ * For non-verifier errors (e.g. ENOMEM) we return false to tell the
+ * caller that something bad happened, and we preserve *error so that
+ * the caller can return the *error up the stack to userspace.
+ *
+ * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
+ * OFLAG_CORRUPT in sm_flags and the *error is cleared. In other words,
+ * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
+ * not via return codes. We return false to tell the caller that
+ * something bad happened. Since the error has been cleared, the caller
+ * will (presumably) return that zero and scrubbing will move on to
+ * whatever's next.
+ *
+ * ftrace can be used to record the precise metadata location and the
+ * approximate code location of the failed operation.
+ */
+
+/* Check for operational errors. */
+static bool
+__xfs_scrub_process_error(
+ struct xfs_scrub_context *sc,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ int *error,
+ __u32 errflag,
+ void *ret_ip)
+{
+ switch (*error) {
+ case 0:
+ return true;
+ case -EDEADLOCK:
+ /* Used to restart an op with deadlock avoidance. */
+ trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error);
+ break;
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Note the badness but don't abort. */
+ sc->sm->sm_flags |= errflag;
+ *error = 0;
+ /* fall through */
+ default:
+ trace_xfs_scrub_op_error(sc, agno, bno, *error,
+ ret_ip);
+ break;
+ }
+ return false;
+}
+
+bool
+xfs_scrub_process_error(
+ struct xfs_scrub_context *sc,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ int *error)
+{
+ return __xfs_scrub_process_error(sc, agno, bno, error,
+ XFS_SCRUB_OFLAG_CORRUPT, __return_address);
+}
+
+bool
+xfs_scrub_xref_process_error(
+ struct xfs_scrub_context *sc,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ int *error)
+{
+ return __xfs_scrub_process_error(sc, agno, bno, error,
+ XFS_SCRUB_OFLAG_XFAIL, __return_address);
+}
+
+/* Check for operational errors for a file offset. */
+static bool
+__xfs_scrub_fblock_process_error(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_fileoff_t offset,
+ int *error,
+ __u32 errflag,
+ void *ret_ip)
+{
+ switch (*error) {
+ case 0:
+ return true;
+ case -EDEADLOCK:
+ /* Used to restart an op with deadlock avoidance. */
+ trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error);
+ break;
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Note the badness but don't abort. */
+ sc->sm->sm_flags |= errflag;
+ *error = 0;
+ /* fall through */
+ default:
+ trace_xfs_scrub_file_op_error(sc, whichfork, offset, *error,
+ ret_ip);
+ break;
+ }
+ return false;
+}
+
+bool
+xfs_scrub_fblock_process_error(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_fileoff_t offset,
+ int *error)
+{
+ return __xfs_scrub_fblock_process_error(sc, whichfork, offset, error,
+ XFS_SCRUB_OFLAG_CORRUPT, __return_address);
+}
+
+bool
+xfs_scrub_fblock_xref_process_error(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_fileoff_t offset,
+ int *error)
+{
+ return __xfs_scrub_fblock_process_error(sc, whichfork, offset, error,
+ XFS_SCRUB_OFLAG_XFAIL, __return_address);
+}
+
+/*
+ * Handling scrub corruption/optimization/warning checks.
+ *
+ * The *_set_{corrupt,preen,warning}() family of functions are used to
+ * record the presence of metadata that is incorrect (corrupt), could be
+ * optimized somehow (preen), or should be flagged for administrative
+ * review but is not incorrect (warn).
+ *
+ * ftrace can be used to record the precise metadata location and
+ * approximate code location of the failed check.
+ */
+
+/* Record a block which could be optimized. */
+void
+xfs_scrub_block_set_preen(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
+ trace_xfs_scrub_block_preen(sc, bp->b_bn, __return_address);
+}
+
+/*
+ * Record an inode which could be optimized. The trace data will
+ * include the block given by bp if bp is given; otherwise it will use
+ * the block location of the inode record itself.
+ */
+void
+xfs_scrub_ino_set_preen(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
+ trace_xfs_scrub_ino_preen(sc, ino, __return_address);
+}
+
+/* Record a corrupt block. */
+void
+xfs_scrub_block_set_corrupt(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xfs_scrub_block_error(sc, bp->b_bn, __return_address);
+}
+
+/* Record a corruption while cross-referencing. */
+void
+xfs_scrub_block_xref_set_corrupt(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
+ trace_xfs_scrub_block_error(sc, bp->b_bn, __return_address);
+}
+
+/*
+ * Record a corrupt inode. The trace data will include the block given
+ * by bp if bp is given; otherwise it will use the block location of the
+ * inode record itself.
+ */
+void
+xfs_scrub_ino_set_corrupt(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xfs_scrub_ino_error(sc, ino, __return_address);
+}
+
+/* Record a corruption while cross-referencing with an inode. */
+void
+xfs_scrub_ino_xref_set_corrupt(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
+ trace_xfs_scrub_ino_error(sc, ino, __return_address);
+}
+
+/* Record corruption in a block indexed by a file fork. */
+void
+xfs_scrub_fblock_set_corrupt(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_fileoff_t offset)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xfs_scrub_fblock_error(sc, whichfork, offset, __return_address);
+}
+
+/* Record a corruption while cross-referencing a fork block. */
+void
+xfs_scrub_fblock_xref_set_corrupt(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_fileoff_t offset)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
+ trace_xfs_scrub_fblock_error(sc, whichfork, offset, __return_address);
+}
+
+/*
+ * Warn about inodes that need administrative review but is not
+ * incorrect.
+ */
+void
+xfs_scrub_ino_set_warning(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
+ trace_xfs_scrub_ino_warning(sc, ino, __return_address);
+}
+
+/* Warn about a block indexed by a file fork that needs review. */
+void
+xfs_scrub_fblock_set_warning(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_fileoff_t offset)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
+ trace_xfs_scrub_fblock_warning(sc, whichfork, offset, __return_address);
+}
+
+/* Signal an incomplete scrub. */
+void
+xfs_scrub_set_incomplete(
+ struct xfs_scrub_context *sc)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
+ trace_xfs_scrub_incomplete(sc, __return_address);
+}
+
+/*
+ * rmap scrubbing -- compute the number of blocks with a given owner,
+ * at least according to the reverse mapping data.
+ */
+
+struct xfs_scrub_rmap_ownedby_info {
+ struct xfs_owner_info *oinfo;
+ xfs_filblks_t *blocks;
+};
+
+STATIC int
+xfs_scrub_count_rmap_ownedby_irec(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_scrub_rmap_ownedby_info *sroi = priv;
+ bool irec_attr;
+ bool oinfo_attr;
+
+ irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK;
+ oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK;
+
+ if (rec->rm_owner != sroi->oinfo->oi_owner)
+ return 0;
+
+ if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr)
+ (*sroi->blocks) += rec->rm_blockcount;
+
+ return 0;
+}
+
+/*
+ * Calculate the number of blocks the rmap thinks are owned by something.
+ * The caller should pass us an rmapbt cursor.
+ */
+int
+xfs_scrub_count_rmap_ownedby_ag(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ struct xfs_owner_info *oinfo,
+ xfs_filblks_t *blocks)
+{
+ struct xfs_scrub_rmap_ownedby_info sroi;
+
+ sroi.oinfo = oinfo;
+ *blocks = 0;
+ sroi.blocks = blocks;
+
+ return xfs_rmap_query_all(cur, xfs_scrub_count_rmap_ownedby_irec,
+ &sroi);
+}
+
+/*
+ * AG scrubbing
+ *
+ * These helpers facilitate locking an allocation group's header
+ * buffers, setting up cursors for all btrees that are present, and
+ * cleaning everything up once we're through.
+ */
+
+/* Decide if we want to return an AG header read failure. */
+static inline bool
+want_ag_read_header_failure(
+ struct xfs_scrub_context *sc,
+ unsigned int type)
+{
+ /* Return all AG header read failures when scanning btrees. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
+ return true;
+ /*
+ * If we're scanning a given type of AG header, we only want to
+ * see read failures from that specific header. We'd like the
+ * other headers to cross-check them, but this isn't required.
+ */
+ if (sc->sm->sm_type == type)
+ return true;
+ return false;
+}
+
+/*
+ * Grab all the headers for an AG.
+ *
+ * The headers should be released by xfs_scrub_ag_free, but as a fail
+ * safe we attach all the buffers we grab to the scrub transaction so
+ * they'll all be freed when we cancel it.
+ */
+int
+xfs_scrub_ag_read_headers(
+ struct xfs_scrub_context *sc,
+ xfs_agnumber_t agno,
+ struct xfs_buf **agi,
+ struct xfs_buf **agf,
+ struct xfs_buf **agfl)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ error = xfs_ialloc_read_agi(mp, sc->tp, agno, agi);
+ if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
+ goto out;
+
+ error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, agf);
+ if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
+ goto out;
+
+ error = xfs_alloc_read_agfl(mp, sc->tp, agno, agfl);
+ if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
+ goto out;
+ error = 0;
+out:
+ return error;
+}
+
+/* Release all the AG btree cursors. */
+void
+xfs_scrub_ag_btcur_free(
+ struct xfs_scrub_ag *sa)
+{
+ if (sa->refc_cur)
+ xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
+ if (sa->rmap_cur)
+ xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
+ if (sa->fino_cur)
+ xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
+ if (sa->ino_cur)
+ xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
+ if (sa->cnt_cur)
+ xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
+ if (sa->bno_cur)
+ xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
+
+ sa->refc_cur = NULL;
+ sa->rmap_cur = NULL;
+ sa->fino_cur = NULL;
+ sa->ino_cur = NULL;
+ sa->bno_cur = NULL;
+ sa->cnt_cur = NULL;
+}
+
+/* Initialize all the btree cursors for an AG. */
+int
+xfs_scrub_ag_btcur_init(
+ struct xfs_scrub_context *sc,
+ struct xfs_scrub_ag *sa)
+{
+ struct xfs_mount *mp = sc->mp;
+ xfs_agnumber_t agno = sa->agno;
+
+ if (sa->agf_bp) {
+ /* Set up a bnobt cursor for cross-referencing. */
+ sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ agno, XFS_BTNUM_BNO);
+ if (!sa->bno_cur)
+ goto err;
+
+ /* Set up a cntbt cursor for cross-referencing. */
+ sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ agno, XFS_BTNUM_CNT);
+ if (!sa->cnt_cur)
+ goto err;
+ }
+
+ /* Set up a inobt cursor for cross-referencing. */
+ if (sa->agi_bp) {
+ sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
+ agno, XFS_BTNUM_INO);
+ if (!sa->ino_cur)
+ goto err;
+ }
+
+ /* Set up a finobt cursor for cross-referencing. */
+ if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
+ agno, XFS_BTNUM_FINO);
+ if (!sa->fino_cur)
+ goto err;
+ }
+
+ /* Set up a rmapbt cursor for cross-referencing. */
+ if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ agno);
+ if (!sa->rmap_cur)
+ goto err;
+ }
+
+ /* Set up a refcountbt cursor for cross-referencing. */
+ if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb)) {
+ sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, agno, NULL);
+ if (!sa->refc_cur)
+ goto err;
+ }
+
+ return 0;
+err:
+ return -ENOMEM;
+}
+
+/* Release the AG header context and btree cursors. */
+void
+xfs_scrub_ag_free(
+ struct xfs_scrub_context *sc,
+ struct xfs_scrub_ag *sa)
+{
+ xfs_scrub_ag_btcur_free(sa);
+ if (sa->agfl_bp) {
+ xfs_trans_brelse(sc->tp, sa->agfl_bp);
+ sa->agfl_bp = NULL;
+ }
+ if (sa->agf_bp) {
+ xfs_trans_brelse(sc->tp, sa->agf_bp);
+ sa->agf_bp = NULL;
+ }
+ if (sa->agi_bp) {
+ xfs_trans_brelse(sc->tp, sa->agi_bp);
+ sa->agi_bp = NULL;
+ }
+ sa->agno = NULLAGNUMBER;
+}
+
+/*
+ * For scrub, grab the AGI and the AGF headers, in that order. Locking
+ * order requires us to get the AGI before the AGF. We use the
+ * transaction to avoid deadlocking on crosslinked metadata buffers;
+ * either the caller passes one in (bmap scrub) or we have to create a
+ * transaction ourselves.
+ */
+int
+xfs_scrub_ag_init(
+ struct xfs_scrub_context *sc,
+ xfs_agnumber_t agno,
+ struct xfs_scrub_ag *sa)
+{
+ int error;
+
+ sa->agno = agno;
+ error = xfs_scrub_ag_read_headers(sc, agno, &sa->agi_bp,
+ &sa->agf_bp, &sa->agfl_bp);
+ if (error)
+ return error;
+
+ return xfs_scrub_ag_btcur_init(sc, sa);
+}
+
+/* Per-scrubber setup functions */
+
+/* Set us up with a transaction and an empty context. */
+int
+xfs_scrub_setup_fs(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp);
+}
+
+/* Set us up with AG headers and btree cursors. */
+int
+xfs_scrub_setup_ag_btree(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip,
+ bool force_log)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ /*
+ * If the caller asks us to checkpont the log, do so. This
+ * expensive operation should be performed infrequently and only
+ * as a last resort. Any caller that sets force_log should
+ * document why they need to do so.
+ */
+ if (force_log) {
+ error = xfs_scrub_checkpoint_log(mp);
+ if (error)
+ return error;
+ }
+
+ error = xfs_scrub_setup_fs(sc, ip);
+ if (error)
+ return error;
+
+ return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa);
+}
+
+/* Push everything out of the log onto disk. */
+int
+xfs_scrub_checkpoint_log(
+ struct xfs_mount *mp)
+{
+ int error;
+
+ error = xfs_log_force(mp, XFS_LOG_SYNC);
+ if (error)
+ return error;
+ xfs_ail_push_all_sync(mp->m_ail);
+ return 0;
+}
+
+/*
+ * Given an inode and the scrub control structure, grab either the
+ * inode referenced in the control structure or the inode passed in.
+ * The inode is not locked.
+ */
+int
+xfs_scrub_get_inode(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip_in)
+{
+ struct xfs_imap imap;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = NULL;
+ int error;
+
+ /* We want to scan the inode we already had opened. */
+ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
+ sc->ip = ip_in;
+ return 0;
+ }
+
+ /* Look up the inode, see if the generation number matches. */
+ if (xfs_internal_inum(mp, sc->sm->sm_ino))
+ return -ENOENT;
+ error = xfs_iget(mp, NULL, sc->sm->sm_ino,
+ XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip);
+ switch (error) {
+ case -ENOENT:
+ /* Inode doesn't exist, just bail out. */
+ return error;
+ case 0:
+ /* Got an inode, continue. */
+ break;
+ case -EINVAL:
+ /*
+ * -EINVAL with IGET_UNTRUSTED could mean one of several
+ * things: userspace gave us an inode number that doesn't
+ * correspond to fs space, or doesn't have an inobt entry;
+ * or it could simply mean that the inode buffer failed the
+ * read verifiers.
+ *
+ * Try just the inode mapping lookup -- if it succeeds, then
+ * the inode buffer verifier failed and something needs fixing.
+ * Otherwise, we really couldn't find it so tell userspace
+ * that it no longer exists.
+ */
+ error = xfs_imap(sc->mp, sc->tp, sc->sm->sm_ino, &imap,
+ XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE);
+ if (error)
+ return -ENOENT;
+ error = -EFSCORRUPTED;
+ /* fall through */
+ default:
+ trace_xfs_scrub_op_error(sc,
+ XFS_INO_TO_AGNO(mp, sc->sm->sm_ino),
+ XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
+ error, __return_address);
+ return error;
+ }
+ if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
+ iput(VFS_I(ip));
+ return -ENOENT;
+ }
+
+ sc->ip = ip;
+ return 0;
+}
+
+/* Set us up to scrub a file's contents. */
+int
+xfs_scrub_setup_inode_contents(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip,
+ unsigned int resblks)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ error = xfs_scrub_get_inode(sc, ip);
+ if (error)
+ return error;
+
+ /* Got the inode, lock it and we're ready to go. */
+ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+ error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ if (error)
+ goto out;
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+out:
+ /* scrub teardown will unlock and release the inode for us */
+ return error;
+}
+
+/*
+ * Predicate that decides if we need to evaluate the cross-reference check.
+ * If there was an error accessing the cross-reference btree, just delete
+ * the cursor and skip the check.
+ */
+bool
+xfs_scrub_should_check_xref(
+ struct xfs_scrub_context *sc,
+ int *error,
+ struct xfs_btree_cur **curpp)
+{
+ if (*error == 0)
+ return true;
+
+ if (curpp) {
+ /* If we've already given up on xref, just bail out. */
+ if (!*curpp)
+ return false;
+
+ /* xref error, delete cursor and bail out. */
+ xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR);
+ *curpp = NULL;
+ }
+
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
+ trace_xfs_scrub_xref_error(sc, *error, __return_address);
+
+ /*
+ * Errors encountered during cross-referencing with another
+ * data structure should not cause this scrubber to abort.
+ */
+ *error = 0;
+ return false;
+}
+
+/* Run the structure verifiers on in-memory buffers to detect bad memory. */
+void
+xfs_scrub_buffer_recheck(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ if (bp->b_ops == NULL) {
+ xfs_scrub_block_set_corrupt(sc, bp);
+ return;
+ }
+ if (bp->b_ops->verify_struct == NULL) {
+ xfs_scrub_set_incomplete(sc);
+ return;
+ }
+ fa = bp->b_ops->verify_struct(bp);
+ if (!fa)
+ return;
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xfs_scrub_block_error(sc, bp->b_bn, fa);
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
new file mode 100644
index 000000000000..deaf60400981
--- /dev/null
+++ b/fs/xfs/scrub/common.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_COMMON_H__
+#define __XFS_SCRUB_COMMON_H__
+
+/*
+ * We /could/ terminate a scrub/repair operation early. If we're not
+ * in a good place to continue (fatal signal, etc.) then bail out.
+ * Note that we're careful not to make any judgements about *error.
+ */
+static inline bool
+xfs_scrub_should_terminate(
+ struct xfs_scrub_context *sc,
+ int *error)
+{
+ if (fatal_signal_pending(current)) {
+ if (*error == 0)
+ *error = -EAGAIN;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Grab an empty transaction so that we can re-grab locked buffers if
+ * one of our btrees turns out to be cyclic.
+ */
+static inline int
+xfs_scrub_trans_alloc(
+ struct xfs_scrub_metadata *sm,
+ struct xfs_mount *mp,
+ struct xfs_trans **tpp)
+{
+ return xfs_trans_alloc_empty(mp, tpp);
+}
+
+bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
+ xfs_agblock_t bno, int *error);
+bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset, int *error);
+
+bool xfs_scrub_xref_process_error(struct xfs_scrub_context *sc,
+ xfs_agnumber_t agno, xfs_agblock_t bno, int *error);
+bool xfs_scrub_fblock_xref_process_error(struct xfs_scrub_context *sc,
+ int whichfork, xfs_fileoff_t offset, int *error);
+
+void xfs_scrub_block_set_preen(struct xfs_scrub_context *sc,
+ struct xfs_buf *bp);
+void xfs_scrub_ino_set_preen(struct xfs_scrub_context *sc, xfs_ino_t ino);
+
+void xfs_scrub_block_set_corrupt(struct xfs_scrub_context *sc,
+ struct xfs_buf *bp);
+void xfs_scrub_ino_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino);
+void xfs_scrub_fblock_set_corrupt(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset);
+
+void xfs_scrub_block_xref_set_corrupt(struct xfs_scrub_context *sc,
+ struct xfs_buf *bp);
+void xfs_scrub_ino_xref_set_corrupt(struct xfs_scrub_context *sc,
+ xfs_ino_t ino);
+void xfs_scrub_fblock_xref_set_corrupt(struct xfs_scrub_context *sc,
+ int whichfork, xfs_fileoff_t offset);
+
+void xfs_scrub_ino_set_warning(struct xfs_scrub_context *sc, xfs_ino_t ino);
+void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset);
+
+void xfs_scrub_set_incomplete(struct xfs_scrub_context *sc);
+int xfs_scrub_checkpoint_log(struct xfs_mount *mp);
+
+/* Are we set up for a cross-referencing check? */
+bool xfs_scrub_should_check_xref(struct xfs_scrub_context *sc, int *error,
+ struct xfs_btree_cur **curpp);
+
+/* Setup functions */
+int xfs_scrub_setup_fs(struct xfs_scrub_context *sc, struct xfs_inode *ip);
+int xfs_scrub_setup_ag_allocbt(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_ag_iallocbt(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_ag_rmapbt(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_ag_refcountbt(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_inode(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_inode_bmap(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_inode_bmap_data(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_directory(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_xattr(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_symlink(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_parent(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+#ifdef CONFIG_XFS_RT
+int xfs_scrub_setup_rt(struct xfs_scrub_context *sc, struct xfs_inode *ip);
+#else
+static inline int
+xfs_scrub_setup_rt(struct xfs_scrub_context *sc, struct xfs_inode *ip)
+{
+ return -ENOENT;
+}
+#endif
+#ifdef CONFIG_XFS_QUOTA
+int xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip);
+#else
+static inline int
+xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip)
+{
+ return -ENOENT;
+}
+#endif
+
+void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa);
+int xfs_scrub_ag_init(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
+ struct xfs_scrub_ag *sa);
+int xfs_scrub_ag_read_headers(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
+ struct xfs_buf **agi, struct xfs_buf **agf,
+ struct xfs_buf **agfl);
+void xfs_scrub_ag_btcur_free(struct xfs_scrub_ag *sa);
+int xfs_scrub_ag_btcur_init(struct xfs_scrub_context *sc,
+ struct xfs_scrub_ag *sa);
+int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc,
+ int (*fn)(struct xfs_scrub_context *, xfs_agblock_t bno,
+ void *),
+ void *priv);
+int xfs_scrub_count_rmap_ownedby_ag(struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ struct xfs_owner_info *oinfo,
+ xfs_filblks_t *blocks);
+
+int xfs_scrub_setup_ag_btree(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip, bool force_log);
+int xfs_scrub_get_inode(struct xfs_scrub_context *sc, struct xfs_inode *ip_in);
+int xfs_scrub_setup_inode_contents(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip, unsigned int resblks);
+void xfs_scrub_buffer_recheck(struct xfs_scrub_context *sc, struct xfs_buf *bp);
+
+#endif /* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
new file mode 100644
index 000000000000..bffdb7dc09bf
--- /dev/null
+++ b/fs/xfs/scrub/dabtree.c
@@ -0,0 +1,613 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr_leaf.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/dabtree.h"
+
+/* Directory/Attribute Btree */
+
+/*
+ * Check for da btree operation errors. See the section about handling
+ * operational errors in common.c.
+ */
+bool
+xfs_scrub_da_process_error(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ int *error)
+{
+ struct xfs_scrub_context *sc = ds->sc;
+
+ if (*error == 0)
+ return true;
+
+ switch (*error) {
+ case -EDEADLOCK:
+ /* Used to restart an op with deadlock avoidance. */
+ trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error);
+ break;
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Note the badness but don't abort. */
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ *error = 0;
+ /* fall through */
+ default:
+ trace_xfs_scrub_file_op_error(sc, ds->dargs.whichfork,
+ xfs_dir2_da_to_db(ds->dargs.geo,
+ ds->state->path.blk[level].blkno),
+ *error, __return_address);
+ break;
+ }
+ return false;
+}
+
+/*
+ * Check for da btree corruption. See the section about handling
+ * operational errors in common.c.
+ */
+void
+xfs_scrub_da_set_corrupt(
+ struct xfs_scrub_da_btree *ds,
+ int level)
+{
+ struct xfs_scrub_context *sc = ds->sc;
+
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+
+ trace_xfs_scrub_fblock_error(sc, ds->dargs.whichfork,
+ xfs_dir2_da_to_db(ds->dargs.geo,
+ ds->state->path.blk[level].blkno),
+ __return_address);
+}
+
+/* Find an entry at a certain level in a da btree. */
+STATIC void *
+xfs_scrub_da_btree_entry(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ int rec)
+{
+ char *ents;
+ struct xfs_da_state_blk *blk;
+ void *baddr;
+
+ /* Dispatch the entry finding function. */
+ blk = &ds->state->path.blk[level];
+ baddr = blk->bp->b_addr;
+ switch (blk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ ents = (char *)xfs_attr3_leaf_entryp(baddr);
+ return ents + (rec * sizeof(struct xfs_attr_leaf_entry));
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr);
+ return ents + (rec * sizeof(struct xfs_dir2_leaf_entry));
+ case XFS_DIR2_LEAF1_MAGIC:
+ case XFS_DIR3_LEAF1_MAGIC:
+ ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr);
+ return ents + (rec * sizeof(struct xfs_dir2_leaf_entry));
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ ents = (char *)ds->dargs.dp->d_ops->node_tree_p(baddr);
+ return ents + (rec * sizeof(struct xfs_da_node_entry));
+ }
+
+ return NULL;
+}
+
+/* Scrub a da btree hash (key). */
+int
+xfs_scrub_da_btree_hash(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ __be32 *hashp)
+{
+ struct xfs_da_state_blk *blks;
+ struct xfs_da_node_entry *entry;
+ xfs_dahash_t hash;
+ xfs_dahash_t parent_hash;
+
+ /* Is this hash in order? */
+ hash = be32_to_cpu(*hashp);
+ if (hash < ds->hashes[level])
+ xfs_scrub_da_set_corrupt(ds, level);
+ ds->hashes[level] = hash;
+
+ if (level == 0)
+ return 0;
+
+ /* Is this hash no larger than the parent hash? */
+ blks = ds->state->path.blk;
+ entry = xfs_scrub_da_btree_entry(ds, level - 1, blks[level - 1].index);
+ parent_hash = be32_to_cpu(entry->hashval);
+ if (parent_hash < hash)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ return 0;
+}
+
+/*
+ * Check a da btree pointer. Returns true if it's ok to use this
+ * pointer.
+ */
+STATIC bool
+xfs_scrub_da_btree_ptr_ok(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ xfs_dablk_t blkno)
+{
+ if (blkno < ds->lowest || (ds->highest != 0 && blkno >= ds->highest)) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * The da btree scrubber can handle leaf1 blocks as a degenerate
+ * form of leafn blocks. Since the regular da code doesn't handle
+ * leaf1, we must multiplex the verifiers.
+ */
+static void
+xfs_scrub_da_btree_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DIR2_LEAF1_MAGIC:
+ case XFS_DIR3_LEAF1_MAGIC:
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ default:
+ /*
+ * xfs_da3_node_buf_ops already know how to handle
+ * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks.
+ */
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ }
+}
+static void
+xfs_scrub_da_btree_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DIR2_LEAF1_MAGIC:
+ case XFS_DIR3_LEAF1_MAGIC:
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ bp->b_ops->verify_write(bp);
+ return;
+ default:
+ /*
+ * xfs_da3_node_buf_ops already know how to handle
+ * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks.
+ */
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ bp->b_ops->verify_write(bp);
+ return;
+ }
+}
+static void *
+xfs_scrub_da_btree_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DIR2_LEAF1_MAGIC:
+ case XFS_DIR3_LEAF1_MAGIC:
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ return bp->b_ops->verify_struct(bp);
+ default:
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ return bp->b_ops->verify_struct(bp);
+ }
+}
+
+static const struct xfs_buf_ops xfs_scrub_da_btree_buf_ops = {
+ .name = "xfs_scrub_da_btree",
+ .verify_read = xfs_scrub_da_btree_read_verify,
+ .verify_write = xfs_scrub_da_btree_write_verify,
+ .verify_struct = xfs_scrub_da_btree_verify,
+};
+
+/* Check a block's sibling. */
+STATIC int
+xfs_scrub_da_btree_block_check_sibling(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ int direction,
+ xfs_dablk_t sibling)
+{
+ int retval;
+ int error;
+
+ memcpy(&ds->state->altpath, &ds->state->path,
+ sizeof(ds->state->altpath));
+
+ /*
+ * If the pointer is null, we shouldn't be able to move the upper
+ * level pointer anywhere.
+ */
+ if (sibling == 0) {
+ error = xfs_da3_path_shift(ds->state, &ds->state->altpath,
+ direction, false, &retval);
+ if (error == 0 && retval == 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ error = 0;
+ goto out;
+ }
+
+ /* Move the alternate cursor one block in the direction given. */
+ error = xfs_da3_path_shift(ds->state, &ds->state->altpath,
+ direction, false, &retval);
+ if (!xfs_scrub_da_process_error(ds, level, &error))
+ return error;
+ if (retval) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ return error;
+ }
+ if (ds->state->altpath.blk[level].bp)
+ xfs_scrub_buffer_recheck(ds->sc,
+ ds->state->altpath.blk[level].bp);
+
+ /* Compare upper level pointer to sibling pointer. */
+ if (ds->state->altpath.blk[level].blkno != sibling)
+ xfs_scrub_da_set_corrupt(ds, level);
+ xfs_trans_brelse(ds->dargs.trans, ds->state->altpath.blk[level].bp);
+out:
+ return error;
+}
+
+/* Check a block's sibling pointers. */
+STATIC int
+xfs_scrub_da_btree_block_check_siblings(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ struct xfs_da_blkinfo *hdr)
+{
+ xfs_dablk_t forw;
+ xfs_dablk_t back;
+ int error = 0;
+
+ forw = be32_to_cpu(hdr->forw);
+ back = be32_to_cpu(hdr->back);
+
+ /* Top level blocks should not have sibling pointers. */
+ if (level == 0) {
+ if (forw != 0 || back != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ return 0;
+ }
+
+ /*
+ * Check back (left) and forw (right) pointers. These functions
+ * absorb error codes for us.
+ */
+ error = xfs_scrub_da_btree_block_check_sibling(ds, level, 0, back);
+ if (error)
+ goto out;
+ error = xfs_scrub_da_btree_block_check_sibling(ds, level, 1, forw);
+
+out:
+ memset(&ds->state->altpath, 0, sizeof(ds->state->altpath));
+ return error;
+}
+
+/* Load a dir/attribute block from a btree. */
+STATIC int
+xfs_scrub_da_btree_block(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ xfs_dablk_t blkno)
+{
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_blkinfo *hdr3;
+ struct xfs_da_args *dargs = &ds->dargs;
+ struct xfs_inode *ip = ds->dargs.dp;
+ xfs_ino_t owner;
+ int *pmaxrecs;
+ struct xfs_da3_icnode_hdr nodehdr;
+ int error = 0;
+
+ blk = &ds->state->path.blk[level];
+ ds->state->path.active = level + 1;
+
+ /* Release old block. */
+ if (blk->bp) {
+ xfs_trans_brelse(dargs->trans, blk->bp);
+ blk->bp = NULL;
+ }
+
+ /* Check the pointer. */
+ blk->blkno = blkno;
+ if (!xfs_scrub_da_btree_ptr_ok(ds, level, blkno))
+ goto out_nobuf;
+
+ /* Read the buffer. */
+ error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno, -2,
+ &blk->bp, dargs->whichfork,
+ &xfs_scrub_da_btree_buf_ops);
+ if (!xfs_scrub_da_process_error(ds, level, &error))
+ goto out_nobuf;
+ if (blk->bp)
+ xfs_scrub_buffer_recheck(ds->sc, blk->bp);
+
+ /*
+ * We didn't find a dir btree root block, which means that
+ * there's no LEAF1/LEAFN tree (at least not where it's supposed
+ * to be), so jump out now.
+ */
+ if (ds->dargs.whichfork == XFS_DATA_FORK && level == 0 &&
+ blk->bp == NULL)
+ goto out_nobuf;
+
+ /* It's /not/ ok for attr trees not to have a da btree. */
+ if (blk->bp == NULL) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out_nobuf;
+ }
+
+ hdr3 = blk->bp->b_addr;
+ blk->magic = be16_to_cpu(hdr3->hdr.magic);
+ pmaxrecs = &ds->maxrecs[level];
+
+ /* We only started zeroing the header on v5 filesystems. */
+ if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb) && hdr3->hdr.pad)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ /* Check the owner. */
+ if (xfs_sb_version_hascrc(&ip->i_mount->m_sb)) {
+ owner = be64_to_cpu(hdr3->owner);
+ if (owner != ip->i_ino)
+ xfs_scrub_da_set_corrupt(ds, level);
+ }
+
+ /* Check the siblings. */
+ error = xfs_scrub_da_btree_block_check_siblings(ds, level, &hdr3->hdr);
+ if (error)
+ goto out;
+
+ /* Interpret the buffer. */
+ switch (blk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ xfs_trans_buf_set_type(dargs->trans, blk->bp,
+ XFS_BLFT_ATTR_LEAF_BUF);
+ blk->magic = XFS_ATTR_LEAF_MAGIC;
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp, pmaxrecs);
+ if (ds->tree_level != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ xfs_trans_buf_set_type(dargs->trans, blk->bp,
+ XFS_BLFT_DIR_LEAFN_BUF);
+ blk->magic = XFS_DIR2_LEAFN_MAGIC;
+ blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs);
+ if (ds->tree_level != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ break;
+ case XFS_DIR2_LEAF1_MAGIC:
+ case XFS_DIR3_LEAF1_MAGIC:
+ xfs_trans_buf_set_type(dargs->trans, blk->bp,
+ XFS_BLFT_DIR_LEAF1_BUF);
+ blk->magic = XFS_DIR2_LEAF1_MAGIC;
+ blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs);
+ if (ds->tree_level != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ break;
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ xfs_trans_buf_set_type(dargs->trans, blk->bp,
+ XFS_BLFT_DA_NODE_BUF);
+ blk->magic = XFS_DA_NODE_MAGIC;
+ node = blk->bp->b_addr;
+ ip->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = ip->d_ops->node_tree_p(node);
+ *pmaxrecs = nodehdr.count;
+ blk->hashval = be32_to_cpu(btree[*pmaxrecs - 1].hashval);
+ if (level == 0) {
+ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+ ds->tree_level = nodehdr.level;
+ } else {
+ if (ds->tree_level != nodehdr.level) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+ }
+
+ /* XXX: Check hdr3.pad32 once we know how to fix it. */
+ break;
+ default:
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+
+out:
+ return error;
+out_freebp:
+ xfs_trans_brelse(dargs->trans, blk->bp);
+ blk->bp = NULL;
+out_nobuf:
+ blk->blkno = 0;
+ return error;
+}
+
+/* Visit all nodes and leaves of a da btree. */
+int
+xfs_scrub_da_btree(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_scrub_da_btree_rec_fn scrub_fn,
+ void *private)
+{
+ struct xfs_scrub_da_btree ds = {};
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_da_state_blk *blks;
+ struct xfs_da_node_entry *key;
+ void *rec;
+ xfs_dablk_t blkno;
+ int level;
+ int error;
+
+ /* Skip short format data structures; no btree to scan. */
+ if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ return 0;
+
+ /* Set up initial da state. */
+ ds.dargs.dp = sc->ip;
+ ds.dargs.whichfork = whichfork;
+ ds.dargs.trans = sc->tp;
+ ds.dargs.op_flags = XFS_DA_OP_OKNOENT;
+ ds.state = xfs_da_state_alloc();
+ ds.state->args = &ds.dargs;
+ ds.state->mp = mp;
+ ds.sc = sc;
+ ds.private = private;
+ if (whichfork == XFS_ATTR_FORK) {
+ ds.dargs.geo = mp->m_attr_geo;
+ ds.lowest = 0;
+ ds.highest = 0;
+ } else {
+ ds.dargs.geo = mp->m_dir_geo;
+ ds.lowest = ds.dargs.geo->leafblk;
+ ds.highest = ds.dargs.geo->freeblk;
+ }
+ blkno = ds.lowest;
+ level = 0;
+
+ /* Find the root of the da tree, if present. */
+ blks = ds.state->path.blk;
+ error = xfs_scrub_da_btree_block(&ds, level, blkno);
+ if (error)
+ goto out_state;
+ /*
+ * We didn't find a block at ds.lowest, which means that there's
+ * no LEAF1/LEAFN tree (at least not where it's supposed to be),
+ * so jump out now.
+ */
+ if (blks[level].bp == NULL)
+ goto out_state;
+
+ blks[level].index = 0;
+ while (level >= 0 && level < XFS_DA_NODE_MAXDEPTH) {
+ /* Handle leaf block. */
+ if (blks[level].magic != XFS_DA_NODE_MAGIC) {
+ /* End of leaf, pop back towards the root. */
+ if (blks[level].index >= ds.maxrecs[level]) {
+ if (level > 0)
+ blks[level - 1].index++;
+ ds.tree_level++;
+ level--;
+ continue;
+ }
+
+ /* Dispatch record scrubbing. */
+ rec = xfs_scrub_da_btree_entry(&ds, level,
+ blks[level].index);
+ error = scrub_fn(&ds, level, rec);
+ if (error)
+ break;
+ if (xfs_scrub_should_terminate(sc, &error) ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ break;
+
+ blks[level].index++;
+ continue;
+ }
+
+
+ /* End of node, pop back towards the root. */
+ if (blks[level].index >= ds.maxrecs[level]) {
+ if (level > 0)
+ blks[level - 1].index++;
+ ds.tree_level++;
+ level--;
+ continue;
+ }
+
+ /* Hashes in order for scrub? */
+ key = xfs_scrub_da_btree_entry(&ds, level, blks[level].index);
+ error = xfs_scrub_da_btree_hash(&ds, level, &key->hashval);
+ if (error)
+ goto out;
+
+ /* Drill another level deeper. */
+ blkno = be32_to_cpu(key->before);
+ level++;
+ ds.tree_level--;
+ error = xfs_scrub_da_btree_block(&ds, level, blkno);
+ if (error)
+ goto out;
+ if (blks[level].bp == NULL)
+ goto out;
+
+ blks[level].index = 0;
+ }
+
+out:
+ /* Release all the buffers we're tracking. */
+ for (level = 0; level < XFS_DA_NODE_MAXDEPTH; level++) {
+ if (blks[level].bp == NULL)
+ continue;
+ xfs_trans_brelse(sc->tp, blks[level].bp);
+ blks[level].bp = NULL;
+ }
+
+out_state:
+ xfs_da_state_free(ds.state);
+ return error;
+}
diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h
new file mode 100644
index 000000000000..d31468d68cef
--- /dev/null
+++ b/fs/xfs/scrub/dabtree.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_DABTREE_H__
+#define __XFS_SCRUB_DABTREE_H__
+
+/* dir/attr btree */
+
+struct xfs_scrub_da_btree {
+ struct xfs_da_args dargs;
+ xfs_dahash_t hashes[XFS_DA_NODE_MAXDEPTH];
+ int maxrecs[XFS_DA_NODE_MAXDEPTH];
+ struct xfs_da_state *state;
+ struct xfs_scrub_context *sc;
+ void *private;
+
+ /*
+ * Lowest and highest directory block address in which we expect
+ * to find dir/attr btree node blocks. For a directory this
+ * (presumably) means between LEAF_OFFSET and FREE_OFFSET; for
+ * attributes there is no limit.
+ */
+ xfs_dablk_t lowest;
+ xfs_dablk_t highest;
+
+ int tree_level;
+};
+
+typedef int (*xfs_scrub_da_btree_rec_fn)(struct xfs_scrub_da_btree *ds,
+ int level, void *rec);
+
+/* Check for da btree operation errors. */
+bool xfs_scrub_da_process_error(struct xfs_scrub_da_btree *ds, int level, int *error);
+
+/* Check for da btree corruption. */
+void xfs_scrub_da_set_corrupt(struct xfs_scrub_da_btree *ds, int level);
+
+int xfs_scrub_da_btree_hash(struct xfs_scrub_da_btree *ds, int level,
+ __be32 *hashp);
+int xfs_scrub_da_btree(struct xfs_scrub_context *sc, int whichfork,
+ xfs_scrub_da_btree_rec_fn scrub_fn, void *private);
+
+#endif /* __XFS_SCRUB_DABTREE_H__ */
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
new file mode 100644
index 000000000000..38f29806eb54
--- /dev/null
+++ b/fs/xfs/scrub/dir.c
@@ -0,0 +1,842 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_itable.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ialloc.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/dabtree.h"
+
+/* Set us up to scrub directories. */
+int
+xfs_scrub_setup_directory(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_inode_contents(sc, ip, 0);
+}
+
+/* Directories */
+
+/* Scrub a directory entry. */
+
+struct xfs_scrub_dir_ctx {
+ /* VFS fill-directory iterator */
+ struct dir_context dir_iter;
+
+ struct xfs_scrub_context *sc;
+};
+
+/* Check that an inode's mode matches a given DT_ type. */
+STATIC int
+xfs_scrub_dir_check_ftype(
+ struct xfs_scrub_dir_ctx *sdc,
+ xfs_fileoff_t offset,
+ xfs_ino_t inum,
+ int dtype)
+{
+ struct xfs_mount *mp = sdc->sc->mp;
+ struct xfs_inode *ip;
+ int ino_dtype;
+ int error = 0;
+
+ if (!xfs_sb_version_hasftype(&mp->m_sb)) {
+ if (dtype != DT_UNKNOWN && dtype != DT_DIR)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ goto out;
+ }
+
+ /*
+ * Grab the inode pointed to by the dirent. We release the
+ * inode before we cancel the scrub transaction. Since we're
+ * don't know a priori that releasing the inode won't trigger
+ * eofblocks cleanup (which allocates what would be a nested
+ * transaction), we can't use DONTCACHE here because DONTCACHE
+ * inodes can trigger immediate inactive cleanup of the inode.
+ */
+ error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip);
+ if (!xfs_scrub_fblock_xref_process_error(sdc->sc, XFS_DATA_FORK, offset,
+ &error))
+ goto out;
+
+ /* Convert mode to the DT_* values that dir_emit uses. */
+ ino_dtype = xfs_dir3_get_dtype(mp,
+ xfs_mode_to_ftype(VFS_I(ip)->i_mode));
+ if (ino_dtype != dtype)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ iput(VFS_I(ip));
+out:
+ return error;
+}
+
+/*
+ * Scrub a single directory entry.
+ *
+ * We use the VFS directory iterator (i.e. readdir) to call this
+ * function for every directory entry in a directory. Once we're here,
+ * we check the inode number to make sure it's sane, then we check that
+ * we can look up this filename. Finally, we check the ftype.
+ */
+STATIC int
+xfs_scrub_dir_actor(
+ struct dir_context *dir_iter,
+ const char *name,
+ int namelen,
+ loff_t pos,
+ u64 ino,
+ unsigned type)
+{
+ struct xfs_mount *mp;
+ struct xfs_inode *ip;
+ struct xfs_scrub_dir_ctx *sdc;
+ struct xfs_name xname;
+ xfs_ino_t lookup_ino;
+ xfs_dablk_t offset;
+ int error = 0;
+
+ sdc = container_of(dir_iter, struct xfs_scrub_dir_ctx, dir_iter);
+ ip = sdc->sc->ip;
+ mp = ip->i_mount;
+ offset = xfs_dir2_db_to_da(mp->m_dir_geo,
+ xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos));
+
+ /* Does this inode number make sense? */
+ if (!xfs_verify_dir_ino(mp, ino)) {
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ goto out;
+ }
+
+ if (!strncmp(".", name, namelen)) {
+ /* If this is "." then check that the inum matches the dir. */
+ if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ if (ino != ip->i_ino)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ } else if (!strncmp("..", name, namelen)) {
+ /*
+ * If this is ".." in the root inode, check that the inum
+ * matches this dir.
+ */
+ if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ if (ip->i_ino == mp->m_sb.sb_rootino && ino != ip->i_ino)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ }
+
+ /* Verify that we can look up this name by hash. */
+ xname.name = name;
+ xname.len = namelen;
+ xname.type = XFS_DIR3_FT_UNKNOWN;
+
+ error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL);
+ if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset,
+ &error))
+ goto fail_xref;
+ if (lookup_ino != ino) {
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ goto out;
+ }
+
+ /* Verify the file type. This function absorbs error codes. */
+ error = xfs_scrub_dir_check_ftype(sdc, offset, lookup_ino, type);
+ if (error)
+ goto out;
+out:
+ return error;
+fail_xref:
+ return error;
+}
+
+/* Scrub a directory btree record. */
+STATIC int
+xfs_scrub_dir_rec(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ void *rec)
+{
+ struct xfs_mount *mp = ds->state->mp;
+ struct xfs_dir2_leaf_entry *ent = rec;
+ struct xfs_inode *dp = ds->dargs.dp;
+ struct xfs_dir2_data_entry *dent;
+ struct xfs_buf *bp;
+ char *p, *endp;
+ xfs_ino_t ino;
+ xfs_dablk_t rec_bno;
+ xfs_dir2_db_t db;
+ xfs_dir2_data_aoff_t off;
+ xfs_dir2_dataptr_t ptr;
+ xfs_dahash_t calc_hash;
+ xfs_dahash_t hash;
+ unsigned int tag;
+ int error;
+
+ /* Check the hash of the entry. */
+ error = xfs_scrub_da_btree_hash(ds, level, &ent->hashval);
+ if (error)
+ goto out;
+
+ /* Valid hash pointer? */
+ ptr = be32_to_cpu(ent->address);
+ if (ptr == 0)
+ return 0;
+
+ /* Find the directory entry's location. */
+ db = xfs_dir2_dataptr_to_db(mp->m_dir_geo, ptr);
+ off = xfs_dir2_dataptr_to_off(mp->m_dir_geo, ptr);
+ rec_bno = xfs_dir2_db_to_da(mp->m_dir_geo, db);
+
+ if (rec_bno >= mp->m_dir_geo->leafblk) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+ error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, -2, &bp);
+ if (!xfs_scrub_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno,
+ &error))
+ goto out;
+ if (!bp) {
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+ goto out;
+ }
+ xfs_scrub_buffer_recheck(ds->sc, bp);
+
+ dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off);
+
+ /* Make sure we got a real directory entry. */
+ p = (char *)mp->m_dir_inode_ops->data_entry_p(bp->b_addr);
+ endp = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr);
+ if (!endp) {
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+ goto out_relse;
+ }
+ while (p < endp) {
+ struct xfs_dir2_data_entry *dep;
+ struct xfs_dir2_data_unused *dup;
+
+ dup = (struct xfs_dir2_data_unused *)p;
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ p += be16_to_cpu(dup->length);
+ continue;
+ }
+ dep = (struct xfs_dir2_data_entry *)p;
+ if (dep == dent)
+ break;
+ p += mp->m_dir_inode_ops->data_entsize(dep->namelen);
+ }
+ if (p >= endp) {
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+ goto out_relse;
+ }
+
+ /* Retrieve the entry, sanity check it, and compare hashes. */
+ ino = be64_to_cpu(dent->inumber);
+ hash = be32_to_cpu(ent->hashval);
+ tag = be16_to_cpup(dp->d_ops->data_entry_tag_p(dent));
+ if (!xfs_verify_dir_ino(mp, ino) || tag != off)
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+ if (dent->namelen == 0) {
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+ goto out_relse;
+ }
+ calc_hash = xfs_da_hashname(dent->name, dent->namelen);
+ if (calc_hash != hash)
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+
+out_relse:
+ xfs_trans_brelse(ds->dargs.trans, bp);
+out:
+ return error;
+}
+
+/*
+ * Is this unused entry either in the bestfree or smaller than all of
+ * them? We've already checked that the bestfrees are sorted longest to
+ * shortest, and that there aren't any bogus entries.
+ */
+STATIC void
+xfs_scrub_directory_check_free_entry(
+ struct xfs_scrub_context *sc,
+ xfs_dablk_t lblk,
+ struct xfs_dir2_data_free *bf,
+ struct xfs_dir2_data_unused *dup)
+{
+ struct xfs_dir2_data_free *dfp;
+ unsigned int dup_length;
+
+ dup_length = be16_to_cpu(dup->length);
+
+ /* Unused entry is shorter than any of the bestfrees */
+ if (dup_length < be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
+ return;
+
+ for (dfp = &bf[XFS_DIR2_DATA_FD_COUNT - 1]; dfp >= bf; dfp--)
+ if (dup_length == be16_to_cpu(dfp->length))
+ return;
+
+ /* Unused entry should be in the bestfrees but wasn't found. */
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+}
+
+/* Check free space info in a directory data block. */
+STATIC int
+xfs_scrub_directory_data_bestfree(
+ struct xfs_scrub_context *sc,
+ xfs_dablk_t lblk,
+ bool is_block)
+{
+ struct xfs_dir2_data_unused *dup;
+ struct xfs_dir2_data_free *dfp;
+ struct xfs_buf *bp;
+ struct xfs_dir2_data_free *bf;
+ struct xfs_mount *mp = sc->mp;
+ const struct xfs_dir_ops *d_ops;
+ char *ptr;
+ char *endptr;
+ u16 tag;
+ unsigned int nr_bestfrees = 0;
+ unsigned int nr_frees = 0;
+ unsigned int smallest_bestfree;
+ int newlen;
+ int offset;
+ int error;
+
+ d_ops = sc->ip->d_ops;
+
+ if (is_block) {
+ /* dir block format */
+ if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ error = xfs_dir3_block_read(sc->tp, sc->ip, &bp);
+ } else {
+ /* dir data format */
+ error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, -1, &bp);
+ }
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+ goto out;
+ xfs_scrub_buffer_recheck(sc, bp);
+
+ /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */
+
+ /* Do the bestfrees correspond to actual free space? */
+ bf = d_ops->data_bestfree_p(bp->b_addr);
+ smallest_bestfree = UINT_MAX;
+ for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
+ offset = be16_to_cpu(dfp->offset);
+ if (offset == 0)
+ continue;
+ if (offset >= mp->m_dir_geo->blksize) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
+ dup = (struct xfs_dir2_data_unused *)(bp->b_addr + offset);
+ tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
+
+ /* bestfree doesn't match the entry it points at? */
+ if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG) ||
+ be16_to_cpu(dup->length) != be16_to_cpu(dfp->length) ||
+ tag != ((char *)dup - (char *)bp->b_addr)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
+
+ /* bestfree records should be ordered largest to smallest */
+ if (smallest_bestfree < be16_to_cpu(dfp->length)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
+
+ smallest_bestfree = be16_to_cpu(dfp->length);
+ nr_bestfrees++;
+ }
+
+ /* Make sure the bestfrees are actually the best free spaces. */
+ ptr = (char *)d_ops->data_entry_p(bp->b_addr);
+ endptr = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr);
+
+ /* Iterate the entries, stopping when we hit or go past the end. */
+ while (ptr < endptr) {
+ dup = (struct xfs_dir2_data_unused *)ptr;
+ /* Skip real entries */
+ if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG)) {
+ struct xfs_dir2_data_entry *dep;
+
+ dep = (struct xfs_dir2_data_entry *)ptr;
+ newlen = d_ops->data_entsize(dep->namelen);
+ if (newlen <= 0) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ lblk);
+ goto out_buf;
+ }
+ ptr += newlen;
+ continue;
+ }
+
+ /* Spot check this free entry */
+ tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
+ if (tag != ((char *)dup - (char *)bp->b_addr))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+ /*
+ * Either this entry is a bestfree or it's smaller than
+ * any of the bestfrees.
+ */
+ xfs_scrub_directory_check_free_entry(sc, lblk, bf, dup);
+
+ /* Move on. */
+ newlen = be16_to_cpu(dup->length);
+ if (newlen <= 0) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
+ ptr += newlen;
+ if (ptr <= endptr)
+ nr_frees++;
+ }
+
+ /* We're required to fill all the space. */
+ if (ptr != endptr)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+ /* Did we see at least as many free slots as there are bestfrees? */
+ if (nr_frees < nr_bestfrees)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+out_buf:
+ xfs_trans_brelse(sc->tp, bp);
+out:
+ return error;
+}
+
+/*
+ * Does the free space length in the free space index block ($len) match
+ * the longest length in the directory data block's bestfree array?
+ * Assume that we've already checked that the data block's bestfree
+ * array is in order.
+ */
+STATIC void
+xfs_scrub_directory_check_freesp(
+ struct xfs_scrub_context *sc,
+ xfs_dablk_t lblk,
+ struct xfs_buf *dbp,
+ unsigned int len)
+{
+ struct xfs_dir2_data_free *dfp;
+
+ dfp = sc->ip->d_ops->data_bestfree_p(dbp->b_addr);
+
+ if (len != be16_to_cpu(dfp->length))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+ if (len > 0 && be16_to_cpu(dfp->offset) == 0)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+}
+
+/* Check free space info in a directory leaf1 block. */
+STATIC int
+xfs_scrub_directory_leaf1_bestfree(
+ struct xfs_scrub_context *sc,
+ struct xfs_da_args *args,
+ xfs_dablk_t lblk)
+{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir2_leaf_tail *ltp;
+ struct xfs_dir2_leaf *leaf;
+ struct xfs_buf *dbp;
+ struct xfs_buf *bp;
+ const struct xfs_dir_ops *d_ops = sc->ip->d_ops;
+ struct xfs_da_geometry *geo = sc->mp->m_dir_geo;
+ __be16 *bestp;
+ __u16 best;
+ __u32 hash;
+ __u32 lasthash = 0;
+ __u32 bestcount;
+ unsigned int stale = 0;
+ int i;
+ int error;
+
+ /* Read the free space block. */
+ error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, -1, &bp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+ goto out;
+ xfs_scrub_buffer_recheck(sc, bp);
+
+ leaf = bp->b_addr;
+ d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = d_ops->leaf_ents_p(leaf);
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+ bestcount = be32_to_cpu(ltp->bestcount);
+ bestp = xfs_dir2_leaf_bests_p(ltp);
+
+ if (xfs_sb_version_hascrc(&sc->mp->m_sb)) {
+ struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
+
+ if (hdr3->pad != cpu_to_be32(0))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ }
+
+ /*
+ * There should be as many bestfree slots as there are dir data
+ * blocks that can fit under i_size.
+ */
+ if (bestcount != xfs_dir2_byte_to_db(geo, sc->ip->i_d.di_size)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+
+ /* Is the leaf count even remotely sane? */
+ if (leafhdr.count > d_ops->leaf_max_ents(geo)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+
+ /* Leaves and bests don't overlap in leaf format. */
+ if ((char *)&ents[leafhdr.count] > (char *)bestp) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+
+ /* Check hash value order, count stale entries. */
+ for (i = 0; i < leafhdr.count; i++) {
+ hash = be32_to_cpu(ents[i].hashval);
+ if (i > 0 && lasthash > hash)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ lasthash = hash;
+ if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ stale++;
+ }
+ if (leafhdr.stale != stale)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+ /* Check all the bestfree entries. */
+ for (i = 0; i < bestcount; i++, bestp++) {
+ best = be16_to_cpu(*bestp);
+ if (best == NULLDATAOFF)
+ continue;
+ error = xfs_dir3_data_read(sc->tp, sc->ip,
+ i * args->geo->fsbcount, -1, &dbp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk,
+ &error))
+ continue;
+ xfs_scrub_directory_check_freesp(sc, lblk, dbp, best);
+ xfs_trans_brelse(sc->tp, dbp);
+ }
+out:
+ return error;
+}
+
+/* Check free space info in a directory freespace block. */
+STATIC int
+xfs_scrub_directory_free_bestfree(
+ struct xfs_scrub_context *sc,
+ struct xfs_da_args *args,
+ xfs_dablk_t lblk)
+{
+ struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_buf *dbp;
+ struct xfs_buf *bp;
+ __be16 *bestp;
+ __u16 best;
+ unsigned int stale = 0;
+ int i;
+ int error;
+
+ /* Read the free space block */
+ error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+ goto out;
+ xfs_scrub_buffer_recheck(sc, bp);
+
+ if (xfs_sb_version_hascrc(&sc->mp->m_sb)) {
+ struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+ if (hdr3->pad != cpu_to_be32(0))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ }
+
+ /* Check all the entries. */
+ sc->ip->d_ops->free_hdr_from_disk(&freehdr, bp->b_addr);
+ bestp = sc->ip->d_ops->free_bests_p(bp->b_addr);
+ for (i = 0; i < freehdr.nvalid; i++, bestp++) {
+ best = be16_to_cpu(*bestp);
+ if (best == NULLDATAOFF) {
+ stale++;
+ continue;
+ }
+ error = xfs_dir3_data_read(sc->tp, sc->ip,
+ (freehdr.firstdb + i) * args->geo->fsbcount,
+ -1, &dbp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk,
+ &error))
+ continue;
+ xfs_scrub_directory_check_freesp(sc, lblk, dbp, best);
+ xfs_trans_brelse(sc->tp, dbp);
+ }
+
+ if (freehdr.nused + stale != freehdr.nvalid)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+out:
+ return error;
+}
+
+/* Check free space information in directories. */
+STATIC int
+xfs_scrub_directory_blocks(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_da_args args;
+ struct xfs_ifork *ifp;
+ struct xfs_mount *mp = sc->mp;
+ xfs_fileoff_t leaf_lblk;
+ xfs_fileoff_t free_lblk;
+ xfs_fileoff_t lblk;
+ struct xfs_iext_cursor icur;
+ xfs_dablk_t dabno;
+ bool found;
+ int is_block = 0;
+ int error;
+
+ /* Ignore local format directories. */
+ if (sc->ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+ sc->ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
+ return 0;
+
+ ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+ lblk = XFS_B_TO_FSB(mp, XFS_DIR2_DATA_OFFSET);
+ leaf_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_LEAF_OFFSET);
+ free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
+
+ /* Is this a block dir? */
+ args.dp = sc->ip;
+ args.geo = mp->m_dir_geo;
+ args.trans = sc->tp;
+ error = xfs_dir2_isblock(&args, &is_block);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+ goto out;
+
+ /* Iterate all the data extents in the directory... */
+ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+ while (found) {
+ /* Block directories only have a single block at offset 0. */
+ if (is_block &&
+ (got.br_startoff > 0 ||
+ got.br_blockcount != args.geo->fsbcount)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ got.br_startoff);
+ break;
+ }
+
+ /* No more data blocks... */
+ if (got.br_startoff >= leaf_lblk)
+ break;
+
+ /*
+ * Check each data block's bestfree data.
+ *
+ * Iterate all the fsbcount-aligned block offsets in
+ * this directory. The directory block reading code is
+ * smart enough to do its own bmap lookups to handle
+ * discontiguous directory blocks. When we're done
+ * with the extent record, re-query the bmap at the
+ * next fsbcount-aligned offset to avoid redundant
+ * block checks.
+ */
+ for (lblk = roundup((xfs_dablk_t)got.br_startoff,
+ args.geo->fsbcount);
+ lblk < got.br_startoff + got.br_blockcount;
+ lblk += args.geo->fsbcount) {
+ error = xfs_scrub_directory_data_bestfree(sc, lblk,
+ is_block);
+ if (error)
+ goto out;
+ }
+ dabno = got.br_startoff + got.br_blockcount;
+ lblk = roundup(dabno, args.geo->fsbcount);
+ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+ }
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Look for a leaf1 block, which has free info. */
+ if (xfs_iext_lookup_extent(sc->ip, ifp, leaf_lblk, &icur, &got) &&
+ got.br_startoff == leaf_lblk &&
+ got.br_blockcount == args.geo->fsbcount &&
+ !xfs_iext_next_extent(ifp, &icur, &got)) {
+ if (is_block) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+ error = xfs_scrub_directory_leaf1_bestfree(sc, &args,
+ leaf_lblk);
+ if (error)
+ goto out;
+ }
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Scan for free blocks */
+ lblk = free_lblk;
+ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+ while (found) {
+ /*
+ * Dirs can't have blocks mapped above 2^32.
+ * Single-block dirs shouldn't even be here.
+ */
+ lblk = got.br_startoff;
+ if (lblk & ~0xFFFFFFFFULL) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+ if (is_block) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+
+ /*
+ * Check each dir free block's bestfree data.
+ *
+ * Iterate all the fsbcount-aligned block offsets in
+ * this directory. The directory block reading code is
+ * smart enough to do its own bmap lookups to handle
+ * discontiguous directory blocks. When we're done
+ * with the extent record, re-query the bmap at the
+ * next fsbcount-aligned offset to avoid redundant
+ * block checks.
+ */
+ for (lblk = roundup((xfs_dablk_t)got.br_startoff,
+ args.geo->fsbcount);
+ lblk < got.br_startoff + got.br_blockcount;
+ lblk += args.geo->fsbcount) {
+ error = xfs_scrub_directory_free_bestfree(sc, &args,
+ lblk);
+ if (error)
+ goto out;
+ }
+ dabno = got.br_startoff + got.br_blockcount;
+ lblk = roundup(dabno, args.geo->fsbcount);
+ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+ }
+out:
+ return error;
+}
+
+/* Scrub a whole directory. */
+int
+xfs_scrub_directory(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_scrub_dir_ctx sdc = {
+ .dir_iter.actor = xfs_scrub_dir_actor,
+ .dir_iter.pos = 0,
+ .sc = sc,
+ };
+ size_t bufsize;
+ loff_t oldpos;
+ int error = 0;
+
+ if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
+ return -ENOENT;
+
+ /* Plausible size? */
+ if (sc->ip->i_d.di_size < xfs_dir2_sf_hdr_size(0)) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+ goto out;
+ }
+
+ /* Check directory tree structure */
+ error = xfs_scrub_da_btree(sc, XFS_DATA_FORK, xfs_scrub_dir_rec, NULL);
+ if (error)
+ return error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+
+ /* Check the freespace. */
+ error = xfs_scrub_directory_blocks(sc);
+ if (error)
+ return error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+
+ /*
+ * Check that every dirent we see can also be looked up by hash.
+ * Userspace usually asks for a 32k buffer, so we will too.
+ */
+ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
+ sc->ip->i_d.di_size);
+
+ /*
+ * Look up every name in this directory by hash.
+ *
+ * Use the xfs_readdir function to call xfs_scrub_dir_actor on
+ * every directory entry in this directory. In _actor, we check
+ * the name, inode number, and ftype (if applicable) of the
+ * entry. xfs_readdir uses the VFS filldir functions to provide
+ * iteration context.
+ *
+ * The VFS grabs a read or write lock via i_rwsem before it reads
+ * or writes to a directory. If we've gotten this far we've
+ * already obtained IOLOCK_EXCL, which (since 4.10) is the same as
+ * getting a write lock on i_rwsem. Therefore, it is safe for us
+ * to drop the ILOCK here in order to reuse the _readdir and
+ * _dir_lookup routines, which do their own ILOCK locking.
+ */
+ oldpos = 0;
+ sc->ilock_flags &= ~XFS_ILOCK_EXCL;
+ xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
+ while (true) {
+ error = xfs_readdir(sc->tp, sc->ip, &sdc.dir_iter, bufsize);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ goto out;
+ if (oldpos == sdc.dir_iter.pos)
+ break;
+ oldpos = sdc.dir_iter.pos;
+ }
+
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
new file mode 100644
index 000000000000..106ca4bd753f
--- /dev/null
+++ b/fs/xfs/scrub/ialloc.c
@@ -0,0 +1,528 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/*
+ * Set us up to scrub inode btrees.
+ * If we detect a discrepancy between the inobt and the inode,
+ * try again after forcing logged inode cores out to disk.
+ */
+int
+xfs_scrub_setup_ag_iallocbt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_ag_btree(sc, ip, sc->try_harder);
+}
+
+/* Inode btree scrubber. */
+
+/*
+ * If we're checking the finobt, cross-reference with the inobt.
+ * Otherwise we're checking the inobt; if there is an finobt, make sure
+ * we have a record or not depending on freecount.
+ */
+static inline void
+xfs_scrub_iallocbt_chunk_xref_other(
+ struct xfs_scrub_context *sc,
+ struct xfs_inobt_rec_incore *irec,
+ xfs_agino_t agino)
+{
+ struct xfs_btree_cur **pcur;
+ bool has_irec;
+ int error;
+
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
+ pcur = &sc->sa.ino_cur;
+ else
+ pcur = &sc->sa.fino_cur;
+ if (!(*pcur))
+ return;
+ error = xfs_ialloc_has_inode_record(*pcur, agino, agino, &has_irec);
+ if (!xfs_scrub_should_check_xref(sc, &error, pcur))
+ return;
+ if (((irec->ir_freecount > 0 && !has_irec) ||
+ (irec->ir_freecount == 0 && has_irec)))
+ xfs_scrub_btree_xref_set_corrupt(sc, *pcur, 0);
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xfs_scrub_iallocbt_chunk_xref(
+ struct xfs_scrub_context *sc,
+ struct xfs_inobt_rec_incore *irec,
+ xfs_agino_t agino,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ struct xfs_owner_info oinfo;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ xfs_scrub_xref_is_used_space(sc, agbno, len);
+ xfs_scrub_iallocbt_chunk_xref_other(sc, irec, agino);
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
+ xfs_scrub_xref_is_owned_by(sc, agbno, len, &oinfo);
+ xfs_scrub_xref_is_not_shared(sc, agbno, len);
+}
+
+/* Is this chunk worth checking? */
+STATIC bool
+xfs_scrub_iallocbt_chunk(
+ struct xfs_scrub_btree *bs,
+ struct xfs_inobt_rec_incore *irec,
+ xfs_agino_t agino,
+ xfs_extlen_t len)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agblock_t bno;
+
+ bno = XFS_AGINO_TO_AGBNO(mp, agino);
+ if (bno + len <= bno ||
+ !xfs_verify_agbno(mp, agno, bno) ||
+ !xfs_verify_agbno(mp, agno, bno + len - 1))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ xfs_scrub_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len);
+
+ return true;
+}
+
+/* Count the number of free inodes. */
+static unsigned int
+xfs_scrub_iallocbt_freecount(
+ xfs_inofree_t freemask)
+{
+ BUILD_BUG_ON(sizeof(freemask) != sizeof(__u64));
+ return hweight64(freemask);
+}
+
+/* Check a particular inode with ir_free. */
+STATIC int
+xfs_scrub_iallocbt_check_cluster_freemask(
+ struct xfs_scrub_btree *bs,
+ xfs_ino_t fsino,
+ xfs_agino_t chunkino,
+ xfs_agino_t clusterino,
+ struct xfs_inobt_rec_incore *irec,
+ struct xfs_buf *bp)
+{
+ struct xfs_dinode *dip;
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ bool inode_is_free = false;
+ bool freemask_ok;
+ bool inuse;
+ int error = 0;
+
+ if (xfs_scrub_should_terminate(bs->sc, &error))
+ return error;
+
+ dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize);
+ if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
+ (dip->di_version >= 3 &&
+ be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ goto out;
+ }
+
+ if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino))
+ inode_is_free = true;
+ error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
+ fsino + clusterino, &inuse);
+ if (error == -ENODATA) {
+ /* Not cached, just read the disk buffer */
+ freemask_ok = inode_is_free ^ !!(dip->di_mode);
+ if (!bs->sc->try_harder && !freemask_ok)
+ return -EDEADLOCK;
+ } else if (error < 0) {
+ /*
+ * Inode is only half assembled, or there was an IO error,
+ * or the verifier failed, so don't bother trying to check.
+ * The inode scrubber can deal with this.
+ */
+ goto out;
+ } else {
+ /* Inode is all there. */
+ freemask_ok = inode_is_free ^ inuse;
+ }
+ if (!freemask_ok)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+out:
+ return 0;
+}
+
+/* Make sure the free mask is consistent with what the inodes think. */
+STATIC int
+xfs_scrub_iallocbt_check_freemask(
+ struct xfs_scrub_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_imap imap;
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xfs_dinode *dip;
+ struct xfs_buf *bp;
+ xfs_ino_t fsino;
+ xfs_agino_t nr_inodes;
+ xfs_agino_t agino;
+ xfs_agino_t chunkino;
+ xfs_agino_t clusterino;
+ xfs_agblock_t agbno;
+ int blks_per_cluster;
+ uint16_t holemask;
+ uint16_t ir_holemask;
+ int error = 0;
+
+ /* Make sure the freemask matches the inode records. */
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0);
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
+
+ for (agino = irec->ir_startino;
+ agino < irec->ir_startino + XFS_INODES_PER_CHUNK;
+ agino += blks_per_cluster * mp->m_sb.sb_inopblock) {
+ fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+ chunkino = agino - irec->ir_startino;
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+
+ /* Compute the holemask mask for this cluster. */
+ for (clusterino = 0, holemask = 0; clusterino < nr_inodes;
+ clusterino += XFS_INODES_PER_HOLEMASK_BIT)
+ holemask |= XFS_INOBT_MASK((chunkino + clusterino) /
+ XFS_INODES_PER_HOLEMASK_BIT);
+
+ /* The whole cluster must be a hole or not a hole. */
+ ir_holemask = (irec->ir_holemask & holemask);
+ if (ir_holemask != holemask && ir_holemask != 0) {
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ continue;
+ }
+
+ /* If any part of this is a hole, skip it. */
+ if (ir_holemask) {
+ xfs_scrub_xref_is_not_owned_by(bs->sc, agbno,
+ blks_per_cluster, &oinfo);
+ continue;
+ }
+
+ xfs_scrub_xref_is_owned_by(bs->sc, agbno, blks_per_cluster,
+ &oinfo);
+
+ /* Grab the inode cluster buffer. */
+ imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno,
+ agbno);
+ imap.im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ imap.im_boffset = 0;
+
+ error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
+ &dip, &bp, 0, 0);
+ if (!xfs_scrub_btree_xref_process_error(bs->sc, bs->cur, 0,
+ &error))
+ continue;
+
+ /* Which inodes are free? */
+ for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
+ error = xfs_scrub_iallocbt_check_cluster_freemask(bs,
+ fsino, chunkino, clusterino, irec, bp);
+ if (error) {
+ xfs_trans_brelse(bs->cur->bc_tp, bp);
+ return error;
+ }
+ }
+
+ xfs_trans_brelse(bs->cur->bc_tp, bp);
+ }
+
+ return error;
+}
+
+/* Scrub an inobt/finobt record. */
+STATIC int
+xfs_scrub_iallocbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ xfs_filblks_t *inode_blocks = bs->private;
+ struct xfs_inobt_rec_incore irec;
+ uint64_t holes;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agino_t agino;
+ xfs_agblock_t agbno;
+ xfs_extlen_t len;
+ int holecount;
+ int i;
+ int error = 0;
+ unsigned int real_freecount;
+ uint16_t holemask;
+
+ xfs_inobt_btrec_to_irec(mp, rec, &irec);
+
+ if (irec.ir_count > XFS_INODES_PER_CHUNK ||
+ irec.ir_freecount > XFS_INODES_PER_CHUNK)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ real_freecount = irec.ir_freecount +
+ (XFS_INODES_PER_CHUNK - irec.ir_count);
+ if (real_freecount != xfs_scrub_iallocbt_freecount(irec.ir_free))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ agino = irec.ir_startino;
+ /* Record has to be properly aligned within the AG. */
+ if (!xfs_verify_agino(mp, agno, agino) ||
+ !xfs_verify_agino(mp, agno, agino + XFS_INODES_PER_CHUNK - 1)) {
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ goto out;
+ }
+
+ /* Make sure this record is aligned to cluster and inoalignmnt size. */
+ agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino);
+ if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) ||
+ (agbno & (xfs_icluster_size_fsb(mp) - 1)))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ *inode_blocks += XFS_B_TO_FSB(mp,
+ irec.ir_count * mp->m_sb.sb_inodesize);
+
+ /* Handle non-sparse inodes */
+ if (!xfs_inobt_issparse(irec.ir_holemask)) {
+ len = XFS_B_TO_FSB(mp,
+ XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize);
+ if (irec.ir_count != XFS_INODES_PER_CHUNK)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len))
+ goto out;
+ goto check_freemask;
+ }
+
+ /* Check each chunk of a sparse inode cluster. */
+ holemask = irec.ir_holemask;
+ holecount = 0;
+ len = XFS_B_TO_FSB(mp,
+ XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize);
+ holes = ~xfs_inobt_irec_to_allocmask(&irec);
+ if ((holes & irec.ir_free) != holes ||
+ irec.ir_freecount > irec.ir_count)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) {
+ if (holemask & 1)
+ holecount += XFS_INODES_PER_HOLEMASK_BIT;
+ else if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len))
+ break;
+ holemask >>= 1;
+ agino += XFS_INODES_PER_HOLEMASK_BIT;
+ }
+
+ if (holecount > XFS_INODES_PER_CHUNK ||
+ holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+check_freemask:
+ error = xfs_scrub_iallocbt_check_freemask(bs, &irec);
+ if (error)
+ goto out;
+
+out:
+ return error;
+}
+
+/*
+ * Make sure the inode btrees are as large as the rmap thinks they are.
+ * Don't bother if we're missing btree cursors, as we're already corrupt.
+ */
+STATIC void
+xfs_scrub_iallocbt_xref_rmap_btreeblks(
+ struct xfs_scrub_context *sc,
+ int which)
+{
+ struct xfs_owner_info oinfo;
+ xfs_filblks_t blocks;
+ xfs_extlen_t inobt_blocks = 0;
+ xfs_extlen_t finobt_blocks = 0;
+ int error;
+
+ if (!sc->sa.ino_cur || !sc->sa.rmap_cur ||
+ (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur))
+ return;
+
+ /* Check that we saw as many inobt blocks as the rmap says. */
+ error = xfs_btree_count_blocks(sc->sa.ino_cur, &inobt_blocks);
+ if (!xfs_scrub_process_error(sc, 0, 0, &error))
+ return;
+
+ if (sc->sa.fino_cur) {
+ error = xfs_btree_count_blocks(sc->sa.fino_cur, &finobt_blocks);
+ if (!xfs_scrub_process_error(sc, 0, 0, &error))
+ return;
+ }
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
+ error = xfs_scrub_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo,
+ &blocks);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+ return;
+ if (blocks != inobt_blocks + finobt_blocks)
+ xfs_scrub_btree_set_corrupt(sc, sc->sa.ino_cur, 0);
+}
+
+/*
+ * Make sure that the inobt records point to the same number of blocks as
+ * the rmap says are owned by inodes.
+ */
+STATIC void
+xfs_scrub_iallocbt_xref_rmap_inodes(
+ struct xfs_scrub_context *sc,
+ int which,
+ xfs_filblks_t inode_blocks)
+{
+ struct xfs_owner_info oinfo;
+ xfs_filblks_t blocks;
+ int error;
+
+ if (!sc->sa.rmap_cur)
+ return;
+
+ /* Check that we saw as many inode blocks as the rmap knows about. */
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
+ error = xfs_scrub_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo,
+ &blocks);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+ return;
+ if (blocks != inode_blocks)
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+}
+
+/* Scrub the inode btrees for some AG. */
+STATIC int
+xfs_scrub_iallocbt(
+ struct xfs_scrub_context *sc,
+ xfs_btnum_t which)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_owner_info oinfo;
+ xfs_filblks_t inode_blocks = 0;
+ int error;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
+ cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur;
+ error = xfs_scrub_btree(sc, cur, xfs_scrub_iallocbt_rec, &oinfo,
+ &inode_blocks);
+ if (error)
+ return error;
+
+ xfs_scrub_iallocbt_xref_rmap_btreeblks(sc, which);
+
+ /*
+ * If we're scrubbing the inode btree, inode_blocks is the number of
+ * blocks pointed to by all the inode chunk records. Therefore, we
+ * should compare to the number of inode chunk blocks that the rmap
+ * knows about. We can't do this for the finobt since it only points
+ * to inode chunks with free inodes.
+ */
+ if (which == XFS_BTNUM_INO)
+ xfs_scrub_iallocbt_xref_rmap_inodes(sc, which, inode_blocks);
+
+ return error;
+}
+
+int
+xfs_scrub_inobt(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_iallocbt(sc, XFS_BTNUM_INO);
+}
+
+int
+xfs_scrub_finobt(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_iallocbt(sc, XFS_BTNUM_FINO);
+}
+
+/* See if an inode btree has (or doesn't have) an inode chunk record. */
+static inline void
+xfs_scrub_xref_inode_check(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len,
+ struct xfs_btree_cur **icur,
+ bool should_have_inodes)
+{
+ bool has_inodes;
+ int error;
+
+ if (!(*icur))
+ return;
+
+ error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes);
+ if (!xfs_scrub_should_check_xref(sc, &error, icur))
+ return;
+ if (has_inodes != should_have_inodes)
+ xfs_scrub_btree_xref_set_corrupt(sc, *icur, 0);
+}
+
+/* xref check that the extent is not covered by inodes */
+void
+xfs_scrub_xref_is_not_inode_chunk(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ xfs_scrub_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, false);
+ xfs_scrub_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur, false);
+}
+
+/* xref check that the extent is covered by inodes */
+void
+xfs_scrub_xref_is_inode_chunk(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ xfs_scrub_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, true);
+}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
new file mode 100644
index 000000000000..df14930e4fc5
--- /dev/null
+++ b/fs/xfs/scrub/inode.c
@@ -0,0 +1,611 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_ialloc.h"
+#include "xfs_da_format.h"
+#include "xfs_reflink.h"
+#include "xfs_rmap.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/*
+ * Grab total control of the inode metadata. It doesn't matter here if
+ * the file data is still changing; exclusive access to the metadata is
+ * the goal.
+ */
+int
+xfs_scrub_setup_inode(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ /*
+ * Try to get the inode. If the verifiers fail, we try again
+ * in raw mode.
+ */
+ error = xfs_scrub_get_inode(sc, ip);
+ switch (error) {
+ case 0:
+ break;
+ case -EFSCORRUPTED:
+ case -EFSBADCRC:
+ return xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ default:
+ return error;
+ }
+
+ /* Got the inode, lock it and we're ready to go. */
+ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+ error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ if (error)
+ goto out;
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+out:
+ /* scrub teardown will unlock and release the inode for us */
+ return error;
+}
+
+/* Inode core */
+
+/* Validate di_extsize hint. */
+STATIC void
+xfs_scrub_inode_extsize(
+ struct xfs_scrub_context *sc,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino,
+ uint16_t mode,
+ uint16_t flags)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_inode_validate_extsize(sc->mp, be32_to_cpu(dip->di_extsize),
+ mode, flags);
+ if (fa)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+}
+
+/*
+ * Validate di_cowextsize hint.
+ *
+ * The rules are documented at xfs_ioctl_setattr_check_cowextsize().
+ * These functions must be kept in sync with each other.
+ */
+STATIC void
+xfs_scrub_inode_cowextsize(
+ struct xfs_scrub_context *sc,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino,
+ uint16_t mode,
+ uint16_t flags,
+ uint64_t flags2)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_inode_validate_cowextsize(sc->mp,
+ be32_to_cpu(dip->di_cowextsize), mode, flags,
+ flags2);
+ if (fa)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+}
+
+/* Make sure the di_flags make sense for the inode. */
+STATIC void
+xfs_scrub_inode_flags(
+ struct xfs_scrub_context *sc,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino,
+ uint16_t mode,
+ uint16_t flags)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ if (flags & ~XFS_DIFLAG_ANY)
+ goto bad;
+
+ /* rt flags require rt device */
+ if ((flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT)) &&
+ !mp->m_rtdev_targp)
+ goto bad;
+
+ /* new rt bitmap flag only valid for rbmino */
+ if ((flags & XFS_DIFLAG_NEWRTBM) && ino != mp->m_sb.sb_rbmino)
+ goto bad;
+
+ /* directory-only flags */
+ if ((flags & (XFS_DIFLAG_RTINHERIT |
+ XFS_DIFLAG_EXTSZINHERIT |
+ XFS_DIFLAG_PROJINHERIT |
+ XFS_DIFLAG_NOSYMLINKS)) &&
+ !S_ISDIR(mode))
+ goto bad;
+
+ /* file-only flags */
+ if ((flags & (XFS_DIFLAG_REALTIME | FS_XFLAG_EXTSIZE)) &&
+ !S_ISREG(mode))
+ goto bad;
+
+ /* filestreams and rt make no sense */
+ if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME))
+ goto bad;
+
+ return;
+bad:
+ xfs_scrub_ino_set_corrupt(sc, ino);
+}
+
+/* Make sure the di_flags2 make sense for the inode. */
+STATIC void
+xfs_scrub_inode_flags2(
+ struct xfs_scrub_context *sc,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino,
+ uint16_t mode,
+ uint16_t flags,
+ uint64_t flags2)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ if (flags2 & ~XFS_DIFLAG2_ANY)
+ goto bad;
+
+ /* reflink flag requires reflink feature */
+ if ((flags2 & XFS_DIFLAG2_REFLINK) &&
+ !xfs_sb_version_hasreflink(&mp->m_sb))
+ goto bad;
+
+ /* cowextsize flag is checked w.r.t. mode separately */
+
+ /* file/dir-only flags */
+ if ((flags2 & XFS_DIFLAG2_DAX) && !(S_ISREG(mode) || S_ISDIR(mode)))
+ goto bad;
+
+ /* file-only flags */
+ if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode))
+ goto bad;
+
+ /* realtime and reflink make no sense, currently */
+ if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK))
+ goto bad;
+
+ /* dax and reflink make no sense, currently */
+ if ((flags2 & XFS_DIFLAG2_DAX) && (flags2 & XFS_DIFLAG2_REFLINK))
+ goto bad;
+
+ return;
+bad:
+ xfs_scrub_ino_set_corrupt(sc, ino);
+}
+
+/* Scrub all the ondisk inode fields. */
+STATIC void
+xfs_scrub_dinode(
+ struct xfs_scrub_context *sc,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino)
+{
+ struct xfs_mount *mp = sc->mp;
+ size_t fork_recs;
+ unsigned long long isize;
+ uint64_t flags2;
+ uint32_t nextents;
+ uint16_t flags;
+ uint16_t mode;
+
+ flags = be16_to_cpu(dip->di_flags);
+ if (dip->di_version >= 3)
+ flags2 = be64_to_cpu(dip->di_flags2);
+ else
+ flags2 = 0;
+
+ /* di_mode */
+ mode = be16_to_cpu(dip->di_mode);
+ switch (mode & S_IFMT) {
+ case S_IFLNK:
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ /* mode is recognized */
+ break;
+ default:
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ break;
+ }
+
+ /* v1/v2 fields */
+ switch (dip->di_version) {
+ case 1:
+ /*
+ * We autoconvert v1 inodes into v2 inodes on writeout,
+ * so just mark this inode for preening.
+ */
+ xfs_scrub_ino_set_preen(sc, ino);
+ break;
+ case 2:
+ case 3:
+ if (dip->di_onlink != 0)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+
+ if (dip->di_mode == 0 && sc->ip)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+
+ if (dip->di_projid_hi != 0 &&
+ !xfs_sb_version_hasprojid32bit(&mp->m_sb))
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ break;
+ default:
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ return;
+ }
+
+ /*
+ * di_uid/di_gid -- -1 isn't invalid, but there's no way that
+ * userspace could have created that.
+ */
+ if (dip->di_uid == cpu_to_be32(-1U) ||
+ dip->di_gid == cpu_to_be32(-1U))
+ xfs_scrub_ino_set_warning(sc, ino);
+
+ /* di_format */
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_DEV:
+ if (!S_ISCHR(mode) && !S_ISBLK(mode) &&
+ !S_ISFIFO(mode) && !S_ISSOCK(mode))
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ if (!S_ISDIR(mode) && !S_ISLNK(mode))
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode))
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (!S_ISREG(mode) && !S_ISDIR(mode))
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ break;
+ case XFS_DINODE_FMT_UUID:
+ default:
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ break;
+ }
+
+ /* di_[amc]time.nsec */
+ if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+
+ /*
+ * di_size. xfs_dinode_verify checks for things that screw up
+ * the VFS such as the upper bit being set and zero-length
+ * symlinks/directories, but we can do more here.
+ */
+ isize = be64_to_cpu(dip->di_size);
+ if (isize & (1ULL << 63))
+ xfs_scrub_ino_set_corrupt(sc, ino);
+
+ /* Devices, fifos, and sockets must have zero size */
+ if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+
+ /* Directories can't be larger than the data section size (32G) */
+ if (S_ISDIR(mode) && (isize == 0 || isize >= XFS_DIR2_SPACE_SIZE))
+ xfs_scrub_ino_set_corrupt(sc, ino);
+
+ /* Symlinks can't be larger than SYMLINK_MAXLEN */
+ if (S_ISLNK(mode) && (isize == 0 || isize >= XFS_SYMLINK_MAXLEN))
+ xfs_scrub_ino_set_corrupt(sc, ino);
+
+ /*
+ * Warn if the running kernel can't handle the kinds of offsets
+ * needed to deal with the file size. In other words, if the
+ * pagecache can't cache all the blocks in this file due to
+ * overly large offsets, flag the inode for admin review.
+ */
+ if (isize >= mp->m_super->s_maxbytes)
+ xfs_scrub_ino_set_warning(sc, ino);
+
+ /* di_nblocks */
+ if (flags2 & XFS_DIFLAG2_REFLINK) {
+ ; /* nblocks can exceed dblocks */
+ } else if (flags & XFS_DIFLAG_REALTIME) {
+ /*
+ * nblocks is the sum of data extents (in the rtdev),
+ * attr extents (in the datadev), and both forks' bmbt
+ * blocks (in the datadev). This clumsy check is the
+ * best we can do without cross-referencing with the
+ * inode forks.
+ */
+ if (be64_to_cpu(dip->di_nblocks) >=
+ mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ } else {
+ if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ }
+
+ xfs_scrub_inode_flags(sc, dip, ino, mode, flags);
+
+ xfs_scrub_inode_extsize(sc, dip, ino, mode, flags);
+
+ /* di_nextents */
+ nextents = be32_to_cpu(dip->di_nextents);
+ fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ if (nextents > fork_recs)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (nextents <= fork_recs)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ break;
+ default:
+ if (nextents != 0)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ break;
+ }
+
+ /* di_forkoff */
+ if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ if (dip->di_anextents != 0 && dip->di_forkoff == 0)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+
+ /* di_aformat */
+ if (dip->di_aformat != XFS_DINODE_FMT_LOCAL &&
+ dip->di_aformat != XFS_DINODE_FMT_EXTENTS &&
+ dip->di_aformat != XFS_DINODE_FMT_BTREE)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+
+ /* di_anextents */
+ nextents = be16_to_cpu(dip->di_anextents);
+ fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
+ switch (dip->di_aformat) {
+ case XFS_DINODE_FMT_EXTENTS:
+ if (nextents > fork_recs)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (nextents <= fork_recs)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ break;
+ default:
+ if (nextents != 0)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ }
+
+ if (dip->di_version >= 3) {
+ if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+ xfs_scrub_inode_flags2(sc, dip, ino, mode, flags, flags2);
+ xfs_scrub_inode_cowextsize(sc, dip, ino, mode, flags,
+ flags2);
+ }
+}
+
+/*
+ * Make sure the finobt doesn't think this inode is free.
+ * We don't have to check the inobt ourselves because we got the inode via
+ * IGET_UNTRUSTED, which checks the inobt for us.
+ */
+static void
+xfs_scrub_inode_xref_finobt(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino)
+{
+ struct xfs_inobt_rec_incore rec;
+ xfs_agino_t agino;
+ int has_record;
+ int error;
+
+ if (!sc->sa.fino_cur)
+ return;
+
+ agino = XFS_INO_TO_AGINO(sc->mp, ino);
+
+ /*
+ * Try to get the finobt record. If we can't get it, then we're
+ * in good shape.
+ */
+ error = xfs_inobt_lookup(sc->sa.fino_cur, agino, XFS_LOOKUP_LE,
+ &has_record);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.fino_cur) ||
+ !has_record)
+ return;
+
+ error = xfs_inobt_get_rec(sc->sa.fino_cur, &rec, &has_record);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.fino_cur) ||
+ !has_record)
+ return;
+
+ /*
+ * Otherwise, make sure this record either doesn't cover this inode,
+ * or that it does but it's marked present.
+ */
+ if (rec.ir_startino > agino ||
+ rec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
+ return;
+
+ if (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.fino_cur, 0);
+}
+
+/* Cross reference the inode fields with the forks. */
+STATIC void
+xfs_scrub_inode_xref_bmap(
+ struct xfs_scrub_context *sc,
+ struct xfs_dinode *dip)
+{
+ xfs_extnum_t nextents;
+ xfs_filblks_t count;
+ xfs_filblks_t acount;
+ int error;
+
+ /* Walk all the extents to check nextents/naextents/nblocks. */
+ error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
+ &nextents, &count);
+ if (!xfs_scrub_should_check_xref(sc, &error, NULL))
+ return;
+ if (nextents < be32_to_cpu(dip->di_nextents))
+ xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino);
+
+ error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
+ &nextents, &acount);
+ if (!xfs_scrub_should_check_xref(sc, &error, NULL))
+ return;
+ if (nextents != be16_to_cpu(dip->di_anextents))
+ xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino);
+
+ /* Check nblocks against the inode. */
+ if (count + acount != be64_to_cpu(dip->di_nblocks))
+ xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino);
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xfs_scrub_inode_xref(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino,
+ struct xfs_dinode *dip)
+{
+ struct xfs_owner_info oinfo;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ agno = XFS_INO_TO_AGNO(sc->mp, ino);
+ agbno = XFS_INO_TO_AGBNO(sc->mp, ino);
+
+ error = xfs_scrub_ag_init(sc, agno, &sc->sa);
+ if (!xfs_scrub_xref_process_error(sc, agno, agbno, &error))
+ return;
+
+ xfs_scrub_xref_is_used_space(sc, agbno, 1);
+ xfs_scrub_inode_xref_finobt(sc, ino);
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
+ xfs_scrub_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xfs_scrub_xref_is_not_shared(sc, agbno, 1);
+ xfs_scrub_inode_xref_bmap(sc, dip);
+
+ xfs_scrub_ag_free(sc, &sc->sa);
+}
+
+/*
+ * If the reflink iflag disagrees with a scan for shared data fork extents,
+ * either flag an error (shared extents w/ no flag) or a preen (flag set w/o
+ * any shared extents). We already checked for reflink iflag set on a non
+ * reflink filesystem.
+ */
+static void
+xfs_scrub_inode_check_reflink_iflag(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino)
+{
+ struct xfs_mount *mp = sc->mp;
+ bool has_shared;
+ int error;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return;
+
+ error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
+ &has_shared);
+ if (!xfs_scrub_xref_process_error(sc, XFS_INO_TO_AGNO(mp, ino),
+ XFS_INO_TO_AGBNO(mp, ino), &error))
+ return;
+ if (xfs_is_reflink_inode(sc->ip) && !has_shared)
+ xfs_scrub_ino_set_preen(sc, ino);
+ else if (!xfs_is_reflink_inode(sc->ip) && has_shared)
+ xfs_scrub_ino_set_corrupt(sc, ino);
+}
+
+/* Scrub an inode. */
+int
+xfs_scrub_inode(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_dinode di;
+ int error = 0;
+
+ /*
+ * If sc->ip is NULL, that means that the setup function called
+ * xfs_iget to look up the inode. xfs_iget returned a EFSCORRUPTED
+ * and a NULL inode, so flag the corruption error and return.
+ */
+ if (!sc->ip) {
+ xfs_scrub_ino_set_corrupt(sc, sc->sm->sm_ino);
+ return 0;
+ }
+
+ /* Scrub the inode core. */
+ xfs_inode_to_disk(sc->ip, &di, 0);
+ xfs_scrub_dinode(sc, &di, sc->ip->i_ino);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /*
+ * Look for discrepancies between file's data blocks and the reflink
+ * iflag. We already checked the iflag against the file mode when
+ * we scrubbed the dinode.
+ */
+ if (S_ISREG(VFS_I(sc->ip)->i_mode))
+ xfs_scrub_inode_check_reflink_iflag(sc, sc->ip->i_ino);
+
+ xfs_scrub_inode_xref(sc, sc->ip->i_ino, &di);
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
new file mode 100644
index 000000000000..1fb88c18d455
--- /dev/null
+++ b/fs/xfs/scrub/parent.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ialloc.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Set us up to scrub parents. */
+int
+xfs_scrub_setup_parent(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_inode_contents(sc, ip, 0);
+}
+
+/* Parent pointers */
+
+/* Look for an entry in a parent pointing to this inode. */
+
+struct xfs_scrub_parent_ctx {
+ struct dir_context dc;
+ xfs_ino_t ino;
+ xfs_nlink_t nlink;
+};
+
+/* Look for a single entry in a directory pointing to an inode. */
+STATIC int
+xfs_scrub_parent_actor(
+ struct dir_context *dc,
+ const char *name,
+ int namelen,
+ loff_t pos,
+ u64 ino,
+ unsigned type)
+{
+ struct xfs_scrub_parent_ctx *spc;
+
+ spc = container_of(dc, struct xfs_scrub_parent_ctx, dc);
+ if (spc->ino == ino)
+ spc->nlink++;
+ return 0;
+}
+
+/* Count the number of dentries in the parent dir that point to this inode. */
+STATIC int
+xfs_scrub_parent_count_parent_dentries(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *parent,
+ xfs_nlink_t *nlink)
+{
+ struct xfs_scrub_parent_ctx spc = {
+ .dc.actor = xfs_scrub_parent_actor,
+ .dc.pos = 0,
+ .ino = sc->ip->i_ino,
+ .nlink = 0,
+ };
+ size_t bufsize;
+ loff_t oldpos;
+ uint lock_mode;
+ int error = 0;
+
+ /*
+ * If there are any blocks, read-ahead block 0 as we're almost
+ * certain to have the next operation be a read there. This is
+ * how we guarantee that the parent's extent map has been loaded,
+ * if there is one.
+ */
+ lock_mode = xfs_ilock_data_map_shared(parent);
+ if (parent->i_d.di_nextents > 0)
+ error = xfs_dir3_data_readahead(parent, 0, -1);
+ xfs_iunlock(parent, lock_mode);
+ if (error)
+ return error;
+
+ /*
+ * Iterate the parent dir to confirm that there is
+ * exactly one entry pointing back to the inode being
+ * scanned.
+ */
+ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
+ parent->i_d.di_size);
+ oldpos = 0;
+ while (true) {
+ error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize);
+ if (error)
+ goto out;
+ if (oldpos == spc.dc.pos)
+ break;
+ oldpos = spc.dc.pos;
+ }
+ *nlink = spc.nlink;
+out:
+ return error;
+}
+
+/*
+ * Given the inode number of the alleged parent of the inode being
+ * scrubbed, try to validate that the parent has exactly one directory
+ * entry pointing back to the inode being scrubbed.
+ */
+STATIC int
+xfs_scrub_parent_validate(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t dnum,
+ bool *try_again)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *dp = NULL;
+ xfs_nlink_t expected_nlink;
+ xfs_nlink_t nlink;
+ int error = 0;
+
+ *try_again = false;
+
+ /* '..' must not point to ourselves. */
+ if (sc->ip->i_ino == dnum) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /*
+ * If we're an unlinked directory, the parent /won't/ have a link
+ * to us. Otherwise, it should have one link.
+ */
+ expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
+
+ /*
+ * Grab this parent inode. We release the inode before we
+ * cancel the scrub transaction. Since we're don't know a
+ * priori that releasing the inode won't trigger eofblocks
+ * cleanup (which allocates what would be a nested transaction)
+ * if the parent pointer erroneously points to a file, we
+ * can't use DONTCACHE here because DONTCACHE inodes can trigger
+ * immediate inactive cleanup of the inode.
+ *
+ * If _iget returns -EINVAL then the parent inode number is garbage
+ * and the directory is corrupt. If the _iget returns -EFSCORRUPTED
+ * or -EFSBADCRC then the parent is corrupt which is a cross
+ * referencing error. Any other error is an operational error.
+ */
+ error = xfs_iget(mp, sc->tp, dnum, XFS_IGET_UNTRUSTED, 0, &dp);
+ if (error == -EINVAL) {
+ error = -EFSCORRUPTED;
+ xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error);
+ goto out;
+ }
+ if (!xfs_scrub_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out;
+ if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out_rele;
+ }
+
+ /*
+ * We prefer to keep the inode locked while we lock and search
+ * its alleged parent for a forward reference. If we can grab
+ * the iolock, validate the pointers and we're done. We must
+ * use nowait here to avoid an ABBA deadlock on the parent and
+ * the child inodes.
+ */
+ if (xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) {
+ error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink);
+ if (!xfs_scrub_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ goto out_unlock;
+ if (nlink != expected_nlink)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out_unlock;
+ }
+
+ /*
+ * The game changes if we get here. We failed to lock the parent,
+ * so we're going to try to verify both pointers while only holding
+ * one lock so as to avoid deadlocking with something that's actually
+ * trying to traverse down the directory tree.
+ */
+ xfs_iunlock(sc->ip, sc->ilock_flags);
+ sc->ilock_flags = 0;
+ xfs_ilock(dp, XFS_IOLOCK_SHARED);
+
+ /* Go looking for our dentry. */
+ error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink);
+ if (!xfs_scrub_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out_unlock;
+
+ /* Drop the parent lock, relock this inode. */
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ sc->ilock_flags = XFS_IOLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+
+ /*
+ * If we're an unlinked directory, the parent /won't/ have a link
+ * to us. Otherwise, it should have one link. We have to re-set
+ * it here because we dropped the lock on sc->ip.
+ */
+ expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
+
+ /* Look up '..' to see if the inode changed. */
+ error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out_rele;
+
+ /* Drat, parent changed. Try again! */
+ if (dnum != dp->i_ino) {
+ iput(VFS_I(dp));
+ *try_again = true;
+ return 0;
+ }
+ iput(VFS_I(dp));
+
+ /*
+ * '..' didn't change, so check that there was only one entry
+ * for us in the parent.
+ */
+ if (nlink != expected_nlink)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return error;
+
+out_unlock:
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+out_rele:
+ iput(VFS_I(dp));
+out:
+ return error;
+}
+
+/* Scrub a parent pointer. */
+int
+xfs_scrub_parent(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t dnum;
+ bool try_again;
+ int tries = 0;
+ int error = 0;
+
+ /*
+ * If we're a directory, check that the '..' link points up to
+ * a directory that has one entry pointing to us.
+ */
+ if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
+ return -ENOENT;
+
+ /* We're not a special inode, are we? */
+ if (!xfs_verify_dir_ino(mp, sc->ip->i_ino)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /*
+ * The VFS grabs a read or write lock via i_rwsem before it reads
+ * or writes to a directory. If we've gotten this far we've
+ * already obtained IOLOCK_EXCL, which (since 4.10) is the same as
+ * getting a write lock on i_rwsem. Therefore, it is safe for us
+ * to drop the ILOCK here in order to do directory lookups.
+ */
+ sc->ilock_flags &= ~(XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL);
+ xfs_iunlock(sc->ip, XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL);
+
+ /* Look up '..' */
+ error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out;
+ if (!xfs_verify_dir_ino(mp, dnum)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /* Is this the root dir? Then '..' must point to itself. */
+ if (sc->ip == mp->m_rootip) {
+ if (sc->ip->i_ino != mp->m_sb.sb_rootino ||
+ sc->ip->i_ino != dnum)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ do {
+ error = xfs_scrub_parent_validate(sc, dnum, &try_again);
+ if (error)
+ goto out;
+ } while (try_again && ++tries < 20);
+
+ /*
+ * We gave it our best shot but failed, so mark this scrub
+ * incomplete. Userspace can decide if it wants to try again.
+ */
+ if (try_again && tries == 20)
+ xfs_scrub_set_incomplete(sc);
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
new file mode 100644
index 000000000000..6ba465e6c885
--- /dev/null
+++ b/fs/xfs/scrub/quota.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_dquot.h"
+#include "xfs_dquot_item.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Convert a scrub type code to a DQ flag, or return 0 if error. */
+static inline uint
+xfs_scrub_quota_to_dqtype(
+ struct xfs_scrub_context *sc)
+{
+ switch (sc->sm->sm_type) {
+ case XFS_SCRUB_TYPE_UQUOTA:
+ return XFS_DQ_USER;
+ case XFS_SCRUB_TYPE_GQUOTA:
+ return XFS_DQ_GROUP;
+ case XFS_SCRUB_TYPE_PQUOTA:
+ return XFS_DQ_PROJ;
+ default:
+ return 0;
+ }
+}
+
+/* Set us up to scrub a quota. */
+int
+xfs_scrub_setup_quota(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ uint dqtype;
+
+ dqtype = xfs_scrub_quota_to_dqtype(sc);
+ if (dqtype == 0)
+ return -EINVAL;
+ if (!xfs_this_quota_on(sc->mp, dqtype))
+ return -ENOENT;
+ return 0;
+}
+
+/* Quotas. */
+
+/* Scrub the fields in an individual quota item. */
+STATIC void
+xfs_scrub_quota_item(
+ struct xfs_scrub_context *sc,
+ uint dqtype,
+ struct xfs_dquot *dq,
+ xfs_dqid_t id)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_disk_dquot *d = &dq->q_core;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ xfs_fileoff_t offset;
+ unsigned long long bsoft;
+ unsigned long long isoft;
+ unsigned long long rsoft;
+ unsigned long long bhard;
+ unsigned long long ihard;
+ unsigned long long rhard;
+ unsigned long long bcount;
+ unsigned long long icount;
+ unsigned long long rcount;
+ xfs_ino_t fs_icount;
+
+ offset = id / qi->qi_dqperchunk;
+
+ /*
+ * We fed $id and DQNEXT into the xfs_qm_dqget call, which means
+ * that the actual dquot we got must either have the same id or
+ * the next higher id.
+ */
+ if (id > be32_to_cpu(d->d_id))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /* Did we get the dquot type we wanted? */
+ if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ if (d->d_pad0 != cpu_to_be32(0) || d->d_pad != cpu_to_be16(0))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /* Check the limits. */
+ bhard = be64_to_cpu(d->d_blk_hardlimit);
+ ihard = be64_to_cpu(d->d_ino_hardlimit);
+ rhard = be64_to_cpu(d->d_rtb_hardlimit);
+
+ bsoft = be64_to_cpu(d->d_blk_softlimit);
+ isoft = be64_to_cpu(d->d_ino_softlimit);
+ rsoft = be64_to_cpu(d->d_rtb_softlimit);
+
+ /*
+ * Warn if the hard limits are larger than the fs.
+ * Administrators can do this, though in production this seems
+ * suspect, which is why we flag it for review.
+ *
+ * Complain about corruption if the soft limit is greater than
+ * the hard limit.
+ */
+ if (bhard > mp->m_sb.sb_dblocks)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (bsoft > bhard)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ if (ihard > mp->m_maxicount)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (isoft > ihard)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ if (rhard > mp->m_sb.sb_rblocks)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (rsoft > rhard)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /* Check the resource counts. */
+ bcount = be64_to_cpu(d->d_bcount);
+ icount = be64_to_cpu(d->d_icount);
+ rcount = be64_to_cpu(d->d_rtbcount);
+ fs_icount = percpu_counter_sum(&mp->m_icount);
+
+ /*
+ * Check that usage doesn't exceed physical limits. However, on
+ * a reflink filesystem we're allowed to exceed physical space
+ * if there are no quota limits.
+ */
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (mp->m_sb.sb_dblocks < bcount)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK,
+ offset);
+ } else {
+ if (mp->m_sb.sb_dblocks < bcount)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ offset);
+ }
+ if (icount > fs_icount || rcount > mp->m_sb.sb_rblocks)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /*
+ * We can violate the hard limits if the admin suddenly sets a
+ * lower limit than the actual usage. However, we flag it for
+ * admin review.
+ */
+ if (id != 0 && bhard != 0 && bcount > bhard)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (id != 0 && ihard != 0 && icount > ihard)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (id != 0 && rhard != 0 && rcount > rhard)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+}
+
+/* Scrub all of a quota type's items. */
+int
+xfs_scrub_quota(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_bmbt_irec irec = { 0 };
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_dquot *dq;
+ xfs_fileoff_t max_dqid_off;
+ xfs_fileoff_t off = 0;
+ xfs_dqid_t id = 0;
+ uint dqtype;
+ int nimaps;
+ int error = 0;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ return -ENOENT;
+
+ mutex_lock(&qi->qi_quotaofflock);
+ dqtype = xfs_scrub_quota_to_dqtype(sc);
+ if (!xfs_this_quota_on(sc->mp, dqtype)) {
+ error = -ENOENT;
+ goto out_unlock_quota;
+ }
+
+ /* Attach to the quota inode and set sc->ip so that reporting works. */
+ ip = xfs_quota_inode(sc->mp, dqtype);
+ sc->ip = ip;
+
+ /* Look for problem extents. */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+ goto out_unlock_inode;
+ }
+ max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
+ while (1) {
+ if (xfs_scrub_should_terminate(sc, &error))
+ break;
+
+ off = irec.br_startoff + irec.br_blockcount;
+ nimaps = 1;
+ error = xfs_bmapi_read(ip, off, -1, &irec, &nimaps,
+ XFS_BMAPI_ENTIRE);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, off,
+ &error))
+ goto out_unlock_inode;
+ if (!nimaps)
+ break;
+ if (irec.br_startblock == HOLESTARTBLOCK)
+ continue;
+
+ /* Check the extent record doesn't point to crap. */
+ if (irec.br_startblock + irec.br_blockcount <=
+ irec.br_startblock)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ irec.br_startoff);
+ if (!xfs_verify_fsbno(mp, irec.br_startblock) ||
+ !xfs_verify_fsbno(mp, irec.br_startblock +
+ irec.br_blockcount - 1))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ irec.br_startoff);
+
+ /*
+ * Unwritten extents or blocks mapped above the highest
+ * quota id shouldn't happen.
+ */
+ if (isnullstartblock(irec.br_startblock) ||
+ irec.br_startoff > max_dqid_off ||
+ irec.br_startoff + irec.br_blockcount > max_dqid_off + 1)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Check all the quota items. */
+ while (id < ((xfs_dqid_t)-1ULL)) {
+ if (xfs_scrub_should_terminate(sc, &error))
+ break;
+
+ error = xfs_qm_dqget(mp, NULL, id, dqtype, XFS_QMOPT_DQNEXT,
+ &dq);
+ if (error == -ENOENT)
+ break;
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK,
+ id * qi->qi_dqperchunk, &error))
+ break;
+
+ xfs_scrub_quota_item(sc, dqtype, dq, id);
+
+ id = be32_to_cpu(dq->q_core.d_id) + 1;
+ xfs_qm_dqput(dq);
+ if (!id)
+ break;
+ }
+
+out:
+ /* We set sc->ip earlier, so make sure we clear it now. */
+ sc->ip = NULL;
+out_unlock_quota:
+ mutex_unlock(&qi->qi_quotaofflock);
+ return error;
+
+out_unlock_inode:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out;
+}
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
new file mode 100644
index 000000000000..400f1561cd3d
--- /dev/null
+++ b/fs/xfs/scrub/refcount.c
@@ -0,0 +1,515 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/*
+ * Set us up to scrub reference count btrees.
+ */
+int
+xfs_scrub_setup_ag_refcountbt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_ag_btree(sc, ip, false);
+}
+
+/* Reference count btree scrubber. */
+
+/*
+ * Confirming Reference Counts via Reverse Mappings
+ *
+ * We want to count the reverse mappings overlapping a refcount record
+ * (bno, len, refcount), allowing for the possibility that some of the
+ * overlap may come from smaller adjoining reverse mappings, while some
+ * comes from single extents which overlap the range entirely. The
+ * outer loop is as follows:
+ *
+ * 1. For all reverse mappings overlapping the refcount extent,
+ * a. If a given rmap completely overlaps, mark it as seen.
+ * b. Otherwise, record the fragment (in agbno order) for later
+ * processing.
+ *
+ * Once we've seen all the rmaps, we know that for all blocks in the
+ * refcount record we want to find $refcount owners and we've already
+ * visited $seen extents that overlap all the blocks. Therefore, we
+ * need to find ($refcount - $seen) owners for every block in the
+ * extent; call that quantity $target_nr. Proceed as follows:
+ *
+ * 2. Pull the first $target_nr fragments from the list; all of them
+ * should start at or before the start of the extent.
+ * Call this subset of fragments the working set.
+ * 3. Until there are no more unprocessed fragments,
+ * a. Find the shortest fragments in the set and remove them.
+ * b. Note the block number of the end of these fragments.
+ * c. Pull the same number of fragments from the list. All of these
+ * fragments should start at the block number recorded in the
+ * previous step.
+ * d. Put those fragments in the set.
+ * 4. Check that there are $target_nr fragments remaining in the list,
+ * and that they all end at or beyond the end of the refcount extent.
+ *
+ * If the refcount is correct, all the check conditions in the algorithm
+ * should always hold true. If not, the refcount is incorrect.
+ */
+struct xfs_scrub_refcnt_frag {
+ struct list_head list;
+ struct xfs_rmap_irec rm;
+};
+
+struct xfs_scrub_refcnt_check {
+ struct xfs_scrub_context *sc;
+ struct list_head fragments;
+
+ /* refcount extent we're examining */
+ xfs_agblock_t bno;
+ xfs_extlen_t len;
+ xfs_nlink_t refcount;
+
+ /* number of owners seen */
+ xfs_nlink_t seen;
+};
+
+/*
+ * Decide if the given rmap is large enough that we can redeem it
+ * towards refcount verification now, or if it's a fragment, in
+ * which case we'll hang onto it in the hopes that we'll later
+ * discover that we've collected exactly the correct number of
+ * fragments as the refcountbt says we should have.
+ */
+STATIC int
+xfs_scrub_refcountbt_rmap_check(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_scrub_refcnt_check *refchk = priv;
+ struct xfs_scrub_refcnt_frag *frag;
+ xfs_agblock_t rm_last;
+ xfs_agblock_t rc_last;
+ int error = 0;
+
+ if (xfs_scrub_should_terminate(refchk->sc, &error))
+ return error;
+
+ rm_last = rec->rm_startblock + rec->rm_blockcount - 1;
+ rc_last = refchk->bno + refchk->len - 1;
+
+ /* Confirm that a single-owner refc extent is a CoW stage. */
+ if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) {
+ xfs_scrub_btree_xref_set_corrupt(refchk->sc, cur, 0);
+ return 0;
+ }
+
+ if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) {
+ /*
+ * The rmap overlaps the refcount record, so we can confirm
+ * one refcount owner seen.
+ */
+ refchk->seen++;
+ } else {
+ /*
+ * This rmap covers only part of the refcount record, so
+ * save the fragment for later processing. If the rmapbt
+ * is healthy each rmap_irec we see will be in agbno order
+ * so we don't need insertion sort here.
+ */
+ frag = kmem_alloc(sizeof(struct xfs_scrub_refcnt_frag),
+ KM_MAYFAIL | KM_NOFS);
+ if (!frag)
+ return -ENOMEM;
+ memcpy(&frag->rm, rec, sizeof(frag->rm));
+ list_add_tail(&frag->list, &refchk->fragments);
+ }
+
+ return 0;
+}
+
+/*
+ * Given a bunch of rmap fragments, iterate through them, keeping
+ * a running tally of the refcount. If this ever deviates from
+ * what we expect (which is the refcountbt's refcount minus the
+ * number of extents that totally covered the refcountbt extent),
+ * we have a refcountbt error.
+ */
+STATIC void
+xfs_scrub_refcountbt_process_rmap_fragments(
+ struct xfs_scrub_refcnt_check *refchk)
+{
+ struct list_head worklist;
+ struct xfs_scrub_refcnt_frag *frag;
+ struct xfs_scrub_refcnt_frag *n;
+ xfs_agblock_t bno;
+ xfs_agblock_t rbno;
+ xfs_agblock_t next_rbno;
+ xfs_nlink_t nr;
+ xfs_nlink_t target_nr;
+
+ target_nr = refchk->refcount - refchk->seen;
+ if (target_nr == 0)
+ return;
+
+ /*
+ * There are (refchk->rc.rc_refcount - refchk->nr refcount)
+ * references we haven't found yet. Pull that many off the
+ * fragment list and figure out where the smallest rmap ends
+ * (and therefore the next rmap should start). All the rmaps
+ * we pull off should start at or before the beginning of the
+ * refcount record's range.
+ */
+ INIT_LIST_HEAD(&worklist);
+ rbno = NULLAGBLOCK;
+ nr = 1;
+
+ /* Make sure the fragments actually /are/ in agbno order. */
+ bno = 0;
+ list_for_each_entry(frag, &refchk->fragments, list) {
+ if (frag->rm.rm_startblock < bno)
+ goto done;
+ bno = frag->rm.rm_startblock;
+ }
+
+ /*
+ * Find all the rmaps that start at or before the refc extent,
+ * and put them on the worklist.
+ */
+ list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+ if (frag->rm.rm_startblock > refchk->bno)
+ goto done;
+ bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+ if (bno < rbno)
+ rbno = bno;
+ list_move_tail(&frag->list, &worklist);
+ if (nr == target_nr)
+ break;
+ nr++;
+ }
+
+ /*
+ * We should have found exactly $target_nr rmap fragments starting
+ * at or before the refcount extent.
+ */
+ if (nr != target_nr)
+ goto done;
+
+ while (!list_empty(&refchk->fragments)) {
+ /* Discard any fragments ending at rbno from the worklist. */
+ nr = 0;
+ next_rbno = NULLAGBLOCK;
+ list_for_each_entry_safe(frag, n, &worklist, list) {
+ bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+ if (bno != rbno) {
+ if (bno < next_rbno)
+ next_rbno = bno;
+ continue;
+ }
+ list_del(&frag->list);
+ kmem_free(frag);
+ nr++;
+ }
+
+ /* Try to add nr rmaps starting at rbno to the worklist. */
+ list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+ bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+ if (frag->rm.rm_startblock != rbno)
+ goto done;
+ list_move_tail(&frag->list, &worklist);
+ if (next_rbno > bno)
+ next_rbno = bno;
+ nr--;
+ if (nr == 0)
+ break;
+ }
+
+ /*
+ * If we get here and nr > 0, this means that we added fewer
+ * items to the worklist than we discarded because the fragment
+ * list ran out of items. Therefore, we cannot maintain the
+ * required refcount. Something is wrong, so we're done.
+ */
+ if (nr)
+ goto done;
+
+ rbno = next_rbno;
+ }
+
+ /*
+ * Make sure the last extent we processed ends at or beyond
+ * the end of the refcount extent.
+ */
+ if (rbno < refchk->bno + refchk->len)
+ goto done;
+
+ /* Actually record us having seen the remaining refcount. */
+ refchk->seen = refchk->refcount;
+done:
+ /* Delete fragments and work list. */
+ list_for_each_entry_safe(frag, n, &worklist, list) {
+ list_del(&frag->list);
+ kmem_free(frag);
+ }
+ list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+ list_del(&frag->list);
+ kmem_free(frag);
+ }
+}
+
+/* Use the rmap entries covering this extent to verify the refcount. */
+STATIC void
+xfs_scrub_refcountbt_xref_rmap(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ xfs_nlink_t refcount)
+{
+ struct xfs_scrub_refcnt_check refchk = {
+ .sc = sc,
+ .bno = bno,
+ .len = len,
+ .refcount = refcount,
+ .seen = 0,
+ };
+ struct xfs_rmap_irec low;
+ struct xfs_rmap_irec high;
+ struct xfs_scrub_refcnt_frag *frag;
+ struct xfs_scrub_refcnt_frag *n;
+ int error;
+
+ if (!sc->sa.rmap_cur)
+ return;
+
+ /* Cross-reference with the rmapbt to confirm the refcount. */
+ memset(&low, 0, sizeof(low));
+ low.rm_startblock = bno;
+ memset(&high, 0xFF, sizeof(high));
+ high.rm_startblock = bno + len - 1;
+
+ INIT_LIST_HEAD(&refchk.fragments);
+ error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high,
+ &xfs_scrub_refcountbt_rmap_check, &refchk);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+ goto out_free;
+
+ xfs_scrub_refcountbt_process_rmap_fragments(&refchk);
+ if (refcount != refchk.seen)
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+
+out_free:
+ list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
+ list_del(&frag->list);
+ kmem_free(frag);
+ }
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xfs_scrub_refcountbt_xref(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len,
+ xfs_nlink_t refcount)
+{
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ xfs_scrub_xref_is_used_space(sc, agbno, len);
+ xfs_scrub_xref_is_not_inode_chunk(sc, agbno, len);
+ xfs_scrub_refcountbt_xref_rmap(sc, agbno, len, refcount);
+}
+
+/* Scrub a refcountbt record. */
+STATIC int
+xfs_scrub_refcountbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ xfs_agblock_t *cow_blocks = bs->private;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agblock_t bno;
+ xfs_extlen_t len;
+ xfs_nlink_t refcount;
+ bool has_cowflag;
+ int error = 0;
+
+ bno = be32_to_cpu(rec->refc.rc_startblock);
+ len = be32_to_cpu(rec->refc.rc_blockcount);
+ refcount = be32_to_cpu(rec->refc.rc_refcount);
+
+ /* Only CoW records can have refcount == 1. */
+ has_cowflag = (bno & XFS_REFC_COW_START);
+ if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ if (has_cowflag)
+ (*cow_blocks) += len;
+
+ /* Check the extent. */
+ bno &= ~XFS_REFC_COW_START;
+ if (bno + len <= bno ||
+ !xfs_verify_agbno(mp, agno, bno) ||
+ !xfs_verify_agbno(mp, agno, bno + len - 1))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (refcount == 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ xfs_scrub_refcountbt_xref(bs->sc, bno, len, refcount);
+
+ return error;
+}
+
+/* Make sure we have as many refc blocks as the rmap says. */
+STATIC void
+xfs_scrub_refcount_xref_rmap(
+ struct xfs_scrub_context *sc,
+ struct xfs_owner_info *oinfo,
+ xfs_filblks_t cow_blocks)
+{
+ xfs_extlen_t refcbt_blocks = 0;
+ xfs_filblks_t blocks;
+ int error;
+
+ if (!sc->sa.rmap_cur)
+ return;
+
+ /* Check that we saw as many refcbt blocks as the rmap knows about. */
+ error = xfs_btree_count_blocks(sc->sa.refc_cur, &refcbt_blocks);
+ if (!xfs_scrub_btree_process_error(sc, sc->sa.refc_cur, 0, &error))
+ return;
+ error = xfs_scrub_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo,
+ &blocks);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+ return;
+ if (blocks != refcbt_blocks)
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+
+ /* Check that we saw as many cow blocks as the rmap knows about. */
+ xfs_rmap_ag_owner(oinfo, XFS_RMAP_OWN_COW);
+ error = xfs_scrub_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo,
+ &blocks);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+ return;
+ if (blocks != cow_blocks)
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+}
+
+/* Scrub the refcount btree for some AG. */
+int
+xfs_scrub_refcountbt(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_owner_info oinfo;
+ xfs_agblock_t cow_blocks = 0;
+ int error;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
+ error = xfs_scrub_btree(sc, sc->sa.refc_cur, xfs_scrub_refcountbt_rec,
+ &oinfo, &cow_blocks);
+ if (error)
+ return error;
+
+ xfs_scrub_refcount_xref_rmap(sc, &oinfo, cow_blocks);
+
+ return 0;
+}
+
+/* xref check that a cow staging extent is marked in the refcountbt. */
+void
+xfs_scrub_xref_is_cow_staging(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ struct xfs_refcount_irec rc;
+ bool has_cowflag;
+ int has_refcount;
+ int error;
+
+ if (!sc->sa.refc_cur)
+ return;
+
+ /* Find the CoW staging extent. */
+ error = xfs_refcount_lookup_le(sc->sa.refc_cur,
+ agbno + XFS_REFC_COW_START, &has_refcount);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur))
+ return;
+ if (!has_refcount) {
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+ return;
+ }
+
+ error = xfs_refcount_get_rec(sc->sa.refc_cur, &rc, &has_refcount);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur))
+ return;
+ if (!has_refcount) {
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+ return;
+ }
+
+ /* CoW flag must be set, refcount must be 1. */
+ has_cowflag = (rc.rc_startblock & XFS_REFC_COW_START);
+ if (!has_cowflag || rc.rc_refcount != 1)
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+
+ /* Must be at least as long as what was passed in */
+ if (rc.rc_blockcount < len)
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+}
+
+/*
+ * xref check that the extent is not shared. Only file data blocks
+ * can have multiple owners.
+ */
+void
+xfs_scrub_xref_is_not_shared(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ bool shared;
+ int error;
+
+ if (!sc->sa.refc_cur)
+ return;
+
+ error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur))
+ return;
+ if (shared)
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+}
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
new file mode 100644
index 000000000000..8f2a7c3ff455
--- /dev/null
+++ b/fs/xfs/scrub/rmap.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/*
+ * Set us up to scrub reverse mapping btrees.
+ */
+int
+xfs_scrub_setup_ag_rmapbt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_ag_btree(sc, ip, false);
+}
+
+/* Reverse-mapping scrubber. */
+
+/* Cross-reference a rmap against the refcount btree. */
+STATIC void
+xfs_scrub_rmapbt_xref_refc(
+ struct xfs_scrub_context *sc,
+ struct xfs_rmap_irec *irec)
+{
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ bool non_inode;
+ bool is_bmbt;
+ bool is_attr;
+ bool is_unwritten;
+ int error;
+
+ if (!sc->sa.refc_cur)
+ return;
+
+ non_inode = XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
+ is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK;
+ is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK;
+ is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN;
+
+ /* If this is shared, must be a data fork extent. */
+ error = xfs_refcount_find_shared(sc->sa.refc_cur, irec->rm_startblock,
+ irec->rm_blockcount, &fbno, &flen, false);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.refc_cur))
+ return;
+ if (flen != 0 && (non_inode || is_attr || is_bmbt || is_unwritten))
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xfs_scrub_rmapbt_xref(
+ struct xfs_scrub_context *sc,
+ struct xfs_rmap_irec *irec)
+{
+ xfs_agblock_t agbno = irec->rm_startblock;
+ xfs_extlen_t len = irec->rm_blockcount;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ xfs_scrub_xref_is_used_space(sc, agbno, len);
+ if (irec->rm_owner == XFS_RMAP_OWN_INODES)
+ xfs_scrub_xref_is_inode_chunk(sc, agbno, len);
+ else
+ xfs_scrub_xref_is_not_inode_chunk(sc, agbno, len);
+ if (irec->rm_owner == XFS_RMAP_OWN_COW)
+ xfs_scrub_xref_is_cow_staging(sc, irec->rm_startblock,
+ irec->rm_blockcount);
+ else
+ xfs_scrub_rmapbt_xref_refc(sc, irec);
+}
+
+/* Scrub an rmapbt record. */
+STATIC int
+xfs_scrub_rmapbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xfs_rmap_irec irec;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ bool non_inode;
+ bool is_unwritten;
+ bool is_bmbt;
+ bool is_attr;
+ int error;
+
+ error = xfs_rmap_btrec_to_irec(rec, &irec);
+ if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, 0, &error))
+ goto out;
+
+ /* Check extent. */
+ if (irec.rm_startblock + irec.rm_blockcount <= irec.rm_startblock)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (irec.rm_owner == XFS_RMAP_OWN_FS) {
+ /*
+ * xfs_verify_agbno returns false for static fs metadata.
+ * Since that only exists at the start of the AG, validate
+ * that by hand.
+ */
+ if (irec.rm_startblock != 0 ||
+ irec.rm_blockcount != XFS_AGFL_BLOCK(mp) + 1)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ } else {
+ /*
+ * Otherwise we must point somewhere past the static metadata
+ * but before the end of the FS. Run the regular check.
+ */
+ if (!xfs_verify_agbno(mp, agno, irec.rm_startblock) ||
+ !xfs_verify_agbno(mp, agno, irec.rm_startblock +
+ irec.rm_blockcount - 1))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ }
+
+ /* Check flags. */
+ non_inode = XFS_RMAP_NON_INODE_OWNER(irec.rm_owner);
+ is_bmbt = irec.rm_flags & XFS_RMAP_BMBT_BLOCK;
+ is_attr = irec.rm_flags & XFS_RMAP_ATTR_FORK;
+ is_unwritten = irec.rm_flags & XFS_RMAP_UNWRITTEN;
+
+ if (is_bmbt && irec.rm_offset != 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (non_inode && irec.rm_offset != 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (is_unwritten && (is_bmbt || non_inode || is_attr))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (non_inode && (is_bmbt || is_unwritten || is_attr))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (!non_inode) {
+ if (!xfs_verify_ino(mp, irec.rm_owner))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ } else {
+ /* Non-inode owner within the magic values? */
+ if (irec.rm_owner <= XFS_RMAP_OWN_MIN ||
+ irec.rm_owner > XFS_RMAP_OWN_FS)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ }
+
+ xfs_scrub_rmapbt_xref(bs->sc, &irec);
+out:
+ return error;
+}
+
+/* Scrub the rmap btree for some AG. */
+int
+xfs_scrub_rmapbt(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_owner_info oinfo;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ return xfs_scrub_btree(sc, sc->sa.rmap_cur, xfs_scrub_rmapbt_rec,
+ &oinfo, NULL);
+}
+
+/* xref check that the extent is owned by a given owner */
+static inline void
+xfs_scrub_xref_check_owner(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo,
+ bool should_have_rmap)
+{
+ bool has_rmap;
+ int error;
+
+ if (!sc->sa.rmap_cur)
+ return;
+
+ error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo,
+ &has_rmap);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+ return;
+ if (has_rmap != should_have_rmap)
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+}
+
+/* xref check that the extent is owned by a given owner */
+void
+xfs_scrub_xref_is_owned_by(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo)
+{
+ xfs_scrub_xref_check_owner(sc, bno, len, oinfo, true);
+}
+
+/* xref check that the extent is not owned by a given owner */
+void
+xfs_scrub_xref_is_not_owned_by(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo)
+{
+ xfs_scrub_xref_check_owner(sc, bno, len, oinfo, false);
+}
+
+/* xref check that the extent has no reverse mapping at all */
+void
+xfs_scrub_xref_has_no_owner(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t bno,
+ xfs_extlen_t len)
+{
+ bool has_rmap;
+ int error;
+
+ if (!sc->sa.rmap_cur)
+ return;
+
+ error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap);
+ if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+ return;
+ if (has_rmap)
+ xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
new file mode 100644
index 000000000000..39c41dfe08ee
--- /dev/null
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_inode.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Set us up with the realtime metadata locked. */
+int
+xfs_scrub_setup_rt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ int error;
+
+ error = xfs_scrub_setup_fs(sc, ip);
+ if (error)
+ return error;
+
+ sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP;
+ sc->ip = sc->mp->m_rbmip;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+
+ return 0;
+}
+
+/* Realtime bitmap. */
+
+/* Scrub a free extent record from the realtime bitmap. */
+STATIC int
+xfs_scrub_rtbitmap_rec(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xfs_scrub_context *sc = priv;
+
+ if (rec->ar_startblock + rec->ar_blockcount <= rec->ar_startblock ||
+ !xfs_verify_rtbno(sc->mp, rec->ar_startblock) ||
+ !xfs_verify_rtbno(sc->mp, rec->ar_startblock +
+ rec->ar_blockcount - 1))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+}
+
+/* Scrub the realtime bitmap. */
+int
+xfs_scrub_rtbitmap(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ error = xfs_rtalloc_query_all(sc->tp, xfs_scrub_rtbitmap_rec, sc);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out;
+
+out:
+ return error;
+}
+
+/* Scrub the realtime summary. */
+int
+xfs_scrub_rtsummary(
+ struct xfs_scrub_context *sc)
+{
+ /* XXX: implement this some day */
+ return -ENOENT;
+}
+
+
+/* xref check that the extent is not free in the rtbitmap */
+void
+xfs_scrub_xref_is_used_rt_space(
+ struct xfs_scrub_context *sc,
+ xfs_rtblock_t fsbno,
+ xfs_extlen_t len)
+{
+ bool is_free;
+ int error;
+
+ xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, fsbno, len,
+ &is_free);
+ if (!xfs_scrub_should_check_xref(sc, &error, NULL))
+ goto out_unlock;
+ if (is_free)
+ xfs_scrub_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
+out_unlock:
+ xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
new file mode 100644
index 000000000000..26c75967a072
--- /dev/null
+++ b/fs/xfs/scrub/scrub.c
@@ -0,0 +1,462 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_itable.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/btree.h"
+
+/*
+ * Online Scrub and Repair
+ *
+ * Traditionally, XFS (the kernel driver) did not know how to check or
+ * repair on-disk data structures. That task was left to the xfs_check
+ * and xfs_repair tools, both of which require taking the filesystem
+ * offline for a thorough but time consuming examination. Online
+ * scrub & repair, on the other hand, enables us to check the metadata
+ * for obvious errors while carefully stepping around the filesystem's
+ * ongoing operations, locking rules, etc.
+ *
+ * Given that most XFS metadata consist of records stored in a btree,
+ * most of the checking functions iterate the btree blocks themselves
+ * looking for irregularities. When a record block is encountered, each
+ * record can be checked for obviously bad values. Record values can
+ * also be cross-referenced against other btrees to look for potential
+ * misunderstandings between pieces of metadata.
+ *
+ * It is expected that the checkers responsible for per-AG metadata
+ * structures will lock the AG headers (AGI, AGF, AGFL), iterate the
+ * metadata structure, and perform any relevant cross-referencing before
+ * unlocking the AG and returning the results to userspace. These
+ * scrubbers must not keep an AG locked for too long to avoid tying up
+ * the block and inode allocators.
+ *
+ * Block maps and b-trees rooted in an inode present a special challenge
+ * because they can involve extents from any AG. The general scrubber
+ * structure of lock -> check -> xref -> unlock still holds, but AG
+ * locking order rules /must/ be obeyed to avoid deadlocks. The
+ * ordering rule, of course, is that we must lock in increasing AG
+ * order. Helper functions are provided to track which AG headers we've
+ * already locked. If we detect an imminent locking order violation, we
+ * can signal a potential deadlock, in which case the scrubber can jump
+ * out to the top level, lock all the AGs in order, and retry the scrub.
+ *
+ * For file data (directories, extended attributes, symlinks) scrub, we
+ * can simply lock the inode and walk the data. For btree data
+ * (directories and attributes) we follow the same btree-scrubbing
+ * strategy outlined previously to check the records.
+ *
+ * We use a bit of trickery with transactions to avoid buffer deadlocks
+ * if there is a cycle in the metadata. The basic problem is that
+ * travelling down a btree involves locking the current buffer at each
+ * tree level. If a pointer should somehow point back to a buffer that
+ * we've already examined, we will deadlock due to the second buffer
+ * locking attempt. Note however that grabbing a buffer in transaction
+ * context links the locked buffer to the transaction. If we try to
+ * re-grab the buffer in the context of the same transaction, we avoid
+ * the second lock attempt and continue. Between the verifier and the
+ * scrubber, something will notice that something is amiss and report
+ * the corruption. Therefore, each scrubber will allocate an empty
+ * transaction, attach buffers to it, and cancel the transaction at the
+ * end of the scrub run. Cancelling a non-dirty transaction simply
+ * unlocks the buffers.
+ *
+ * There are four pieces of data that scrub can communicate to
+ * userspace. The first is the error code (errno), which can be used to
+ * communicate operational errors in performing the scrub. There are
+ * also three flags that can be set in the scrub context. If the data
+ * structure itself is corrupt, the CORRUPT flag will be set. If
+ * the metadata is correct but otherwise suboptimal, the PREEN flag
+ * will be set.
+ *
+ * We perform secondary validation of filesystem metadata by
+ * cross-referencing every record with all other available metadata.
+ * For example, for block mapping extents, we verify that there are no
+ * records in the free space and inode btrees corresponding to that
+ * space extent and that there is a corresponding entry in the reverse
+ * mapping btree. Inconsistent metadata is noted by setting the
+ * XCORRUPT flag; btree query function errors are noted by setting the
+ * XFAIL flag and deleting the cursor to prevent further attempts to
+ * cross-reference with a defective btree.
+ */
+
+/*
+ * Scrub probe -- userspace uses this to probe if we're willing to scrub
+ * or repair a given mountpoint. This will be used by xfs_scrub to
+ * probe the kernel's abilities to scrub (and repair) the metadata. We
+ * do this by validating the ioctl inputs from userspace, preparing the
+ * filesystem for a scrub (or a repair) operation, and immediately
+ * returning to userspace. Userspace can use the returned errno and
+ * structure state to decide (in broad terms) if scrub/repair are
+ * supported by the running kernel.
+ */
+static int
+xfs_scrub_probe(
+ struct xfs_scrub_context *sc)
+{
+ int error = 0;
+
+ if (xfs_scrub_should_terminate(sc, &error))
+ return error;
+
+ return 0;
+}
+
+/* Scrub setup and teardown */
+
+/* Free all the resources and finish the transactions. */
+STATIC int
+xfs_scrub_teardown(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip_in,
+ int error)
+{
+ xfs_scrub_ag_free(sc, &sc->sa);
+ if (sc->tp) {
+ xfs_trans_cancel(sc->tp);
+ sc->tp = NULL;
+ }
+ if (sc->ip) {
+ if (sc->ilock_flags)
+ xfs_iunlock(sc->ip, sc->ilock_flags);
+ if (sc->ip != ip_in &&
+ !xfs_internal_inum(sc->mp, sc->ip->i_ino))
+ iput(VFS_I(sc->ip));
+ sc->ip = NULL;
+ }
+ if (sc->buf) {
+ kmem_free(sc->buf);
+ sc->buf = NULL;
+ }
+ return error;
+}
+
+/* Scrubbing dispatch. */
+
+static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
+ [XFS_SCRUB_TYPE_PROBE] = { /* ioctl presence test */
+ .type = ST_NONE,
+ .setup = xfs_scrub_setup_fs,
+ .scrub = xfs_scrub_probe,
+ },
+ [XFS_SCRUB_TYPE_SB] = { /* superblock */
+ .type = ST_PERAG,
+ .setup = xfs_scrub_setup_fs,
+ .scrub = xfs_scrub_superblock,
+ },
+ [XFS_SCRUB_TYPE_AGF] = { /* agf */
+ .type = ST_PERAG,
+ .setup = xfs_scrub_setup_fs,
+ .scrub = xfs_scrub_agf,
+ },
+ [XFS_SCRUB_TYPE_AGFL]= { /* agfl */
+ .type = ST_PERAG,
+ .setup = xfs_scrub_setup_fs,
+ .scrub = xfs_scrub_agfl,
+ },
+ [XFS_SCRUB_TYPE_AGI] = { /* agi */
+ .type = ST_PERAG,
+ .setup = xfs_scrub_setup_fs,
+ .scrub = xfs_scrub_agi,
+ },
+ [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */
+ .type = ST_PERAG,
+ .setup = xfs_scrub_setup_ag_allocbt,
+ .scrub = xfs_scrub_bnobt,
+ },
+ [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */
+ .type = ST_PERAG,
+ .setup = xfs_scrub_setup_ag_allocbt,
+ .scrub = xfs_scrub_cntbt,
+ },
+ [XFS_SCRUB_TYPE_INOBT] = { /* inobt */
+ .type = ST_PERAG,
+ .setup = xfs_scrub_setup_ag_iallocbt,
+ .scrub = xfs_scrub_inobt,
+ },
+ [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */
+ .type = ST_PERAG,
+ .setup = xfs_scrub_setup_ag_iallocbt,
+ .scrub = xfs_scrub_finobt,
+ .has = xfs_sb_version_hasfinobt,
+ },
+ [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */
+ .type = ST_PERAG,
+ .setup = xfs_scrub_setup_ag_rmapbt,
+ .scrub = xfs_scrub_rmapbt,
+ .has = xfs_sb_version_hasrmapbt,
+ },
+ [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */
+ .type = ST_PERAG,
+ .setup = xfs_scrub_setup_ag_refcountbt,
+ .scrub = xfs_scrub_refcountbt,
+ .has = xfs_sb_version_hasreflink,
+ },
+ [XFS_SCRUB_TYPE_INODE] = { /* inode record */
+ .type = ST_INODE,
+ .setup = xfs_scrub_setup_inode,
+ .scrub = xfs_scrub_inode,
+ },
+ [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */
+ .type = ST_INODE,
+ .setup = xfs_scrub_setup_inode_bmap,
+ .scrub = xfs_scrub_bmap_data,
+ },
+ [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */
+ .type = ST_INODE,
+ .setup = xfs_scrub_setup_inode_bmap,
+ .scrub = xfs_scrub_bmap_attr,
+ },
+ [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */
+ .type = ST_INODE,
+ .setup = xfs_scrub_setup_inode_bmap,
+ .scrub = xfs_scrub_bmap_cow,
+ },
+ [XFS_SCRUB_TYPE_DIR] = { /* directory */
+ .type = ST_INODE,
+ .setup = xfs_scrub_setup_directory,
+ .scrub = xfs_scrub_directory,
+ },
+ [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */
+ .type = ST_INODE,
+ .setup = xfs_scrub_setup_xattr,
+ .scrub = xfs_scrub_xattr,
+ },
+ [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */
+ .type = ST_INODE,
+ .setup = xfs_scrub_setup_symlink,
+ .scrub = xfs_scrub_symlink,
+ },
+ [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */
+ .type = ST_INODE,
+ .setup = xfs_scrub_setup_parent,
+ .scrub = xfs_scrub_parent,
+ },
+ [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
+ .type = ST_FS,
+ .setup = xfs_scrub_setup_rt,
+ .scrub = xfs_scrub_rtbitmap,
+ .has = xfs_sb_version_hasrealtime,
+ },
+ [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
+ .type = ST_FS,
+ .setup = xfs_scrub_setup_rt,
+ .scrub = xfs_scrub_rtsummary,
+ .has = xfs_sb_version_hasrealtime,
+ },
+ [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */
+ .type = ST_FS,
+ .setup = xfs_scrub_setup_quota,
+ .scrub = xfs_scrub_quota,
+ },
+ [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */
+ .type = ST_FS,
+ .setup = xfs_scrub_setup_quota,
+ .scrub = xfs_scrub_quota,
+ },
+ [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */
+ .type = ST_FS,
+ .setup = xfs_scrub_setup_quota,
+ .scrub = xfs_scrub_quota,
+ },
+};
+
+/* This isn't a stable feature, warn once per day. */
+static inline void
+xfs_scrub_experimental_warning(
+ struct xfs_mount *mp)
+{
+ static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT(
+ "xfs_scrub_warning", 86400 * HZ, 1);
+ ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE);
+
+ if (__ratelimit(&scrub_warning))
+ xfs_alert(mp,
+"EXPERIMENTAL online scrub feature in use. Use at your own risk!");
+}
+
+static int
+xfs_scrub_validate_inputs(
+ struct xfs_mount *mp,
+ struct xfs_scrub_metadata *sm)
+{
+ int error;
+ const struct xfs_scrub_meta_ops *ops;
+
+ error = -EINVAL;
+ /* Check our inputs. */
+ sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN)
+ goto out;
+ /* sm_reserved[] must be zero */
+ if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved)))
+ goto out;
+
+ error = -ENOENT;
+ /* Do we know about this type of metadata? */
+ if (sm->sm_type >= XFS_SCRUB_TYPE_NR)
+ goto out;
+ ops = &meta_scrub_ops[sm->sm_type];
+ if (ops->setup == NULL || ops->scrub == NULL)
+ goto out;
+ /* Does this fs even support this type of metadata? */
+ if (ops->has && !ops->has(&mp->m_sb))
+ goto out;
+
+ error = -EINVAL;
+ /* restricting fields must be appropriate for type */
+ switch (ops->type) {
+ case ST_NONE:
+ case ST_FS:
+ if (sm->sm_ino || sm->sm_gen || sm->sm_agno)
+ goto out;
+ break;
+ case ST_PERAG:
+ if (sm->sm_ino || sm->sm_gen ||
+ sm->sm_agno >= mp->m_sb.sb_agcount)
+ goto out;
+ break;
+ case ST_INODE:
+ if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino))
+ goto out;
+ break;
+ default:
+ goto out;
+ }
+
+ error = -EOPNOTSUPP;
+ /*
+ * We won't scrub any filesystem that doesn't have the ability
+ * to record unwritten extents. The option was made default in
+ * 2003, removed from mkfs in 2007, and cannot be disabled in
+ * v5, so if we find a filesystem without this flag it's either
+ * really old or totally unsupported. Avoid it either way.
+ * We also don't support v1-v3 filesystems, which aren't
+ * mountable.
+ */
+ if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
+ goto out;
+
+ /* We don't know how to repair anything yet. */
+ if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ goto out;
+
+ error = 0;
+out:
+ return error;
+}
+
+/* Dispatch metadata scrubbing. */
+int
+xfs_scrub_metadata(
+ struct xfs_inode *ip,
+ struct xfs_scrub_metadata *sm)
+{
+ struct xfs_scrub_context sc;
+ struct xfs_mount *mp = ip->i_mount;
+ bool try_harder = false;
+ int error = 0;
+
+ BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
+ (sizeof(struct xfs_scrub_meta_ops) * XFS_SCRUB_TYPE_NR));
+
+ trace_xfs_scrub_start(ip, sm, error);
+
+ /* Forbidden if we are shut down or mounted norecovery. */
+ error = -ESHUTDOWN;
+ if (XFS_FORCED_SHUTDOWN(mp))
+ goto out;
+ error = -ENOTRECOVERABLE;
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ goto out;
+
+ error = xfs_scrub_validate_inputs(mp, sm);
+ if (error)
+ goto out;
+
+ xfs_scrub_experimental_warning(mp);
+
+retry_op:
+ /* Set up for the operation. */
+ memset(&sc, 0, sizeof(sc));
+ sc.mp = ip->i_mount;
+ sc.sm = sm;
+ sc.ops = &meta_scrub_ops[sm->sm_type];
+ sc.try_harder = try_harder;
+ sc.sa.agno = NULLAGNUMBER;
+ error = sc.ops->setup(&sc, ip);
+ if (error)
+ goto out_teardown;
+
+ /* Scrub for errors. */
+ error = sc.ops->scrub(&sc);
+ if (!try_harder && error == -EDEADLOCK) {
+ /*
+ * Scrubbers return -EDEADLOCK to mean 'try harder'.
+ * Tear down everything we hold, then set up again with
+ * preparation for worst-case scenarios.
+ */
+ error = xfs_scrub_teardown(&sc, ip, 0);
+ if (error)
+ goto out;
+ try_harder = true;
+ goto retry_op;
+ } else if (error)
+ goto out_teardown;
+
+ if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT))
+ xfs_alert_ratelimited(mp, "Corruption detected during scrub.");
+
+out_teardown:
+ error = xfs_scrub_teardown(&sc, ip, error);
+out:
+ trace_xfs_scrub_done(ip, sm, error);
+ if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
+ sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ error = 0;
+ }
+ return error;
+}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
new file mode 100644
index 000000000000..0d92af86f67a
--- /dev/null
+++ b/fs/xfs/scrub/scrub.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_SCRUB_H__
+#define __XFS_SCRUB_SCRUB_H__
+
+struct xfs_scrub_context;
+
+/* Type info and names for the scrub types. */
+enum xfs_scrub_type {
+ ST_NONE = 1, /* disabled */
+ ST_PERAG, /* per-AG metadata */
+ ST_FS, /* per-FS metadata */
+ ST_INODE, /* per-inode metadata */
+};
+
+struct xfs_scrub_meta_ops {
+ /* Acquire whatever resources are needed for the operation. */
+ int (*setup)(struct xfs_scrub_context *,
+ struct xfs_inode *);
+
+ /* Examine metadata for errors. */
+ int (*scrub)(struct xfs_scrub_context *);
+
+ /* Decide if we even have this piece of metadata. */
+ bool (*has)(struct xfs_sb *);
+
+ /* type describing required/allowed inputs */
+ enum xfs_scrub_type type;
+};
+
+/* Buffer pointers and btree cursors for an entire AG. */
+struct xfs_scrub_ag {
+ xfs_agnumber_t agno;
+
+ /* AG btree roots */
+ struct xfs_buf *agf_bp;
+ struct xfs_buf *agfl_bp;
+ struct xfs_buf *agi_bp;
+
+ /* AG btrees */
+ struct xfs_btree_cur *bno_cur;
+ struct xfs_btree_cur *cnt_cur;
+ struct xfs_btree_cur *ino_cur;
+ struct xfs_btree_cur *fino_cur;
+ struct xfs_btree_cur *rmap_cur;
+ struct xfs_btree_cur *refc_cur;
+};
+
+struct xfs_scrub_context {
+ /* General scrub state. */
+ struct xfs_mount *mp;
+ struct xfs_scrub_metadata *sm;
+ const struct xfs_scrub_meta_ops *ops;
+ struct xfs_trans *tp;
+ struct xfs_inode *ip;
+ void *buf;
+ uint ilock_flags;
+ bool try_harder;
+
+ /* State tracking for single-AG operations. */
+ struct xfs_scrub_ag sa;
+};
+
+/* Metadata scrubbers */
+int xfs_scrub_tester(struct xfs_scrub_context *sc);
+int xfs_scrub_superblock(struct xfs_scrub_context *sc);
+int xfs_scrub_agf(struct xfs_scrub_context *sc);
+int xfs_scrub_agfl(struct xfs_scrub_context *sc);
+int xfs_scrub_agi(struct xfs_scrub_context *sc);
+int xfs_scrub_bnobt(struct xfs_scrub_context *sc);
+int xfs_scrub_cntbt(struct xfs_scrub_context *sc);
+int xfs_scrub_inobt(struct xfs_scrub_context *sc);
+int xfs_scrub_finobt(struct xfs_scrub_context *sc);
+int xfs_scrub_rmapbt(struct xfs_scrub_context *sc);
+int xfs_scrub_refcountbt(struct xfs_scrub_context *sc);
+int xfs_scrub_inode(struct xfs_scrub_context *sc);
+int xfs_scrub_bmap_data(struct xfs_scrub_context *sc);
+int xfs_scrub_bmap_attr(struct xfs_scrub_context *sc);
+int xfs_scrub_bmap_cow(struct xfs_scrub_context *sc);
+int xfs_scrub_directory(struct xfs_scrub_context *sc);
+int xfs_scrub_xattr(struct xfs_scrub_context *sc);
+int xfs_scrub_symlink(struct xfs_scrub_context *sc);
+int xfs_scrub_parent(struct xfs_scrub_context *sc);
+#ifdef CONFIG_XFS_RT
+int xfs_scrub_rtbitmap(struct xfs_scrub_context *sc);
+int xfs_scrub_rtsummary(struct xfs_scrub_context *sc);
+#else
+static inline int
+xfs_scrub_rtbitmap(struct xfs_scrub_context *sc)
+{
+ return -ENOENT;
+}
+static inline int
+xfs_scrub_rtsummary(struct xfs_scrub_context *sc)
+{
+ return -ENOENT;
+}
+#endif
+#ifdef CONFIG_XFS_QUOTA
+int xfs_scrub_quota(struct xfs_scrub_context *sc);
+#else
+static inline int
+xfs_scrub_quota(struct xfs_scrub_context *sc)
+{
+ return -ENOENT;
+}
+#endif
+
+/* cross-referencing helpers */
+void xfs_scrub_xref_is_used_space(struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno, xfs_extlen_t len);
+void xfs_scrub_xref_is_not_inode_chunk(struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno, xfs_extlen_t len);
+void xfs_scrub_xref_is_inode_chunk(struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno, xfs_extlen_t len);
+void xfs_scrub_xref_is_owned_by(struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno, xfs_extlen_t len,
+ struct xfs_owner_info *oinfo);
+void xfs_scrub_xref_is_not_owned_by(struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno, xfs_extlen_t len,
+ struct xfs_owner_info *oinfo);
+void xfs_scrub_xref_has_no_owner(struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno, xfs_extlen_t len);
+void xfs_scrub_xref_is_cow_staging(struct xfs_scrub_context *sc,
+ xfs_agblock_t bno, xfs_extlen_t len);
+void xfs_scrub_xref_is_not_shared(struct xfs_scrub_context *sc,
+ xfs_agblock_t bno, xfs_extlen_t len);
+#ifdef CONFIG_XFS_RT
+void xfs_scrub_xref_is_used_rt_space(struct xfs_scrub_context *sc,
+ xfs_rtblock_t rtbno, xfs_extlen_t len);
+#else
+# define xfs_scrub_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
+#endif
+
+#endif /* __XFS_SCRUB_SCRUB_H__ */
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
new file mode 100644
index 000000000000..3aa3d60f7c16
--- /dev/null
+++ b/fs/xfs/scrub/symlink.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_symlink.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Set us up to scrub a symbolic link. */
+int
+xfs_scrub_setup_symlink(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ /* Allocate the buffer without the inode lock held. */
+ sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ return xfs_scrub_setup_inode_contents(sc, ip, 0);
+}
+
+/* Symbolic links. */
+
+int
+xfs_scrub_symlink(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_ifork *ifp;
+ loff_t len;
+ int error = 0;
+
+ if (!S_ISLNK(VFS_I(ip)->i_mode))
+ return -ENOENT;
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ len = ip->i_d.di_size;
+
+ /* Plausible size? */
+ if (len > XFS_SYMLINK_MAXLEN || len <= 0) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /* Inline symlink? */
+ if (ifp->if_flags & XFS_IFINLINE) {
+ if (len > XFS_IFORK_DSIZE(ip) ||
+ len > strnlen(ifp->if_u1.if_data, XFS_IFORK_DSIZE(ip)))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /* Remote symlink; must read the contents. */
+ error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out;
+ if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
new file mode 100644
index 000000000000..86daed0e3a45
--- /dev/null
+++ b/fs/xfs/scrub/trace.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+
+/* Figure out which block the btree cursor was pointing to. */
+static inline xfs_fsblock_t
+xfs_scrub_btree_cur_fsbno(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level < cur->bc_nlevels && cur->bc_bufs[level])
+ return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn);
+ else if (level == cur->bc_nlevels - 1 &&
+ cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_private.b.ip->i_ino);
+ else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS))
+ return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, 0);
+ return NULLFSBLOCK;
+}
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "scrub/trace.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
new file mode 100644
index 000000000000..5d2b1c241be5
--- /dev/null
+++ b/fs/xfs/scrub/trace.h
@@ -0,0 +1,500 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xfs_scrub
+
+#if !defined(_TRACE_XFS_SCRUB_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XFS_SCRUB_TRACE_H
+
+#include <linux/tracepoint.h>
+#include "xfs_bit.h"
+
+DECLARE_EVENT_CLASS(xfs_scrub_class,
+ TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
+ int error),
+ TP_ARGS(ip, sm, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, inum)
+ __field(unsigned int, gen)
+ __field(unsigned int, flags)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->type = sm->sm_type;
+ __entry->agno = sm->sm_agno;
+ __entry->inum = sm->sm_ino;
+ __entry->gen = sm->sm_gen;
+ __entry->flags = sm->sm_flags;
+ __entry->error = error;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx type %u agno %u inum %llu gen %u flags 0x%x error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->type,
+ __entry->agno,
+ __entry->inum,
+ __entry->gen,
+ __entry->flags,
+ __entry->error)
+)
+#define DEFINE_SCRUB_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_class, name, \
+ TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, \
+ int error), \
+ TP_ARGS(ip, sm, error))
+
+DEFINE_SCRUB_EVENT(xfs_scrub_start);
+DEFINE_SCRUB_EVENT(xfs_scrub_done);
+DEFINE_SCRUB_EVENT(xfs_scrub_deadlock_retry);
+
+TRACE_EVENT(xfs_scrub_op_error,
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
+ xfs_agblock_t bno, int error, void *ret_ip),
+ TP_ARGS(sc, agno, bno, error, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, error)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->agno = agno;
+ __entry->bno = bno;
+ __entry->error = error;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->agno,
+ __entry->bno,
+ __entry->error,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_file_op_error,
+ TP_PROTO(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset, int error, void *ret_ip),
+ TP_ARGS(sc, whichfork, offset, error, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(unsigned int, type)
+ __field(xfs_fileoff_t, offset)
+ __field(int, error)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->ip->i_mount->m_super->s_dev;
+ __entry->ino = sc->ip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->type = sc->sm->sm_type;
+ __entry->offset = offset;
+ __entry->error = error;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu error %d ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->whichfork,
+ __entry->type,
+ __entry->offset,
+ __entry->error,
+ __entry->ret_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_scrub_block_error_class,
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_daddr_t daddr, void *ret_ip),
+ TP_ARGS(sc, daddr, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno;
+ xfs_agnumber_t agno;
+ xfs_agblock_t bno;
+
+ fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr);
+ agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+ bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->agno = agno;
+ __entry->bno = bno;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->agno,
+ __entry->bno,
+ __entry->ret_ip)
+)
+
+#define DEFINE_SCRUB_BLOCK_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_block_error_class, name, \
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_daddr_t daddr, \
+ void *ret_ip), \
+ TP_ARGS(sc, daddr, ret_ip))
+
+DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_error);
+DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_preen);
+
+DECLARE_EVENT_CLASS(xfs_scrub_ino_error_class,
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, void *ret_ip),
+ TP_ARGS(sc, ino, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, type)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->type = sc->sm->sm_type;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx type %u ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->type,
+ __entry->ret_ip)
+)
+
+#define DEFINE_SCRUB_INO_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_ino_error_class, name, \
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, \
+ void *ret_ip), \
+ TP_ARGS(sc, ino, ret_ip))
+
+DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_error);
+DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_preen);
+DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_warning);
+
+DECLARE_EVENT_CLASS(xfs_scrub_fblock_error_class,
+ TP_PROTO(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset, void *ret_ip),
+ TP_ARGS(sc, whichfork, offset, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(unsigned int, type)
+ __field(xfs_fileoff_t, offset)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->ip->i_mount->m_super->s_dev;
+ __entry->ino = sc->ip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->type = sc->sm->sm_type;
+ __entry->offset = offset;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->whichfork,
+ __entry->type,
+ __entry->offset,
+ __entry->ret_ip)
+);
+
+#define DEFINE_SCRUB_FBLOCK_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_fblock_error_class, name, \
+ TP_PROTO(struct xfs_scrub_context *sc, int whichfork, \
+ xfs_fileoff_t offset, void *ret_ip), \
+ TP_ARGS(sc, whichfork, offset, ret_ip))
+
+DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xfs_scrub_fblock_error);
+DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xfs_scrub_fblock_warning);
+
+TRACE_EVENT(xfs_scrub_incomplete,
+ TP_PROTO(struct xfs_scrub_context *sc, void *ret_ip),
+ TP_ARGS(sc, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_btree_op_error,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level, int error, void *ret_ip),
+ TP_ARGS(sc, cur, level, error, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(int, level)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, ptr);
+ __field(int, error)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->ptr = cur->bc_ptrs[level];
+ __entry->error = error;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->btnum,
+ __entry->level,
+ __entry->ptr,
+ __entry->agno,
+ __entry->bno,
+ __entry->error,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_ifork_btree_op_error,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level, int error, void *ret_ip),
+ TP_ARGS(sc, cur, level, error, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(unsigned int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(int, level)
+ __field(int, ptr)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, error)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->ip->i_ino;
+ __entry->whichfork = cur->bc_private.b.whichfork;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->ptr = cur->bc_ptrs[level];
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->error = error;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->whichfork,
+ __entry->type,
+ __entry->btnum,
+ __entry->level,
+ __entry->ptr,
+ __entry->agno,
+ __entry->bno,
+ __entry->error,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_btree_error,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level, void *ret_ip),
+ TP_ARGS(sc, cur, level, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(int, level)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, ptr);
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->ptr = cur->bc_ptrs[level];
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->btnum,
+ __entry->level,
+ __entry->ptr,
+ __entry->agno,
+ __entry->bno,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_ifork_btree_error,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level, void *ret_ip),
+ TP_ARGS(sc, cur, level, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(unsigned int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(int, level)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, ptr);
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->ip->i_ino;
+ __entry->whichfork = cur->bc_private.b.whichfork;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->ptr = cur->bc_ptrs[level];
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->whichfork,
+ __entry->type,
+ __entry->btnum,
+ __entry->level,
+ __entry->ptr,
+ __entry->agno,
+ __entry->bno,
+ __entry->ret_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_scrub_sbtree_class,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level),
+ TP_ARGS(sc, cur, level),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, level)
+ __field(int, nlevels)
+ __field(int, ptr)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->level = level;
+ __entry->nlevels = cur->bc_nlevels;
+ __entry->ptr = cur->bc_ptrs[level];
+ ),
+ TP_printk("dev %d:%d type %u btnum %d agno %u agbno %u level %d nlevels %d ptr %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->btnum,
+ __entry->agno,
+ __entry->bno,
+ __entry->level,
+ __entry->nlevels,
+ __entry->ptr)
+)
+#define DEFINE_SCRUB_SBTREE_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_sbtree_class, name, \
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, \
+ int level), \
+ TP_ARGS(sc, cur, level))
+
+DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_rec);
+DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_key);
+
+TRACE_EVENT(xfs_scrub_xref_error,
+ TP_PROTO(struct xfs_scrub_context *sc, int error, void *ret_ip),
+ TP_ARGS(sc, error, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, type)
+ __field(int, error)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->error = error;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u xref error %d ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->error,
+ __entry->ret_ip)
+);
+
+#endif /* _TRACE_XFS_SCRUB_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE scrub/trace
+#include <trace/define_trace.h>
diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h
new file mode 100644
index 000000000000..e00e0eadac6a
--- /dev/null
+++ b/fs/xfs/scrub/xfs_scrub.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_H__
+#define __XFS_SCRUB_H__
+
+#ifndef CONFIG_XFS_ONLINE_SCRUB
+# define xfs_scrub_metadata(ip, sm) (-ENOTTY)
+#else
+int xfs_scrub_metadata(struct xfs_inode *ip, struct xfs_scrub_metadata *sm);
+#endif /* CONFIG_XFS_ONLINE_SCRUB */
+
+#endif /* __XFS_SCRUB_H__ */