summaryrefslogtreecommitdiff
path: root/fs/xfs/scrub
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/scrub')
-rw-r--r--fs/xfs/scrub/agb_bitmap.h5
-rw-r--r--fs/xfs/scrub/agheader.c156
-rw-r--r--fs/xfs/scrub/agheader_repair.c944
-rw-r--r--fs/xfs/scrub/agino_bitmap.h49
-rw-r--r--fs/xfs/scrub/alloc.c2
-rw-r--r--fs/xfs/scrub/alloc_repair.c54
-rw-r--r--fs/xfs/scrub/attr.c214
-rw-r--r--fs/xfs/scrub/attr.h7
-rw-r--r--fs/xfs/scrub/attr_repair.c1663
-rw-r--r--fs/xfs/scrub/attr_repair.h15
-rw-r--r--fs/xfs/scrub/bitmap.c36
-rw-r--r--fs/xfs/scrub/bitmap.h2
-rw-r--r--fs/xfs/scrub/bmap.c174
-rw-r--r--fs/xfs/scrub/bmap_repair.c171
-rw-r--r--fs/xfs/scrub/btree.c58
-rw-r--r--fs/xfs/scrub/common.c488
-rw-r--r--fs/xfs/scrub/common.h124
-rw-r--r--fs/xfs/scrub/cow_repair.c191
-rw-r--r--fs/xfs/scrub/dab_bitmap.h37
-rw-r--r--fs/xfs/scrub/dabtree.c24
-rw-r--r--fs/xfs/scrub/dabtree.h3
-rw-r--r--fs/xfs/scrub/dir.c391
-rw-r--r--fs/xfs/scrub/dir_repair.c1968
-rw-r--r--fs/xfs/scrub/dirtree.c1009
-rw-r--r--fs/xfs/scrub/dirtree.h168
-rw-r--r--fs/xfs/scrub/dirtree_repair.c821
-rw-r--r--fs/xfs/scrub/findparent.c470
-rw-r--r--fs/xfs/scrub/findparent.h56
-rw-r--r--fs/xfs/scrub/fscounters.c100
-rw-r--r--fs/xfs/scrub/fscounters.h21
-rw-r--r--fs/xfs/scrub/fscounters_repair.c85
-rw-r--r--fs/xfs/scrub/health.c178
-rw-r--r--fs/xfs/scrub/health.h5
-rw-r--r--fs/xfs/scrub/ialloc.c40
-rw-r--r--fs/xfs/scrub/ialloc_repair.c37
-rw-r--r--fs/xfs/scrub/ino_bitmap.h37
-rw-r--r--fs/xfs/scrub/inode.c102
-rw-r--r--fs/xfs/scrub/inode_repair.c627
-rw-r--r--fs/xfs/scrub/iscan.c826
-rw-r--r--fs/xfs/scrub/iscan.h100
-rw-r--r--fs/xfs/scrub/listxattr.c320
-rw-r--r--fs/xfs/scrub/listxattr.h19
-rw-r--r--fs/xfs/scrub/metapath.c679
-rw-r--r--fs/xfs/scrub/newbt.c113
-rw-r--r--fs/xfs/scrub/newbt.h8
-rw-r--r--fs/xfs/scrub/nlinks.c1049
-rw-r--r--fs/xfs/scrub/nlinks.h109
-rw-r--r--fs/xfs/scrub/nlinks_repair.c351
-rw-r--r--fs/xfs/scrub/orphanage.c629
-rw-r--r--fs/xfs/scrub/orphanage.h86
-rw-r--r--fs/xfs/scrub/parent.c721
-rw-r--r--fs/xfs/scrub/parent_repair.c1639
-rw-r--r--fs/xfs/scrub/quota.c8
-rw-r--r--fs/xfs/scrub/quota_repair.c9
-rw-r--r--fs/xfs/scrub/quotacheck.c870
-rw-r--r--fs/xfs/scrub/quotacheck.h76
-rw-r--r--fs/xfs/scrub/quotacheck_repair.c261
-rw-r--r--fs/xfs/scrub/rcbag.c307
-rw-r--r--fs/xfs/scrub/rcbag.h28
-rw-r--r--fs/xfs/scrub/rcbag_btree.c370
-rw-r--r--fs/xfs/scrub/rcbag_btree.h81
-rw-r--r--fs/xfs/scrub/readdir.c144
-rw-r--r--fs/xfs/scrub/readdir.h3
-rw-r--r--fs/xfs/scrub/reap.c739
-rw-r--r--fs/xfs/scrub/reap.h30
-rw-r--r--fs/xfs/scrub/refcount.c19
-rw-r--r--fs/xfs/scrub/refcount_repair.c190
-rw-r--r--fs/xfs/scrub/repair.c514
-rw-r--r--fs/xfs/scrub/repair.h102
-rw-r--r--fs/xfs/scrub/rgb_bitmap.h37
-rw-r--r--fs/xfs/scrub/rgsuper.c88
-rw-r--r--fs/xfs/scrub/rmap.c28
-rw-r--r--fs/xfs/scrub/rmap_repair.c1743
-rw-r--r--fs/xfs/scrub/rtb_bitmap.h37
-rw-r--r--fs/xfs/scrub/rtbitmap.c125
-rw-r--r--fs/xfs/scrub/rtbitmap.h55
-rw-r--r--fs/xfs/scrub/rtbitmap_repair.c453
-rw-r--r--fs/xfs/scrub/rtrefcount.c661
-rw-r--r--fs/xfs/scrub/rtrefcount_repair.c761
-rw-r--r--fs/xfs/scrub/rtrmap.c323
-rw-r--r--fs/xfs/scrub/rtrmap_repair.c987
-rw-r--r--fs/xfs/scrub/rtsummary.c150
-rw-r--r--fs/xfs/scrub/rtsummary.h37
-rw-r--r--fs/xfs/scrub/rtsummary_repair.c186
-rw-r--r--fs/xfs/scrub/scrub.c431
-rw-r--r--fs/xfs/scrub/scrub.h179
-rw-r--r--fs/xfs/scrub/stats.c11
-rw-r--r--fs/xfs/scrub/symlink.c16
-rw-r--r--fs/xfs/scrub/symlink_repair.c510
-rw-r--r--fs/xfs/scrub/tempexch.h22
-rw-r--r--fs/xfs/scrub/tempfile.c980
-rw-r--r--fs/xfs/scrub/tempfile.h51
-rw-r--r--fs/xfs/scrub/trace.c16
-rw-r--r--fs/xfs/scrub/trace.h2389
-rw-r--r--fs/xfs/scrub/xfarray.c264
-rw-r--r--fs/xfs/scrub/xfarray.h36
-rw-r--r--fs/xfs/scrub/xfblob.c168
-rw-r--r--fs/xfs/scrub/xfblob.h50
-rw-r--r--fs/xfs/scrub/xfile.c355
-rw-r--r--fs/xfs/scrub/xfile.h66
-rw-r--r--fs/xfs/scrub/xfs_scrub.h6
101 files changed, 30382 insertions, 1771 deletions
diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h
index ed08f76ff4f3..e488e1f4f63d 100644
--- a/fs/xfs/scrub/agb_bitmap.h
+++ b/fs/xfs/scrub/agb_bitmap.h
@@ -65,4 +65,9 @@ int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur);
+static inline uint32_t xagb_bitmap_count_set_regions(struct xagb_bitmap *b)
+{
+ return xbitmap32_count_set_regions(&b->agbitmap);
+}
+
#endif /* __XFS_SCRUB_AGB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 6c6e5eba42c8..303374df44bd 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -15,6 +15,7 @@
#include "xfs_ialloc.h"
#include "xfs_rmap.h"
#include "xfs_ag.h"
+#include "xfs_inode.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -59,6 +60,34 @@ xchk_superblock_xref(
}
/*
+ * Calculate the ondisk superblock size in bytes given the feature set of the
+ * mounted filesystem (aka the primary sb). This is subtlely different from
+ * the logic in xfs_repair, which computes the size of a secondary sb given the
+ * featureset listed in the secondary sb.
+ */
+STATIC size_t
+xchk_superblock_ondisk_size(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_zoned(mp))
+ return offsetofend(struct xfs_dsb, sb_rtreserved);
+ if (xfs_has_metadir(mp))
+ return offsetofend(struct xfs_dsb, sb_pad);
+ if (xfs_has_metauuid(mp))
+ return offsetofend(struct xfs_dsb, sb_meta_uuid);
+ if (xfs_has_crc(mp))
+ return offsetofend(struct xfs_dsb, sb_lsn);
+ if (xfs_sb_version_hasmorebits(&mp->m_sb))
+ return offsetofend(struct xfs_dsb, sb_bad_features2);
+ if (xfs_has_logv2(mp))
+ return offsetofend(struct xfs_dsb, sb_logsunit);
+ if (xfs_has_sector(mp))
+ return offsetofend(struct xfs_dsb, sb_logsectsize);
+ /* only support dirv2 or more recent */
+ return offsetofend(struct xfs_dsb, sb_dirblklog);
+}
+
+/*
* Scrub the filesystem superblock.
*
* Note: We do /not/ attempt to check AG 0's superblock. Mount is
@@ -74,6 +103,7 @@ xchk_superblock(
struct xfs_buf *bp;
struct xfs_dsb *sb;
struct xfs_perag *pag;
+ size_t sblen;
xfs_agnumber_t agno;
uint32_t v2_ok;
__be32 features_mask;
@@ -143,11 +173,19 @@ xchk_superblock(
if (sb->sb_rootino != cpu_to_be64(mp->m_sb.sb_rootino))
xchk_block_set_preen(sc, bp);
- if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino))
- xchk_block_set_preen(sc, bp);
+ if (xfs_has_metadir(sc->mp)) {
+ if (sb->sb_rbmino != cpu_to_be64(0))
+ xchk_block_set_corrupt(sc, bp);
- if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino))
- xchk_block_set_preen(sc, bp);
+ if (sb->sb_rsumino != cpu_to_be64(0))
+ xchk_block_set_corrupt(sc, bp);
+ } else {
+ if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino))
+ xchk_block_set_preen(sc, bp);
+
+ if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino))
+ xchk_block_set_preen(sc, bp);
+ }
if (sb->sb_rextsize != cpu_to_be32(mp->m_sb.sb_rextsize))
xchk_block_set_corrupt(sc, bp);
@@ -165,8 +203,7 @@ xchk_superblock(
xchk_block_set_corrupt(sc, bp);
/* Check sb_versionnum bits that are set at mkfs time. */
- vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS |
- XFS_SB_VERSION_NUMBITS |
+ vernum_mask = cpu_to_be16(XFS_SB_VERSION_NUMBITS |
XFS_SB_VERSION_ALIGNBIT |
XFS_SB_VERSION_DALIGNBIT |
XFS_SB_VERSION_SHAREDBIT |
@@ -224,11 +261,19 @@ xchk_superblock(
* sb_icount, sb_ifree, sb_fdblocks, sb_frexents
*/
- if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino))
- xchk_block_set_preen(sc, bp);
+ if (xfs_has_metadir(mp)) {
+ if (sb->sb_uquotino != cpu_to_be64(0))
+ xchk_block_set_corrupt(sc, bp);
- if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino))
- xchk_block_set_preen(sc, bp);
+ if (sb->sb_gquotino != cpu_to_be64(0))
+ xchk_block_set_preen(sc, bp);
+ } else {
+ if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino))
+ xchk_block_set_preen(sc, bp);
+
+ if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino))
+ xchk_block_set_preen(sc, bp);
+ }
/*
* Skip the quota flags since repair will force quotacheck.
@@ -337,8 +382,13 @@ xchk_superblock(
if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align))
xchk_block_set_corrupt(sc, bp);
- if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino))
- xchk_block_set_preen(sc, bp);
+ if (xfs_has_metadir(mp)) {
+ if (sb->sb_pquotino != cpu_to_be64(0))
+ xchk_block_set_corrupt(sc, bp);
+ } else {
+ if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino))
+ xchk_block_set_preen(sc, bp);
+ }
/* Don't care about sb_lsn */
}
@@ -349,9 +399,26 @@ xchk_superblock(
xchk_block_set_corrupt(sc, bp);
}
+ if (xfs_has_metadir(mp)) {
+ if (sb->sb_metadirino != cpu_to_be64(mp->m_sb.sb_metadirino))
+ xchk_block_set_preen(sc, bp);
+
+ if (sb->sb_rgcount != cpu_to_be32(mp->m_sb.sb_rgcount))
+ xchk_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rgextents != cpu_to_be32(mp->m_sb.sb_rgextents))
+ xchk_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rgblklog != mp->m_sb.sb_rgblklog)
+ xchk_block_set_corrupt(sc, bp);
+
+ if (memchr_inv(sb->sb_pad, 0, sizeof(sb->sb_pad)))
+ xchk_block_set_corrupt(sc, bp);
+ }
+
/* Everything else must be zero. */
- if (memchr_inv(sb + 1, 0,
- BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+ sblen = xchk_superblock_ondisk_size(mp);
+ if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen))
xchk_block_set_corrupt(sc, bp);
xchk_superblock_xref(sc, bp);
@@ -434,7 +501,7 @@ xchk_agf_xref_btreeblks(
{
struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
struct xfs_mount *mp = sc->mp;
- xfs_agblock_t blocks;
+ xfs_filblks_t blocks;
xfs_agblock_t btreeblks;
int error;
@@ -483,7 +550,7 @@ xchk_agf_xref_refcblks(
struct xfs_scrub *sc)
{
struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
- xfs_agblock_t blocks;
+ xfs_filblks_t blocks;
int error;
if (!sc->sa.refc_cur)
@@ -552,32 +619,32 @@ xchk_agf(
/* Check the AG length */
eoag = be32_to_cpu(agf->agf_length);
- if (eoag != pag->block_count)
+ if (eoag != pag_group(pag)->xg_block_count)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
/* Check the AGF btree roots and levels */
- agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
+ agbno = be32_to_cpu(agf->agf_bno_root);
if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
+ agbno = be32_to_cpu(agf->agf_cnt_root);
if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+ level = be32_to_cpu(agf->agf_bno_level);
if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+ level = be32_to_cpu(agf->agf_cnt_level);
if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
if (xfs_has_rmapbt(mp)) {
- agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
+ agbno = be32_to_cpu(agf->agf_rmap_root);
if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+ level = be32_to_cpu(agf->agf_rmap_level);
if (level <= 0 || level > mp->m_rmap_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
}
@@ -816,7 +883,7 @@ xchk_agi_xref_fiblocks(
struct xfs_scrub *sc)
{
struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
- xfs_agblock_t blocks;
+ xfs_filblks_t blocks;
int error = 0;
if (!xfs_has_inobtcounts(sc->mp))
@@ -865,6 +932,43 @@ xchk_agi_xref(
/* scrub teardown will take care of sc->sa for us */
}
+/*
+ * Check the unlinked buckets for links to bad inodes. We hold the AGI, so
+ * there cannot be any threads updating unlinked list pointers in this AG.
+ */
+STATIC void
+xchk_iunlink(
+ struct xfs_scrub *sc,
+ struct xfs_agi *agi)
+{
+ unsigned int i;
+ struct xfs_inode *ip;
+
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ xfs_agino_t agino = be32_to_cpu(agi->agi_unlinked[i]);
+
+ while (agino != NULLAGINO) {
+ if (agino % XFS_AGI_UNLINKED_BUCKETS != i) {
+ xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ return;
+ }
+
+ ip = xfs_iunlink_lookup(sc->sa.pag, agino);
+ if (!ip) {
+ xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ return;
+ }
+
+ if (!xfs_inode_on_unlinked_list(ip)) {
+ xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ return;
+ }
+
+ agino = ip->i_next_unlinked;
+ }
+ }
+}
+
/* Scrub the AGI. */
int
xchk_agi(
@@ -895,7 +999,7 @@ xchk_agi(
/* Check the AG length */
eoag = be32_to_cpu(agi->agi_length);
- if (eoag != pag->block_count)
+ if (eoag != pag_group(pag)->xg_block_count)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check btree roots and levels */
@@ -949,6 +1053,8 @@ xchk_agi(
if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ xchk_iunlink(sc, agi);
+
xchk_agi_xref(sc);
out:
return error;
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 26bd1ff68f1b..cd6f0223879f 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -21,13 +21,18 @@
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_ag.h"
+#include "xfs_inode.h"
+#include "xfs_iunlink_item.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
+#include "scrub/agino_bitmap.h"
#include "scrub/reap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
/* Superblock */
@@ -174,8 +179,7 @@ xrep_agf_find_btrees(
* We relied on the rmapbt to reconstruct the AGF. If we get a
* different root then something's seriously wrong.
*/
- if (fab[XREP_AGF_RMAPBT].root !=
- be32_to_cpu(old_agf->agf_roots[XFS_BTNUM_RMAPi]))
+ if (fab[XREP_AGF_RMAPBT].root != be32_to_cpu(old_agf->agf_rmap_root))
return -EFSCORRUPTED;
/* We must find the refcountbt root if that feature is enabled. */
@@ -204,8 +208,8 @@ xrep_agf_init_header(
memset(agf, 0, BBTOB(agf_bp->b_length));
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
- agf->agf_seqno = cpu_to_be32(pag->pag_agno);
- agf->agf_length = cpu_to_be32(pag->block_count);
+ agf->agf_seqno = cpu_to_be32(pag_agno(pag));
+ agf->agf_length = cpu_to_be32(pag_group(pag)->xg_block_count);
agf->agf_flfirst = old_agf->agf_flfirst;
agf->agf_fllast = old_agf->agf_fllast;
agf->agf_flcount = old_agf->agf_flcount;
@@ -224,20 +228,14 @@ xrep_agf_set_roots(
struct xfs_agf *agf,
struct xrep_find_ag_btree *fab)
{
- agf->agf_roots[XFS_BTNUM_BNOi] =
- cpu_to_be32(fab[XREP_AGF_BNOBT].root);
- agf->agf_levels[XFS_BTNUM_BNOi] =
- cpu_to_be32(fab[XREP_AGF_BNOBT].height);
+ agf->agf_bno_root = cpu_to_be32(fab[XREP_AGF_BNOBT].root);
+ agf->agf_bno_level = cpu_to_be32(fab[XREP_AGF_BNOBT].height);
- agf->agf_roots[XFS_BTNUM_CNTi] =
- cpu_to_be32(fab[XREP_AGF_CNTBT].root);
- agf->agf_levels[XFS_BTNUM_CNTi] =
- cpu_to_be32(fab[XREP_AGF_CNTBT].height);
+ agf->agf_cnt_root = cpu_to_be32(fab[XREP_AGF_CNTBT].root);
+ agf->agf_cnt_level = cpu_to_be32(fab[XREP_AGF_CNTBT].height);
- agf->agf_roots[XFS_BTNUM_RMAPi] =
- cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
- agf->agf_levels[XFS_BTNUM_RMAPi] =
- cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
+ agf->agf_rmap_root = cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
+ agf->agf_rmap_level = cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
if (xfs_has_reflink(sc->mp)) {
agf->agf_refcount_root =
@@ -258,12 +256,11 @@ xrep_agf_calc_from_btrees(
struct xfs_agf *agf = agf_bp->b_addr;
struct xfs_mount *mp = sc->mp;
xfs_agblock_t btreeblks;
- xfs_agblock_t blocks;
+ xfs_filblks_t blocks;
int error;
/* Update the AGF counters from the bnobt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
+ cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
if (error)
goto err;
@@ -276,8 +273,7 @@ xrep_agf_calc_from_btrees(
agf->agf_longest = cpu_to_be32(raa.longest);
/* Update the AGF counters from the cntbt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
@@ -333,12 +329,9 @@ xrep_agf_commit_new(
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
pag->pagf_longest = be32_to_cpu(agf->agf_longest);
- pag->pagf_levels[XFS_BTNUM_BNOi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
- pag->pagf_levels[XFS_BTNUM_CNTi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
- pag->pagf_levels[XFS_BTNUM_RMAPi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+ pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
+ pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
+ pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
@@ -391,7 +384,7 @@ xrep_agf(
* was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED.
*/
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+ XFS_AG_DADDR(mp, pag_agno(sc->sa.pag),
XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
if (error)
@@ -559,16 +552,14 @@ xrep_agfl_collect_blocks(
goto out_bmp;
/* Find all blocks currently being used by the bnobt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
+ cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
xfs_btree_del_cursor(cur, error);
if (error)
goto out_bmp;
/* Find all blocks currently being used by the cntbt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
xfs_btree_del_cursor(cur, error);
if (error)
@@ -656,7 +647,7 @@ xrep_agfl_fill(
xfs_agblock_t agbno = start;
int error;
- trace_xrep_agfl_insert(sc->sa.pag, agbno, len);
+ trace_xrep_agfl_insert(pag_group(sc->sa.pag), agbno, len);
while (agbno < start + len && af->fl_off < af->flcount)
af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++);
@@ -696,7 +687,7 @@ xrep_agfl_init_header(
agfl = XFS_BUF_TO_AGFL(agfl_bp);
memset(agfl, 0xFF, BBTOB(agfl_bp->b_length));
agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
- agfl->agfl_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
+ agfl->agfl_seqno = cpu_to_be32(pag_agno(sc->sa.pag));
uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
/*
@@ -705,7 +696,7 @@ xrep_agfl_init_header(
* step.
*/
xagb_bitmap_init(&af.used_extents);
- af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp),
+ af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp);
xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af);
error = xagb_bitmap_disunion(agfl_extents, &af.used_extents);
if (error)
@@ -750,7 +741,7 @@ xrep_agfl(
* was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED.
*/
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+ XFS_AG_DADDR(mp, pag_agno(sc->sa.pag),
XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL);
if (error)
@@ -810,15 +801,57 @@ enum {
XREP_AGI_MAX
};
+#define XREP_AGI_LOOKUP_BATCH 32
+
+struct xrep_agi {
+ struct xfs_scrub *sc;
+
+ /* AGI buffer, tracked separately */
+ struct xfs_buf *agi_bp;
+
+ /* context for finding btree roots */
+ struct xrep_find_ag_btree fab[XREP_AGI_MAX];
+
+ /* old AGI contents in case we have to revert */
+ struct xfs_agi old_agi;
+
+ /* bitmap of which inodes are unlinked */
+ struct xagino_bitmap iunlink_bmp;
+
+ /* heads of the unlinked inode bucket lists */
+ xfs_agino_t iunlink_heads[XFS_AGI_UNLINKED_BUCKETS];
+
+ /* scratchpad for batched lookups of the radix tree */
+ struct xfs_inode *lookup_batch[XREP_AGI_LOOKUP_BATCH];
+
+ /* Map of ino -> next_ino for unlinked inode processing. */
+ struct xfarray *iunlink_next;
+
+ /* Map of ino -> prev_ino for unlinked inode processing. */
+ struct xfarray *iunlink_prev;
+};
+
+static void
+xrep_agi_buf_cleanup(
+ void *buf)
+{
+ struct xrep_agi *ragi = buf;
+
+ xfarray_destroy(ragi->iunlink_prev);
+ xfarray_destroy(ragi->iunlink_next);
+ xagino_bitmap_destroy(&ragi->iunlink_bmp);
+}
+
/*
* Given the inode btree roots described by *fab, find the roots, check them
* for sanity, and pass the root data back out via *fab.
*/
STATIC int
xrep_agi_find_btrees(
- struct xfs_scrub *sc,
- struct xrep_find_ag_btree *fab)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xrep_find_ag_btree *fab = ragi->fab;
struct xfs_buf *agf_bp;
struct xfs_mount *mp = sc->mp;
int error;
@@ -851,10 +884,11 @@ xrep_agi_find_btrees(
*/
STATIC void
xrep_agi_init_header(
- struct xfs_scrub *sc,
- struct xfs_buf *agi_bp,
- struct xfs_agi *old_agi)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_buf *agi_bp = ragi->agi_bp;
+ struct xfs_agi *old_agi = &ragi->old_agi;
struct xfs_agi *agi = agi_bp->b_addr;
struct xfs_perag *pag = sc->sa.pag;
struct xfs_mount *mp = sc->mp;
@@ -863,17 +897,13 @@ xrep_agi_init_header(
memset(agi, 0, BBTOB(agi_bp->b_length));
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
- agi->agi_seqno = cpu_to_be32(pag->pag_agno);
- agi->agi_length = cpu_to_be32(pag->block_count);
+ agi->agi_seqno = cpu_to_be32(pag_agno(pag));
+ agi->agi_length = cpu_to_be32(pag_group(pag)->xg_block_count);
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
if (xfs_has_crc(mp))
uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
- /* We don't know how to fix the unlinked list yet. */
- memcpy(&agi->agi_unlinked, &old_agi->agi_unlinked,
- sizeof(agi->agi_unlinked));
-
/* Mark the incore AGF data stale until we're done fixing things. */
ASSERT(xfs_perag_initialised_agi(pag));
clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
@@ -882,10 +912,12 @@ xrep_agi_init_header(
/* Set btree root information in an AGI. */
STATIC void
xrep_agi_set_roots(
- struct xfs_scrub *sc,
- struct xfs_agi *agi,
- struct xrep_find_ag_btree *fab)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_agi *agi = ragi->agi_bp->b_addr;
+ struct xrep_find_ag_btree *fab = ragi->fab;
+
agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root);
agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height);
@@ -898,9 +930,10 @@ xrep_agi_set_roots(
/* Update the AGI counters. */
STATIC int
xrep_agi_calc_from_btrees(
- struct xfs_scrub *sc,
- struct xfs_buf *agi_bp)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_buf *agi_bp = ragi->agi_bp;
struct xfs_btree_cur *cur;
struct xfs_agi *agi = agi_bp->b_addr;
struct xfs_mount *mp = sc->mp;
@@ -908,12 +941,12 @@ xrep_agi_calc_from_btrees(
xfs_agino_t freecount;
int error;
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
error = xfs_ialloc_count_inodes(cur, &count, &freecount);
if (error)
goto err;
if (xfs_has_inobtcounts(mp)) {
- xfs_agblock_t blocks;
+ xfs_filblks_t blocks;
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
@@ -926,10 +959,9 @@ xrep_agi_calc_from_btrees(
agi->agi_freecount = cpu_to_be32(freecount);
if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
- xfs_agblock_t blocks;
+ xfs_filblks_t blocks;
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp,
- XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
@@ -943,12 +975,713 @@ err:
return error;
}
+/*
+ * Record a forwards unlinked chain pointer from agino -> next_agino in our
+ * staging information.
+ */
+static inline int
+xrep_iunlink_store_next(
+ struct xrep_agi *ragi,
+ xfs_agino_t agino,
+ xfs_agino_t next_agino)
+{
+ ASSERT(next_agino != 0);
+
+ return xfarray_store(ragi->iunlink_next, agino, &next_agino);
+}
+
+/*
+ * Record a backwards unlinked chain pointer from prev_ino <- agino in our
+ * staging information.
+ */
+static inline int
+xrep_iunlink_store_prev(
+ struct xrep_agi *ragi,
+ xfs_agino_t agino,
+ xfs_agino_t prev_agino)
+{
+ ASSERT(prev_agino != 0);
+
+ return xfarray_store(ragi->iunlink_prev, agino, &prev_agino);
+}
+
+/*
+ * Given an @agino, look up the next inode in the iunlink bucket. Returns
+ * NULLAGINO if we're at the end of the chain, 0 if @agino is not in memory
+ * like it should be, or a per-AG inode number.
+ */
+static inline xfs_agino_t
+xrep_iunlink_next(
+ struct xfs_scrub *sc,
+ xfs_agino_t agino)
+{
+ struct xfs_inode *ip;
+
+ ip = xfs_iunlink_lookup(sc->sa.pag, agino);
+ if (!ip)
+ return 0;
+
+ return ip->i_next_unlinked;
+}
+
+/*
+ * Load the inode @agino into memory, set its i_prev_unlinked, and drop the
+ * inode so it can be inactivated. Returns NULLAGINO if we're at the end of
+ * the chain or if we should stop walking the chain due to corruption; or a
+ * per-AG inode number.
+ */
+STATIC xfs_agino_t
+xrep_iunlink_reload_next(
+ struct xrep_agi *ragi,
+ xfs_agino_t prev_agino,
+ xfs_agino_t agino)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_inode *ip;
+ xfs_agino_t ret = NULLAGINO;
+ int error;
+
+ error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino), &ip);
+ if (error)
+ return ret;
+
+ trace_xrep_iunlink_reload_next(ip, prev_agino);
+
+ /* If this is a linked inode, stop processing the chain. */
+ if (VFS_I(ip)->i_nlink != 0) {
+ xrep_iunlink_store_next(ragi, agino, NULLAGINO);
+ goto rele;
+ }
+
+ ip->i_prev_unlinked = prev_agino;
+ ret = ip->i_next_unlinked;
+
+ /*
+ * Drop the inode reference that we just took. We hold the AGI, so
+ * this inode cannot move off the unlinked list and hence cannot be
+ * reclaimed.
+ */
+rele:
+ xchk_irele(sc, ip);
+ return ret;
+}
+
+/*
+ * Walk an AGI unlinked bucket's list to load incore any unlinked inodes that
+ * still existed at mount time. This can happen if iunlink processing fails
+ * during log recovery.
+ */
+STATIC int
+xrep_iunlink_walk_ondisk_bucket(
+ struct xrep_agi *ragi,
+ unsigned int bucket)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+ xfs_agino_t prev_agino = NULLAGINO;
+ xfs_agino_t next_agino;
+ int error = 0;
+
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+ while (next_agino != NULLAGINO) {
+ xfs_agino_t agino = next_agino;
+
+ if (xchk_should_terminate(ragi->sc, &error))
+ return error;
+
+ trace_xrep_iunlink_walk_ondisk_bucket(sc->sa.pag, bucket,
+ prev_agino, agino);
+
+ if (bucket != agino % XFS_AGI_UNLINKED_BUCKETS)
+ break;
+
+ next_agino = xrep_iunlink_next(sc, agino);
+ if (!next_agino)
+ next_agino = xrep_iunlink_reload_next(ragi, prev_agino,
+ agino);
+
+ prev_agino = agino;
+ }
+
+ return 0;
+}
+
+/* Decide if this is an unlinked inode in this AG. */
+STATIC bool
+xrep_iunlink_igrab(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = pag_mount(pag);
+
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag))
+ return false;
+
+ if (!xfs_inode_on_unlinked_list(ip))
+ return false;
+
+ return true;
+}
+
+/*
+ * Mark the given inode in the lookup batch in our unlinked inode bitmap, and
+ * remember if this inode is the start of the unlinked chain.
+ */
+STATIC int
+xrep_iunlink_visit(
+ struct xrep_agi *ragi,
+ unsigned int batch_idx)
+{
+ struct xfs_mount *mp = ragi->sc->mp;
+ struct xfs_inode *ip = ragi->lookup_batch[batch_idx];
+ xfs_agino_t agino;
+ unsigned int bucket;
+ int error;
+
+ ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == pag_agno(ragi->sc->sa.pag));
+ ASSERT(xfs_inode_on_unlinked_list(ip));
+
+ agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
+
+ trace_xrep_iunlink_visit(ragi->sc->sa.pag, bucket,
+ ragi->iunlink_heads[bucket], ip);
+
+ error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
+ if (error)
+ return error;
+
+ if (ip->i_prev_unlinked == NULLAGINO) {
+ if (ragi->iunlink_heads[bucket] == NULLAGINO)
+ ragi->iunlink_heads[bucket] = agino;
+ }
+
+ return 0;
+}
+
+/*
+ * Find all incore unlinked inodes so that we can rebuild the unlinked buckets.
+ * We hold the AGI so there should not be any modifications to the unlinked
+ * list.
+ */
+STATIC int
+xrep_iunlink_mark_incore(
+ struct xrep_agi *ragi)
+{
+ struct xfs_perag *pag = ragi->sc->sa.pag;
+ struct xfs_mount *mp = pag_mount(pag);
+ uint32_t first_index = 0;
+ bool done = false;
+ unsigned int nr_found = 0;
+
+ do {
+ unsigned int i;
+ int error = 0;
+
+ if (xchk_should_terminate(ragi->sc, &error))
+ return error;
+
+ rcu_read_lock();
+
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+ (void **)&ragi->lookup_batch, first_index,
+ XREP_AGI_LOOKUP_BATCH);
+ if (!nr_found) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for (i = 0; i < nr_found; i++) {
+ struct xfs_inode *ip = ragi->lookup_batch[i];
+
+ if (done || !xrep_iunlink_igrab(pag, ip))
+ ragi->lookup_batch[i] = NULL;
+
+ /*
+ * Update the index for the next lookup. Catch
+ * overflows into the next AG range which can occur if
+ * we have inodes in the last block of the AG and we
+ * are currently pointing to the last inode.
+ *
+ * Because we may see inodes that are from the wrong AG
+ * due to RCU freeing and reallocation, only update the
+ * index if it lies in this AG. It was a race that lead
+ * us to see this inode, so another lookup from the
+ * same index will not find it again.
+ */
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag))
+ continue;
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+ done = true;
+ }
+
+ /* unlock now we've grabbed the inodes. */
+ rcu_read_unlock();
+
+ for (i = 0; i < nr_found; i++) {
+ if (!ragi->lookup_batch[i])
+ continue;
+ error = xrep_iunlink_visit(ragi, i);
+ if (error)
+ return error;
+ }
+ } while (!done);
+
+ return 0;
+}
+
+/* Mark all the unlinked ondisk inodes in this inobt record in iunlink_bmp. */
+STATIC int
+xrep_iunlink_mark_ondisk_rec(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore irec;
+ struct xrep_agi *ragi = priv;
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agino_t agino;
+ unsigned int i;
+ int error = 0;
+
+ xfs_inobt_btrec_to_irec(mp, rec, &irec);
+
+ for (i = 0, agino = irec.ir_startino;
+ i < XFS_INODES_PER_CHUNK;
+ i++, agino++) {
+ struct xfs_inode *ip;
+ unsigned int len = 1;
+
+ /* Skip free inodes */
+ if (XFS_INOBT_MASK(i) & irec.ir_free)
+ continue;
+ /* Skip inodes we've seen before */
+ if (xagino_bitmap_test(&ragi->iunlink_bmp, agino, &len))
+ continue;
+
+ /*
+ * Skip incore inodes; these were already picked up by
+ * the _mark_incore step.
+ */
+ rcu_read_lock();
+ ip = radix_tree_lookup(&sc->sa.pag->pag_ici_root, agino);
+ rcu_read_unlock();
+ if (ip)
+ continue;
+
+ /*
+ * Try to look up this inode. If we can't get it, just move
+ * on because we haven't actually scrubbed the inobt or the
+ * inodes yet.
+ */
+ error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino),
+ &ip);
+ if (error)
+ continue;
+
+ trace_xrep_iunlink_reload_ondisk(ip);
+
+ if (VFS_I(ip)->i_nlink == 0)
+ error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * Find ondisk inodes that are unlinked and not in cache, and mark them in
+ * iunlink_bmp. We haven't checked the inobt yet, so we don't error out if
+ * the btree is corrupt.
+ */
+STATIC void
+xrep_iunlink_mark_ondisk(
+ struct xrep_agi *ragi)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_buf *agi_bp = ragi->agi_bp;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
+ error = xfs_btree_query_all(cur, xrep_iunlink_mark_ondisk_rec, ragi);
+ xfs_btree_del_cursor(cur, error);
+}
+
+/*
+ * Walk an iunlink bucket's inode list. For each inode that should be on this
+ * chain, clear its entry in in iunlink_bmp because it's ok and we don't need
+ * to touch it further.
+ */
+STATIC int
+xrep_iunlink_resolve_bucket(
+ struct xrep_agi *ragi,
+ unsigned int bucket)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_inode *ip;
+ xfs_agino_t prev_agino = NULLAGINO;
+ xfs_agino_t next_agino = ragi->iunlink_heads[bucket];
+ int error = 0;
+
+ while (next_agino != NULLAGINO) {
+ if (xchk_should_terminate(ragi->sc, &error))
+ return error;
+
+ /* Find the next inode in the chain. */
+ ip = xfs_iunlink_lookup(sc->sa.pag, next_agino);
+ if (!ip) {
+ /* Inode not incore? Terminate the chain. */
+ trace_xrep_iunlink_resolve_uncached(sc->sa.pag,
+ bucket, prev_agino, next_agino);
+
+ next_agino = NULLAGINO;
+ break;
+ }
+
+ if (next_agino % XFS_AGI_UNLINKED_BUCKETS != bucket) {
+ /*
+ * Inode is in the wrong bucket. Advance the list,
+ * but pretend we didn't see this inode.
+ */
+ trace_xrep_iunlink_resolve_wronglist(sc->sa.pag,
+ bucket, prev_agino, next_agino);
+
+ next_agino = ip->i_next_unlinked;
+ continue;
+ }
+
+ if (!xfs_inode_on_unlinked_list(ip)) {
+ /*
+ * Incore inode doesn't think this inode is on an
+ * unlinked list. This is probably because we reloaded
+ * it from disk. Advance the list, but pretend we
+ * didn't see this inode; we'll fix that later.
+ */
+ trace_xrep_iunlink_resolve_nolist(sc->sa.pag,
+ bucket, prev_agino, next_agino);
+ next_agino = ip->i_next_unlinked;
+ continue;
+ }
+
+ trace_xrep_iunlink_resolve_ok(sc->sa.pag, bucket, prev_agino,
+ next_agino);
+
+ /*
+ * Otherwise, this inode's unlinked pointers are ok. Clear it
+ * from the unlinked bitmap since we're done with it, and make
+ * sure the chain is still correct.
+ */
+ error = xagino_bitmap_clear(&ragi->iunlink_bmp, next_agino, 1);
+ if (error)
+ return error;
+
+ /* Remember the previous inode's next pointer. */
+ if (prev_agino != NULLAGINO) {
+ error = xrep_iunlink_store_next(ragi, prev_agino,
+ next_agino);
+ if (error)
+ return error;
+ }
+
+ /* Remember this inode's previous pointer. */
+ error = xrep_iunlink_store_prev(ragi, next_agino, prev_agino);
+ if (error)
+ return error;
+
+ /* Advance the list and remember this inode. */
+ prev_agino = next_agino;
+ next_agino = ip->i_next_unlinked;
+ }
+
+ /* Update the previous inode's next pointer. */
+ if (prev_agino != NULLAGINO) {
+ error = xrep_iunlink_store_next(ragi, prev_agino, next_agino);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Reinsert this unlinked inode into the head of the staged bucket list. */
+STATIC int
+xrep_iunlink_add_to_bucket(
+ struct xrep_agi *ragi,
+ xfs_agino_t agino)
+{
+ xfs_agino_t current_head;
+ unsigned int bucket;
+ int error;
+
+ bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
+
+ /* Point this inode at the current head of the bucket list. */
+ current_head = ragi->iunlink_heads[bucket];
+
+ trace_xrep_iunlink_add_to_bucket(ragi->sc->sa.pag, bucket, agino,
+ current_head);
+
+ error = xrep_iunlink_store_next(ragi, agino, current_head);
+ if (error)
+ return error;
+
+ /* Remember the head inode's previous pointer. */
+ if (current_head != NULLAGINO) {
+ error = xrep_iunlink_store_prev(ragi, current_head, agino);
+ if (error)
+ return error;
+ }
+
+ ragi->iunlink_heads[bucket] = agino;
+ return 0;
+}
+
+/* Reinsert unlinked inodes into the staged iunlink buckets. */
+STATIC int
+xrep_iunlink_add_lost_inodes(
+ uint32_t start,
+ uint32_t len,
+ void *priv)
+{
+ struct xrep_agi *ragi = priv;
+ int error;
+
+ for (; len > 0; start++, len--) {
+ error = xrep_iunlink_add_to_bucket(ragi, start);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Figure out the iunlink bucket values and find inodes that need to be
+ * reinserted into the list.
+ */
+STATIC int
+xrep_iunlink_rebuild_buckets(
+ struct xrep_agi *ragi)
+{
+ unsigned int i;
+ int error;
+
+ /*
+ * Walk the ondisk AGI unlinked list to find inodes that are on the
+ * list but aren't in memory. This can happen if a past log recovery
+ * tried to clear the iunlinked list but failed. Our scan rebuilds the
+ * unlinked list using incore inodes, so we must load and link them
+ * properly.
+ */
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ error = xrep_iunlink_walk_ondisk_bucket(ragi, i);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Record all the incore unlinked inodes in iunlink_bmp that we didn't
+ * find by walking the ondisk iunlink buckets. This shouldn't happen,
+ * but we can't risk forgetting an inode somewhere.
+ */
+ error = xrep_iunlink_mark_incore(ragi);
+ if (error)
+ return error;
+
+ /*
+ * If there are ondisk inodes that are unlinked and are not been loaded
+ * into cache, record them in iunlink_bmp.
+ */
+ xrep_iunlink_mark_ondisk(ragi);
+
+ /*
+ * Walk each iunlink bucket to (re)construct as much of the incore list
+ * as would be correct. For each inode that survives this step, mark
+ * it clear in iunlink_bmp; we're done with those inodes.
+ */
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ error = xrep_iunlink_resolve_bucket(ragi, i);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Any unlinked inodes that we didn't find through the bucket list
+ * walk (or was ignored by the walk) must be inserted into the bucket
+ * list. Stage this in memory for now.
+ */
+ return xagino_bitmap_walk(&ragi->iunlink_bmp,
+ xrep_iunlink_add_lost_inodes, ragi);
+}
+
+/* Update i_next_iunlinked for the inode @agino. */
+STATIC int
+xrep_iunlink_relink_next(
+ struct xrep_agi *ragi,
+ xfarray_idx_t idx,
+ xfs_agino_t next_agino)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_inode *ip;
+ xfarray_idx_t agino = idx - 1;
+ bool want_rele = false;
+ int error = 0;
+
+ ip = xfs_iunlink_lookup(pag, agino);
+ if (!ip) {
+ xfs_agino_t prev_agino;
+
+ /*
+ * No inode exists in cache. Load it off the disk so that we
+ * can reinsert it into the incore unlinked list.
+ */
+ error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip);
+ if (error)
+ return -EFSCORRUPTED;
+
+ want_rele = true;
+
+ /* Set the backward pointer since this just came off disk. */
+ error = xfarray_load(ragi->iunlink_prev, agino, &prev_agino);
+ if (error)
+ goto out_rele;
+
+ trace_xrep_iunlink_relink_prev(ip, prev_agino);
+ ip->i_prev_unlinked = prev_agino;
+ }
+
+ /* Update the forward pointer. */
+ if (ip->i_next_unlinked != next_agino) {
+ error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
+ if (error)
+ goto out_rele;
+
+ trace_xrep_iunlink_relink_next(ip, next_agino);
+ ip->i_next_unlinked = next_agino;
+ }
+
+out_rele:
+ /*
+ * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
+ * and the inode cannot be reclaimed. However, if we used iget to load
+ * a missing inode, we must irele it here.
+ */
+ if (want_rele)
+ xchk_irele(sc, ip);
+ return error;
+}
+
+/* Update i_prev_iunlinked for the inode @agino. */
+STATIC int
+xrep_iunlink_relink_prev(
+ struct xrep_agi *ragi,
+ xfarray_idx_t idx,
+ xfs_agino_t prev_agino)
+{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_inode *ip;
+ xfarray_idx_t agino = idx - 1;
+ bool want_rele = false;
+ int error = 0;
+
+ ASSERT(prev_agino != 0);
+
+ ip = xfs_iunlink_lookup(pag, agino);
+ if (!ip) {
+ xfs_agino_t next_agino;
+
+ /*
+ * No inode exists in cache. Load it off the disk so that we
+ * can reinsert it into the incore unlinked list.
+ */
+ error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip);
+ if (error)
+ return -EFSCORRUPTED;
+
+ want_rele = true;
+
+ /* Set the forward pointer since this just came off disk. */
+ error = xfarray_load(ragi->iunlink_prev, agino, &next_agino);
+ if (error)
+ goto out_rele;
+
+ error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino);
+ if (error)
+ goto out_rele;
+
+ trace_xrep_iunlink_relink_next(ip, next_agino);
+ ip->i_next_unlinked = next_agino;
+ }
+
+ /* Update the backward pointer. */
+ if (ip->i_prev_unlinked != prev_agino) {
+ trace_xrep_iunlink_relink_prev(ip, prev_agino);
+ ip->i_prev_unlinked = prev_agino;
+ }
+
+out_rele:
+ /*
+ * The iunlink lookup doesn't igrab because we hold the AGI buffer lock
+ * and the inode cannot be reclaimed. However, if we used iget to load
+ * a missing inode, we must irele it here.
+ */
+ if (want_rele)
+ xchk_irele(sc, ip);
+ return error;
+}
+
+/* Log all the iunlink updates we need to finish regenerating the AGI. */
+STATIC int
+xrep_iunlink_commit(
+ struct xrep_agi *ragi)
+{
+ struct xfs_agi *agi = ragi->agi_bp->b_addr;
+ xfarray_idx_t idx = XFARRAY_CURSOR_INIT;
+ xfs_agino_t agino;
+ unsigned int i;
+ int error;
+
+ /* Fix all the forward links */
+ while ((error = xfarray_iter(ragi->iunlink_next, &idx, &agino)) == 1) {
+ error = xrep_iunlink_relink_next(ragi, idx, agino);
+ if (error)
+ return error;
+ }
+
+ /* Fix all the back links */
+ idx = XFARRAY_CURSOR_INIT;
+ while ((error = xfarray_iter(ragi->iunlink_prev, &idx, &agino)) == 1) {
+ error = xrep_iunlink_relink_prev(ragi, idx, agino);
+ if (error)
+ return error;
+ }
+
+ /* Copy the staged iunlink buckets to the new AGI. */
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ trace_xrep_iunlink_commit_bucket(ragi->sc->sa.pag, i,
+ be32_to_cpu(ragi->old_agi.agi_unlinked[i]),
+ ragi->iunlink_heads[i]);
+
+ agi->agi_unlinked[i] = cpu_to_be32(ragi->iunlink_heads[i]);
+ }
+
+ return 0;
+}
+
/* Trigger reinitialization of the in-core data. */
STATIC int
xrep_agi_commit_new(
- struct xfs_scrub *sc,
- struct xfs_buf *agi_bp)
+ struct xrep_agi *ragi)
{
+ struct xfs_scrub *sc = ragi->sc;
+ struct xfs_buf *agi_bp = ragi->agi_bp;
struct xfs_perag *pag;
struct xfs_agi *agi = agi_bp->b_addr;
@@ -971,48 +1704,76 @@ xrep_agi_commit_new(
/* Repair the AGI. */
int
xrep_agi(
- struct xfs_scrub *sc)
+ struct xfs_scrub *sc)
{
- struct xrep_find_ag_btree fab[XREP_AGI_MAX] = {
- [XREP_AGI_INOBT] = {
- .rmap_owner = XFS_RMAP_OWN_INOBT,
- .buf_ops = &xfs_inobt_buf_ops,
- .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
- },
- [XREP_AGI_FINOBT] = {
- .rmap_owner = XFS_RMAP_OWN_INOBT,
- .buf_ops = &xfs_finobt_buf_ops,
- .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
- },
- [XREP_AGI_END] = {
- .buf_ops = NULL
- },
- };
- struct xfs_agi old_agi;
- struct xfs_mount *mp = sc->mp;
- struct xfs_buf *agi_bp;
- struct xfs_agi *agi;
- int error;
+ struct xrep_agi *ragi;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ unsigned int i;
+ int error;
/* We require the rmapbt to rebuild anything. */
if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
+ sc->buf = kzalloc(sizeof(struct xrep_agi), XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+ ragi = sc->buf;
+ ragi->sc = sc;
+
+ ragi->fab[XREP_AGI_INOBT] = (struct xrep_find_ag_btree){
+ .rmap_owner = XFS_RMAP_OWN_INOBT,
+ .buf_ops = &xfs_inobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
+ };
+ ragi->fab[XREP_AGI_FINOBT] = (struct xrep_find_ag_btree){
+ .rmap_owner = XFS_RMAP_OWN_INOBT,
+ .buf_ops = &xfs_finobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
+ };
+ ragi->fab[XREP_AGI_END] = (struct xrep_find_ag_btree){
+ .buf_ops = NULL,
+ };
+
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+ ragi->iunlink_heads[i] = NULLAGINO;
+
+ xagino_bitmap_init(&ragi->iunlink_bmp);
+ sc->buf_cleanup = xrep_agi_buf_cleanup;
+
+ descr = xchk_xfile_ag_descr(sc, "iunlinked next pointers");
+ error = xfarray_create(descr, 0, sizeof(xfs_agino_t),
+ &ragi->iunlink_next);
+ kfree(descr);
+ if (error)
+ return error;
+
+ descr = xchk_xfile_ag_descr(sc, "iunlinked prev pointers");
+ error = xfarray_create(descr, 0, sizeof(xfs_agino_t),
+ &ragi->iunlink_prev);
+ kfree(descr);
+ if (error)
+ return error;
+
/*
* Make sure we have the AGI buffer, as scrub might have decided it
* was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED.
*/
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+ XFS_AG_DADDR(mp, pag_agno(sc->sa.pag),
XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL);
+ XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL);
if (error)
return error;
- agi_bp->b_ops = &xfs_agi_buf_ops;
- agi = agi_bp->b_addr;
+ ragi->agi_bp->b_ops = &xfs_agi_buf_ops;
/* Find the AGI btree roots. */
- error = xrep_agi_find_btrees(sc, fab);
+ error = xrep_agi_find_btrees(ragi);
+ if (error)
+ return error;
+
+ error = xrep_iunlink_rebuild_buckets(ragi);
if (error)
return error;
@@ -1021,18 +1782,21 @@ xrep_agi(
return error;
/* Start rewriting the header and implant the btrees we found. */
- xrep_agi_init_header(sc, agi_bp, &old_agi);
- xrep_agi_set_roots(sc, agi, fab);
- error = xrep_agi_calc_from_btrees(sc, agi_bp);
+ xrep_agi_init_header(ragi);
+ xrep_agi_set_roots(ragi);
+ error = xrep_agi_calc_from_btrees(ragi);
+ if (error)
+ goto out_revert;
+ error = xrep_iunlink_commit(ragi);
if (error)
goto out_revert;
/* Reinitialize in-core state. */
- return xrep_agi_commit_new(sc, agi_bp);
+ return xrep_agi_commit_new(ragi);
out_revert:
/* Mark the incore AGI state stale and revert the AGI. */
clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate);
- memcpy(agi, &old_agi, sizeof(old_agi));
+ memcpy(ragi->agi_bp->b_addr, &ragi->old_agi, sizeof(struct xfs_agi));
return error;
}
diff --git a/fs/xfs/scrub/agino_bitmap.h b/fs/xfs/scrub/agino_bitmap.h
new file mode 100644
index 000000000000..56d7db5f1699
--- /dev/null
+++ b/fs/xfs/scrub/agino_bitmap.h
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_AGINO_BITMAP_H__
+#define __XFS_SCRUB_AGINO_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_agino_t */
+
+struct xagino_bitmap {
+ struct xbitmap32 aginobitmap;
+};
+
+static inline void xagino_bitmap_init(struct xagino_bitmap *bitmap)
+{
+ xbitmap32_init(&bitmap->aginobitmap);
+}
+
+static inline void xagino_bitmap_destroy(struct xagino_bitmap *bitmap)
+{
+ xbitmap32_destroy(&bitmap->aginobitmap);
+}
+
+static inline int xagino_bitmap_clear(struct xagino_bitmap *bitmap,
+ xfs_agino_t agino, unsigned int len)
+{
+ return xbitmap32_clear(&bitmap->aginobitmap, agino, len);
+}
+
+static inline int xagino_bitmap_set(struct xagino_bitmap *bitmap,
+ xfs_agino_t agino, unsigned int len)
+{
+ return xbitmap32_set(&bitmap->aginobitmap, agino, len);
+}
+
+static inline bool xagino_bitmap_test(struct xagino_bitmap *bitmap,
+ xfs_agino_t agino, unsigned int *len)
+{
+ return xbitmap32_test(&bitmap->aginobitmap, agino, len);
+}
+
+static inline int xagino_bitmap_walk(struct xagino_bitmap *bitmap,
+ xbitmap32_walk_fn fn, void *priv)
+{
+ return xbitmap32_walk(&bitmap->aginobitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_AGINO_BITMAP_H__ */
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index d1b8a4997dd2..8b282138097f 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -139,7 +139,7 @@ xchk_allocbt_rec(
struct xchk_alloc *ca = bs->private;
xfs_alloc_btrec_to_irec(rec, &irec);
- if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
+ if (xfs_alloc_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
index 45edda096869..bed6a09aa791 100644
--- a/fs/xfs/scrub/alloc_repair.c
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -132,17 +132,16 @@ int
xrep_setup_ag_allocbt(
struct xfs_scrub *sc)
{
+ struct xfs_group *xg = pag_group(sc->sa.pag);
unsigned int busy_gen;
/*
* Make sure the busy extent list is clear because we can't put extents
* on there twice.
*/
- busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
- if (xfs_extent_busy_list_empty(sc->sa.pag))
+ if (xfs_extent_busy_list_empty(xg, &busy_gen))
return 0;
-
- return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0);
+ return xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0);
}
/* Check for any obvious conflicts in the free extent. */
@@ -210,7 +209,7 @@ xrep_abt_stash(
if (error)
return error;
- trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
+ trace_xrep_abt_found(sc->sa.pag, &arec);
error = xfarray_append(ra->free_records, &arec);
if (error)
@@ -484,8 +483,8 @@ xrep_abt_reserve_space(
ASSERT(arec.ar_blockcount <= UINT_MAX);
len = min_t(unsigned int, arec.ar_blockcount, desired);
- trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno,
- arec.ar_startblock, len, XFS_RMAP_OWN_AG);
+ trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, arec.ar_startblock,
+ len, XFS_RMAP_OWN_AG);
error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag,
arec.ar_startblock, len);
@@ -543,8 +542,9 @@ xrep_abt_dispose_one(
/* Add a deferred rmap for each extent we used. */
if (resv->used > 0)
- xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno,
- resv->used, XFS_RMAP_OWN_AG);
+ xfs_rmap_alloc_extent(sc->tp, false,
+ xfs_agbno_to_fsb(pag, resv->agbno), resv->used,
+ XFS_RMAP_OWN_AG);
/*
* For each reserved btree block we didn't use, add it to the free
@@ -554,8 +554,8 @@ xrep_abt_dispose_one(
if (free_aglen == 0)
return 0;
- trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
- free_aglen, ra->new_bnobt.oinfo.oi_owner);
+ trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
+ ra->new_bnobt.oinfo.oi_owner);
error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen,
&ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true);
@@ -687,8 +687,8 @@ xrep_abt_reset_counters(
* height values before re-initializing the perag info from the updated
* AGF to capture all the new values.
*/
- pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi];
- pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi];
+ pag->pagf_repair_bno_level = pag->pagf_bno_level;
+ pag->pagf_repair_cnt_level = pag->pagf_cnt_level;
/* Reinitialize with the values we just logged. */
return xrep_reinit_pagf(sc);
@@ -735,10 +735,11 @@ xrep_abt_build_new_trees(
ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
/* Allocate cursors for the staged btrees. */
- bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake,
- pag, XFS_BTNUM_BNO);
- cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake,
- pag, XFS_BTNUM_CNT);
+ bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake);
+
+ cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake);
/* Last chance to abort before we start committing fixes. */
if (xchk_should_terminate(sc, &error))
@@ -765,10 +766,8 @@ xrep_abt_build_new_trees(
* height so that we don't trip the verifiers when writing the new
* btree blocks to disk.
*/
- pag->pagf_repair_levels[XFS_BTNUM_BNOi] =
- ra->new_bnobt.bload.btree_height;
- pag->pagf_repair_levels[XFS_BTNUM_CNTi] =
- ra->new_cntbt.bload.btree_height;
+ pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height;
+ pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height;
/* Load the free space by length tree. */
ra->array_cur = XFARRAY_CURSOR_INIT;
@@ -779,7 +778,7 @@ xrep_abt_build_new_trees(
error = xrep_bnobt_sort_records(ra);
if (error)
- return error;
+ goto err_levels;
/* Load the free space by block number tree. */
ra->array_cur = XFARRAY_CURSOR_INIT;
@@ -807,8 +806,8 @@ xrep_abt_build_new_trees(
return xrep_roll_ag_trans(sc);
err_levels:
- pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
- pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+ pag->pagf_repair_bno_level = 0;
+ pag->pagf_repair_cnt_level = 0;
err_cur:
xfs_btree_del_cursor(cnt_cur, error);
xfs_btree_del_cursor(bno_cur, error);
@@ -838,8 +837,8 @@ xrep_abt_remove_old_trees(
* Now that we've zapped all the old allocbt blocks we can turn off
* the alternate height mechanism.
*/
- pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
- pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+ pag->pagf_repair_bno_level = 0;
+ pag->pagf_repair_cnt_level = 0;
return 0;
}
@@ -850,6 +849,7 @@ xrep_allocbt(
{
struct xrep_abt *ra;
struct xfs_mount *mp = sc->mp;
+ unsigned int busy_gen;
char *descr;
int error;
@@ -870,7 +870,7 @@ xrep_allocbt(
* on there twice. In theory we cleared this before we started, but
* let's not risk the filesystem.
*/
- if (!xfs_extent_busy_list_empty(sc->sa.pag)) {
+ if (!xfs_extent_busy_list_empty(pag_group(sc->sa.pag), &busy_gen)) {
error = -EDEADLOCK;
goto out_ra;
}
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 83c7feb38714..708334f9b2bd 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -10,16 +10,20 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_attr_sf.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/dabtree.h"
#include "scrub/attr.h"
+#include "scrub/listxattr.h"
+#include "scrub/repair.h"
/* Free the buffers linked from the xattr buffer. */
static void
@@ -35,6 +39,8 @@ xchk_xattr_buf_cleanup(
kvfree(ab->value);
ab->value = NULL;
ab->value_sz = 0;
+ kvfree(ab->name);
+ ab->name = NULL;
}
/*
@@ -65,7 +71,7 @@ xchk_xattr_want_freemap(
* reallocating the buffer if necessary. Buffer contents are not preserved
* across a reallocation.
*/
-static int
+int
xchk_setup_xattr_buf(
struct xfs_scrub *sc,
size_t value_size)
@@ -95,6 +101,12 @@ xchk_setup_xattr_buf(
return -ENOMEM;
}
+ if (xchk_could_repair(sc)) {
+ ab->name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS);
+ if (!ab->name)
+ return -ENOMEM;
+ }
+
resize_value:
if (ab->value_sz >= value_size)
return 0;
@@ -121,6 +133,12 @@ xchk_setup_xattr(
{
int error;
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_xattr(sc);
+ if (error)
+ return error;
+ }
+
/*
* We failed to get memory while checking attrs, so this time try to
* get all the memory we're ever going to need. Allocate the buffer
@@ -137,106 +155,105 @@ xchk_setup_xattr(
/* Extended Attributes */
-struct xchk_xattr {
- struct xfs_attr_list_context context;
- struct xfs_scrub *sc;
-};
-
/*
* Check that an extended attribute key can be looked up by hash.
*
- * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked)
- * to call this function for every attribute key in an inode. Once
- * we're here, we load the attribute value to see if any errors happen,
- * or if we get more or less data than we expected.
+ * We use the extended attribute walk helper to call this function for every
+ * attribute key in an inode. Once we're here, we load the attribute value to
+ * see if any errors happen, or if we get more or less data than we expected.
*/
-static void
-xchk_xattr_listent(
- struct xfs_attr_list_context *context,
- int flags,
- unsigned char *name,
- int namelen,
- int valuelen)
+static int
+xchk_xattr_actor(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
{
struct xfs_da_args args = {
- .op_flags = XFS_DA_OP_NOTIME,
- .attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK,
- .geo = context->dp->i_mount->m_attr_geo,
+ .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK,
+ .geo = sc->mp->m_attr_geo,
.whichfork = XFS_ATTR_FORK,
- .dp = context->dp,
+ .dp = ip,
.name = name,
.namelen = namelen,
- .hashval = xfs_da_hashname(name, namelen),
- .trans = context->tp,
+ .trans = sc->tp,
.valuelen = valuelen,
+ .owner = ip->i_ino,
};
struct xchk_xattr_buf *ab;
- struct xchk_xattr *sx;
int error = 0;
- sx = container_of(context, struct xchk_xattr, context);
- ab = sx->sc->buf;
+ ab = sc->buf;
- if (xchk_should_terminate(sx->sc, &error)) {
- context->seen_enough = error;
- return;
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ if (attr_flags & ~XFS_ATTR_ONDISK_MASK) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno);
+ return -ECANCELED;
}
- if (flags & XFS_ATTR_INCOMPLETE) {
+ if (attr_flags & XFS_ATTR_INCOMPLETE) {
/* Incomplete attr key, just mark the inode for preening. */
- xchk_ino_set_preen(sx->sc, context->dp->i_ino);
- return;
+ xchk_ino_set_preen(sc, ip->i_ino);
+ return 0;
}
- /* Only one namespace bit allowed. */
- if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) {
- xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
- goto fail_xref;
+ /* Does this name make sense? */
+ if (!xfs_attr_namecheck(attr_flags, name, namelen)) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno);
+ return -ECANCELED;
}
- /* Does this name make sense? */
- if (!xfs_attr_namecheck(name, namelen)) {
- xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
- goto fail_xref;
+ /* Check parent pointer record. */
+ if ((attr_flags & XFS_ATTR_PARENT) &&
+ !xfs_parent_valuecheck(sc->mp, value, valuelen)) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno);
+ return -ECANCELED;
}
/*
- * Local xattr values are stored in the attr leaf block, so we don't
- * need to retrieve the value from a remote block to detect corruption
- * problems.
+ * Try to allocate enough memory to extract the attr value. If that
+ * doesn't work, return -EDEADLOCK as a signal to try again with a
+ * maximally sized buffer.
*/
- if (flags & XFS_ATTR_LOCAL)
- goto fail_xref;
+ error = xchk_setup_xattr_buf(sc, valuelen);
+ if (error == -ENOMEM)
+ error = -EDEADLOCK;
+ if (error)
+ return error;
/*
- * Try to allocate enough memory to extrat the attr value. If that
- * doesn't work, we overload the seen_enough variable to convey
- * the error message back to the main scrub function.
+ * Parent pointers are matched on attr name and value, so we must
+ * supply the xfs_parent_rec here when confirming that the dabtree
+ * indexing works correctly.
*/
- error = xchk_setup_xattr_buf(sx->sc, valuelen);
- if (error == -ENOMEM)
- error = -EDEADLOCK;
- if (error) {
- context->seen_enough = error;
- return;
- }
+ if (attr_flags & XFS_ATTR_PARENT)
+ memcpy(ab->value, value, valuelen);
args.value = ab->value;
+ /*
+ * Get the attr value to ensure that lookup can find this attribute
+ * through the dabtree indexing and that remote value retrieval also
+ * works correctly.
+ */
+ xfs_attr_sethash(&args);
error = xfs_attr_get_ilocked(&args);
/* ENODATA means the hash lookup failed and the attr is bad */
if (error == -ENODATA)
error = -EFSCORRUPTED;
- if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
+ if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, args.blkno,
&error))
- goto fail_xref;
+ return error;
if (args.valuelen != valuelen)
- xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK,
- args.blkno);
-fail_xref:
- if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- context->seen_enough = 1;
- return;
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno);
+
+ return 0;
}
/*
@@ -246,7 +263,7 @@ fail_xref:
* Within a char, the lowest bit of the char represents the byte with
* the smallest address
*/
-STATIC bool
+bool
xchk_xattr_set_map(
struct xfs_scrub *sc,
unsigned long *map,
@@ -403,6 +420,17 @@ xchk_xattr_block(
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
hdrsize = xfs_attr3_leaf_hdr_size(leaf);
+ /*
+ * Empty xattr leaf blocks mapped at block 0 are probably a byproduct
+ * of a race between setxattr and a log shutdown. Anywhere else in the
+ * attr fork is a corruption.
+ */
+ if (leafhdr.count == 0) {
+ if (blk->blkno == 0)
+ xchk_da_set_preen(ds, level);
+ else
+ xchk_da_set_corrupt(ds, level);
+ }
if (leafhdr.usedbytes > mp->m_attr_geo->blksize)
xchk_da_set_corrupt(ds, level);
if (leafhdr.firstused > mp->m_attr_geo->blksize)
@@ -411,6 +439,8 @@ xchk_xattr_block(
xchk_da_set_corrupt(ds, level);
if (!xchk_xattr_set_map(ds->sc, ab->usedmap, 0, hdrsize))
xchk_da_set_corrupt(ds, level);
+ if (leafhdr.holes)
+ xchk_da_set_preen(ds, level);
if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
goto out;
@@ -463,7 +493,6 @@ xchk_xattr_rec(
xfs_dahash_t hash;
int nameidx;
int hdrsize;
- unsigned int badflags;
int error;
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
@@ -493,10 +522,15 @@ xchk_xattr_rec(
/* Retrieve the entry and check it. */
hash = be32_to_cpu(ent->hashval);
- badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE |
- XFS_ATTR_INCOMPLETE);
- if ((ent->flags & badflags) != 0)
+ if (ent->flags & ~XFS_ATTR_ONDISK_MASK) {
+ xchk_da_set_corrupt(ds, level);
+ return 0;
+ }
+ if (!xfs_attr_check_namespace(ent->flags)) {
xchk_da_set_corrupt(ds, level);
+ return 0;
+ }
+
if (ent->flags & XFS_ATTR_LOCAL) {
lentry = (struct xfs_attr_leaf_name_local *)
(((char *)bp->b_addr) + nameidx);
@@ -504,7 +538,10 @@ xchk_xattr_rec(
xchk_da_set_corrupt(ds, level);
goto out;
}
- calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen);
+ calc_hash = xfs_attr_hashval(mp, ent->flags, lentry->nameval,
+ lentry->namelen,
+ lentry->nameval + lentry->namelen,
+ be16_to_cpu(lentry->valuelen));
} else {
rentry = (struct xfs_attr_leaf_name_remote *)
(((char *)bp->b_addr) + nameidx);
@@ -512,7 +549,13 @@ xchk_xattr_rec(
xchk_da_set_corrupt(ds, level);
goto out;
}
- calc_hash = xfs_da_hashname(rentry->name, rentry->namelen);
+ if (ent->flags & XFS_ATTR_PARENT) {
+ xchk_da_set_corrupt(ds, level);
+ goto out;
+ }
+ calc_hash = xfs_attr_hashval(mp, ent->flags, rentry->name,
+ rentry->namelen, NULL,
+ be32_to_cpu(rentry->valuelen));
}
if (calc_hash != hash)
xchk_da_set_corrupt(ds, level);
@@ -556,6 +599,15 @@ xchk_xattr_check_sf(
break;
}
+ /*
+ * Shortform entries do not set LOCAL or INCOMPLETE, so the
+ * only valid flag bits here are for namespaces.
+ */
+ if (sfe->flags & ~XFS_ATTR_NSP_ONDISK_MASK) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ break;
+ }
+
if (!xchk_xattr_set_map(sc, ab->usedmap,
(char *)sfe - (char *)sf,
sizeof(struct xfs_attr_sf_entry))) {
@@ -588,16 +640,6 @@ int
xchk_xattr(
struct xfs_scrub *sc)
{
- struct xchk_xattr sx = {
- .sc = sc,
- .context = {
- .dp = sc->ip,
- .tp = sc->tp,
- .resynch = 1,
- .put_listent = xchk_xattr_listent,
- .allow_incomplete = true,
- },
- };
xfs_dablk_t last_checked = -1U;
int error = 0;
@@ -626,12 +668,6 @@ xchk_xattr(
/*
* Look up every xattr in this file by name and hash.
*
- * Use the backend implementation of xfs_attr_list to call
- * xchk_xattr_listent on every attribute key in this inode.
- * In other words, we use the same iterator/callback mechanism
- * that listattr uses to scrub extended attributes, though in our
- * _listent function, we check the value of the attribute.
- *
* The VFS only locks i_rwsem when modifying attrs, so keep all
* three locks held because that's the only way to ensure we're
* the only thread poking into the da btree. We traverse the da
@@ -639,13 +675,9 @@ xchk_xattr(
* iteration, which doesn't really follow the usual buffer
* locking order.
*/
- error = xfs_attr_list_ilocked(&sx.context);
+ error = xchk_xattr_walk(sc, sc->ip, xchk_xattr_actor, NULL, NULL);
if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
return error;
- /* Did our listent function try to return any errors? */
- if (sx.context.seen_enough < 0)
- return sx.context.seen_enough;
-
return 0;
}
diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
index 48fd9402c432..7db58af56646 100644
--- a/fs/xfs/scrub/attr.h
+++ b/fs/xfs/scrub/attr.h
@@ -16,9 +16,16 @@ struct xchk_xattr_buf {
/* Bitmap of free space in xattr leaf blocks. */
unsigned long *freemap;
+ /* Memory buffer used to hold salvaged xattr names. */
+ unsigned char *name;
+
/* Memory buffer used to extract xattr values. */
void *value;
size_t value_sz;
};
+bool xchk_xattr_set_map(struct xfs_scrub *sc, unsigned long *map,
+ unsigned int start, unsigned int len);
+int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size);
+
#endif /* __XFS_SCRUB_ATTR_H__ */
diff --git a/fs/xfs/scrub/attr_repair.c b/fs/xfs/scrub/attr_repair.c
new file mode 100644
index 000000000000..c7eb94069caf
--- /dev/null
+++ b/fs/xfs/scrub/attr_repair.c
@@ -0,0 +1,1663 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_exchmaps.h"
+#include "xfs_exchrange.h"
+#include "xfs_acl.h"
+#include "xfs_parent.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/attr.h"
+#include "scrub/reap.h"
+#include "scrub/attr_repair.h"
+
+/*
+ * Extended Attribute Repair
+ * =========================
+ *
+ * We repair extended attributes by reading the attr leaf blocks looking for
+ * attributes entries that look salvageable (name passes verifiers, value can
+ * be retrieved, etc). Each extended attribute worth salvaging is stashed in
+ * memory, and the stashed entries are periodically replayed into a temporary
+ * file to constrain memory use. Batching the construction of the temporary
+ * extended attribute structure in this fashion reduces lock cycling of the
+ * file being repaired and the temporary file.
+ *
+ * When salvaging completes, the remaining stashed attributes are replayed to
+ * the temporary file. An atomic file contents exchange is used to commit the
+ * new xattr blocks to the file being repaired. This will disrupt attrmulti
+ * cursors.
+ */
+
+struct xrep_xattr_key {
+ /* Cookie for retrieval of the xattr name. */
+ xfblob_cookie name_cookie;
+
+ /* Cookie for retrieval of the xattr value. */
+ xfblob_cookie value_cookie;
+
+ /* XFS_ATTR_* flags */
+ int flags;
+
+ /* Length of the value and name. */
+ uint32_t valuelen;
+ uint16_t namelen;
+};
+
+/*
+ * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write
+ * them to the temp file.
+ */
+#define XREP_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8)
+
+struct xrep_xattr {
+ struct xfs_scrub *sc;
+
+ /* Information for exchanging attr fork mappings at the end. */
+ struct xrep_tempexch tx;
+
+ /* xattr keys */
+ struct xfarray *xattr_records;
+
+ /* xattr values */
+ struct xfblob *xattr_blobs;
+
+ /* Number of attributes that we are salvaging. */
+ unsigned long long attrs_found;
+
+ /* Can we flush stashed attrs to the tempfile? */
+ bool can_flush;
+
+ /* Did the live update fail, and hence the repair is now out of date? */
+ bool live_update_aborted;
+
+ /* Lock protecting parent pointer updates */
+ struct mutex lock;
+
+ /* Fixed-size array of xrep_xattr_pptr structures. */
+ struct xfarray *pptr_recs;
+
+ /* Blobs containing parent pointer names. */
+ struct xfblob *pptr_names;
+
+ /* Hook to capture parent pointer updates. */
+ struct xfs_dir_hook dhook;
+
+ /* Scratch buffer for capturing parent pointers. */
+ struct xfs_da_args pptr_args;
+
+ /* Name buffer */
+ struct xfs_name xname;
+ char namebuf[MAXNAMELEN];
+};
+
+/* Create a parent pointer in the tempfile. */
+#define XREP_XATTR_PPTR_ADD (1)
+
+/* Remove a parent pointer from the tempfile. */
+#define XREP_XATTR_PPTR_REMOVE (2)
+
+/* A stashed parent pointer update. */
+struct xrep_xattr_pptr {
+ /* Cookie for retrieval of the pptr name. */
+ xfblob_cookie name_cookie;
+
+ /* Parent pointer record. */
+ struct xfs_parent_rec pptr_rec;
+
+ /* Length of the pptr name. */
+ uint8_t namelen;
+
+ /* XREP_XATTR_PPTR_{ADD,REMOVE} */
+ uint8_t action;
+};
+
+/* Set up to recreate the extended attributes. */
+int
+xrep_setup_xattr(
+ struct xfs_scrub *sc)
+{
+ if (xfs_has_parent(sc->mp))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ return xrep_tempfile_create(sc, S_IFREG);
+}
+
+/*
+ * Decide if we want to salvage this attribute. We don't bother with
+ * incomplete or oversized keys or values. The @value parameter can be null
+ * for remote attrs.
+ */
+STATIC int
+xrep_xattr_want_salvage(
+ struct xrep_xattr *rx,
+ unsigned int attr_flags,
+ const void *name,
+ int namelen,
+ const void *value,
+ int valuelen)
+{
+ if (attr_flags & XFS_ATTR_INCOMPLETE)
+ return false;
+ if (namelen > XATTR_NAME_MAX || namelen <= 0)
+ return false;
+ if (!xfs_attr_namecheck(attr_flags, name, namelen))
+ return false;
+ if (valuelen > XATTR_SIZE_MAX || valuelen < 0)
+ return false;
+ if (attr_flags & XFS_ATTR_PARENT)
+ return xfs_parent_valuecheck(rx->sc->mp, value, valuelen);
+
+ return true;
+}
+
+/* Allocate an in-core record to hold xattrs while we rebuild the xattr data. */
+STATIC int
+xrep_xattr_salvage_key(
+ struct xrep_xattr *rx,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ unsigned char *value,
+ int valuelen)
+{
+ struct xrep_xattr_key key = {
+ .valuelen = valuelen,
+ .flags = flags & XFS_ATTR_NSP_ONDISK_MASK,
+ };
+ unsigned int i = 0;
+ int error = 0;
+
+ if (xchk_should_terminate(rx->sc, &error))
+ return error;
+
+ /*
+ * Truncate the name to the first character that would trip namecheck.
+ * If we no longer have a name after that, ignore this attribute.
+ */
+ if (flags & XFS_ATTR_PARENT) {
+ key.namelen = namelen;
+
+ trace_xrep_xattr_salvage_pptr(rx->sc->ip, flags, name,
+ key.namelen, value, valuelen);
+ } else {
+ while (i < namelen && name[i] != 0)
+ i++;
+ if (i == 0)
+ return 0;
+ key.namelen = i;
+
+ trace_xrep_xattr_salvage_rec(rx->sc->ip, flags, name,
+ key.namelen, valuelen);
+ }
+
+ error = xfblob_store(rx->xattr_blobs, &key.name_cookie, name,
+ key.namelen);
+ if (error)
+ return error;
+
+ error = xfblob_store(rx->xattr_blobs, &key.value_cookie, value,
+ key.valuelen);
+ if (error)
+ return error;
+
+ error = xfarray_append(rx->xattr_records, &key);
+ if (error)
+ return error;
+
+ rx->attrs_found++;
+ return 0;
+}
+
+/*
+ * Record a shortform extended attribute key & value for later reinsertion
+ * into the inode.
+ */
+STATIC int
+xrep_xattr_salvage_sf_attr(
+ struct xrep_xattr *rx,
+ struct xfs_attr_sf_hdr *hdr,
+ struct xfs_attr_sf_entry *sfe)
+{
+ struct xfs_scrub *sc = rx->sc;
+ struct xchk_xattr_buf *ab = sc->buf;
+ unsigned char *name = sfe->nameval;
+ unsigned char *value = &sfe->nameval[sfe->namelen];
+
+ if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)name - (char *)hdr,
+ sfe->namelen))
+ return 0;
+
+ if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)value - (char *)hdr,
+ sfe->valuelen))
+ return 0;
+
+ if (!xrep_xattr_want_salvage(rx, sfe->flags, sfe->nameval,
+ sfe->namelen, value, sfe->valuelen))
+ return 0;
+
+ return xrep_xattr_salvage_key(rx, sfe->flags, sfe->nameval,
+ sfe->namelen, value, sfe->valuelen);
+}
+
+/*
+ * Record a local format extended attribute key & value for later reinsertion
+ * into the inode.
+ */
+STATIC int
+xrep_xattr_salvage_local_attr(
+ struct xrep_xattr *rx,
+ struct xfs_attr_leaf_entry *ent,
+ unsigned int nameidx,
+ const char *buf_end,
+ struct xfs_attr_leaf_name_local *lentry)
+{
+ struct xchk_xattr_buf *ab = rx->sc->buf;
+ unsigned char *value;
+ unsigned int valuelen;
+ unsigned int namesize;
+
+ /*
+ * Decode the leaf local entry format. If something seems wrong, we
+ * junk the attribute.
+ */
+ value = &lentry->nameval[lentry->namelen];
+ valuelen = be16_to_cpu(lentry->valuelen);
+ namesize = xfs_attr_leaf_entsize_local(lentry->namelen, valuelen);
+ if ((char *)lentry + namesize > buf_end)
+ return 0;
+ if (!xrep_xattr_want_salvage(rx, ent->flags, lentry->nameval,
+ lentry->namelen, value, valuelen))
+ return 0;
+ if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize))
+ return 0;
+
+ /* Try to save this attribute. */
+ return xrep_xattr_salvage_key(rx, ent->flags, lentry->nameval,
+ lentry->namelen, value, valuelen);
+}
+
+/*
+ * Record a remote format extended attribute key & value for later reinsertion
+ * into the inode.
+ */
+STATIC int
+xrep_xattr_salvage_remote_attr(
+ struct xrep_xattr *rx,
+ struct xfs_attr_leaf_entry *ent,
+ unsigned int nameidx,
+ const char *buf_end,
+ struct xfs_attr_leaf_name_remote *rentry,
+ unsigned int ent_idx,
+ struct xfs_buf *leaf_bp)
+{
+ struct xchk_xattr_buf *ab = rx->sc->buf;
+ struct xfs_da_args args = {
+ .trans = rx->sc->tp,
+ .dp = rx->sc->ip,
+ .index = ent_idx,
+ .geo = rx->sc->mp->m_attr_geo,
+ .owner = rx->sc->ip->i_ino,
+ .attr_filter = ent->flags & XFS_ATTR_NSP_ONDISK_MASK,
+ .namelen = rentry->namelen,
+ .name = rentry->name,
+ .value = ab->value,
+ .valuelen = be32_to_cpu(rentry->valuelen),
+ };
+ unsigned int namesize;
+ int error;
+
+ /*
+ * Decode the leaf remote entry format. If something seems wrong, we
+ * junk the attribute. Note that we should never find a zero-length
+ * remote attribute value.
+ */
+ namesize = xfs_attr_leaf_entsize_remote(rentry->namelen);
+ if ((char *)rentry + namesize > buf_end)
+ return 0;
+ if (args.valuelen == 0 ||
+ !xrep_xattr_want_salvage(rx, ent->flags, rentry->name,
+ rentry->namelen, NULL, args.valuelen))
+ return 0;
+ if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize))
+ return 0;
+
+ /*
+ * Enlarge the buffer (if needed) to hold the value that we're trying
+ * to salvage from the old extended attribute data.
+ */
+ error = xchk_setup_xattr_buf(rx->sc, args.valuelen);
+ if (error == -ENOMEM)
+ error = -EDEADLOCK;
+ if (error)
+ return error;
+
+ /* Look up the remote value and stash it for reconstruction. */
+ error = xfs_attr3_leaf_getvalue(leaf_bp, &args);
+ if (error || args.rmtblkno == 0)
+ goto err_free;
+
+ error = xfs_attr_rmtval_get(&args);
+ if (error)
+ goto err_free;
+
+ /* Try to save this attribute. */
+ error = xrep_xattr_salvage_key(rx, ent->flags, rentry->name,
+ rentry->namelen, ab->value, args.valuelen);
+err_free:
+ /* remote value was garbage, junk it */
+ if (error == -EFSBADCRC || error == -EFSCORRUPTED)
+ error = 0;
+ return error;
+}
+
+/* Extract every xattr key that we can from this attr fork block. */
+STATIC int
+xrep_xattr_recover_leaf(
+ struct xrep_xattr *rx,
+ struct xfs_buf *bp)
+{
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct xfs_scrub *sc = rx->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_name_local *lentry;
+ struct xfs_attr_leaf_name_remote *rentry;
+ struct xfs_attr_leaf_entry *ent;
+ struct xfs_attr_leaf_entry *entries;
+ struct xchk_xattr_buf *ab = rx->sc->buf;
+ char *buf_end;
+ size_t off;
+ unsigned int nameidx;
+ unsigned int hdrsize;
+ int i;
+ int error = 0;
+
+ bitmap_zero(ab->usedmap, mp->m_attr_geo->blksize);
+
+ /* Check the leaf header */
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+ hdrsize = xfs_attr3_leaf_hdr_size(leaf);
+ xchk_xattr_set_map(sc, ab->usedmap, 0, hdrsize);
+ entries = xfs_attr3_leaf_entryp(leaf);
+
+ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize;
+ for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /* Skip key if it conflicts with something else? */
+ off = (char *)ent - (char *)leaf;
+ if (!xchk_xattr_set_map(sc, ab->usedmap, off,
+ sizeof(xfs_attr_leaf_entry_t)))
+ continue;
+
+ /* Check the name information. */
+ nameidx = be16_to_cpu(ent->nameidx);
+ if (nameidx < leafhdr.firstused ||
+ nameidx >= mp->m_attr_geo->blksize)
+ continue;
+
+ if (ent->flags & XFS_ATTR_LOCAL) {
+ lentry = xfs_attr3_leaf_name_local(leaf, i);
+ error = xrep_xattr_salvage_local_attr(rx, ent, nameidx,
+ buf_end, lentry);
+ } else {
+ rentry = xfs_attr3_leaf_name_remote(leaf, i);
+ error = xrep_xattr_salvage_remote_attr(rx, ent, nameidx,
+ buf_end, rentry, i, bp);
+ }
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Try to recover shortform attrs. */
+STATIC int
+xrep_xattr_recover_sf(
+ struct xrep_xattr *rx)
+{
+ struct xfs_scrub *sc = rx->sc;
+ struct xchk_xattr_buf *ab = sc->buf;
+ struct xfs_attr_sf_hdr *hdr;
+ struct xfs_attr_sf_entry *sfe;
+ struct xfs_attr_sf_entry *next;
+ struct xfs_ifork *ifp;
+ unsigned char *end;
+ int i;
+ int error = 0;
+
+ ifp = xfs_ifork_ptr(rx->sc->ip, XFS_ATTR_FORK);
+ hdr = ifp->if_data;
+
+ bitmap_zero(ab->usedmap, ifp->if_bytes);
+ end = (unsigned char *)ifp->if_data + ifp->if_bytes;
+ xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*hdr));
+
+ sfe = xfs_attr_sf_firstentry(hdr);
+ if ((unsigned char *)sfe > end)
+ return 0;
+
+ for (i = 0; i < hdr->count; i++) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ next = xfs_attr_sf_nextentry(sfe);
+ if ((unsigned char *)next > end)
+ break;
+
+ if (xchk_xattr_set_map(sc, ab->usedmap,
+ (char *)sfe - (char *)hdr,
+ sizeof(struct xfs_attr_sf_entry))) {
+ /*
+ * No conflicts with the sf entry; let's save this
+ * attribute.
+ */
+ error = xrep_xattr_salvage_sf_attr(rx, hdr, sfe);
+ if (error)
+ return error;
+ }
+
+ sfe = next;
+ }
+
+ return 0;
+}
+
+/*
+ * Try to return a buffer of xattr data for a given physical extent.
+ *
+ * Because the buffer cache get function complains if it finds a buffer
+ * matching the block number but not matching the length, we must be careful to
+ * look for incore buffers (up to the maximum length of a remote value) that
+ * could be hiding anywhere in the physical range. If we find an incore
+ * buffer, we can pass that to the caller. Optionally, read a single block and
+ * pass that back.
+ *
+ * Note the subtlety that remote attr value blocks for which there is no incore
+ * buffer will be passed to the callback one block at a time. These buffers
+ * will not have any ops attached and must be staled to prevent aliasing with
+ * multiblock buffers once we drop the ILOCK.
+ */
+STATIC int
+xrep_xattr_find_buf(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno,
+ xfs_extlen_t max_len,
+ bool can_read,
+ struct xfs_buf **bpp)
+{
+ struct xrep_bufscan scan = {
+ .daddr = XFS_FSB_TO_DADDR(mp, fsbno),
+ .max_sectors = xrep_bufscan_max_sectors(mp, max_len),
+ .daddr_step = XFS_FSB_TO_BB(mp, 1),
+ };
+ struct xfs_buf *bp;
+
+ while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
+ *bpp = bp;
+ return 0;
+ }
+
+ if (!can_read) {
+ *bpp = NULL;
+ return 0;
+ }
+
+ return xfs_buf_read(mp->m_ddev_targp, scan.daddr, XFS_FSB_TO_BB(mp, 1),
+ XBF_TRYLOCK, bpp, NULL);
+}
+
+/*
+ * Deal with a buffer that we found during our walk of the attr fork.
+ *
+ * Attribute leaf and node blocks are simple -- they're a single block, so we
+ * can walk them one at a time and we never have to worry about discontiguous
+ * multiblock buffers like we do for directories.
+ *
+ * Unfortunately, remote attr blocks add a lot of complexity here. Each disk
+ * block is totally self contained, in the sense that the v5 header provides no
+ * indication that there could be more data in the next block. The incore
+ * buffers can span multiple blocks, though they never cross extent records.
+ * However, they don't necessarily start or end on an extent record boundary.
+ * Therefore, we need a special buffer find function to walk the buffer cache
+ * for us.
+ *
+ * The caller must hold the ILOCK on the file being repaired. We use
+ * XBF_TRYLOCK here to skip any locked buffer on the assumption that we don't
+ * own the block and don't want to hang the system on a potentially garbage
+ * buffer.
+ */
+STATIC int
+xrep_xattr_recover_block(
+ struct xrep_xattr *rx,
+ xfs_dablk_t dabno,
+ xfs_fsblock_t fsbno,
+ xfs_extlen_t max_len,
+ xfs_extlen_t *actual_len)
+{
+ struct xfs_da_blkinfo *info;
+ struct xfs_buf *bp;
+ int error;
+
+ error = xrep_xattr_find_buf(rx->sc->mp, fsbno, max_len, true, &bp);
+ if (error)
+ return error;
+ info = bp->b_addr;
+ *actual_len = XFS_BB_TO_FSB(rx->sc->mp, bp->b_length);
+
+ trace_xrep_xattr_recover_leafblock(rx->sc->ip, dabno,
+ be16_to_cpu(info->magic));
+
+ /*
+ * If the buffer has the right magic number for an attr leaf block and
+ * passes a structure check (we don't care about checksums), salvage
+ * as much as we can from the block. */
+ if (info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) &&
+ xrep_buf_verify_struct(bp, &xfs_attr3_leaf_buf_ops) &&
+ xfs_attr3_leaf_header_check(bp, rx->sc->ip->i_ino) == NULL)
+ error = xrep_xattr_recover_leaf(rx, bp);
+
+ /*
+ * If the buffer didn't already have buffer ops set, it was read in by
+ * the _find_buf function and could very well be /part/ of a multiblock
+ * remote block. Mark it stale so that it doesn't hang around in
+ * memory to cause problems.
+ */
+ if (bp->b_ops == NULL)
+ xfs_buf_stale(bp);
+
+ xfs_buf_relse(bp);
+ return error;
+}
+
+/* Insert one xattr key/value. */
+STATIC int
+xrep_xattr_insert_rec(
+ struct xrep_xattr *rx,
+ const struct xrep_xattr_key *key)
+{
+ struct xfs_da_args args = {
+ .dp = rx->sc->tempip,
+ .attr_filter = key->flags,
+ .namelen = key->namelen,
+ .valuelen = key->valuelen,
+ .owner = rx->sc->ip->i_ino,
+ .geo = rx->sc->mp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .op_flags = XFS_DA_OP_OKNOENT,
+ };
+ struct xchk_xattr_buf *ab = rx->sc->buf;
+ int error;
+
+ /*
+ * Grab pointers to the scrub buffer so that we can use them to insert
+ * attrs into the temp file.
+ */
+ args.name = ab->name;
+ args.value = ab->value;
+
+ /*
+ * The attribute name is stored near the end of the in-core buffer,
+ * though we reserve one more byte to ensure null termination.
+ */
+ ab->name[XATTR_NAME_MAX] = 0;
+
+ error = xfblob_load(rx->xattr_blobs, key->name_cookie, ab->name,
+ key->namelen);
+ if (error)
+ return error;
+
+ error = xfblob_free(rx->xattr_blobs, key->name_cookie);
+ if (error)
+ return error;
+
+ error = xfblob_load(rx->xattr_blobs, key->value_cookie, args.value,
+ key->valuelen);
+ if (error)
+ return error;
+
+ error = xfblob_free(rx->xattr_blobs, key->value_cookie);
+ if (error)
+ return error;
+
+ ab->name[key->namelen] = 0;
+
+ if (key->flags & XFS_ATTR_PARENT) {
+ trace_xrep_xattr_insert_pptr(rx->sc->tempip, key->flags,
+ ab->name, key->namelen, ab->value,
+ key->valuelen);
+ args.op_flags |= XFS_DA_OP_LOGGED;
+ } else {
+ trace_xrep_xattr_insert_rec(rx->sc->tempip, key->flags,
+ ab->name, key->namelen, key->valuelen);
+ }
+
+ /*
+ * xfs_attr_set creates and commits its own transaction. If the attr
+ * already exists, we'll just drop it during the rebuild.
+ */
+ xfs_attr_sethash(&args);
+ error = xfs_attr_set(&args, XFS_ATTRUPDATE_CREATE, false);
+ if (error == -EEXIST)
+ error = 0;
+
+ return error;
+}
+
+/*
+ * Periodically flush salvaged attributes to the temporary file. This is done
+ * to reduce the memory requirements of the xattr rebuild because files can
+ * contain millions of attributes.
+ */
+STATIC int
+xrep_xattr_flush_stashed(
+ struct xrep_xattr *rx)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ /*
+ * Entering this function, the scrub context has a reference to the
+ * inode being repaired, the temporary file, and a scrub transaction
+ * that we use during xattr salvaging to avoid livelocking if there
+ * are cycles in the xattr structures. We hold ILOCK_EXCL on both
+ * the inode being repaired, though it is not ijoined to the scrub
+ * transaction.
+ *
+ * To constrain kernel memory use, we occasionally flush salvaged
+ * xattrs from the xfarray and xfblob structures into the temporary
+ * file in preparation for exchanging the xattr structures at the end.
+ * Updating the temporary file requires a transaction, so we commit the
+ * scrub transaction and drop the two ILOCKs so that xfs_attr_set can
+ * allocate whatever transaction it wants.
+ *
+ * We still hold IOLOCK_EXCL on the inode being repaired, which
+ * prevents anyone from modifying the damaged xattr data while we
+ * repair it.
+ */
+ error = xrep_trans_commit(rx->sc);
+ if (error)
+ return error;
+ xchk_iunlock(rx->sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the IOLOCK of the temporary file while we modify xattrs. This
+ * isn't strictly required because the temporary file is never revealed
+ * to userspace, but we follow the same locking rules. We still hold
+ * sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rx->sc);
+ if (error)
+ return error;
+
+ /* Add all the salvaged attrs to the temporary file. */
+ foreach_xfarray_idx(rx->xattr_records, array_cur) {
+ struct xrep_xattr_key key;
+
+ error = xfarray_load(rx->xattr_records, array_cur, &key);
+ if (error)
+ return error;
+
+ error = xrep_xattr_insert_rec(rx, &key);
+ if (error)
+ return error;
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rx->xattr_records);
+ xfblob_truncate(rx->xattr_blobs);
+
+ xrep_tempfile_iounlock(rx->sc);
+
+ /* Recreate the salvage transaction and relock the inode. */
+ error = xchk_trans_alloc(rx->sc, 0);
+ if (error)
+ return error;
+ xchk_ilock(rx->sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/* Decide if we've stashed too much xattr data in memory. */
+static inline bool
+xrep_xattr_want_flush_stashed(
+ struct xrep_xattr *rx)
+{
+ unsigned long long bytes;
+
+ if (!rx->can_flush)
+ return false;
+
+ bytes = xfarray_bytes(rx->xattr_records) +
+ xfblob_bytes(rx->xattr_blobs);
+ return bytes > XREP_XATTR_MAX_STASH_BYTES;
+}
+
+/*
+ * Did we observe rename changing parent pointer xattrs while we were flushing
+ * salvaged attrs?
+ */
+static inline bool
+xrep_xattr_saw_pptr_conflict(
+ struct xrep_xattr *rx)
+{
+ bool ret;
+
+ ASSERT(rx->can_flush);
+
+ if (!xfs_has_parent(rx->sc->mp))
+ return false;
+
+ xfs_assert_ilocked(rx->sc->ip, XFS_ILOCK_EXCL);
+
+ mutex_lock(&rx->lock);
+ ret = xfarray_bytes(rx->pptr_recs) > 0;
+ mutex_unlock(&rx->lock);
+
+ return ret;
+}
+
+/*
+ * Reset the entire repair state back to initial conditions, now that we've
+ * detected a parent pointer update to the attr structure while we were
+ * flushing salvaged attrs. See the locking notes in dir_repair.c for more
+ * information on why this is all necessary.
+ */
+STATIC int
+xrep_xattr_full_reset(
+ struct xrep_xattr *rx)
+{
+ struct xfs_scrub *sc = rx->sc;
+ struct xfs_attr_sf_hdr *hdr;
+ struct xfs_ifork *ifp = &sc->tempip->i_af;
+ int error;
+
+ trace_xrep_xattr_full_reset(sc->ip, sc->tempip);
+
+ /* The temporary file's data fork had better not be in btree format. */
+ if (sc->tempip->i_df.if_format == XFS_DINODE_FMT_BTREE) {
+ ASSERT(0);
+ return -EIO;
+ }
+
+ /*
+ * We begin in transaction context with sc->ip ILOCKed but not joined
+ * to the transaction. To reset to the initial state, we must hold
+ * sc->ip's ILOCK to prevent rename from updating parent pointer
+ * information and the tempfile's ILOCK to clear its contents.
+ */
+ xchk_iunlock(rx->sc, XFS_ILOCK_EXCL);
+ xrep_tempfile_ilock_both(sc);
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+ /*
+ * Free all the blocks of the attr fork of the temp file, and reset
+ * it back to local format.
+ */
+ if (xfs_ifork_has_extents(&sc->tempip->i_af)) {
+ error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ ASSERT(ifp->if_bytes == 0);
+ ifp->if_format = XFS_DINODE_FMT_LOCAL;
+ xfs_idata_realloc(sc->tempip, sizeof(*hdr), XFS_ATTR_FORK);
+ }
+
+ /* Reinitialize the attr fork to an empty shortform structure. */
+ hdr = ifp->if_data;
+ memset(hdr, 0, sizeof(*hdr));
+ hdr->totsize = cpu_to_be16(sizeof(*hdr));
+ xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+ /*
+ * Roll this transaction to commit our reset ondisk. The tempfile
+ * should no longer be joined to the transaction, so we drop its ILOCK.
+ * This should leave us in transaction context with sc->ip ILOCKed but
+ * not joined to the transaction.
+ */
+ error = xrep_roll_trans(sc);
+ if (error)
+ return error;
+ xrep_tempfile_iunlock(sc);
+
+ /*
+ * Erase any accumulated parent pointer updates now that we've erased
+ * the tempfile's attr fork. We're resetting the entire repair state
+ * back to where we were initially, except now we won't flush salvaged
+ * xattrs until the very end.
+ */
+ mutex_lock(&rx->lock);
+ xfarray_truncate(rx->pptr_recs);
+ xfblob_truncate(rx->pptr_names);
+ mutex_unlock(&rx->lock);
+
+ rx->can_flush = false;
+ rx->attrs_found = 0;
+
+ ASSERT(xfarray_bytes(rx->xattr_records) == 0);
+ ASSERT(xfblob_bytes(rx->xattr_blobs) == 0);
+ return 0;
+}
+
+/* Extract as many attribute keys and values as we can. */
+STATIC int
+xrep_xattr_recover(
+ struct xrep_xattr *rx)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_scrub *sc = rx->sc;
+ struct xfs_da_geometry *geo = sc->mp->m_attr_geo;
+ xfs_fileoff_t offset;
+ xfs_extlen_t len;
+ xfs_dablk_t dabno;
+ int nmap;
+ int error;
+
+restart:
+ /*
+ * Iterate each xattr leaf block in the attr fork to scan them for any
+ * attributes that we might salvage.
+ */
+ for (offset = 0;
+ offset < XFS_MAX_FILEOFF;
+ offset = got.br_startoff + got.br_blockcount) {
+ nmap = 1;
+ error = xfs_bmapi_read(sc->ip, offset, XFS_MAX_FILEOFF - offset,
+ &got, &nmap, XFS_BMAPI_ATTRFORK);
+ if (error)
+ return error;
+ if (nmap != 1)
+ return -EFSCORRUPTED;
+ if (!xfs_bmap_is_written_extent(&got))
+ continue;
+
+ for (dabno = round_up(got.br_startoff, geo->fsbcount);
+ dabno < got.br_startoff + got.br_blockcount;
+ dabno += len) {
+ xfs_fileoff_t curr_offset = dabno - got.br_startoff;
+ xfs_extlen_t maxlen;
+
+ if (xchk_should_terminate(rx->sc, &error))
+ return error;
+
+ maxlen = min_t(xfs_filblks_t, INT_MAX,
+ got.br_blockcount - curr_offset);
+ error = xrep_xattr_recover_block(rx, dabno,
+ curr_offset + got.br_startblock,
+ maxlen, &len);
+ if (error)
+ return error;
+
+ if (xrep_xattr_want_flush_stashed(rx)) {
+ error = xrep_xattr_flush_stashed(rx);
+ if (error)
+ return error;
+
+ if (xrep_xattr_saw_pptr_conflict(rx)) {
+ error = xrep_xattr_full_reset(rx);
+ if (error)
+ return error;
+
+ goto restart;
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Reset the extended attribute fork to a state where we can start re-adding
+ * the salvaged attributes.
+ */
+STATIC int
+xrep_xattr_fork_remove(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_attr_sf_hdr *hdr;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK);
+
+ /*
+ * If the data fork is in btree format, we can't change di_forkoff
+ * because we could run afoul of the rule that the data fork isn't
+ * supposed to be in btree format if there's enough space in the fork
+ * that it could have used extents format. Instead, reinitialize the
+ * attr fork to have a shortform structure with zero attributes.
+ */
+ if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) {
+ ifp->if_format = XFS_DINODE_FMT_LOCAL;
+ hdr = xfs_idata_realloc(ip, (int)sizeof(*hdr) - ifp->if_bytes,
+ XFS_ATTR_FORK);
+ hdr->count = 0;
+ hdr->totsize = cpu_to_be16(sizeof(*hdr));
+ xfs_trans_log_inode(sc->tp, ip,
+ XFS_ILOG_CORE | XFS_ILOG_ADATA);
+ return 0;
+ }
+
+ /* If we still have attr fork extents, something's wrong. */
+ if (ifp->if_nextents != 0) {
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec irec;
+ unsigned int i = 0;
+
+ xfs_emerg(sc->mp,
+ "inode 0x%llx attr fork still has %llu attr extents, format %d?!",
+ ip->i_ino, ifp->if_nextents, ifp->if_format);
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ xfs_err(sc->mp,
+ "[%u]: startoff %llu startblock %llu blockcount %llu state %u",
+ i++, irec.br_startoff,
+ irec.br_startblock, irec.br_blockcount,
+ irec.br_state);
+ }
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ xfs_attr_fork_remove(ip, sc->tp);
+ return 0;
+}
+
+/*
+ * Free all the attribute fork blocks of the file being repaired and delete the
+ * fork. The caller must ILOCK the scrub file and join it to the transaction.
+ * This function returns with the inode joined to a clean transaction.
+ */
+int
+xrep_xattr_reset_fork(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ trace_xrep_xattr_reset_fork(sc->ip, sc->ip);
+
+ /* Unmap all the attr blocks. */
+ if (xfs_ifork_has_extents(&sc->ip->i_af)) {
+ error = xrep_reap_ifork(sc, sc->ip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ }
+
+ error = xrep_xattr_fork_remove(sc, sc->ip);
+ if (error)
+ return error;
+
+ return xfs_trans_roll_inode(&sc->tp, sc->ip);
+}
+
+/*
+ * Free all the attribute fork blocks of the temporary file and delete the attr
+ * fork. The caller must ILOCK the tempfile and join it to the transaction.
+ * This function returns with the inode joined to a clean scrub transaction.
+ */
+int
+xrep_xattr_reset_tempfile_fork(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ trace_xrep_xattr_reset_fork(sc->ip, sc->tempip);
+
+ /*
+ * Wipe out the attr fork of the temp file so that regular inode
+ * inactivation won't trip over the corrupt attr fork.
+ */
+ if (xfs_ifork_has_extents(&sc->tempip->i_af)) {
+ error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ }
+
+ return xrep_xattr_fork_remove(sc, sc->tempip);
+}
+
+/*
+ * Find all the extended attributes for this inode by scraping them out of the
+ * attribute key blocks by hand, and flushing them into the temp file.
+ * When we're done, free the staging memory before exchanging the xattr
+ * structures to reduce memory usage.
+ */
+STATIC int
+xrep_xattr_salvage_attributes(
+ struct xrep_xattr *rx)
+{
+ struct xfs_inode *ip = rx->sc->ip;
+ int error;
+
+ /* Short format xattrs are easy! */
+ if (rx->sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) {
+ error = xrep_xattr_recover_sf(rx);
+ if (error)
+ return error;
+
+ return xrep_xattr_flush_stashed(rx);
+ }
+
+ /*
+ * For non-inline xattr structures, the salvage function scans the
+ * buffer cache looking for potential attr leaf blocks. The scan
+ * requires the ability to lock any buffer found and runs independently
+ * of any transaction <-> buffer item <-> buffer linkage. Therefore,
+ * roll the transaction to ensure there are no buffers joined. We hold
+ * the ILOCK independently of the transaction.
+ */
+ error = xfs_trans_roll(&rx->sc->tp);
+ if (error)
+ return error;
+
+ error = xfs_iread_extents(rx->sc->tp, ip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ error = xrep_xattr_recover(rx);
+ if (error)
+ return error;
+
+ return xrep_xattr_flush_stashed(rx);
+}
+
+/*
+ * Add this stashed incore parent pointer to the temporary file. The caller
+ * must hold the tempdir's IOLOCK, must not hold any ILOCKs, and must not be in
+ * transaction context.
+ */
+STATIC int
+xrep_xattr_replay_pptr_update(
+ struct xrep_xattr *rx,
+ const struct xfs_name *xname,
+ struct xrep_xattr_pptr *pptr)
+{
+ struct xfs_scrub *sc = rx->sc;
+ int error;
+
+ switch (pptr->action) {
+ case XREP_XATTR_PPTR_ADD:
+ /* Create parent pointer. */
+ trace_xrep_xattr_replay_parentadd(sc->tempip, xname,
+ &pptr->pptr_rec);
+
+ error = xfs_parent_set(sc->tempip, sc->ip->i_ino, xname,
+ &pptr->pptr_rec, &rx->pptr_args);
+ ASSERT(error != -EEXIST);
+ return error;
+ case XREP_XATTR_PPTR_REMOVE:
+ /* Remove parent pointer. */
+ trace_xrep_xattr_replay_parentremove(sc->tempip, xname,
+ &pptr->pptr_rec);
+
+ error = xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname,
+ &pptr->pptr_rec, &rx->pptr_args);
+ ASSERT(error != -ENOATTR);
+ return error;
+ }
+
+ ASSERT(0);
+ return -EIO;
+}
+
+/*
+ * Flush stashed parent pointer updates that have been recorded by the scanner.
+ * This is done to reduce the memory requirements of the xattr rebuild, since
+ * files can have a lot of hardlinks and the fs can be busy.
+ *
+ * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile
+ * IOLOCK.
+ */
+STATIC int
+xrep_xattr_replay_pptr_updates(
+ struct xrep_xattr *rx)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ mutex_lock(&rx->lock);
+ foreach_xfarray_idx(rx->pptr_recs, array_cur) {
+ struct xrep_xattr_pptr pptr;
+
+ error = xfarray_load(rx->pptr_recs, array_cur, &pptr);
+ if (error)
+ goto out_unlock;
+
+ error = xfblob_loadname(rx->pptr_names, pptr.name_cookie,
+ &rx->xname, pptr.namelen);
+ if (error)
+ goto out_unlock;
+ mutex_unlock(&rx->lock);
+
+ error = xrep_xattr_replay_pptr_update(rx, &rx->xname, &pptr);
+ if (error)
+ return error;
+
+ mutex_lock(&rx->lock);
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rx->pptr_recs);
+ xfblob_truncate(rx->pptr_names);
+ mutex_unlock(&rx->lock);
+ return 0;
+out_unlock:
+ mutex_unlock(&rx->lock);
+ return error;
+}
+
+/*
+ * Remember that we want to create a parent pointer in the tempfile. These
+ * stashed actions will be replayed later.
+ */
+STATIC int
+xrep_xattr_stash_parentadd(
+ struct xrep_xattr *rx,
+ const struct xfs_name *name,
+ const struct xfs_inode *dp)
+{
+ struct xrep_xattr_pptr pptr = {
+ .action = XREP_XATTR_PPTR_ADD,
+ .namelen = name->len,
+ };
+ int error;
+
+ trace_xrep_xattr_stash_parentadd(rx->sc->tempip, dp, name);
+
+ xfs_inode_to_parent_rec(&pptr.pptr_rec, dp);
+ error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rx->pptr_recs, &pptr);
+}
+
+/*
+ * Remember that we want to remove a parent pointer from the tempfile. These
+ * stashed actions will be replayed later.
+ */
+STATIC int
+xrep_xattr_stash_parentremove(
+ struct xrep_xattr *rx,
+ const struct xfs_name *name,
+ const struct xfs_inode *dp)
+{
+ struct xrep_xattr_pptr pptr = {
+ .action = XREP_XATTR_PPTR_REMOVE,
+ .namelen = name->len,
+ };
+ int error;
+
+ trace_xrep_xattr_stash_parentremove(rx->sc->tempip, dp, name);
+
+ xfs_inode_to_parent_rec(&pptr.pptr_rec, dp);
+ error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rx->pptr_recs, &pptr);
+}
+
+/*
+ * Capture dirent updates being made by other threads. We will have to replay
+ * the parent pointer updates before exchanging attr forks.
+ */
+STATIC int
+xrep_xattr_live_dirent_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xrep_xattr *rx;
+ struct xfs_scrub *sc;
+ int error;
+
+ rx = container_of(nb, struct xrep_xattr, dhook.dirent_hook.nb);
+ sc = rx->sc;
+
+ /*
+ * This thread updated a dirent that points to the file that we're
+ * repairing, so stash the update for replay against the temporary
+ * file.
+ */
+ if (p->ip->i_ino != sc->ip->i_ino)
+ return NOTIFY_DONE;
+
+ mutex_lock(&rx->lock);
+ if (p->delta > 0)
+ error = xrep_xattr_stash_parentadd(rx, p->name, p->dp);
+ else
+ error = xrep_xattr_stash_parentremove(rx, p->name, p->dp);
+ if (error)
+ rx->live_update_aborted = true;
+ mutex_unlock(&rx->lock);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Prepare both inodes' attribute forks for an exchange. Promote the tempfile
+ * from short format to leaf format, and if the file being repaired has a short
+ * format attr fork, turn it into an empty extent list.
+ */
+STATIC int
+xrep_xattr_swap_prep(
+ struct xfs_scrub *sc,
+ bool temp_local,
+ bool ip_local)
+{
+ int error;
+
+ /*
+ * If the tempfile's attributes are in shortform format, convert that
+ * to a single leaf extent so that we can use the atomic mapping
+ * exchange.
+ */
+ if (temp_local) {
+ struct xfs_da_args args = {
+ .dp = sc->tempip,
+ .geo = sc->mp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .trans = sc->tp,
+ .total = 1,
+ .owner = sc->ip->i_ino,
+ };
+
+ error = xfs_attr_shortform_to_leaf(&args);
+ if (error)
+ return error;
+
+ /*
+ * Roll the deferred log items to get us back to a clean
+ * transaction.
+ */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If the file being repaired had a shortform attribute fork, convert
+ * that to an empty extent list in preparation for the atomic mapping
+ * exchange.
+ */
+ if (ip_local) {
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
+
+ xfs_idestroy_fork(ifp);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ifp->if_nextents = 0;
+ ifp->if_bytes = 0;
+ ifp->if_data = NULL;
+ ifp->if_height = 0;
+
+ xfs_trans_log_inode(sc->tp, sc->ip,
+ XFS_ILOG_CORE | XFS_ILOG_ADATA);
+ }
+
+ return 0;
+}
+
+/* Exchange the temporary file's attribute fork with the one being repaired. */
+int
+xrep_xattr_swap(
+ struct xfs_scrub *sc,
+ struct xrep_tempexch *tx)
+{
+ bool ip_local, temp_local;
+ int error = 0;
+
+ ip_local = sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL;
+ temp_local = sc->tempip->i_af.if_format == XFS_DINODE_FMT_LOCAL;
+
+ /*
+ * If the both files have a local format attr fork and the rebuilt
+ * xattr data would fit in the repaired file's attr fork, just copy
+ * the contents from the tempfile and declare ourselves done.
+ */
+ if (ip_local && temp_local) {
+ int forkoff;
+ int newsize;
+
+ newsize = xfs_attr_sf_totsize(sc->tempip);
+ forkoff = xfs_attr_shortform_bytesfit(sc->ip, newsize);
+ if (forkoff > 0) {
+ sc->ip->i_forkoff = forkoff;
+ xrep_tempfile_copyout_local(sc, XFS_ATTR_FORK);
+ return 0;
+ }
+ }
+
+ /* Otherwise, make sure both attr forks are in block-mapping mode. */
+ error = xrep_xattr_swap_prep(sc, temp_local, ip_local);
+ if (error)
+ return error;
+
+ return xrep_tempexch_contents(sc, tx);
+}
+
+/*
+ * Finish replaying stashed parent pointer updates, allocate a transaction for
+ * exchanging extent mappings, and take the ILOCKs of both files before we
+ * commit the new extended attribute structure.
+ */
+STATIC int
+xrep_xattr_finalize_tempfile(
+ struct xrep_xattr *rx)
+{
+ struct xfs_scrub *sc = rx->sc;
+ int error;
+
+ if (!xfs_has_parent(sc->mp))
+ return xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx);
+
+ /*
+ * Repair relies on the ILOCK to quiesce all possible xattr updates.
+ * Replay all queued parent pointer updates into the tempfile before
+ * exchanging the contents, even if that means dropping the ILOCKs and
+ * the transaction.
+ */
+ do {
+ error = xrep_xattr_replay_pptr_updates(rx);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx);
+ if (error)
+ return error;
+
+ if (xfarray_length(rx->pptr_recs) == 0)
+ break;
+
+ xchk_trans_cancel(sc);
+ xrep_tempfile_iunlock_both(sc);
+ } while (!xchk_should_terminate(sc, &error));
+ return error;
+}
+
+/*
+ * Exchange the new extended attribute data (which we created in the tempfile)
+ * with the file being repaired.
+ */
+STATIC int
+xrep_xattr_rebuild_tree(
+ struct xrep_xattr *rx)
+{
+ struct xfs_scrub *sc = rx->sc;
+ int error;
+
+ /*
+ * If we didn't find any attributes to salvage, repair the file by
+ * zapping its attr fork.
+ */
+ if (rx->attrs_found == 0) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ error = xrep_xattr_reset_fork(sc);
+ if (error)
+ return error;
+
+ goto forget_acls;
+ }
+
+ trace_xrep_xattr_rebuild_tree(sc->ip, sc->tempip);
+
+ /*
+ * Commit the repair transaction and drop the ILOCKs so that we can use
+ * the atomic file content exchange helper functions to compute the
+ * correct resource reservations.
+ *
+ * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent xattr
+ * modifications, but there's nothing to prevent userspace from reading
+ * the attributes until we're ready for the exchange operation. Reads
+ * will return -EIO without shutting down the fs, so we're ok with
+ * that.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the IOLOCK on the temporary file so that we can run xattr
+ * operations with the same locks held as we would for a normal file.
+ * We still hold sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rx->sc);
+ if (error)
+ return error;
+
+ /*
+ * Allocate transaction, lock inodes, and make sure that we've replayed
+ * all the stashed parent pointer updates to the temp file. After this
+ * point, we're ready to exchange attr fork mappings.
+ */
+ error = xrep_xattr_finalize_tempfile(rx);
+ if (error)
+ return error;
+
+ /*
+ * Exchange the blocks mapped by the tempfile's attr fork with the file
+ * being repaired. The old attr blocks will then be attached to the
+ * tempfile, so reap its attr fork.
+ */
+ error = xrep_xattr_swap(sc, &rx->tx);
+ if (error)
+ return error;
+
+ error = xrep_xattr_reset_tempfile_fork(sc);
+ if (error)
+ return error;
+
+ /*
+ * Roll to get a transaction without any inodes joined to it. Then we
+ * can drop the tempfile's ILOCK and IOLOCK before doing more work on
+ * the scrub target file.
+ */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+
+ xrep_tempfile_iunlock(sc);
+ xrep_tempfile_iounlock(sc);
+
+forget_acls:
+ /* Invalidate cached ACLs now that we've reloaded all the xattrs. */
+ xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_FILE);
+ xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_DEFAULT);
+ return 0;
+}
+
+/* Tear down all the incore scan stuff we created. */
+STATIC void
+xrep_xattr_teardown(
+ struct xrep_xattr *rx)
+{
+ if (xfs_has_parent(rx->sc->mp))
+ xfs_dir_hook_del(rx->sc->mp, &rx->dhook);
+ if (rx->pptr_names)
+ xfblob_destroy(rx->pptr_names);
+ if (rx->pptr_recs)
+ xfarray_destroy(rx->pptr_recs);
+ xfblob_destroy(rx->xattr_blobs);
+ xfarray_destroy(rx->xattr_records);
+ mutex_destroy(&rx->lock);
+ kfree(rx);
+}
+
+/* Set up the filesystem scan so we can regenerate extended attributes. */
+STATIC int
+xrep_xattr_setup_scan(
+ struct xfs_scrub *sc,
+ struct xrep_xattr **rxp)
+{
+ struct xrep_xattr *rx;
+ char *descr;
+ int max_len;
+ int error;
+
+ rx = kzalloc(sizeof(struct xrep_xattr), XCHK_GFP_FLAGS);
+ if (!rx)
+ return -ENOMEM;
+ rx->sc = sc;
+ rx->can_flush = true;
+ rx->xname.name = rx->namebuf;
+
+ mutex_init(&rx->lock);
+
+ /*
+ * Allocate enough memory to handle loading local attr values from the
+ * xfblob data while flushing stashed attrs to the temporary file.
+ * We only realloc the buffer when salvaging remote attr values.
+ */
+ max_len = xfs_attr_leaf_entsize_local_max(sc->mp->m_attr_geo->blksize);
+ error = xchk_setup_xattr_buf(rx->sc, max_len);
+ if (error == -ENOMEM)
+ error = -EDEADLOCK;
+ if (error)
+ goto out_rx;
+
+ /* Set up some staging for salvaged attribute keys and values */
+ descr = xchk_xfile_ino_descr(sc, "xattr keys");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_xattr_key),
+ &rx->xattr_records);
+ kfree(descr);
+ if (error)
+ goto out_rx;
+
+ descr = xchk_xfile_ino_descr(sc, "xattr names");
+ error = xfblob_create(descr, &rx->xattr_blobs);
+ kfree(descr);
+ if (error)
+ goto out_keys;
+
+ if (xfs_has_parent(sc->mp)) {
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+
+ descr = xchk_xfile_ino_descr(sc,
+ "xattr retained parent pointer entries");
+ error = xfarray_create(descr, 0,
+ sizeof(struct xrep_xattr_pptr),
+ &rx->pptr_recs);
+ kfree(descr);
+ if (error)
+ goto out_values;
+
+ descr = xchk_xfile_ino_descr(sc,
+ "xattr retained parent pointer names");
+ error = xfblob_create(descr, &rx->pptr_names);
+ kfree(descr);
+ if (error)
+ goto out_pprecs;
+
+ xfs_dir_hook_setup(&rx->dhook, xrep_xattr_live_dirent_update);
+ error = xfs_dir_hook_add(sc->mp, &rx->dhook);
+ if (error)
+ goto out_ppnames;
+ }
+
+ *rxp = rx;
+ return 0;
+out_ppnames:
+ xfblob_destroy(rx->pptr_names);
+out_pprecs:
+ xfarray_destroy(rx->pptr_recs);
+out_values:
+ xfblob_destroy(rx->xattr_blobs);
+out_keys:
+ xfarray_destroy(rx->xattr_records);
+out_rx:
+ mutex_destroy(&rx->lock);
+ kfree(rx);
+ return error;
+}
+
+/*
+ * Repair the extended attribute metadata.
+ *
+ * XXX: Remote attribute value buffers encompass the entire (up to 64k) buffer.
+ * The buffer cache in XFS can't handle aliased multiblock buffers, so this
+ * might misbehave if the attr fork is crosslinked with other filesystem
+ * metadata.
+ */
+int
+xrep_xattr(
+ struct xfs_scrub *sc)
+{
+ struct xrep_xattr *rx = NULL;
+ int error;
+
+ if (!xfs_inode_hasattr(sc->ip))
+ return -ENOENT;
+
+ /* The rmapbt is required to reap the old attr fork. */
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+
+ error = xrep_xattr_setup_scan(sc, &rx);
+ if (error)
+ return error;
+
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ error = xrep_xattr_salvage_attributes(rx);
+ if (error)
+ goto out_scan;
+
+ if (rx->live_update_aborted) {
+ error = -EIO;
+ goto out_scan;
+ }
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto out_scan;
+
+ error = xrep_xattr_rebuild_tree(rx);
+ if (error)
+ goto out_scan;
+
+out_scan:
+ xrep_xattr_teardown(rx);
+ return error;
+}
diff --git a/fs/xfs/scrub/attr_repair.h b/fs/xfs/scrub/attr_repair.h
new file mode 100644
index 000000000000..979729bd4a5f
--- /dev/null
+++ b/fs/xfs/scrub/attr_repair.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_ATTR_REPAIR_H__
+#define __XFS_SCRUB_ATTR_REPAIR_H__
+
+struct xrep_tempexch;
+
+int xrep_xattr_swap(struct xfs_scrub *sc, struct xrep_tempexch *tx);
+int xrep_xattr_reset_fork(struct xfs_scrub *sc);
+int xrep_xattr_reset_tempfile_fork(struct xfs_scrub *sc);
+
+#endif /* __XFS_SCRUB_ATTR_REPAIR_H__ */
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 1449bb5262d9..7ba35a7a7920 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -40,22 +40,23 @@ struct xbitmap64_node {
* These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll
* forward-declare them anyway for clarity.
*/
-static inline void
+static inline __maybe_unused void
xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root);
-static inline void
+static inline __maybe_unused void
xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root);
-static inline struct xbitmap64_node *
+static inline __maybe_unused struct xbitmap64_node *
xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start,
uint64_t last);
-static inline struct xbitmap64_node *
+static inline __maybe_unused struct xbitmap64_node *
xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start,
uint64_t last);
INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t,
- __bn_subtree_last, START, LAST, static inline, xbitmap64_tree)
+ __bn_subtree_last, START, LAST, static inline __maybe_unused,
+ xbitmap64_tree)
/* Iterate each interval of a bitmap. Do not change the bitmap. */
#define for_each_xbitmap64_extent(bn, bitmap) \
@@ -314,22 +315,23 @@ struct xbitmap32_node {
* These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll
* forward-declare them anyway for clarity.
*/
-static inline void
+static inline __maybe_unused void
xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root);
-static inline void
+static inline __maybe_unused void
xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root);
-static inline struct xbitmap32_node *
+static inline __maybe_unused struct xbitmap32_node *
xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start,
uint32_t last);
-static inline struct xbitmap32_node *
+static inline __maybe_unused struct xbitmap32_node *
xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start,
uint32_t last);
INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t,
- __bn_subtree_last, START, LAST, static inline, xbitmap32_tree)
+ __bn_subtree_last, START, LAST, static inline __maybe_unused,
+ xbitmap32_tree)
/* Iterate each interval of a bitmap. Do not change the bitmap. */
#define for_each_xbitmap32_extent(bn, bitmap) \
@@ -566,3 +568,17 @@ xbitmap32_test(
*len = bn->bn_start - start;
return false;
}
+
+/* Count the number of set regions in this bitmap. */
+uint32_t
+xbitmap32_count_set_regions(
+ struct xbitmap32 *bitmap)
+{
+ struct xbitmap32_node *bn;
+ uint32_t nr = 0;
+
+ for_each_xbitmap32_extent(bn, bitmap)
+ nr++;
+
+ return nr;
+}
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 2df8911606d6..710c1ac5e323 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -62,4 +62,6 @@ int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn,
bool xbitmap32_empty(struct xbitmap32 *bitmap);
bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len);
+uint32_t xbitmap32_count_set_regions(struct xbitmap32 *bitmap);
+
#endif /* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index b169cddde6da..4f1e2574660d 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -19,7 +19,10 @@
#include "xfs_bmap_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_rtgroup.h"
#include "xfs_health.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtrmap_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
@@ -142,15 +145,22 @@ static inline bool
xchk_bmap_get_rmap(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec,
- xfs_agblock_t agbno,
+ xfs_agblock_t bno,
uint64_t owner,
struct xfs_rmap_irec *rmap)
{
+ struct xfs_btree_cur **curp = &info->sc->sa.rmap_cur;
xfs_fileoff_t offset;
unsigned int rflags = 0;
int has_rmap;
int error;
+ if (xfs_ifork_is_realtime(info->sc->ip, info->whichfork))
+ curp = &info->sc->sr.rmap_cur;
+
+ if (*curp == NULL)
+ return false;
+
if (info->whichfork == XFS_ATTR_FORK)
rflags |= XFS_RMAP_ATTR_FORK;
if (irec->br_state == XFS_EXT_UNWRITTEN)
@@ -171,13 +181,13 @@ xchk_bmap_get_rmap(
* range rmap lookup to make sure we get the correct owner/offset.
*/
if (info->is_shared) {
- error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno,
- owner, offset, rflags, rmap, &has_rmap);
+ error = xfs_rmap_lookup_le_range(*curp, bno, owner, offset,
+ rflags, rmap, &has_rmap);
} else {
- error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno,
- owner, offset, rflags, rmap, &has_rmap);
+ error = xfs_rmap_lookup_le(*curp, bno, owner, offset,
+ rflags, rmap, &has_rmap);
}
- if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur))
+ if (!xchk_should_check_xref(info->sc, &error, curp))
return false;
if (!has_rmap)
@@ -191,29 +201,29 @@ STATIC void
xchk_bmap_xref_rmap(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec,
- xfs_agblock_t agbno)
+ xfs_agblock_t bno)
{
struct xfs_rmap_irec rmap;
unsigned long long rmap_end;
uint64_t owner = info->sc->ip->i_ino;
- if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm))
+ if (xchk_skip_xref(info->sc->sm))
return;
/* Find the rmap record for this irec. */
- if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
+ if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap))
return;
/*
* The rmap must be an exact match for this incore file mapping record,
* which may have arisen from multiple ondisk records.
*/
- if (rmap.rm_startblock != agbno)
+ if (rmap.rm_startblock != bno)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
- if (rmap_end != agbno + irec->br_blockcount)
+ if (rmap_end != bno + irec->br_blockcount)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -258,7 +268,7 @@ STATIC void
xchk_bmap_xref_rmap_cow(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec,
- xfs_agblock_t agbno)
+ xfs_agblock_t bno)
{
struct xfs_rmap_irec rmap;
unsigned long long rmap_end;
@@ -268,7 +278,7 @@ xchk_bmap_xref_rmap_cow(
return;
/* Find the rmap record for this irec. */
- if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
+ if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap))
return;
/*
@@ -276,12 +286,12 @@ xchk_bmap_xref_rmap_cow(
* can start before and end after the physical space allocated to this
* mapping. There are no offsets to check.
*/
- if (rmap.rm_startblock > agbno)
+ if (rmap.rm_startblock > bno)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
- if (rmap_end < agbno + irec->br_blockcount)
+ if (rmap_end < bno + irec->br_blockcount)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -314,8 +324,58 @@ xchk_bmap_rt_iextent_xref(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
+ struct xfs_owner_info oinfo;
+ xfs_rgblock_t rgbno;
+ int error;
+
+ error = xchk_rtgroup_init_existing(info->sc,
+ xfs_rtb_to_rgno(ip->i_mount, irec->br_startblock),
+ &info->sc->sr);
+ if (!xchk_fblock_process_error(info->sc, info->whichfork,
+ irec->br_startoff, &error))
+ return;
+
+ error = xchk_rtgroup_lock(info->sc, &info->sc->sr, XCHK_RTGLOCK_ALL);
+ if (!xchk_fblock_process_error(info->sc, info->whichfork,
+ irec->br_startoff, &error))
+ goto out_free;
+
xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
irec->br_blockcount);
+
+ if (!xfs_has_rtrmapbt(info->sc->mp))
+ goto out_cur;
+
+ rgbno = xfs_rtb_to_rgbno(info->sc->mp, irec->br_startblock);
+
+ switch (info->whichfork) {
+ case XFS_DATA_FORK:
+ xchk_bmap_xref_rmap(info, irec, rgbno);
+ if (!xfs_is_reflink_inode(info->sc->ip)) {
+ xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino,
+ info->whichfork, irec->br_startoff);
+ xchk_xref_is_only_rt_owned_by(info->sc, rgbno,
+ irec->br_blockcount, &oinfo);
+ xchk_xref_is_not_rt_shared(info->sc, rgbno,
+ irec->br_blockcount);
+ }
+ xchk_xref_is_not_rt_cow_staging(info->sc, rgbno,
+ irec->br_blockcount);
+ break;
+ case XFS_COW_FORK:
+ xchk_bmap_xref_rmap_cow(info, irec, rgbno);
+ xchk_xref_is_only_rt_owned_by(info->sc, rgbno,
+ irec->br_blockcount, &XFS_RMAP_OINFO_COW);
+ xchk_xref_is_rt_cow_staging(info->sc, rgbno,
+ irec->br_blockcount);
+ xchk_xref_is_not_rt_shared(info->sc, rgbno,
+ irec->br_blockcount);
+ break;
+ }
+out_cur:
+ xchk_rtgroup_btcur_free(&info->sc->sr);
+out_free:
+ xchk_rtgroup_free(info->sc, &info->sc->sr);
}
/* Cross-reference a single datadev extent record. */
@@ -600,9 +660,8 @@ xchk_bmap_check_rmap(
if (irec.br_startoff != check_rec.rm_offset)
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
check_rec.rm_offset);
- if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp,
- cur->bc_ag.pag->pag_agno,
- check_rec.rm_startblock))
+ if (irec.br_startblock !=
+ xfs_gbno_to_fsb(cur->bc_group, check_rec.rm_startblock))
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
check_rec.rm_offset);
if (irec.br_blockcount > check_rec.rm_blockcount)
@@ -656,6 +715,30 @@ xchk_bmap_check_ag_rmaps(
return error;
}
+/* Make sure each rt rmap has a corresponding bmbt entry. */
+STATIC int
+xchk_bmap_check_rt_rmaps(
+ struct xfs_scrub *sc,
+ struct xfs_rtgroup *rtg)
+{
+ struct xchk_bmap_check_rmap_info sbcri;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg);
+
+ sbcri.sc = sc;
+ sbcri.whichfork = XFS_DATA_FORK;
+ error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri);
+ if (error == -ECANCELED)
+ error = 0;
+
+ xfs_btree_del_cursor(cur, error);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+ return error;
+}
+
/*
* Decide if we want to scan the reverse mappings to determine if the attr
* fork /really/ has zero space mappings.
@@ -710,10 +793,6 @@ xchk_bmap_check_empty_datafork(
{
struct xfs_ifork *ifp = &ip->i_df;
- /* Don't support realtime rmap checks yet. */
- if (XFS_IS_REALTIME_INODE(ip))
- return false;
-
/*
* If the dinode repair found a bad data fork, it will reset the fork
* to extents format with zero records and wait for the this scrubber
@@ -761,11 +840,25 @@ xchk_bmap_check_rmaps(
struct xfs_scrub *sc,
int whichfork)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
int error;
- for_each_perag(sc->mp, agno, pag) {
+ if (xfs_ifork_is_realtime(sc->ip, whichfork)) {
+ struct xfs_rtgroup *rtg = NULL;
+
+ while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) {
+ error = xchk_bmap_check_rt_rmaps(sc, rtg);
+ if (error ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+ xfs_rtgroup_rele(rtg);
+ return error;
+ }
+ }
+
+ return 0;
+ }
+
+ while ((pag = xfs_perag_next(sc->mp, pag))) {
error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
if (error ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
@@ -822,9 +915,12 @@ xchk_bmap_iext_mapping(
/* Are these two mappings contiguous with each other? */
static inline bool
xchk_are_bmaps_contiguous(
+ const struct xchk_bmap_info *info,
const struct xfs_bmbt_irec *b1,
const struct xfs_bmbt_irec *b2)
{
+ struct xfs_mount *mp = info->sc->mp;
+
/* Don't try to combine unallocated mappings. */
if (!xfs_bmap_is_real_extent(b1))
return false;
@@ -838,6 +934,17 @@ xchk_are_bmaps_contiguous(
return false;
if (b1->br_state != b2->br_state)
return false;
+
+ /*
+ * Don't combine bmaps that would cross rtgroup boundaries. This is a
+ * valid state, but if combined they will fail rtb extent checks.
+ */
+ if (info->is_rt && xfs_has_rtgroups(mp)) {
+ if (xfs_rtb_to_rgno(mp, b1->br_startblock) !=
+ xfs_rtb_to_rgno(mp, b2->br_startblock))
+ return false;
+ }
+
return true;
}
@@ -875,7 +982,7 @@ xchk_bmap_iext_iter(
* that we just read, if possible.
*/
while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) {
- if (!xchk_are_bmaps_contiguous(irec, &got))
+ if (!xchk_are_bmaps_contiguous(info, irec, &got))
break;
if (!xchk_bmap_iext_mapping(info, &got)) {
@@ -924,21 +1031,27 @@ xchk_bmap(
if (!ifp)
return -ENOENT;
- info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
+ info.is_rt = xfs_ifork_is_realtime(ip, whichfork);
info.whichfork = whichfork;
info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip);
info.sc = sc;
switch (whichfork) {
case XFS_COW_FORK:
- /* No CoW forks on non-reflink filesystems. */
- if (!xfs_has_reflink(mp)) {
+ /* No CoW forks filesystem doesn't support out of place writes */
+ if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
return 0;
}
break;
case XFS_ATTR_FORK:
- if (!xfs_has_attr(mp) && !xfs_has_attr2(mp))
+ /*
+ * "attr" means that an attr fork was created at some point in
+ * the life of this filesystem. "attr2" means that inodes have
+ * variable-sized data/attr fork areas. Hence we only check
+ * attr here.
+ */
+ if (!xfs_has_attr(mp))
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
break;
default:
@@ -951,6 +1064,7 @@ xchk_bmap(
case XFS_DINODE_FMT_UUID:
case XFS_DINODE_FMT_DEV:
case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_META_BTREE:
/* No mappings to check. */
if (whichfork == XFS_COW_FORK)
xchk_fblock_set_corrupt(sc, whichfork, 0);
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index a4bb89fdd510..1084213b8e9b 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -25,11 +25,13 @@
#include "xfs_bmap_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
#include "xfs_refcount.h"
#include "xfs_quota.h"
#include "xfs_ialloc.h"
#include "xfs_ag.h"
#include "xfs_reflink.h"
+#include "xfs_rtgroup.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -99,14 +101,21 @@ xrep_bmap_discover_shared(
xfs_filblks_t blockcount)
{
struct xfs_scrub *sc = rb->sc;
+ struct xfs_btree_cur *cur;
xfs_agblock_t agbno;
xfs_agblock_t fbno;
xfs_extlen_t flen;
int error;
- agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
- error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount,
- &fbno, &flen, false);
+ if (XFS_IS_REALTIME_INODE(sc->ip)) {
+ agbno = xfs_rtb_to_rgbno(sc->mp, startblock);
+ cur = sc->sr.refc_cur;
+ } else {
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
+ cur = sc->sa.refc_cur;
+ }
+ error = xfs_refcount_find_shared(cur, agbno, blockcount, &fbno, &flen,
+ false);
if (error)
return error;
@@ -196,7 +205,7 @@ xrep_bmap_check_fork_rmap(
return -EFSCORRUPTED;
/* Check that this is within the AG. */
- if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock,
+ if (!xfs_verify_agbext(to_perag(cur->bc_group), rec->rm_startblock,
rec->rm_blockcount))
return -EFSCORRUPTED;
@@ -237,7 +246,6 @@ xrep_bmap_walk_rmap(
void *priv)
{
struct xrep_bmap *rb = priv;
- struct xfs_mount *mp = cur->bc_mp;
xfs_fsblock_t fsbno;
int error = 0;
@@ -269,8 +277,7 @@ xrep_bmap_walk_rmap(
if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten)
return -EFSCORRUPTED;
- fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno,
- rec->rm_startblock);
+ fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group), rec->rm_startblock);
if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
rb->old_bmbt_block_count += rec->rm_blockcount;
@@ -361,6 +368,114 @@ xrep_bmap_scan_ag(
return error;
}
+#ifdef CONFIG_XFS_RT
+/* Check for any obvious errors or conflicts in the file mapping. */
+STATIC int
+xrep_bmap_check_rtfork_rmap(
+ struct xfs_scrub *sc,
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec)
+{
+ /* xattr extents are never stored on realtime devices */
+ if (rec->rm_flags & XFS_RMAP_ATTR_FORK)
+ return -EFSCORRUPTED;
+
+ /* bmbt blocks are never stored on realtime devices */
+ if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ return -EFSCORRUPTED;
+
+ /* Data extents for non-rt files are never stored on the rt device. */
+ if (!XFS_IS_REALTIME_INODE(sc->ip))
+ return -EFSCORRUPTED;
+
+ /* Check the file offsets and physical extents. */
+ if (!xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ /* Check that this is within the rtgroup. */
+ if (!xfs_verify_rgbext(to_rtg(cur->bc_group), rec->rm_startblock,
+ rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ return xrep_require_rtext_inuse(sc, rec->rm_startblock,
+ rec->rm_blockcount);
+}
+
+/* Record realtime extents that belong to this inode's fork. */
+STATIC int
+xrep_bmap_walk_rtrmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_bmap *rb = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(rb->sc, &error))
+ return error;
+
+ /* Skip extents which are not owned by this inode and fork. */
+ if (rec->rm_owner != rb->sc->ip->i_ino)
+ return 0;
+
+ error = xrep_bmap_check_rtfork_rmap(rb->sc, cur, rec);
+ if (error)
+ return error;
+
+ /*
+ * Record all blocks allocated to this file even if the extent isn't
+ * for the fork we're rebuilding so that we can reset di_nblocks later.
+ */
+ rb->nblocks += rec->rm_blockcount;
+
+ /* If this rmap isn't for the fork we want, we're done. */
+ if (rb->whichfork == XFS_DATA_FORK &&
+ (rec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return 0;
+ if (rb->whichfork == XFS_ATTR_FORK &&
+ !(rec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return 0;
+
+ return xrep_bmap_from_rmap(rb, rec->rm_offset,
+ xfs_rgbno_to_rtb(to_rtg(cur->bc_group),
+ rec->rm_startblock),
+ rec->rm_blockcount,
+ rec->rm_flags & XFS_RMAP_UNWRITTEN);
+}
+
+/* Scan the realtime reverse mappings to build the new extent map. */
+STATIC int
+xrep_bmap_scan_rtgroup(
+ struct xrep_bmap *rb,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_scrub *sc = rb->sc;
+ int error;
+
+ if (!xfs_has_rtrmapbt(sc->mp))
+ return 0;
+
+ error = xrep_rtgroup_init(sc, rtg, &sc->sr,
+ XFS_RTGLOCK_RMAP |
+ XFS_RTGLOCK_REFCOUNT |
+ XFS_RTGLOCK_BITMAP_SHARED);
+ if (error)
+ return error;
+
+ error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_bmap_walk_rtrmap, rb);
+ xchk_rtgroup_btcur_free(&sc->sr);
+ xchk_rtgroup_free(sc, &sc->sr);
+ return error;
+}
+#else
+static inline int
+xrep_bmap_scan_rtgroup(struct xrep_bmap *rb, struct xfs_rtgroup *rtg)
+{
+ return -EFSCORRUPTED;
+}
+#endif
+
/* Find the delalloc extents from the old incore extent tree. */
STATIC int
xrep_bmap_find_delalloc(
@@ -409,12 +524,27 @@ xrep_bmap_find_mappings(
struct xrep_bmap *rb)
{
struct xfs_scrub *sc = rb->sc;
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
int error = 0;
+ /*
+ * Iterate the rtrmaps for extents. Metadata files never have content
+ * on the realtime device, so there's no need to scan them.
+ */
+ if (!xfs_is_metadir_inode(sc->ip)) {
+ struct xfs_rtgroup *rtg = NULL;
+
+ while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) {
+ error = xrep_bmap_scan_rtgroup(rb, rtg);
+ if (error) {
+ xfs_rtgroup_rele(rtg);
+ return error;
+ }
+ }
+ }
+
/* Iterate the rmaps for extents. */
- for_each_perag(sc->mp, agno, pag) {
+ while ((pag = xfs_perag_next(sc->mp, pag))) {
error = xrep_bmap_scan_ag(rb, pag);
if (error) {
xfs_perag_rele(pag);
@@ -480,7 +610,7 @@ xrep_bmap_iroot_size(
{
ASSERT(level > 0);
- return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level);
+ return xfs_bmap_broot_space_calc(cur->bc_mp, nr_this_level);
}
/* Update the inode counters. */
@@ -639,7 +769,13 @@ xrep_bmap_build_new_fork(
rb->new_bmapbt.bload.get_records = xrep_bmap_get_records;
rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block;
rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size;
- bmap_cur = xfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake);
+
+ /*
+ * Allocate a new bmap btree cursor for reloading an inode block mapping
+ * data structure.
+ */
+ bmap_cur = xfs_bmbt_init_cursor(sc->mp, NULL, sc->ip, XFS_STAGING_FORK);
+ xfs_btree_stage_ifakeroot(bmap_cur, ifake);
/*
* Figure out the size and format of the new fork, then fill it with
@@ -728,6 +864,7 @@ xrep_bmap_check_inputs(
case XFS_DINODE_FMT_DEV:
case XFS_DINODE_FMT_LOCAL:
case XFS_DINODE_FMT_UUID:
+ case XFS_DINODE_FMT_META_BTREE:
return -ECANCELED;
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
@@ -750,10 +887,6 @@ xrep_bmap_check_inputs(
return -EINVAL;
}
- /* Don't know how to rebuild realtime data forks. */
- if (XFS_IS_REALTIME_INODE(sc->ip))
- return -EOPNOTSUPP;
-
return 0;
}
@@ -779,10 +912,6 @@ xrep_bmap_init_reflink_scan(
if (whichfork != XFS_DATA_FORK)
return RLS_IRRELEVANT;
- /* cannot share realtime extents */
- if (XFS_IS_REALTIME_INODE(sc->ip))
- return RLS_IRRELEVANT;
-
return RLS_UNKNOWN;
}
@@ -795,7 +924,7 @@ xrep_bmap(
{
struct xrep_bmap *rb;
char *descr;
- unsigned int max_bmbt_recs;
+ xfs_extnum_t max_bmbt_recs;
bool large_extcount;
int error = 0;
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 1935b9ce1885..fe678a0438bc 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -47,7 +47,7 @@ __xchk_btree_process_error(
*error = 0;
fallthrough;
default:
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
trace_xchk_ifork_btree_op_error(sc, cur, level,
*error, ret_ip);
else
@@ -91,7 +91,7 @@ __xchk_btree_set_corrupt(
{
sc->sm->sm_flags |= errflag;
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
trace_xchk_ifork_btree_error(sc, cur, level,
ret_ip);
else
@@ -168,7 +168,7 @@ xchk_btree_rec(
if (xfs_btree_keycmp_lt(cur, &key, keyp))
xchk_btree_set_corrupt(bs->sc, cur, 1);
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Is high_key(rec) no larger than the parent high key? */
@@ -215,7 +215,7 @@ xchk_btree_key(
if (xfs_btree_keycmp_lt(cur, key, keyp))
xchk_btree_set_corrupt(bs->sc, cur, level);
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Is this block's high key no larger than the parent high key? */
@@ -236,22 +236,18 @@ xchk_btree_ptr_ok(
int level,
union xfs_btree_ptr *ptr)
{
- bool res;
-
/* A btree rooted in an inode has no block pointer to the root. */
- if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ if (bs->cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
level == bs->cur->bc_nlevels)
return true;
/* Otherwise, check the pointers. */
- if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
- res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
- else
- res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
- if (!res)
+ if (__xfs_btree_check_ptr(bs->cur, ptr, 0, level)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+ return false;
+ }
- return res;
+ return true;
}
/* Check that a btree block's sibling matches what we expect it. */
@@ -374,18 +370,21 @@ xchk_btree_check_block_owner(
{
xfs_agnumber_t agno;
xfs_agblock_t agbno;
- xfs_btnum_t btnum;
bool init_sa;
int error = 0;
if (!bs->cur)
return 0;
- btnum = bs->cur->bc_btnum;
agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
- init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
+ /*
+ * If the btree being examined is not itself a per-AG btree, initialize
+ * sc->sa so that we can check for the presence of an ownership record
+ * in the rmap btree for the AG containing the block.
+ */
+ init_sa = bs->cur->bc_ops->type != XFS_BTREE_TYPE_AG;
if (init_sa) {
error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
@@ -399,11 +398,11 @@ xchk_btree_check_block_owner(
* have to nullify it (to shut down further block owner checks) if
* self-xref encounters problems.
*/
- if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
+ if (!bs->sc->sa.bno_cur && xfs_btree_is_bno(bs->cur->bc_ops))
bs->cur = NULL;
xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo);
- if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
+ if (!bs->sc->sa.rmap_cur && xfs_btree_is_rmap(bs->cur->bc_ops))
bs->cur = NULL;
out_free:
@@ -429,7 +428,7 @@ xchk_btree_check_owner(
* up.
*/
if (bp == NULL) {
- if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE)
xchk_btree_set_corrupt(bs->sc, bs->cur, level);
return 0;
}
@@ -442,7 +441,7 @@ xchk_btree_check_owner(
* duplicate cursors. Therefore, save the buffer daddr for
* later scanning.
*/
- if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
+ if (xfs_btree_is_bno(cur->bc_ops) || xfs_btree_is_rmap(cur->bc_ops)) {
struct check_owner *co;
co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
@@ -475,7 +474,7 @@ xchk_btree_check_iroot_minrecs(
* existing filesystems, so instead we disable the check for data fork
* bmap btrees when there's an attr fork.
*/
- if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
+ if (xfs_btree_is_bmap(bs->cur->bc_ops) &&
bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
xfs_inode_has_attr_fork(bs->sc->ip))
return false;
@@ -508,7 +507,7 @@ xchk_btree_check_minrecs(
* child block might be less than the standard minrecs, but that's ok
* provided that there's only one direct child of the root.
*/
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
level == cur->bc_nlevels - 2) {
struct xfs_btree_block *root_block;
struct xfs_buf *root_bp;
@@ -562,7 +561,7 @@ xchk_btree_block_check_keys(
return;
}
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Make sure the high key of this block matches the parent. */
@@ -585,7 +584,6 @@ xchk_btree_get_block(
struct xfs_btree_block **pblock,
struct xfs_buf **pbp)
{
- xfs_failaddr_t failed_at;
int error;
*pblock = NULL;
@@ -597,13 +595,7 @@ xchk_btree_get_block(
return error;
xfs_btree_get_block(bs->cur, level, pbp);
- if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
- failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
- level, *pbp);
- else
- failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
- level, *pbp);
- if (failed_at) {
+ if (__xfs_btree_check_block(bs->cur, *pblock, level, *pbp)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, level);
return 0;
}
@@ -664,7 +656,7 @@ xchk_btree_block_keys(
if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys))
xchk_btree_set_corrupt(bs->sc, cur, 1);
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Get high keys */
@@ -728,7 +720,7 @@ xchk_btree(
* error codes for us.
*/
level = cur->bc_nlevels - 1;
- cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ xfs_btree_init_ptr_from_cur(cur, &ptr);
if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
goto out;
error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 81f2b96bb5a7..28ad341df8ee 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -26,14 +26,24 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_dir2_priv.h"
+#include "xfs_dir2.h"
#include "xfs_attr.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_exchmaps.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_bmap_util.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/health.h"
+#include "scrub/tempfile.h"
/* Common code for the metadata scrubbers. */
@@ -82,6 +92,15 @@ __xchk_process_error(
sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
sc->sm, *error);
break;
+ case -ECANCELED:
+ /*
+ * ECANCELED here means that the caller set one of the scrub
+ * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
+ * quickly. Set error to zero and do not continue.
+ */
+ trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
+ *error = 0;
+ break;
case -EFSBADCRC:
case -EFSCORRUPTED:
/* Note the badness but don't abort. */
@@ -89,8 +108,7 @@ __xchk_process_error(
*error = 0;
fallthrough;
default:
- trace_xchk_op_error(sc, agno, bno, *error,
- ret_ip);
+ trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
break;
}
return false;
@@ -108,6 +126,17 @@ xchk_process_error(
}
bool
+xchk_process_rt_error(
+ struct xfs_scrub *sc,
+ xfs_rgnumber_t rgno,
+ xfs_rgblock_t rgbno,
+ int *error)
+{
+ return __xchk_process_error(sc, rgno, rgbno, error,
+ XFS_SCRUB_OFLAG_CORRUPT, __return_address);
+}
+
+bool
xchk_xref_process_error(
struct xfs_scrub *sc,
xfs_agnumber_t agno,
@@ -136,6 +165,16 @@ __xchk_fblock_process_error(
/* Used to restart an op with deadlock avoidance. */
trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
break;
+ case -ECANCELED:
+ /*
+ * ECANCELED here means that the caller set one of the scrub
+ * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
+ * quickly. Set error to zero and do not continue.
+ */
+ trace_xchk_file_op_error(sc, whichfork, offset, *error,
+ ret_ip);
+ *error = 0;
+ break;
case -EFSBADCRC:
case -EFSCORRUPTED:
/* Note the badness but don't abort. */
@@ -227,6 +266,19 @@ xchk_block_set_corrupt(
trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
}
+#ifdef CONFIG_XFS_QUOTA
+/* Record a corrupt quota counter. */
+void
+xchk_qcheck_set_corrupt(
+ struct xfs_scrub *sc,
+ unsigned int dqtype,
+ xfs_dqid_t id)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xchk_qcheck_error(sc, dqtype, id, __return_address);
+}
+#endif
+
/* Record a corruption while cross-referencing. */
void
xchk_block_xref_set_corrupt(
@@ -412,7 +464,7 @@ xchk_perag_read_headers(
{
int error;
- error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
+ error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
return error;
@@ -427,7 +479,7 @@ xchk_perag_read_headers(
* Grab the AG headers for the attached perag structure and wait for pending
* intents to drain.
*/
-static int
+int
xchk_perag_drain_and_lock(
struct xfs_scrub *sc)
{
@@ -477,7 +529,7 @@ xchk_perag_drain_and_lock(
* Obviously, this should be slanted against scrub and in favor
* of runtime threads.
*/
- if (!xfs_perag_intent_busy(sa->pag))
+ if (!xfs_group_intent_busy(pag_group(sa->pag)))
return 0;
if (sa->agf_bp) {
@@ -492,7 +544,7 @@ xchk_perag_drain_and_lock(
if (!(sc->flags & XCHK_FSGATES_DRAIN))
return -ECHRNG;
- error = xfs_perag_intent_drain(sa->pag);
+ error = xfs_group_intent_drain(pag_group(sa->pag));
if (error == -ERESTARTSYS)
error = -EINTR;
} while (!error);
@@ -555,46 +607,50 @@ xchk_ag_btcur_init(
{
struct xfs_mount *mp = sc->mp;
- if (sa->agf_bp &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
+ if (sa->agf_bp) {
/* Set up a bnobt cursor for cross-referencing. */
- sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sa->pag, XFS_BTNUM_BNO);
- }
+ sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur,
+ XFS_SCRUB_TYPE_BNOBT);
- if (sa->agf_bp &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
/* Set up a cntbt cursor for cross-referencing. */
- sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sa->pag, XFS_BTNUM_CNT);
- }
-
- /* Set up a inobt cursor for cross-referencing. */
- if (sa->agi_bp &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
- sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
- XFS_BTNUM_INO);
- }
-
- /* Set up a finobt cursor for cross-referencing. */
- if (sa->agi_bp && xfs_has_finobt(mp) &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
- sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
- XFS_BTNUM_FINO);
- }
-
- /* Set up a rmapbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_has_rmapbt(mp) &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
- sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur,
+ XFS_SCRUB_TYPE_CNTBT);
+
+ /* Set up a rmapbt cursor for cross-referencing. */
+ if (xfs_has_rmapbt(mp)) {
+ sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur,
+ XFS_SCRUB_TYPE_RMAPBT);
+ }
+
+ /* Set up a refcountbt cursor for cross-referencing. */
+ if (xfs_has_reflink(mp)) {
+ sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur,
+ XFS_SCRUB_TYPE_REFCNTBT);
+ }
}
- /* Set up a refcountbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_has_reflink(mp) &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
- sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
- sa->agf_bp, sa->pag);
+ if (sa->agi_bp) {
+ /* Set up a inobt cursor for cross-referencing. */
+ sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp,
+ sa->agi_bp);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur,
+ XFS_SCRUB_TYPE_INOBT);
+
+ /* Set up a finobt cursor for cross-referencing. */
+ if (xfs_has_finobt(mp)) {
+ sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp,
+ sa->agi_bp);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur,
+ XFS_SCRUB_TYPE_FINOBT);
+ }
}
}
@@ -643,6 +699,163 @@ xchk_ag_init(
return 0;
}
+#ifdef CONFIG_XFS_RT
+/*
+ * For scrubbing a realtime group, grab all the in-core resources we'll need to
+ * check the metadata, which means taking the ILOCK of the realtime group's
+ * metadata inodes. Callers must not join these inodes to the transaction with
+ * non-zero lockflags or concurrency problems will result. The @rtglock_flags
+ * argument takes XFS_RTGLOCK_* flags.
+ */
+int
+xchk_rtgroup_init(
+ struct xfs_scrub *sc,
+ xfs_rgnumber_t rgno,
+ struct xchk_rt *sr)
+{
+ ASSERT(sr->rtg == NULL);
+ ASSERT(sr->rtlock_flags == 0);
+
+ sr->rtg = xfs_rtgroup_get(sc->mp, rgno);
+ if (!sr->rtg)
+ return -ENOENT;
+ return 0;
+}
+
+/* Lock all the rt group metadata inode ILOCKs and wait for intents. */
+int
+xchk_rtgroup_lock(
+ struct xfs_scrub *sc,
+ struct xchk_rt *sr,
+ unsigned int rtglock_flags)
+{
+ int error = 0;
+
+ ASSERT(sr->rtg != NULL);
+
+ /*
+ * If we're /only/ locking the rtbitmap in shared mode, then we're
+ * obviously not trying to compare records in two metadata inodes.
+ * There's no need to drain intents here because the caller (most
+ * likely the rgsuper scanner) doesn't need that level of consistency.
+ */
+ if (rtglock_flags == XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+ sr->rtlock_flags = rtglock_flags;
+ return 0;
+ }
+
+ do {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+
+ /*
+ * If we've grabbed a non-metadata file for scrubbing, we
+ * assume that holding its ILOCK will suffice to coordinate
+ * with any rt intent chains involving this inode.
+ */
+ if (sc->ip && !xfs_is_internal_inode(sc->ip))
+ break;
+
+ /*
+ * Decide if the rt group is quiet enough for all metadata to
+ * be consistent with each other. Regular file IO doesn't get
+ * to lock all the rt inodes at the same time, which means that
+ * there could be other threads in the middle of processing a
+ * chain of deferred ops.
+ *
+ * We just locked all the metadata inodes for this rt group;
+ * now take a look to see if there are any intents in progress.
+ * If there are, drop the rt group inode locks and wait for the
+ * intents to drain. Since we hold the rt group inode locks
+ * for the duration of the scrub, this is the only time we have
+ * to sample the intents counter; any threads increasing it
+ * after this point can't possibly be in the middle of a chain
+ * of rt metadata updates.
+ *
+ * Obviously, this should be slanted against scrub and in favor
+ * of runtime threads.
+ */
+ if (!xfs_group_intent_busy(rtg_group(sr->rtg)))
+ break;
+
+ xfs_rtgroup_unlock(sr->rtg, rtglock_flags);
+
+ if (!(sc->flags & XCHK_FSGATES_DRAIN))
+ return -ECHRNG;
+ error = xfs_group_intent_drain(rtg_group(sr->rtg));
+ if (error) {
+ if (error == -ERESTARTSYS)
+ error = -EINTR;
+ return error;
+ }
+ } while (1);
+
+ sr->rtlock_flags = rtglock_flags;
+
+ if (xfs_has_rtrmapbt(sc->mp) && (rtglock_flags & XFS_RTGLOCK_RMAP))
+ sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg);
+
+ if (xfs_has_rtreflink(sc->mp) && (rtglock_flags & XFS_RTGLOCK_REFCOUNT))
+ sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg);
+
+ return 0;
+}
+
+/*
+ * Free all the btree cursors and other incore data relating to the realtime
+ * group. This has to be done /before/ committing (or cancelling) the scrub
+ * transaction.
+ */
+void
+xchk_rtgroup_btcur_free(
+ struct xchk_rt *sr)
+{
+ if (sr->rmap_cur)
+ xfs_btree_del_cursor(sr->rmap_cur, XFS_BTREE_ERROR);
+ if (sr->refc_cur)
+ xfs_btree_del_cursor(sr->refc_cur, XFS_BTREE_ERROR);
+
+ sr->refc_cur = NULL;
+ sr->rmap_cur = NULL;
+}
+
+/*
+ * Unlock the realtime group. This must be done /after/ committing (or
+ * cancelling) the scrub transaction.
+ */
+void
+xchk_rtgroup_unlock(
+ struct xchk_rt *sr)
+{
+ ASSERT(sr->rtg != NULL);
+
+ if (sr->rtlock_flags) {
+ xfs_rtgroup_unlock(sr->rtg, sr->rtlock_flags);
+ sr->rtlock_flags = 0;
+ }
+}
+
+/*
+ * Unlock the realtime group and release its resources. This must be done
+ * /after/ committing (or cancelling) the scrub transaction.
+ */
+void
+xchk_rtgroup_free(
+ struct xfs_scrub *sc,
+ struct xchk_rt *sr)
+{
+ ASSERT(sr->rtg != NULL);
+
+ xchk_rtgroup_unlock(sr);
+
+ xfs_rtgroup_put(sr->rtg);
+ sr->rtg = NULL;
+}
+#endif /* CONFIG_XFS_RT */
+
/* Per-scrubber setup functions */
void
@@ -653,6 +866,13 @@ xchk_trans_cancel(
sc->tp = NULL;
}
+int
+xchk_trans_alloc_empty(
+ struct xfs_scrub *sc)
+{
+ return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+}
+
/*
* Grab an empty transaction so that we can re-grab locked buffers if
* one of our btrees turns out to be cyclic.
@@ -672,7 +892,7 @@ xchk_trans_alloc(
return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
resblks, 0, 0, &sc->tp);
- return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+ return xchk_trans_alloc_empty(sc);
}
/* Set us up with a transaction and an empty context. */
@@ -686,6 +906,14 @@ xchk_setup_fs(
return xchk_trans_alloc(sc, resblks);
}
+/* Set us up with a transaction and an empty context to repair rt metadata. */
+int
+xchk_setup_rt(
+ struct xfs_scrub *sc)
+{
+ return xchk_trans_alloc(sc, xrep_calc_rtgroup_resblks(sc));
+}
+
/* Set us up with AG headers and btree cursors. */
int
xchk_setup_ag_btree(
@@ -737,7 +965,7 @@ xchk_iget(
{
ASSERT(sc->tp != NULL);
- return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
+ return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp);
}
/*
@@ -783,13 +1011,13 @@ again:
* in the iget cache miss path.
*/
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
- error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
+ error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp);
xfs_perag_put(pag);
if (error)
return error;
- error = xfs_iget(mp, tp, inum,
- XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp);
+ error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0,
+ ipp);
if (error == -EAGAIN) {
/*
* The inode may be in core but temporarily unavailable and may
@@ -900,9 +1128,15 @@ xchk_iget_for_scrubbing(
if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
return xchk_install_live_inode(sc, ip_in);
- /* Reject internal metadata files and obviously bad inode numbers. */
- if (xfs_internal_inum(mp, sc->sm->sm_ino))
+ /*
+ * On pre-metadir filesystems, reject internal metadata files. For
+ * metadir filesystems, limited scrubbing of any file in the metadata
+ * directory tree by handle is allowed, because that is the only way to
+ * validate the lack of parent pointers in the sb-root metadata inodes.
+ */
+ if (!xfs_has_metadir(mp) && xfs_is_sb_inum(mp, sc->sm->sm_ino))
return -ENOENT;
+ /* Reject obviously bad inode numbers. */
if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
return -ENOENT;
@@ -1000,9 +1234,7 @@ xchk_irele(
struct xfs_scrub *sc,
struct xfs_inode *ip)
{
- if (current->journal_info != NULL) {
- ASSERT(current->journal_info == sc->tp);
-
+ if (sc->tp) {
/*
* If we are in a transaction, we /cannot/ drop the inode
* ourselves, because the VFS will trigger writeback, which
@@ -1018,12 +1250,6 @@ xchk_irele(
spin_lock(&VFS_I(ip)->i_lock);
VFS_I(ip)->i_state &= ~I_DONTCACHE;
spin_unlock(&VFS_I(ip)->i_lock);
- } else if (atomic_read(&VFS_I(ip)->i_count) == 1) {
- /*
- * If this is the last reference to the inode and the caller
- * permits it, set DONTCACHE to avoid thrashing.
- */
- d_mark_dontcache(VFS_I(ip));
}
xfs_irele(ip);
@@ -1045,6 +1271,10 @@ xchk_setup_inode_contents(
if (error)
return error;
+ error = xrep_tempfile_adjust_directory_tree(sc);
+ if (error)
+ return error;
+
/* Lock the inode so the VFS cannot touch this file. */
xchk_ilock(sc, XFS_IOLOCK_EXCL);
@@ -1160,27 +1390,12 @@ xchk_metadata_inode_subtype(
struct xfs_scrub *sc,
unsigned int scrub_type)
{
- __u32 smtype = sc->sm->sm_type;
- unsigned int sick_mask = sc->sick_mask;
+ struct xfs_scrub_subord *sub;
int error;
- sc->sm->sm_type = scrub_type;
-
- switch (scrub_type) {
- case XFS_SCRUB_TYPE_INODE:
- error = xchk_inode(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTD:
- error = xchk_bmap_data(sc);
- break;
- default:
- ASSERT(0);
- error = -EFSCORRUPTED;
- break;
- }
-
- sc->sick_mask = sick_mask;
- sc->sm->sm_type = smtype;
+ sub = xchk_scrub_create_subord(sc, scrub_type);
+ error = sub->sc.ops->scrub(&sub->sc);
+ xchk_scrub_free_subord(sub);
return error;
}
@@ -1215,12 +1430,6 @@ xchk_metadata_inode_forks(
return 0;
}
- /* They also should never have extended attributes. */
- if (xfs_inode_hasattr(sc->ip)) {
- xchk_ino_set_corrupt(sc, sc->ip->i_ino);
- return 0;
- }
-
/* Invoke the data fork scrubber. */
error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
@@ -1237,6 +1446,21 @@ xchk_metadata_inode_forks(
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
}
+ /*
+ * Metadata files can only have extended attributes on metadir
+ * filesystems, either for parent pointers or for actual xattr data.
+ */
+ if (xfs_inode_hasattr(sc->ip)) {
+ if (!xfs_has_metadir(sc->mp)) {
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+ }
+
return 0;
}
@@ -1257,7 +1481,16 @@ xchk_fsgates_enable(
trace_xchk_fsgates_enable(sc, scrub_fsgates);
if (scrub_fsgates & XCHK_FSGATES_DRAIN)
- xfs_drain_wait_enable();
+ xfs_defer_drain_wait_enable();
+
+ if (scrub_fsgates & XCHK_FSGATES_QUOTA)
+ xfs_dqtrx_hook_enable();
+
+ if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
+ xfs_dir_hook_enable();
+
+ if (scrub_fsgates & XCHK_FSGATES_RMAP)
+ xfs_rmap_hook_enable();
sc->flags |= scrub_fsgates;
}
@@ -1303,7 +1536,7 @@ xchk_inode_is_allocated(
}
/* reject inode numbers outside existing AGs */
- ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
+ ino = xfs_agino_to_ino(pag, agino);
if (!xfs_verify_ino(mp, ino))
return -EINVAL;
@@ -1413,3 +1646,92 @@ out_rcu:
rcu_read_unlock();
return error;
}
+
+/* Is this inode a root directory for either tree? */
+bool
+xchk_inode_is_dirtree_root(const struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ return ip == mp->m_rootip ||
+ (xfs_has_metadir(mp) && ip == mp->m_metadirip);
+}
+
+/* Does the superblock point down to this inode? */
+bool
+xchk_inode_is_sb_rooted(const struct xfs_inode *ip)
+{
+ return xchk_inode_is_dirtree_root(ip) ||
+ xfs_is_sb_inum(ip->i_mount, ip->i_ino);
+}
+
+/* What is the root directory inumber for this inode? */
+xfs_ino_t
+xchk_inode_rootdir_inum(const struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (xfs_is_metadir_inode(ip))
+ return mp->m_metadirip->i_ino;
+ return mp->m_rootip->i_ino;
+}
+
+static int
+xchk_meta_btree_count_blocks(
+ struct xfs_scrub *sc,
+ xfs_extnum_t *nextents,
+ xfs_filblks_t *count)
+{
+ struct xfs_btree_cur *cur;
+ int error;
+
+ if (!sc->sr.rtg) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ switch (sc->ip->i_metatype) {
+ case XFS_METAFILE_RTRMAP:
+ cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg);
+ break;
+ case XFS_METAFILE_RTREFCOUNT:
+ cur = xfs_rtrefcountbt_init_cursor(sc->tp, sc->sr.rtg);
+ break;
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_btree_count_blocks(cur, count);
+ xfs_btree_del_cursor(cur, error);
+ if (!error) {
+ *nextents = 0;
+ (*count)--; /* don't count the btree iroot */
+ }
+ return error;
+}
+
+/* Count the blocks used by a file, even if it's a metadata inode. */
+int
+xchk_inode_count_blocks(
+ struct xfs_scrub *sc,
+ int whichfork,
+ xfs_extnum_t *nextents,
+ xfs_filblks_t *count)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork);
+
+ if (!ifp) {
+ *nextents = 0;
+ *count = 0;
+ return 0;
+ }
+
+ if (ifp->if_format == XFS_DINODE_FMT_META_BTREE) {
+ ASSERT(whichfork == XFS_DATA_FORK);
+ return xchk_meta_btree_count_blocks(sc, nextents, count);
+ }
+
+ return xfs_bmap_count_blocks(sc->tp, sc->ip, whichfork, nextents,
+ count);
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index da09580b454a..19877d99f255 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -6,36 +6,14 @@
#ifndef __XFS_SCRUB_COMMON_H__
#define __XFS_SCRUB_COMMON_H__
-/*
- * We /could/ terminate a scrub/repair operation early. If we're not
- * in a good place to continue (fatal signal, etc.) then bail out.
- * Note that we're careful not to make any judgements about *error.
- */
-static inline bool
-xchk_should_terminate(
- struct xfs_scrub *sc,
- int *error)
-{
- /*
- * If preemption is disabled, we need to yield to the scheduler every
- * few seconds so that we don't run afoul of the soft lockup watchdog
- * or RCU stall detector.
- */
- cond_resched();
-
- if (fatal_signal_pending(current)) {
- if (*error == 0)
- *error = -EINTR;
- return true;
- }
- return false;
-}
-
int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
+int xchk_trans_alloc_empty(struct xfs_scrub *sc);
void xchk_trans_cancel(struct xfs_scrub *sc);
bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int *error);
+bool xchk_process_rt_error(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
+ xfs_rgblock_t rgbno, int *error);
bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork,
xfs_fileoff_t offset, int *error);
@@ -54,6 +32,10 @@ void xchk_block_set_corrupt(struct xfs_scrub *sc,
void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino);
void xchk_fblock_set_corrupt(struct xfs_scrub *sc, int whichfork,
xfs_fileoff_t offset);
+#ifdef CONFIG_XFS_QUOTA
+void xchk_qcheck_set_corrupt(struct xfs_scrub *sc, unsigned int dqtype,
+ xfs_dqid_t id);
+#endif
void xchk_block_xref_set_corrupt(struct xfs_scrub *sc,
struct xfs_buf *bp);
@@ -73,9 +55,15 @@ int xchk_checkpoint_log(struct xfs_mount *mp);
bool xchk_should_check_xref(struct xfs_scrub *sc, int *error,
struct xfs_btree_cur **curpp);
+static inline int xchk_setup_nothing(struct xfs_scrub *sc)
+{
+ return -ENOENT;
+}
+
/* Setup functions */
int xchk_setup_agheader(struct xfs_scrub *sc);
int xchk_setup_fs(struct xfs_scrub *sc);
+int xchk_setup_rt(struct xfs_scrub *sc);
int xchk_setup_ag_allocbt(struct xfs_scrub *sc);
int xchk_setup_ag_iallocbt(struct xfs_scrub *sc);
int xchk_setup_ag_rmapbt(struct xfs_scrub *sc);
@@ -87,41 +75,41 @@ int xchk_setup_directory(struct xfs_scrub *sc);
int xchk_setup_xattr(struct xfs_scrub *sc);
int xchk_setup_symlink(struct xfs_scrub *sc);
int xchk_setup_parent(struct xfs_scrub *sc);
+int xchk_setup_dirtree(struct xfs_scrub *sc);
+int xchk_setup_metapath(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xchk_setup_rtbitmap(struct xfs_scrub *sc);
int xchk_setup_rtsummary(struct xfs_scrub *sc);
+int xchk_setup_rgsuperblock(struct xfs_scrub *sc);
+int xchk_setup_rtrmapbt(struct xfs_scrub *sc);
+int xchk_setup_rtrefcountbt(struct xfs_scrub *sc);
#else
-static inline int
-xchk_setup_rtbitmap(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
-static inline int
-xchk_setup_rtsummary(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
+# define xchk_setup_rtbitmap xchk_setup_nothing
+# define xchk_setup_rtsummary xchk_setup_nothing
+# define xchk_setup_rgsuperblock xchk_setup_nothing
+# define xchk_setup_rtrmapbt xchk_setup_nothing
+# define xchk_setup_rtrefcountbt xchk_setup_nothing
#endif
#ifdef CONFIG_XFS_QUOTA
int xchk_ino_dqattach(struct xfs_scrub *sc);
int xchk_setup_quota(struct xfs_scrub *sc);
+int xchk_setup_quotacheck(struct xfs_scrub *sc);
#else
static inline int
xchk_ino_dqattach(struct xfs_scrub *sc)
{
return 0;
}
-static inline int
-xchk_setup_quota(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
+# define xchk_setup_quota xchk_setup_nothing
+# define xchk_setup_quotacheck xchk_setup_nothing
#endif
int xchk_setup_fscounters(struct xfs_scrub *sc);
+int xchk_setup_nlinks(struct xfs_scrub *sc);
void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa);
int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno,
struct xchk_ag *sa);
+int xchk_perag_drain_and_lock(struct xfs_scrub *sc);
/*
* Grab all AG resources, treating the inability to grab the perag structure as
@@ -139,6 +127,41 @@ xchk_ag_init_existing(
return error == -ENOENT ? -EFSCORRUPTED : error;
}
+#ifdef CONFIG_XFS_RT
+
+/* All the locks we need to check an rtgroup. */
+#define XCHK_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \
+ XFS_RTGLOCK_RMAP | \
+ XFS_RTGLOCK_REFCOUNT)
+
+int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
+ struct xchk_rt *sr);
+
+static inline int
+xchk_rtgroup_init_existing(
+ struct xfs_scrub *sc,
+ xfs_rgnumber_t rgno,
+ struct xchk_rt *sr)
+{
+ int error = xchk_rtgroup_init(sc, rgno, sr);
+
+ return error == -ENOENT ? -EFSCORRUPTED : error;
+}
+
+int xchk_rtgroup_lock(struct xfs_scrub *sc, struct xchk_rt *sr,
+ unsigned int rtglock_flags);
+void xchk_rtgroup_unlock(struct xchk_rt *sr);
+void xchk_rtgroup_btcur_free(struct xchk_rt *sr);
+void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr);
+#else
+# define xchk_rtgroup_init(sc, rgno, sr) (-EFSCORRUPTED)
+# define xchk_rtgroup_init_existing(sc, rgno, sr) (-EFSCORRUPTED)
+# define xchk_rtgroup_lock(sc, sr, lockflags) (-EFSCORRUPTED)
+# define xchk_rtgroup_unlock(sr) do { } while (0)
+# define xchk_rtgroup_btcur_free(sr) do { } while (0)
+# define xchk_rtgroup_free(sc, sr) do { } while (0)
+#endif /* CONFIG_XFS_RT */
+
int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno,
struct xchk_ag *sa);
void xchk_ag_btcur_free(struct xchk_ag *sa);
@@ -199,8 +222,8 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
}
bool xchk_dir_looks_zapped(struct xfs_inode *dp);
+bool xchk_pptr_looks_zapped(struct xfs_inode *ip);
-#ifdef CONFIG_XFS_ONLINE_REPAIR
/* Decide if a repair is required. */
static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
{
@@ -220,10 +243,6 @@ static inline bool xchk_could_repair(const struct xfs_scrub *sc)
return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
!(sc->flags & XREP_ALREADY_FIXED);
}
-#else
-# define xchk_needs_repair(sc) (false)
-# define xchk_could_repair(sc) (false)
-#endif /* CONFIG_XFS_ONLINE_REPAIR */
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
@@ -237,13 +256,20 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
#define xchk_xfile_ag_descr(sc, fmt, ...) \
kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \
(sc)->mp->m_super->s_id, \
- (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \
+ (sc)->sa.pag ? \
+ pag_agno((sc)->sa.pag) : (sc)->sm->sm_agno, \
##__VA_ARGS__)
#define xchk_xfile_ino_descr(sc, fmt, ...) \
kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \
(sc)->mp->m_super->s_id, \
(sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \
##__VA_ARGS__)
+#define xchk_xfile_rtgroup_descr(sc, fmt, ...) \
+ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): rtgroup 0x%x " fmt, \
+ (sc)->mp->m_super->s_id, \
+ (sc)->sa.pag ? \
+ rtg_rgno((sc)->sr.rtg) : (sc)->sm->sm_agno, \
+ ##__VA_ARGS__)
/*
* Setting up a hook to wait for intents to drain is costly -- we have to take
@@ -261,5 +287,11 @@ void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks);
int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino,
bool *inuse);
+int xchk_inode_count_blocks(struct xfs_scrub *sc, int whichfork,
+ xfs_extnum_t *nextents, xfs_filblks_t *count);
+
+bool xchk_inode_is_dirtree_root(const struct xfs_inode *ip);
+bool xchk_inode_is_sb_rooted(const struct xfs_inode *ip);
+xfs_ino_t xchk_inode_rootdir_inum(const struct xfs_inode *ip);
#endif /* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index 1e82c727af8e..38a246b8bf11 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -26,6 +26,9 @@
#include "xfs_errortag.h"
#include "xfs_icache.h"
#include "xfs_refcount_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -34,6 +37,7 @@
#include "scrub/bitmap.h"
#include "scrub/off_bitmap.h"
#include "scrub/fsb_bitmap.h"
+#include "scrub/rtb_bitmap.h"
#include "scrub/reap.h"
/*
@@ -61,7 +65,10 @@ struct xrep_cow {
struct xoff_bitmap bad_fileoffs;
/* Bitmap of fsblocks that were removed from the CoW fork. */
- struct xfsb_bitmap old_cowfork_fsblocks;
+ union {
+ struct xfsb_bitmap old_cowfork_fsblocks;
+ struct xrtb_bitmap old_cowfork_rtblocks;
+ };
/* CoW fork mappings used to scan for bad CoW staging extents. */
struct xfs_bmbt_irec irec;
@@ -137,7 +144,6 @@ xrep_cow_mark_shared_staging(
{
struct xrep_cow *xc = priv;
struct xfs_refcount_irec rrec;
- xfs_fsblock_t fsbno;
if (!xfs_refcount_check_domain(rec) ||
rec->rc_domain != XFS_REFC_DOMAIN_SHARED)
@@ -145,9 +151,9 @@ xrep_cow_mark_shared_staging(
xrep_cow_trim_refcount(xc, &rrec, rec);
- fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
- rrec.rc_startblock);
- return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount);
+ return xrep_cow_mark_file_range(xc,
+ xfs_gbno_to_fsb(cur->bc_group, rrec.rc_startblock),
+ rrec.rc_blockcount);
}
/*
@@ -178,8 +184,7 @@ xrep_cow_mark_missing_staging(
goto next;
error = xrep_cow_mark_file_range(xc,
- XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
- xc->next_bno),
+ xfs_gbno_to_fsb(cur->bc_group, xc->next_bno),
rrec.rc_startblock - xc->next_bno);
if (error)
return error;
@@ -200,7 +205,6 @@ xrep_cow_mark_missing_staging_rmap(
void *priv)
{
struct xrep_cow *xc = priv;
- xfs_fsblock_t fsbno;
xfs_agblock_t rec_bno;
xfs_extlen_t rec_len;
unsigned int adj;
@@ -222,8 +226,8 @@ xrep_cow_mark_missing_staging_rmap(
rec_len -= adj;
}
- fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno);
- return xrep_cow_mark_file_range(xc, fsbno, rec_len);
+ return xrep_cow_mark_file_range(xc,
+ xfs_gbno_to_fsb(cur->bc_group, rec_bno), rec_len);
}
/*
@@ -275,8 +279,7 @@ xrep_cow_find_bad(
if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
error = xrep_cow_mark_file_range(xc,
- XFS_AGB_TO_FSB(sc->mp, pag->pag_agno,
- xc->next_bno),
+ xfs_agbno_to_fsb(pag, xc->next_bno),
xc->irec_startbno + xc->irec.br_blockcount -
xc->next_bno);
if (error)
@@ -312,6 +315,92 @@ out_pag:
}
/*
+ * Find any part of the CoW fork mapping that isn't a single-owner CoW staging
+ * extent and mark the corresponding part of the file range in the bitmap.
+ */
+STATIC int
+xrep_cow_find_bad_rt(
+ struct xrep_cow *xc)
+{
+ struct xfs_refcount_irec rc_low = { 0 };
+ struct xfs_refcount_irec rc_high = { 0 };
+ struct xfs_rmap_irec rm_low = { 0 };
+ struct xfs_rmap_irec rm_high = { 0 };
+ struct xfs_scrub *sc = xc->sc;
+ struct xfs_rtgroup *rtg;
+ int error = 0;
+
+ xc->irec_startbno = xfs_rtb_to_rgbno(sc->mp, xc->irec.br_startblock);
+
+ rtg = xfs_rtgroup_get(sc->mp,
+ xfs_rtb_to_rgno(sc->mp, xc->irec.br_startblock));
+ if (!rtg)
+ return -EFSCORRUPTED;
+
+ error = xrep_rtgroup_init(sc, rtg, &sc->sr,
+ XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT);
+ if (error)
+ goto out_rtg;
+
+ /* Mark any CoW fork extents that are shared. */
+ rc_low.rc_startblock = xc->irec_startbno;
+ rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED;
+ error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high,
+ xrep_cow_mark_shared_staging, xc);
+ if (error)
+ goto out_sr;
+
+ /* Make sure there are CoW staging extents for the whole mapping. */
+ rc_low.rc_startblock = xc->irec_startbno;
+ rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW;
+ xc->next_bno = xc->irec_startbno;
+ error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high,
+ xrep_cow_mark_missing_staging, xc);
+ if (error)
+ goto out_sr;
+
+ if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
+ error = xrep_cow_mark_file_range(xc,
+ xfs_rgbno_to_rtb(rtg, xc->next_bno),
+ xc->irec_startbno + xc->irec.br_blockcount -
+ xc->next_bno);
+ if (error)
+ goto out_sr;
+ }
+
+ /* Mark any area has an rmap that isn't a COW staging extent. */
+ rm_low.rm_startblock = xc->irec_startbno;
+ memset(&rm_high, 0xFF, sizeof(rm_high));
+ rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ error = xfs_rmap_query_range(sc->sr.rmap_cur, &rm_low, &rm_high,
+ xrep_cow_mark_missing_staging_rmap, xc);
+ if (error)
+ goto out_sr;
+
+ /*
+ * If userspace is forcing us to rebuild the CoW fork or someone
+ * turned on the debugging knob, replace everything in the
+ * CoW fork and then scan for staging extents in the refcountbt.
+ */
+ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
+ XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+ error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
+ xc->irec.br_blockcount);
+ if (error)
+ goto out_rtg;
+ }
+
+out_sr:
+ xchk_rtgroup_btcur_free(&sc->sr);
+ xchk_rtgroup_free(sc, &sc->sr);
+out_rtg:
+ xfs_rtgroup_put(rtg);
+ return error;
+}
+
+/*
* Allocate a replacement CoW staging extent of up to the given number of
* blocks, and fill out the mapping.
*/
@@ -344,7 +433,7 @@ xrep_cow_alloc(
if (args.fsbno == NULLFSBLOCK)
return -ENOSPC;
- xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len);
+ xfs_refcount_alloc_cow_extent(sc->tp, false, args.fsbno, args.len);
repl->fsbno = args.fsbno;
repl->len = args.len;
@@ -352,6 +441,32 @@ xrep_cow_alloc(
}
/*
+ * Allocate a replacement rt CoW staging extent of up to the given number of
+ * blocks, and fill out the mapping.
+ */
+STATIC int
+xrep_cow_alloc_rt(
+ struct xfs_scrub *sc,
+ xfs_extlen_t maxlen,
+ struct xrep_cow_extent *repl)
+{
+ xfs_rtxlen_t maxrtx = xfs_rtb_to_rtx(sc->mp, maxlen);
+ int error;
+
+ error = xfs_trans_reserve_more(sc->tp, 0, maxrtx);
+ if (error)
+ return error;
+
+ error = xfs_rtallocate_rtgs(sc->tp, NULLRTBLOCK, 1, maxrtx, 1, false,
+ false, &repl->fsbno, &repl->len);
+ if (error)
+ return error;
+
+ xfs_refcount_alloc_cow_extent(sc->tp, true, repl->fsbno, repl->len);
+ return 0;
+}
+
+/*
* Look up the current CoW fork mapping so that we only allocate enough to
* replace a single mapping. If we don't find a mapping that covers the start
* of the file range, or we find a delalloc or written extent, something is
@@ -468,7 +583,10 @@ xrep_cow_replace_range(
*/
alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN,
nextoff - startoff);
- error = xrep_cow_alloc(sc, alloc_len, &repl);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ error = xrep_cow_alloc_rt(sc, alloc_len, &repl);
+ else
+ error = xrep_cow_alloc(sc, alloc_len, &repl);
if (error)
return error;
@@ -484,8 +602,12 @@ xrep_cow_replace_range(
return error;
/* Note the old CoW staging extents; we'll reap them all later. */
- error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock,
- repl.len);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ error = xrtb_bitmap_set(&xc->old_cowfork_rtblocks,
+ got.br_startblock, repl.len);
+ else
+ error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks,
+ got.br_startblock, repl.len);
if (error)
return error;
@@ -541,8 +663,16 @@ xrep_bmap_cow(
if (!ifp)
return 0;
- /* realtime files aren't supported yet */
- if (XFS_IS_REALTIME_INODE(sc->ip))
+ /*
+ * Realtime files with large extent sizes are not supported because
+ * we could encounter an CoW mapping that has been partially written
+ * out *and* requires replacement, and there's no solution to that.
+ */
+ if (xfs_inode_has_bigrtalloc(sc->ip))
+ return -EOPNOTSUPP;
+
+ /* Metadata inodes aren't supposed to have data on the rt volume. */
+ if (xfs_is_metadir_inode(sc->ip) && XFS_IS_REALTIME_INODE(sc->ip))
return -EOPNOTSUPP;
/*
@@ -563,7 +693,10 @@ xrep_bmap_cow(
xc->sc = sc;
xoff_bitmap_init(&xc->bad_fileoffs);
- xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ xrtb_bitmap_init(&xc->old_cowfork_rtblocks);
+ else
+ xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
for_each_xfs_iext(ifp, &icur, &xc->irec) {
if (xchk_should_terminate(sc, &error))
@@ -586,7 +719,10 @@ xrep_bmap_cow(
if (xfs_bmap_is_written_extent(&xc->irec))
continue;
- error = xrep_cow_find_bad(xc);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ error = xrep_cow_find_bad_rt(xc);
+ else
+ error = xrep_cow_find_bad(xc);
if (error)
goto out_bitmap;
}
@@ -601,14 +737,21 @@ xrep_bmap_cow(
* by the refcount btree, not the inode, so it is correct to treat them
* like inode metadata.
*/
- error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
- &XFS_RMAP_OINFO_COW);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ error = xrep_reap_rtblocks(sc, &xc->old_cowfork_rtblocks,
+ &XFS_RMAP_OINFO_COW);
+ else
+ error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
+ &XFS_RMAP_OINFO_COW);
if (error)
goto out_bitmap;
out_bitmap:
- xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ xrtb_bitmap_destroy(&xc->old_cowfork_rtblocks);
+ else
+ xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
xoff_bitmap_destroy(&xc->bad_fileoffs);
- kmem_free(xc);
+ kfree(xc);
return error;
}
diff --git a/fs/xfs/scrub/dab_bitmap.h b/fs/xfs/scrub/dab_bitmap.h
new file mode 100644
index 000000000000..0c6e3aad4395
--- /dev/null
+++ b/fs/xfs/scrub/dab_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_DAB_BITMAP_H__
+#define __XFS_SCRUB_DAB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_dablk_t */
+
+struct xdab_bitmap {
+ struct xbitmap32 dabitmap;
+};
+
+static inline void xdab_bitmap_init(struct xdab_bitmap *bitmap)
+{
+ xbitmap32_init(&bitmap->dabitmap);
+}
+
+static inline void xdab_bitmap_destroy(struct xdab_bitmap *bitmap)
+{
+ xbitmap32_destroy(&bitmap->dabitmap);
+}
+
+static inline int xdab_bitmap_set(struct xdab_bitmap *bitmap,
+ xfs_dablk_t dabno, xfs_extlen_t len)
+{
+ return xbitmap32_set(&bitmap->dabitmap, dabno, len);
+}
+
+static inline bool xdab_bitmap_test(struct xdab_bitmap *bitmap,
+ xfs_dablk_t dabno, xfs_extlen_t *len)
+{
+ return xbitmap32_test(&bitmap->dabitmap, dabno, len);
+}
+
+#endif /* __XFS_SCRUB_DAB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 82b150d3b8b7..056de4819f86 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -78,6 +78,22 @@ xchk_da_set_corrupt(
__return_address);
}
+/* Flag a da btree node in need of optimization. */
+void
+xchk_da_set_preen(
+ struct xchk_da_btree *ds,
+ int level)
+{
+ struct xfs_scrub *sc = ds->sc;
+
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
+ trace_xchk_fblock_preen(sc, ds->dargs.whichfork,
+ xfs_dir2_da_to_db(ds->dargs.geo,
+ ds->state->path.blk[level].blkno),
+ __return_address);
+}
+
+/* Find an entry at a certain level in a da btree. */
static struct xfs_da_node_entry *
xchk_da_btree_node_entry(
struct xchk_da_btree *ds,
@@ -320,6 +336,7 @@ xchk_da_btree_block(
struct xfs_da3_blkinfo *hdr3;
struct xfs_da_args *dargs = &ds->dargs;
struct xfs_inode *ip = ds->dargs.dp;
+ xfs_failaddr_t fa;
xfs_ino_t owner;
int *pmaxrecs;
struct xfs_da3_icnode_hdr nodehdr;
@@ -442,6 +459,12 @@ xchk_da_btree_block(
goto out_freebp;
}
+ fa = xfs_da3_header_check(blk->bp, dargs->owner);
+ if (fa) {
+ xchk_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+
/*
* If we've been handed a block that is below the dabtree root, does
* its hashval match what the parent block expected to see?
@@ -494,6 +517,7 @@ xchk_da_btree(
ds->dargs.whichfork = whichfork;
ds->dargs.trans = sc->tp;
ds->dargs.op_flags = XFS_DA_OP_OKNOENT;
+ ds->dargs.owner = sc->ip->i_ino;
ds->state = xfs_da_state_alloc(&ds->dargs);
ds->sc = sc;
ds->private = private;
diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h
index 4f8c2138a1ec..de291e3b77dd 100644
--- a/fs/xfs/scrub/dabtree.h
+++ b/fs/xfs/scrub/dabtree.h
@@ -35,6 +35,9 @@ bool xchk_da_process_error(struct xchk_da_btree *ds, int level, int *error);
/* Check for da btree corruption. */
void xchk_da_set_corrupt(struct xchk_da_btree *ds, int level);
+void xchk_da_set_preen(struct xchk_da_btree *ds, int level);
+
+void xchk_da_set_preen(struct xchk_da_btree *ds, int level);
int xchk_da_btree_hash(struct xchk_da_btree *ds, int level, __be32 *hashp);
int xchk_da_btree(struct xfs_scrub *sc, int whichfork,
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index d86ab51af928..c877bde71e62 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -16,22 +16,70 @@
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_health.h"
+#include "xfs_attr.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/dabtree.h"
#include "scrub/readdir.h"
#include "scrub/health.h"
+#include "scrub/repair.h"
+#include "scrub/trace.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
/* Set us up to scrub directories. */
int
xchk_setup_directory(
struct xfs_scrub *sc)
{
+ int error;
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_directory(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_inode_contents(sc, 0);
}
/* Directories */
+/* Deferred directory entry that we saved for later. */
+struct xchk_dirent {
+ /* Cookie for retrieval of the dirent name. */
+ xfblob_cookie name_cookie;
+
+ /* Child inode number. */
+ xfs_ino_t ino;
+
+ /* Length of the pptr name. */
+ uint8_t namelen;
+};
+
+struct xchk_dir {
+ struct xfs_scrub *sc;
+
+ /* information for parent pointer validation. */
+ struct xfs_parent_rec pptr_rec;
+ struct xfs_da_args pptr_args;
+
+ /* Fixed-size array of xchk_dirent structures. */
+ struct xfarray *dir_entries;
+
+ /* Blobs containing dirent names. */
+ struct xfblob *dir_names;
+
+ /* If we've cycled the ILOCK, we must revalidate deferred dirents. */
+ bool need_revalidate;
+
+ /* Name buffer for dirent revalidation. */
+ struct xfs_name xname;
+ uint8_t namebuf[MAXNAMELEN];
+};
+
/* Scrub a directory entry. */
/* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */
@@ -52,6 +100,116 @@ xchk_dir_check_ftype(
if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /*
+ * Metadata and regular inodes cannot cross trees. This property
+ * cannot change without a full inode free and realloc cycle, so it's
+ * safe to check this without holding locks.
+ */
+ if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(sc->ip))
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+}
+
+/*
+ * Try to lock a child file for checking parent pointers. Returns the inode
+ * flags for the locks we now hold, or zero if we failed.
+ */
+STATIC unsigned int
+xchk_dir_lock_child(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+ return 0;
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return 0;
+ }
+
+ if (!xfs_inode_has_attr_fork(ip) || !xfs_need_iread_extents(&ip->i_af))
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED;
+
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return 0;
+ }
+
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL;
+}
+
+/* Check the backwards link (parent pointer) associated with this dirent. */
+STATIC int
+xchk_dir_parent_pointer(
+ struct xchk_dir *sd,
+ const struct xfs_name *name,
+ struct xfs_inode *ip)
+{
+ struct xfs_scrub *sc = sd->sc;
+ int error;
+
+ xfs_inode_to_parent_rec(&sd->pptr_rec, sc->ip);
+ error = xfs_parent_lookup(sc->tp, ip, name, &sd->pptr_rec,
+ &sd->pptr_args);
+ if (error == -ENOATTR)
+ xchk_fblock_xref_set_corrupt(sc, XFS_DATA_FORK, 0);
+
+ return 0;
+}
+
+/* Look for a parent pointer matching this dirent, if the child isn't busy. */
+STATIC int
+xchk_dir_check_pptr_fast(
+ struct xchk_dir *sd,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ struct xfs_inode *ip)
+{
+ struct xfs_scrub *sc = sd->sc;
+ unsigned int lockmode;
+ int error;
+
+ /* dot and dotdot entries do not have parent pointers */
+ if (xfs_dir2_samename(name, &xfs_name_dot) ||
+ xfs_dir2_samename(name, &xfs_name_dotdot))
+ return 0;
+
+ /* No self-referential non-dot or dotdot dirents. */
+ if (ip == sc->ip) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return -ECANCELED;
+ }
+
+ /* Try to lock the inode. */
+ lockmode = xchk_dir_lock_child(sc, ip);
+ if (!lockmode) {
+ struct xchk_dirent save_de = {
+ .namelen = name->len,
+ .ino = ip->i_ino,
+ };
+
+ /* Couldn't lock the inode, so save the dirent for later. */
+ trace_xchk_dir_defer(sc->ip, name, ip->i_ino);
+
+ error = xfblob_storename(sd->dir_names, &save_de.name_cookie,
+ name);
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ return error;
+
+ error = xfarray_append(sd->dir_entries, &save_de);
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ return error;
+
+ return 0;
+ }
+
+ error = xchk_dir_parent_pointer(sd, name, ip);
+ xfs_iunlock(ip, lockmode);
+ return error;
}
/*
@@ -71,6 +229,7 @@ xchk_dir_actor(
{
struct xfs_mount *mp = dp->i_mount;
struct xfs_inode *ip;
+ struct xchk_dir *sd = priv;
xfs_ino_t lookup_ino;
xfs_dablk_t offset;
int error = 0;
@@ -93,16 +252,16 @@ xchk_dir_actor(
return -ECANCELED;
}
- if (!strncmp(".", name->name, name->len)) {
+ if (xfs_dir2_samename(name, &xfs_name_dot)) {
/* If this is "." then check that the inum matches the dir. */
if (ino != dp->i_ino)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
- } else if (!strncmp("..", name->name, name->len)) {
+ } else if (xfs_dir2_samename(name, &xfs_name_dotdot)) {
/*
* If this is ".." in the root inode, check that the inum
* matches this dir.
*/
- if (dp->i_ino == mp->m_sb.sb_rootino && ino != dp->i_ino)
+ if (xchk_inode_is_dirtree_root(dp) && ino != dp->i_ino)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
}
@@ -137,6 +296,14 @@ xchk_dir_actor(
goto out;
xchk_dir_check_ftype(sc, offset, ip, name->type);
+
+ if (xfs_has_parent(mp)) {
+ error = xchk_dir_check_pptr_fast(sd, dapos, name, ip);
+ if (error)
+ goto out_rele;
+ }
+
+out_rele:
xchk_irele(sc, ip);
out:
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
@@ -196,8 +363,8 @@ xchk_dir_rec(
xchk_da_set_corrupt(ds, level);
goto out;
}
- error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno,
- XFS_DABUF_MAP_HOLE_OK, &bp);
+ error = xfs_dir3_data_read(ds->dargs.trans, dp, ds->dargs.owner,
+ rec_bno, XFS_DABUF_MAP_HOLE_OK, &bp);
if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno,
&error))
goto out;
@@ -315,10 +482,11 @@ xchk_directory_data_bestfree(
/* dir block format */
if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
- error = xfs_dir3_block_read(sc->tp, sc->ip, &bp);
+ error = xfs_dir3_block_read(sc->tp, sc->ip, sc->ip->i_ino, &bp);
} else {
/* dir data format */
- error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, 0, &bp);
+ error = xfs_dir3_data_read(sc->tp, sc->ip, sc->ip->i_ino, lblk,
+ 0, &bp);
}
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
goto out;
@@ -470,7 +638,7 @@ xchk_directory_leaf1_bestfree(
int error;
/* Read the free space block. */
- error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp);
+ error = xfs_dir3_leaf_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
return error;
xchk_buffer_recheck(sc, bp);
@@ -531,10 +699,9 @@ xchk_directory_leaf1_bestfree(
/* Check all the bestfree entries. */
for (i = 0; i < bestcount; i++, bestp++) {
best = be16_to_cpu(*bestp);
- error = xfs_dir3_data_read(sc->tp, sc->ip,
+ error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner,
xfs_dir2_db_to_da(args->geo, i),
- XFS_DABUF_MAP_HOLE_OK,
- &dbp);
+ XFS_DABUF_MAP_HOLE_OK, &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
break;
@@ -577,7 +744,7 @@ xchk_directory_free_bestfree(
int error;
/* Read the free space block */
- error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp);
+ error = xfs_dir2_free_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
return error;
xchk_buffer_recheck(sc, bp);
@@ -597,7 +764,7 @@ xchk_directory_free_bestfree(
stale++;
continue;
}
- error = xfs_dir3_data_read(sc->tp, sc->ip,
+ error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner,
(freehdr.firstdb + i) * args->geo->fsbcount,
0, &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
@@ -621,10 +788,11 @@ xchk_directory_blocks(
{
struct xfs_bmbt_irec got;
struct xfs_da_args args = {
- .dp = sc ->ip,
+ .dp = sc->ip,
.whichfork = XFS_DATA_FORK,
.geo = sc->mp->m_dir_geo,
.trans = sc->tp,
+ .owner = sc->ip->i_ino,
};
struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
struct xfs_mount *mp = sc->mp;
@@ -648,7 +816,8 @@ xchk_directory_blocks(
free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
/* Is this a block dir? */
- error = xfs_dir2_isblock(&args, &is_block);
+ if (xfs_dir2_format(&args, &error) == XFS_DIR2_FMT_BLOCK)
+ is_block = true;
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
goto out;
@@ -752,11 +921,148 @@ out:
return error;
}
+/*
+ * Revalidate a dirent that we collected in the past but couldn't check because
+ * of lock contention. Returns 0 if the dirent is still valid, -ENOENT if it
+ * has gone away on us, or a negative errno.
+ */
+STATIC int
+xchk_dir_revalidate_dirent(
+ struct xchk_dir *sd,
+ const struct xfs_name *xname,
+ xfs_ino_t ino)
+{
+ struct xfs_scrub *sc = sd->sc;
+ xfs_ino_t child_ino;
+ int error;
+
+ /*
+ * Look up the directory entry. If we get -ENOENT, the directory entry
+ * went away and there's nothing to revalidate. Return any other
+ * error.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, xname, &child_ino);
+ if (error)
+ return error;
+
+ /* The inode number changed, nothing to revalidate. */
+ if (ino != child_ino)
+ return -ENOENT;
+
+ return 0;
+}
+
+/*
+ * Check a directory entry's parent pointers the slow way, which means we cycle
+ * locks a bunch and put up with revalidation until we get it done.
+ */
+STATIC int
+xchk_dir_slow_dirent(
+ struct xchk_dir *sd,
+ struct xchk_dirent *dirent,
+ const struct xfs_name *xname)
+{
+ struct xfs_scrub *sc = sd->sc;
+ struct xfs_inode *ip;
+ unsigned int lockmode;
+ int error;
+
+ /* Check that the deferred dirent still exists. */
+ if (sd->need_revalidate) {
+ error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino);
+ if (error == -ENOENT)
+ return 0;
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ return error;
+ }
+
+ error = xchk_iget(sc, dirent->ino, &ip);
+ if (error == -EINVAL || error == -ENOENT) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+ return error;
+
+ /*
+ * If we can grab both IOLOCK and ILOCK of the alleged child, we can
+ * proceed with the validation.
+ */
+ lockmode = xchk_dir_lock_child(sc, ip);
+ if (lockmode) {
+ trace_xchk_dir_slowpath(sc->ip, xname, ip->i_ino);
+ goto check_pptr;
+ }
+
+ /*
+ * We couldn't lock the child file. Drop all the locks and try to
+ * get them again, one at a time.
+ */
+ xchk_iunlock(sc, sc->ilock_flags);
+ sd->need_revalidate = true;
+
+ trace_xchk_dir_ultraslowpath(sc->ip, xname, ip->i_ino);
+
+ error = xchk_dir_trylock_for_pptrs(sc, ip, &lockmode);
+ if (error)
+ goto out_rele;
+
+ /* Revalidate, since we just cycled the locks. */
+ error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino);
+ if (error == -ENOENT) {
+ error = 0;
+ goto out_unlock;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out_unlock;
+
+check_pptr:
+ error = xchk_dir_parent_pointer(sd, xname, ip);
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+out_rele:
+ xchk_irele(sc, ip);
+ return error;
+}
+
+/* Check all the dirents that we deferred the first time around. */
+STATIC int
+xchk_dir_finish_slow_dirents(
+ struct xchk_dir *sd)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ foreach_xfarray_idx(sd->dir_entries, array_cur) {
+ struct xchk_dirent dirent;
+
+ if (sd->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ error = xfarray_load(sd->dir_entries, array_cur, &dirent);
+ if (error)
+ return error;
+
+ error = xfblob_loadname(sd->dir_names, dirent.name_cookie,
+ &sd->xname, dirent.namelen);
+ if (error)
+ return error;
+
+ error = xchk_dir_slow_dirent(sd, &dirent, &sd->xname);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
/* Scrub a whole directory. */
int
xchk_directory(
struct xfs_scrub *sc)
{
+ struct xchk_dir *sd;
int error;
if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
@@ -789,9 +1095,60 @@ xchk_directory(
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return 0;
+ sd = kvzalloc(sizeof(struct xchk_dir), XCHK_GFP_FLAGS);
+ if (!sd)
+ return -ENOMEM;
+ sd->sc = sc;
+ sd->xname.name = sd->namebuf;
+
+ if (xfs_has_parent(sc->mp)) {
+ char *descr;
+
+ /*
+ * Set up some staging memory for dirents that we can't check
+ * due to locking contention.
+ */
+ descr = xchk_xfile_ino_descr(sc, "slow directory entries");
+ error = xfarray_create(descr, 0, sizeof(struct xchk_dirent),
+ &sd->dir_entries);
+ kfree(descr);
+ if (error)
+ goto out_sd;
+
+ descr = xchk_xfile_ino_descr(sc, "slow directory entry names");
+ error = xfblob_create(descr, &sd->dir_names);
+ kfree(descr);
+ if (error)
+ goto out_entries;
+ }
+
/* Look up every name in this directory by hash. */
- error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL);
- if (error && error != -ECANCELED)
+ error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, sd);
+ if (error == -ECANCELED)
+ error = 0;
+ if (error)
+ goto out_names;
+
+ if (xfs_has_parent(sc->mp)) {
+ error = xchk_dir_finish_slow_dirents(sd);
+ if (error == -ETIMEDOUT) {
+ /* Couldn't grab a lock, scrub was marked incomplete */
+ error = 0;
+ goto out_names;
+ }
+ if (error)
+ goto out_names;
+ }
+
+out_names:
+ if (sd->dir_names)
+ xfblob_destroy(sd->dir_names);
+out_entries:
+ if (sd->dir_entries)
+ xfarray_destroy(sd->dir_entries);
+out_sd:
+ kvfree(sd);
+ if (error)
return error;
/* If the dir is clean, it is clearly not zapped. */
diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c
new file mode 100644
index 000000000000..249313882108
--- /dev/null
+++ b/fs/xfs/scrub/dir_repair.c
@@ -0,0 +1,1968 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_bmap_util.h"
+#include "xfs_exchmaps.h"
+#include "xfs_exchrange.h"
+#include "xfs_ag.h"
+#include "xfs_parent.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/iscan.h"
+#include "scrub/readdir.h"
+#include "scrub/reap.h"
+#include "scrub/findparent.h"
+#include "scrub/orphanage.h"
+#include "scrub/listxattr.h"
+
+/*
+ * Directory Repair
+ * ================
+ *
+ * We repair directories by reading the directory data blocks looking for
+ * directory entries that look salvageable (name passes verifiers, entry points
+ * to a valid allocated inode, etc). Each entry worth salvaging is stashed in
+ * memory, and the stashed entries are periodically replayed into a temporary
+ * directory to constrain memory use. Batching the construction of the
+ * temporary directory in this fashion reduces lock cycling of the directory
+ * being repaired and the temporary directory, and will later become important
+ * for parent pointer scanning.
+ *
+ * If parent pointers are enabled on this filesystem, we instead reconstruct
+ * the directory by visiting each parent pointer of each file in the filesystem
+ * and translating the relevant parent pointer records into dirents. In this
+ * case, it is advantageous to stash all directory entries created from parent
+ * pointers for a single child file before replaying them into the temporary
+ * directory. To save memory, the live filesystem scan reuses the findparent
+ * fields. Directory repair chooses either parent pointer scanning or
+ * directory entry salvaging, but not both.
+ *
+ * Directory entries added to the temporary directory do not elevate the link
+ * counts of the inodes found. When salvaging completes, the remaining stashed
+ * entries are replayed to the temporary directory. An atomic mapping exchange
+ * is used to commit the new directory blocks to the directory being repaired.
+ * This will disrupt readdir cursors.
+ *
+ * Locking Issues
+ * --------------
+ *
+ * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on
+ * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects
+ * b's dotdot update. This is in contrast to every other dotdot update (link,
+ * remove, mkdir). If the repair code drops the ILOCK, it must either
+ * revalidate the dotdot entry or use dirent hooks to capture updates from
+ * other threads.
+ */
+
+/* Create a dirent in the tempdir. */
+#define XREP_DIRENT_ADD (1)
+
+/* Remove a dirent from the tempdir. */
+#define XREP_DIRENT_REMOVE (2)
+
+/* Directory entry to be restored in the new directory. */
+struct xrep_dirent {
+ /* Cookie for retrieval of the dirent name. */
+ xfblob_cookie name_cookie;
+
+ /* Target inode number. */
+ xfs_ino_t ino;
+
+ /* Length of the dirent name. */
+ uint8_t namelen;
+
+ /* File type of the dirent. */
+ uint8_t ftype;
+
+ /* XREP_DIRENT_{ADD,REMOVE} */
+ uint8_t action;
+};
+
+/*
+ * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names
+ * before we write them to the temp dir.
+ */
+#define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8)
+
+struct xrep_dir {
+ struct xfs_scrub *sc;
+
+ /* Fixed-size array of xrep_dirent structures. */
+ struct xfarray *dir_entries;
+
+ /* Blobs containing directory entry names. */
+ struct xfblob *dir_names;
+
+ /* Information for exchanging data forks at the end. */
+ struct xrep_tempexch tx;
+
+ /* Preallocated args struct for performing dir operations */
+ struct xfs_da_args args;
+
+ /*
+ * Information used to scan the filesystem to find the inumber of the
+ * dotdot entry for this directory. For directory salvaging when
+ * parent pointers are not enabled, we use the findparent_* functions
+ * on this object and access only the parent_ino field directly.
+ *
+ * When parent pointers are enabled, however, the pptr scanner uses the
+ * iscan, hooks, lock, and parent_ino fields of this object directly.
+ * @pscan.lock coordinates access to dir_entries, dir_names,
+ * parent_ino, subdirs, dirents, and args. This reduces the memory
+ * requirements of this structure.
+ */
+ struct xrep_parent_scan_info pscan;
+
+ /*
+ * Context information for attaching this directory to the lost+found
+ * if this directory does not have a parent.
+ */
+ struct xrep_adoption adoption;
+
+ /* How many subdirectories did we find? */
+ uint64_t subdirs;
+
+ /* How many dirents did we find? */
+ unsigned int dirents;
+
+ /* Should we move this directory to the orphanage? */
+ bool needs_adoption;
+
+ /* Directory entry name, plus the trailing null. */
+ struct xfs_name xname;
+ unsigned char namebuf[MAXNAMELEN];
+};
+
+/* Tear down all the incore stuff we created. */
+static void
+xrep_dir_teardown(
+ struct xfs_scrub *sc)
+{
+ struct xrep_dir *rd = sc->buf;
+
+ xrep_findparent_scan_teardown(&rd->pscan);
+ xfblob_destroy(rd->dir_names);
+ xfarray_destroy(rd->dir_entries);
+}
+
+/* Set up for a directory repair. */
+int
+xrep_setup_directory(
+ struct xfs_scrub *sc)
+{
+ struct xrep_dir *rd;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ error = xrep_orphanage_try_create(sc);
+ if (error)
+ return error;
+
+ error = xrep_tempfile_create(sc, S_IFDIR);
+ if (error)
+ return error;
+
+ rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS);
+ if (!rd)
+ return -ENOMEM;
+ rd->sc = sc;
+ rd->xname.name = rd->namebuf;
+ sc->buf = rd;
+
+ return 0;
+}
+
+/*
+ * Look up the dotdot entry and confirm that it's really the parent.
+ * Returns NULLFSINO if we don't know what to do.
+ */
+static inline xfs_ino_t
+xrep_dir_lookup_parent(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ xfs_ino_t ino;
+ int error;
+
+ error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL);
+ if (error)
+ return NULLFSINO;
+ if (!xfs_verify_dir_ino(sc->mp, ino))
+ return NULLFSINO;
+
+ error = xrep_findparent_confirm(sc, &ino);
+ if (error)
+ return NULLFSINO;
+
+ return ino;
+}
+
+/*
+ * Look up '..' in the dentry cache and confirm that it's really the parent.
+ * Returns NULLFSINO if the dcache misses or if the hit is implausible.
+ */
+static inline xfs_ino_t
+xrep_dir_dcache_parent(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ xfs_ino_t parent_ino;
+ int error;
+
+ parent_ino = xrep_findparent_from_dcache(sc);
+ if (parent_ino == NULLFSINO)
+ return parent_ino;
+
+ error = xrep_findparent_confirm(sc, &parent_ino);
+ if (error)
+ return NULLFSINO;
+
+ return parent_ino;
+}
+
+/* Try to find the parent of the directory being repaired. */
+STATIC int
+xrep_dir_find_parent(
+ struct xrep_dir *rd)
+{
+ xfs_ino_t ino;
+
+ ino = xrep_findparent_self_reference(rd->sc);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rd->pscan, ino);
+ return 0;
+ }
+
+ ino = xrep_dir_dcache_parent(rd);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rd->pscan, ino);
+ return 0;
+ }
+
+ ino = xrep_dir_lookup_parent(rd);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rd->pscan, ino);
+ return 0;
+ }
+
+ /*
+ * A full filesystem scan is the last resort. On a busy filesystem,
+ * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means
+ * that we don't know what who the parent is, so we should return to
+ * userspace.
+ */
+ return xrep_findparent_scan(&rd->pscan);
+}
+
+/*
+ * Decide if we want to salvage this entry. We don't bother with oversized
+ * names or the dot entry.
+ */
+STATIC int
+xrep_dir_want_salvage(
+ struct xrep_dir *rd,
+ const char *name,
+ int namelen,
+ xfs_ino_t ino)
+{
+ struct xfs_mount *mp = rd->sc->mp;
+
+ /* No pointers to ourselves or to garbage. */
+ if (ino == rd->sc->ip->i_ino)
+ return false;
+ if (!xfs_verify_dir_ino(mp, ino))
+ return false;
+
+ /* No weird looking names or dot entries. */
+ if (namelen >= MAXNAMELEN || namelen <= 0)
+ return false;
+ if (namelen == 1 && name[0] == '.')
+ return false;
+ if (!xfs_dir2_namecheck(name, namelen))
+ return false;
+
+ return true;
+}
+
+/*
+ * Remember that we want to create a dirent in the tempdir. These stashed
+ * actions will be replayed later.
+ */
+STATIC int
+xrep_dir_stash_createname(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_ino_t ino)
+{
+ struct xrep_dirent dirent = {
+ .action = XREP_DIRENT_ADD,
+ .ino = ino,
+ .namelen = name->len,
+ .ftype = name->type,
+ };
+ int error;
+
+ trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino);
+
+ error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rd->dir_entries, &dirent);
+}
+
+/*
+ * Remember that we want to remove a dirent from the tempdir. These stashed
+ * actions will be replayed later.
+ */
+STATIC int
+xrep_dir_stash_removename(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_ino_t ino)
+{
+ struct xrep_dirent dirent = {
+ .action = XREP_DIRENT_REMOVE,
+ .ino = ino,
+ .namelen = name->len,
+ .ftype = name->type,
+ };
+ int error;
+
+ trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino);
+
+ error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rd->dir_entries, &dirent);
+}
+
+/* Allocate an in-core record to hold entries while we rebuild the dir data. */
+STATIC int
+xrep_dir_salvage_entry(
+ struct xrep_dir *rd,
+ unsigned char *name,
+ unsigned int namelen,
+ xfs_ino_t ino)
+{
+ struct xfs_name xname = {
+ .name = name,
+ };
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *ip;
+ unsigned int i = 0;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * Truncate the name to the first character that would trip namecheck.
+ * If we no longer have a name after that, ignore this entry.
+ */
+ while (i < namelen && name[i] != 0 && name[i] != '/')
+ i++;
+ if (i == 0)
+ return 0;
+ xname.len = i;
+
+ /* Ignore '..' entries; we already picked the new parent. */
+ if (xname.len == 2 && name[0] == '.' && name[1] == '.') {
+ trace_xrep_dir_salvaged_parent(sc->ip, ino);
+ return 0;
+ }
+
+ trace_xrep_dir_salvage_entry(sc->ip, &xname, ino);
+
+ /*
+ * Compute the ftype or dump the entry if we can't. We don't lock the
+ * inode because inodes can't change type while we have a reference.
+ */
+ error = xchk_iget(sc, ino, &ip);
+ if (error)
+ return 0;
+
+ /* Don't mix metadata and regular directory trees. */
+ if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(rd->sc->ip)) {
+ xchk_irele(sc, ip);
+ return 0;
+ }
+
+ xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
+ xchk_irele(sc, ip);
+
+ return xrep_dir_stash_createname(rd, &xname, ino);
+}
+
+/* Record a shortform directory entry for later reinsertion. */
+STATIC int
+xrep_dir_salvage_sf_entry(
+ struct xrep_dir *rd,
+ struct xfs_dir2_sf_hdr *sfp,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ xfs_ino_t ino;
+
+ ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
+ if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
+ return 0;
+
+ return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
+}
+
+/* Record a regular directory entry for later reinsertion. */
+STATIC int
+xrep_dir_salvage_data_entry(
+ struct xrep_dir *rd,
+ struct xfs_dir2_data_entry *dep)
+{
+ xfs_ino_t ino;
+
+ ino = be64_to_cpu(dep->inumber);
+ if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
+ return 0;
+
+ return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
+}
+
+/* Try to recover block/data format directory entries. */
+STATIC int
+xrep_dir_recover_data(
+ struct xrep_dir *rd,
+ struct xfs_buf *bp)
+{
+ struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo;
+ unsigned int offset;
+ unsigned int end;
+ int error = 0;
+
+ /*
+ * Loop over the data portion of the block.
+ * Each object is a real entry (dep) or an unused one (dup).
+ */
+ offset = geo->data_entry_offset;
+ end = min_t(unsigned int, BBTOB(bp->b_length),
+ xfs_dir3_data_end_offset(geo, bp->b_addr));
+
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
+
+ if (xchk_should_terminate(rd->sc, &error))
+ return error;
+
+ /* Skip unused entries. */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ offset += be16_to_cpu(dup->length);
+ continue;
+ }
+
+ /* Don't walk off the end of the block. */
+ offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
+ if (offset > end)
+ break;
+
+ /* Ok, let's save this entry. */
+ error = xrep_dir_salvage_data_entry(rd, dep);
+ if (error)
+ return error;
+
+ }
+
+ return 0;
+}
+
+/* Try to recover shortform directory entries. */
+STATIC int
+xrep_dir_recover_sf(
+ struct xrep_dir *rd)
+{
+ struct xfs_dir2_sf_hdr *hdr;
+ struct xfs_dir2_sf_entry *sfep;
+ struct xfs_dir2_sf_entry *next;
+ struct xfs_ifork *ifp;
+ xfs_ino_t ino;
+ unsigned char *end;
+ int error = 0;
+
+ ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK);
+ hdr = ifp->if_data;
+ end = (unsigned char *)ifp->if_data + ifp->if_bytes;
+
+ ino = xfs_dir2_sf_get_parent_ino(hdr);
+ trace_xrep_dir_salvaged_parent(rd->sc->ip, ino);
+
+ sfep = xfs_dir2_sf_firstentry(hdr);
+ while ((unsigned char *)sfep < end) {
+ if (xchk_should_terminate(rd->sc, &error))
+ return error;
+
+ next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep);
+ if ((unsigned char *)next > end)
+ break;
+
+ /* Ok, let's save this entry. */
+ error = xrep_dir_salvage_sf_entry(rd, hdr, sfep);
+ if (error)
+ return error;
+
+ sfep = next;
+ }
+
+ return 0;
+}
+
+/*
+ * Try to figure out the format of this directory from the data fork mappings
+ * and the directory size. If we can be reasonably sure of format, we can be
+ * more aggressive in salvaging directory entries. On return, @magic_guess
+ * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
+ * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
+ * and 0 if we can't tell.
+ */
+STATIC void
+xrep_dir_guess_format(
+ struct xrep_dir *rd,
+ __be32 *magic_guess)
+{
+ struct xfs_inode *dp = rd->sc->ip;
+ struct xfs_mount *mp = rd->sc->mp;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ xfs_fileoff_t last;
+ int error;
+
+ ASSERT(xfs_has_crc(mp));
+
+ *magic_guess = 0;
+
+ /*
+ * If there's a single directory block and the directory size is
+ * exactly one block, this has to be a single block format directory.
+ */
+ error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK);
+ if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize &&
+ dp->i_disk_size == geo->blksize) {
+ *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+ return;
+ }
+
+ /*
+ * If the last extent before the leaf offset matches the directory
+ * size and the directory size is larger than 1 block, this is a
+ * data format directory.
+ */
+ last = geo->leafblk;
+ error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK);
+ if (!error &&
+ XFS_FSB_TO_B(mp, last) > geo->blksize &&
+ XFS_FSB_TO_B(mp, last) == dp->i_disk_size) {
+ *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+ return;
+ }
+}
+
+/* Recover directory entries from a specific directory block. */
+STATIC int
+xrep_dir_recover_dirblock(
+ struct xrep_dir *rd,
+ __be32 magic_guess,
+ xfs_dablk_t dabno)
+{
+ struct xfs_dir2_data_hdr *hdr;
+ struct xfs_buf *bp;
+ __be32 oldmagic;
+ int error;
+
+ /*
+ * Try to read buffer. We invalidate them in the next step so we don't
+ * bother to set a buffer type or ops.
+ */
+ error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
+ XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
+ if (error || !bp)
+ return error;
+
+ hdr = bp->b_addr;
+ oldmagic = hdr->magic;
+
+ trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
+ be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));
+
+ /*
+ * If we're sure of the block's format, proceed with the salvage
+ * operation using the specified magic number.
+ */
+ if (magic_guess) {
+ hdr->magic = magic_guess;
+ goto recover;
+ }
+
+ /*
+ * If we couldn't guess what type of directory this is, then we will
+ * only salvage entries from directory blocks that match the magic
+ * number and pass verifiers.
+ */
+ switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+ case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+ if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
+ goto out;
+ if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL)
+ goto out;
+ break;
+ case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+ case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+ if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
+ goto out;
+ if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL)
+ goto out;
+ break;
+ default:
+ goto out;
+ }
+
+recover:
+ error = xrep_dir_recover_data(rd, bp);
+
+out:
+ hdr->magic = oldmagic;
+ xfs_trans_brelse(rd->sc->tp, bp);
+ return error;
+}
+
+static inline void
+xrep_dir_init_args(
+ struct xrep_dir *rd,
+ struct xfs_inode *dp,
+ const struct xfs_name *name)
+{
+ memset(&rd->args, 0, sizeof(struct xfs_da_args));
+ rd->args.geo = rd->sc->mp->m_dir_geo;
+ rd->args.whichfork = XFS_DATA_FORK;
+ rd->args.owner = rd->sc->ip->i_ino;
+ rd->args.trans = rd->sc->tp;
+ rd->args.dp = dp;
+ if (!name)
+ return;
+ rd->args.name = name->name;
+ rd->args.namelen = name->len;
+ rd->args.filetype = name->type;
+ rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name);
+}
+
+/* Replay a stashed createname into the temporary directory. */
+STATIC int
+xrep_dir_replay_createname(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_ino_t inum,
+ xfs_extlen_t total)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *dp = rd->sc->tempip;
+ int error;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+ error = xfs_dir_ino_validate(sc->mp, inum);
+ if (error)
+ return error;
+
+ trace_xrep_dir_replay_createname(dp, name, inum);
+
+ xrep_dir_init_args(rd, dp, name);
+ rd->args.inumber = inum;
+ rd->args.total = total;
+ rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+ return xfs_dir_createname_args(&rd->args);
+}
+
+/* Replay a stashed removename onto the temporary directory. */
+STATIC int
+xrep_dir_replay_removename(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_extlen_t total)
+{
+ struct xfs_inode *dp = rd->args.dp;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+ xrep_dir_init_args(rd, dp, name);
+ rd->args.op_flags = 0;
+ rd->args.total = total;
+
+ trace_xrep_dir_replay_removename(dp, name, 0);
+ return xfs_dir_removename_args(&rd->args);
+}
+
+/*
+ * Add this stashed incore directory entry to the temporary directory.
+ * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
+ * must not be in transaction context.
+ */
+STATIC int
+xrep_dir_replay_update(
+ struct xrep_dir *rd,
+ const struct xfs_name *xname,
+ const struct xrep_dirent *dirent)
+{
+ struct xfs_mount *mp = rd->sc->mp;
+#ifdef DEBUG
+ xfs_ino_t ino;
+#endif
+ uint resblks;
+ int error;
+
+ resblks = xfs_link_space_res(mp, xname->len);
+ error = xchk_trans_alloc(rd->sc, resblks);
+ if (error)
+ return error;
+
+ /* Lock the temporary directory and join it to the transaction */
+ xrep_tempfile_ilock(rd->sc);
+ xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0);
+
+ switch (dirent->action) {
+ case XREP_DIRENT_ADD:
+ /*
+ * Create a replacement dirent in the temporary directory.
+ * Note that _createname doesn't check for existing entries.
+ * There shouldn't be any in the temporary dir, but we'll
+ * verify this in debug mode.
+ */
+#ifdef DEBUG
+ error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
+ if (error != -ENOENT) {
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+#endif
+
+ error = xrep_dir_replay_createname(rd, xname, dirent->ino,
+ resblks);
+ if (error)
+ goto out_cancel;
+
+ if (xname->type == XFS_DIR3_FT_DIR)
+ rd->subdirs++;
+ rd->dirents++;
+ break;
+ case XREP_DIRENT_REMOVE:
+ /*
+ * Remove a dirent from the temporary directory. Note that
+ * _removename doesn't check the inode target of the exist
+ * entry. There should be a perfect match in the temporary
+ * dir, but we'll verify this in debug mode.
+ */
+#ifdef DEBUG
+ error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
+ if (error) {
+ ASSERT(error != 0);
+ goto out_cancel;
+ }
+ if (ino != dirent->ino) {
+ ASSERT(ino == dirent->ino);
+ error = -EIO;
+ goto out_cancel;
+ }
+#endif
+
+ error = xrep_dir_replay_removename(rd, xname, resblks);
+ if (error)
+ goto out_cancel;
+
+ if (xname->type == XFS_DIR3_FT_DIR)
+ rd->subdirs--;
+ rd->dirents--;
+ break;
+ default:
+ ASSERT(0);
+ error = -EIO;
+ goto out_cancel;
+ }
+
+ /* Commit and unlock. */
+ error = xrep_trans_commit(rd->sc);
+ if (error)
+ return error;
+
+ xrep_tempfile_iunlock(rd->sc);
+ return 0;
+out_cancel:
+ xchk_trans_cancel(rd->sc);
+ xrep_tempfile_iunlock(rd->sc);
+ return error;
+}
+
+/*
+ * Flush stashed incore dirent updates that have been recorded by the scanner.
+ * This is done to reduce the memory requirements of the directory rebuild,
+ * since directories can contain up to 32GB of directory data.
+ *
+ * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir
+ * IOLOCK.
+ */
+STATIC int
+xrep_dir_replay_updates(
+ struct xrep_dir *rd)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ /* Add all the salvaged dirents to the temporary directory. */
+ mutex_lock(&rd->pscan.lock);
+ foreach_xfarray_idx(rd->dir_entries, array_cur) {
+ struct xrep_dirent dirent;
+
+ error = xfarray_load(rd->dir_entries, array_cur, &dirent);
+ if (error)
+ goto out_unlock;
+
+ error = xfblob_loadname(rd->dir_names, dirent.name_cookie,
+ &rd->xname, dirent.namelen);
+ if (error)
+ goto out_unlock;
+ rd->xname.type = dirent.ftype;
+ mutex_unlock(&rd->pscan.lock);
+
+ error = xrep_dir_replay_update(rd, &rd->xname, &dirent);
+ if (error)
+ return error;
+ mutex_lock(&rd->pscan.lock);
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rd->dir_entries);
+ xfblob_truncate(rd->dir_names);
+ mutex_unlock(&rd->pscan.lock);
+ return 0;
+out_unlock:
+ mutex_unlock(&rd->pscan.lock);
+ return error;
+}
+
+/*
+ * Periodically flush stashed directory entries to the temporary dir. This
+ * is done to reduce the memory requirements of the directory rebuild, since
+ * directories can contain up to 32GB of directory data.
+ */
+STATIC int
+xrep_dir_flush_stashed(
+ struct xrep_dir *rd)
+{
+ int error;
+
+ /*
+ * Entering this function, the scrub context has a reference to the
+ * inode being repaired, the temporary file, and a scrub transaction
+ * that we use during dirent salvaging to avoid livelocking if there
+ * are cycles in the directory structures. We hold ILOCK_EXCL on both
+ * the inode being repaired and the temporary file, though they are
+ * not ijoined to the scrub transaction.
+ *
+ * To constrain kernel memory use, we occasionally write salvaged
+ * dirents from the xfarray and xfblob structures into the temporary
+ * directory in preparation for exchanging the directory structures at
+ * the end. Updating the temporary file requires a transaction, so we
+ * commit the scrub transaction and drop the two ILOCKs so that
+ * we can allocate whatever transaction we want.
+ *
+ * We still hold IOLOCK_EXCL on the inode being repaired, which
+ * prevents anyone from accessing the damaged directory data while we
+ * repair it.
+ */
+ error = xrep_trans_commit(rd->sc);
+ if (error)
+ return error;
+ xchk_iunlock(rd->sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the IOLOCK of the temporary file while we modify dirents. This
+ * isn't strictly required because the temporary file is never revealed
+ * to userspace, but we follow the same locking rules. We still hold
+ * sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rd->sc);
+ if (error)
+ return error;
+
+ /* Write to the tempdir all the updates that we've stashed. */
+ error = xrep_dir_replay_updates(rd);
+ xrep_tempfile_iounlock(rd->sc);
+ if (error)
+ return error;
+
+ /*
+ * Recreate the salvage transaction and relock the dir we're salvaging.
+ */
+ error = xchk_trans_alloc(rd->sc, 0);
+ if (error)
+ return error;
+ xchk_ilock(rd->sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/* Decide if we've stashed too much dirent data in memory. */
+static inline bool
+xrep_dir_want_flush_stashed(
+ struct xrep_dir *rd)
+{
+ unsigned long long bytes;
+
+ bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names);
+ return bytes > XREP_DIR_MAX_STASH_BYTES;
+}
+
+/* Extract as many directory entries as we can. */
+STATIC int
+xrep_dir_recover(
+ struct xrep_dir *rd)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_da_geometry *geo = sc->mp->m_dir_geo;
+ xfs_fileoff_t offset;
+ xfs_dablk_t dabno;
+ __be32 magic_guess;
+ int nmap;
+ int error;
+
+ xrep_dir_guess_format(rd, &magic_guess);
+
+ /* Iterate each directory data block in the data fork. */
+ for (offset = 0;
+ offset < geo->leafblk;
+ offset = got.br_startoff + got.br_blockcount) {
+ nmap = 1;
+ error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset,
+ &got, &nmap, 0);
+ if (error)
+ return error;
+ if (nmap != 1)
+ return -EFSCORRUPTED;
+ if (!xfs_bmap_is_written_extent(&got))
+ continue;
+
+ for (dabno = round_up(got.br_startoff, geo->fsbcount);
+ dabno < got.br_startoff + got.br_blockcount;
+ dabno += geo->fsbcount) {
+ if (xchk_should_terminate(rd->sc, &error))
+ return error;
+
+ error = xrep_dir_recover_dirblock(rd,
+ magic_guess, dabno);
+ if (error)
+ return error;
+
+ /* Flush dirents to constrain memory usage. */
+ if (xrep_dir_want_flush_stashed(rd)) {
+ error = xrep_dir_flush_stashed(rd);
+ if (error)
+ return error;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Find all the directory entries for this inode by scraping them out of the
+ * directory leaf blocks by hand, and flushing them into the temp dir.
+ */
+STATIC int
+xrep_dir_find_entries(
+ struct xrep_dir *rd)
+{
+ struct xfs_inode *dp = rd->sc->ip;
+ int error;
+
+ /*
+ * Salvage directory entries from the old directory, and write them to
+ * the temporary directory.
+ */
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ error = xrep_dir_recover_sf(rd);
+ } else {
+ error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ error = xrep_dir_recover(rd);
+ }
+ if (error)
+ return error;
+
+ return xrep_dir_flush_stashed(rd);
+}
+
+/* Scan all files in the filesystem for dirents. */
+STATIC int
+xrep_dir_salvage_entries(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ /*
+ * Drop the ILOCK on this directory so that we can scan for this
+ * directory's parent. Figure out who is going to be the parent of
+ * this directory, then retake the ILOCK so that we can salvage
+ * directory entries.
+ */
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ error = xrep_dir_find_parent(rd);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ /*
+ * Collect directory entries by parsing raw leaf blocks to salvage
+ * whatever we can. When we're done, free the staging memory before
+ * exchanging the directories to reduce memory usage.
+ */
+ error = xrep_dir_find_entries(rd);
+ if (error)
+ return error;
+
+ /*
+ * Cancel the repair transaction and drop the ILOCK so that we can
+ * (later) use the atomic mapping exchange functions to compute the
+ * correct block reservations and re-lock the inodes.
+ *
+ * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory
+ * modifications, but there's nothing to prevent userspace from reading
+ * the directory until we're ready for the exchange operation. Reads
+ * will return -EIO without shutting down the fs, so we're ok with
+ * that.
+ *
+ * The VFS can change dotdot on us, but the findparent scan will keep
+ * our incore parent inode up to date. See the note on locking issues
+ * for more details.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+
+/*
+ * Examine a parent pointer of a file. If it leads us back to the directory
+ * that we're rebuilding, create an incore dirent from the parent pointer and
+ * stash it.
+ */
+STATIC int
+xrep_dir_scan_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ .type = xfs_mode_to_ftype(VFS_I(ip)->i_mode),
+ };
+ xfs_ino_t parent_ino;
+ uint32_t parent_gen;
+ struct xrep_dir *rd = priv;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ /*
+ * Ignore parent pointers that point back to a different dir, list the
+ * wrong generation number, or are invalid.
+ */
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, &parent_gen);
+ if (error)
+ return error;
+
+ if (parent_ino != sc->ip->i_ino ||
+ parent_gen != VFS_I(sc->ip)->i_generation)
+ return 0;
+
+ mutex_lock(&rd->pscan.lock);
+ error = xrep_dir_stash_createname(rd, &xname, ip->i_ino);
+ mutex_unlock(&rd->pscan.lock);
+ return error;
+}
+
+/*
+ * If this child dirent points to the directory being repaired, remember that
+ * fact so that we can reset the dotdot entry if necessary.
+ */
+STATIC int
+xrep_dir_scan_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_dir *rd = priv;
+
+ /* Dirent doesn't point to this directory. */
+ if (ino != rd->sc->ip->i_ino)
+ return 0;
+
+ /* Ignore garbage inum. */
+ if (!xfs_verify_dir_ino(rd->sc->mp, ino))
+ return 0;
+
+ /* No weird looking names. */
+ if (name->len >= MAXNAMELEN || name->len <= 0)
+ return 0;
+
+ /* Don't pick up dot or dotdot entries; we only want child dirents. */
+ if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
+ xfs_dir2_samename(name, &xfs_name_dot))
+ return 0;
+
+ trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot,
+ dp->i_ino);
+
+ xrep_findparent_scan_found(&rd->pscan, dp->i_ino);
+ return 0;
+}
+
+/*
+ * Decide if we want to look for child dirents or parent pointers in this file.
+ * Skip the dir being repaired and any files being used to stage repairs.
+ */
+static inline bool
+xrep_dir_want_scan(
+ struct xrep_dir *rd,
+ const struct xfs_inode *ip)
+{
+ return ip != rd->sc->ip && !xrep_is_tempfile(ip);
+}
+
+/*
+ * Take ILOCK on a file that we want to scan.
+ *
+ * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or
+ * has an unloaded attr bmbt. Otherwise, take ILOCK_SHARED.
+ */
+static inline unsigned int
+xrep_dir_scan_ilock(
+ struct xrep_dir *rd,
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ /* Need to take the shared ILOCK to advance the iscan cursor. */
+ if (!xrep_dir_want_scan(rd, ip))
+ goto lock;
+
+ if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) {
+ lock_mode = XFS_ILOCK_EXCL;
+ goto lock;
+ }
+
+ if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
+ lock_mode = XFS_ILOCK_EXCL;
+
+lock:
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
+/*
+ * Scan this file for relevant child dirents or parent pointers that point to
+ * the directory we're rebuilding.
+ */
+STATIC int
+xrep_dir_scan_file(
+ struct xrep_dir *rd,
+ struct xfs_inode *ip)
+{
+ unsigned int lock_mode;
+ int error = 0;
+
+ lock_mode = xrep_dir_scan_ilock(rd, ip);
+
+ if (!xrep_dir_want_scan(rd, ip))
+ goto scan_done;
+
+ /*
+ * If the extended attributes look as though they has been zapped by
+ * the inode record repair code, we cannot scan for parent pointers.
+ */
+ if (xchk_pptr_looks_zapped(ip)) {
+ error = -EBUSY;
+ goto scan_done;
+ }
+
+ error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd);
+ if (error)
+ goto scan_done;
+
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ /*
+ * If the directory looks as though it has been zapped by the
+ * inode record repair code, we cannot scan for child dirents.
+ */
+ if (xchk_dir_looks_zapped(ip)) {
+ error = -EBUSY;
+ goto scan_done;
+ }
+
+ error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd);
+ if (error)
+ goto scan_done;
+ }
+
+scan_done:
+ xchk_iscan_mark_visited(&rd->pscan.iscan, ip);
+ xfs_iunlock(ip, lock_mode);
+ return error;
+}
+
+/*
+ * Scan all files in the filesystem for parent pointers that we can turn into
+ * replacement dirents, and a dirent that we can use to set the dotdot pointer.
+ */
+STATIC int
+xrep_dir_scan_dirtree(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /* Roots of directory trees are their own parents. */
+ if (xchk_inode_is_dirtree_root(sc->ip))
+ xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino);
+
+ /*
+ * Filesystem scans are time consuming. Drop the directory ILOCK and
+ * all other resources for the duration of the scan and hope for the
+ * best. The live update hooks will keep our scan information up to
+ * date even though we've dropped the locks.
+ */
+ xchk_trans_cancel(sc);
+ if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
+ xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
+ XFS_ILOCK_EXCL));
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) {
+ bool flush;
+
+ error = xrep_dir_scan_file(rd, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ /* Flush stashed dirent updates to constrain memory usage. */
+ mutex_lock(&rd->pscan.lock);
+ flush = xrep_dir_want_flush_stashed(rd);
+ mutex_unlock(&rd->pscan.lock);
+ if (flush) {
+ xchk_trans_cancel(sc);
+
+ error = xrep_tempfile_iolock_polled(sc);
+ if (error)
+ break;
+
+ error = xrep_dir_replay_updates(rd);
+ xrep_tempfile_iounlock(sc);
+ if (error)
+ break;
+
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ break;
+ }
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&rd->pscan.iscan);
+ if (error) {
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Cancel the empty transaction so that we can (later) use the atomic
+ * file mapping exchange functions to lock files and commit the new
+ * directory.
+ */
+ xchk_trans_cancel(rd->sc);
+ return 0;
+}
+
+/*
+ * Capture dirent updates being made by other threads which are relevant to the
+ * directory being repaired.
+ */
+STATIC int
+xrep_dir_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xrep_dir *rd;
+ struct xfs_scrub *sc;
+ int error = 0;
+
+ rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb);
+ sc = rd->sc;
+
+ /*
+ * This thread updated a child dirent in the directory that we're
+ * rebuilding. Stash the update for replay against the temporary
+ * directory.
+ */
+ if (p->dp->i_ino == sc->ip->i_ino &&
+ xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) {
+ mutex_lock(&rd->pscan.lock);
+ if (p->delta > 0)
+ error = xrep_dir_stash_createname(rd, p->name,
+ p->ip->i_ino);
+ else
+ error = xrep_dir_stash_removename(rd, p->name,
+ p->ip->i_ino);
+ mutex_unlock(&rd->pscan.lock);
+ if (error)
+ goto out_abort;
+ }
+
+ /*
+ * This thread updated another directory's child dirent that points to
+ * the directory that we're rebuilding, so remember the new dotdot
+ * target.
+ */
+ if (p->ip->i_ino == sc->ip->i_ino &&
+ xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) {
+ if (p->delta > 0) {
+ trace_xrep_dir_stash_createname(sc->tempip,
+ &xfs_name_dotdot,
+ p->dp->i_ino);
+
+ xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino);
+ } else {
+ trace_xrep_dir_stash_removename(sc->tempip,
+ &xfs_name_dotdot,
+ rd->pscan.parent_ino);
+
+ xrep_findparent_scan_found(&rd->pscan, NULLFSINO);
+ }
+ }
+
+ return NOTIFY_DONE;
+out_abort:
+ xchk_iscan_abort(&rd->pscan.iscan);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Free all the directory blocks and reset the data fork. The caller must
+ * join the inode to the transaction. This function returns with the inode
+ * joined to a clean scrub transaction.
+ */
+STATIC int
+xrep_dir_reset_fork(
+ struct xrep_dir *rd,
+ xfs_ino_t parent_ino)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
+ int error;
+
+ /* Unmap all the directory buffers. */
+ if (xfs_ifork_has_extents(ifp)) {
+ error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+
+ trace_xrep_dir_reset_fork(sc->tempip, parent_ino);
+
+ /* Reset the data fork to an empty data fork. */
+ xfs_idestroy_fork(ifp);
+ ifp->if_bytes = 0;
+ sc->tempip->i_disk_size = 0;
+
+ /* Reinitialize the short form directory. */
+ xrep_dir_init_args(rd, sc->tempip, NULL);
+ return xfs_dir2_sf_create(&rd->args, parent_ino);
+}
+
+/*
+ * Prepare both inodes' directory forks for exchanging mappings. Promote the
+ * tempfile from short format to leaf format, and if the file being repaired
+ * has a short format data fork, turn it into an empty extent list.
+ */
+STATIC int
+xrep_dir_swap_prep(
+ struct xfs_scrub *sc,
+ bool temp_local,
+ bool ip_local)
+{
+ int error;
+
+ /*
+ * If the tempfile's directory is in shortform format, convert that to
+ * a single leaf extent so that we can use the atomic mapping exchange.
+ */
+ if (temp_local) {
+ struct xfs_da_args args = {
+ .dp = sc->tempip,
+ .geo = sc->mp->m_dir_geo,
+ .whichfork = XFS_DATA_FORK,
+ .trans = sc->tp,
+ .total = 1,
+ .owner = sc->ip->i_ino,
+ };
+
+ error = xfs_dir2_sf_to_block(&args);
+ if (error)
+ return error;
+
+ /*
+ * Roll the deferred log items to get us back to a clean
+ * transaction.
+ */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If the file being repaired had a shortform data fork, convert that
+ * to an empty extent list in preparation for the atomic mapping
+ * exchange.
+ */
+ if (ip_local) {
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(ifp);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ifp->if_nextents = 0;
+ ifp->if_bytes = 0;
+ ifp->if_data = NULL;
+ ifp->if_height = 0;
+
+ xfs_trans_log_inode(sc->tp, sc->ip,
+ XFS_ILOG_CORE | XFS_ILOG_DDATA);
+ }
+
+ return 0;
+}
+
+/*
+ * Replace the inode number of a directory entry.
+ */
+static int
+xrep_dir_replace(
+ struct xrep_dir *rd,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ xfs_ino_t inum,
+ xfs_extlen_t total)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+ error = xfs_dir_ino_validate(sc->mp, inum);
+ if (error)
+ return error;
+
+ xrep_dir_init_args(rd, dp, name);
+ rd->args.inumber = inum;
+ rd->args.total = total;
+ return xfs_dir_replace_args(&rd->args);
+}
+
+/*
+ * Reset the link count of this directory and adjust the unlinked list pointers
+ * as needed.
+ */
+STATIC int
+xrep_dir_set_nlink(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *dp = sc->ip;
+ struct xfs_perag *pag;
+ unsigned int new_nlink = min_t(unsigned long long,
+ rd->subdirs + 2,
+ XFS_NLINK_PINNED);
+ int error;
+
+ /*
+ * The directory is not on the incore unlinked list, which means that
+ * it needs to be reachable via the directory tree. Update the nlink
+ * with our observed link count. If the directory has no parent, it
+ * will be moved to the orphanage.
+ */
+ if (!xfs_inode_on_unlinked_list(dp))
+ goto reset_nlink;
+
+ /*
+ * The directory is on the unlinked list and we did not find any
+ * dirents. Set the link count to zero and let the directory
+ * inactivate when the last reference drops.
+ */
+ if (rd->dirents == 0) {
+ rd->needs_adoption = false;
+ new_nlink = 0;
+ goto reset_nlink;
+ }
+
+ /*
+ * The directory is on the unlinked list and we found dirents. This
+ * directory needs to be reachable via the directory tree. Remove the
+ * dir from the unlinked list and update nlink with the observed link
+ * count. If the directory has no parent, it will be moved to the
+ * orphanage.
+ */
+ pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino));
+ if (!pag) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_iunlink_remove(sc->tp, pag, dp);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+reset_nlink:
+ if (VFS_I(dp)->i_nlink != new_nlink)
+ set_nlink(VFS_I(dp), new_nlink);
+ return 0;
+}
+
+/*
+ * Finish replaying stashed dirent updates, allocate a transaction for
+ * exchanging data fork mappings, and take the ILOCKs of both directories
+ * before we commit the new directory structure.
+ */
+STATIC int
+xrep_dir_finalize_tempdir(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ if (!xfs_has_parent(sc->mp))
+ return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
+
+ /*
+ * Repair relies on the ILOCK to quiesce all possible dirent updates.
+ * Replay all queued dirent updates into the tempdir before exchanging
+ * the contents, even if that means dropping the ILOCKs and the
+ * transaction.
+ */
+ do {
+ error = xrep_dir_replay_updates(rd);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
+ if (error)
+ return error;
+
+ if (xfarray_length(rd->dir_entries) == 0)
+ break;
+
+ xchk_trans_cancel(sc);
+ xrep_tempfile_iunlock_both(sc);
+ } while (!xchk_should_terminate(sc, &error));
+ return error;
+}
+
+/* Exchange the temporary directory's data fork with the one being repaired. */
+STATIC int
+xrep_dir_swap(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ xfs_ino_t ino;
+ bool ip_local, temp_local;
+ int error = 0;
+
+ /*
+ * If we never found the parent for this directory, temporarily assign
+ * the root dir as the parent; we'll move this to the orphanage after
+ * exchanging the dir contents. We hold the ILOCK of the dir being
+ * repaired, so we're not worried about racy updates of dotdot.
+ */
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+ if (rd->pscan.parent_ino == NULLFSINO) {
+ rd->needs_adoption = true;
+ rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino;
+ }
+
+ /*
+ * Reset the temporary directory's '..' entry to point to the parent
+ * that we found. The dirent replace code asserts if the dirent
+ * already points at the new inumber, so we look it up here.
+ *
+ * It's also possible that this replacement could also expand a sf
+ * tempdir into block format.
+ */
+ error = xchk_dir_lookup(sc, rd->sc->tempip, &xfs_name_dotdot, &ino);
+ if (error)
+ return error;
+
+ if (rd->pscan.parent_ino != ino) {
+ error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
+ rd->pscan.parent_ino, rd->tx.req.resblks);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Changing the dot and dotdot entries could have changed the shape of
+ * the directory, so we recompute these.
+ */
+ ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+ temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+
+ /*
+ * If the both files have a local format data fork and the rebuilt
+ * directory data would fit in the repaired file's data fork, copy
+ * the contents from the tempfile and update the directory link count.
+ * We're done now.
+ */
+ if (ip_local && temp_local &&
+ sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
+ xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
+ return xrep_dir_set_nlink(rd);
+ }
+
+ /*
+ * Clean the transaction before we start working on exchanging
+ * directory contents.
+ */
+ error = xrep_tempfile_roll_trans(rd->sc);
+ if (error)
+ return error;
+
+ /* Otherwise, make sure both data forks are in block-mapping mode. */
+ error = xrep_dir_swap_prep(sc, temp_local, ip_local);
+ if (error)
+ return error;
+
+ /*
+ * Set nlink of the directory in the same transaction sequence that
+ * (atomically) commits the new directory data.
+ */
+ error = xrep_dir_set_nlink(rd);
+ if (error)
+ return error;
+
+ return xrep_tempexch_contents(sc, &rd->tx);
+}
+
+/*
+ * Exchange the new directory contents (which we created in the tempfile) with
+ * the directory being repaired.
+ */
+STATIC int
+xrep_dir_rebuild_tree(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino);
+
+ /*
+ * Take the IOLOCK on the temporary file so that we can run dir
+ * operations with the same locks held as we would for a normal file.
+ * We still hold sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rd->sc);
+ if (error)
+ return error;
+
+ /*
+ * Allocate transaction, lock inodes, and make sure that we've replayed
+ * all the stashed dirent updates to the tempdir. After this point,
+ * we're ready to exchange data fork mappings.
+ */
+ error = xrep_dir_finalize_tempdir(rd);
+ if (error)
+ return error;
+
+ if (xchk_iscan_aborted(&rd->pscan.iscan))
+ return -ECANCELED;
+
+ /*
+ * Exchange the tempdir's data fork with the file being repaired. This
+ * recreates the transaction and re-takes the ILOCK in the scrub
+ * context.
+ */
+ error = xrep_dir_swap(rd);
+ if (error)
+ return error;
+
+ /*
+ * Release the old directory blocks and reset the data fork of the temp
+ * directory to an empty shortform directory because inactivation does
+ * nothing for directories.
+ */
+ error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino);
+ if (error)
+ return error;
+
+ /*
+ * Roll to get a transaction without any inodes joined to it. Then we
+ * can drop the tempfile's ILOCK and IOLOCK before doing more work on
+ * the scrub target directory.
+ */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+
+ xrep_tempfile_iunlock(sc);
+ xrep_tempfile_iounlock(sc);
+ return 0;
+}
+
+/* Set up the filesystem scan so we can regenerate directory entries. */
+STATIC int
+xrep_dir_setup_scan(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ char *descr;
+ int error;
+
+ /* Set up some staging memory for salvaging dirents. */
+ descr = xchk_xfile_ino_descr(sc, "directory entries");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_dirent),
+ &rd->dir_entries);
+ kfree(descr);
+ if (error)
+ return error;
+
+ descr = xchk_xfile_ino_descr(sc, "directory entry names");
+ error = xfblob_create(descr, &rd->dir_names);
+ kfree(descr);
+ if (error)
+ goto out_xfarray;
+
+ if (xfs_has_parent(sc->mp))
+ error = __xrep_findparent_scan_start(sc, &rd->pscan,
+ xrep_dir_live_update);
+ else
+ error = xrep_findparent_scan_start(sc, &rd->pscan);
+ if (error)
+ goto out_xfblob;
+
+ return 0;
+
+out_xfblob:
+ xfblob_destroy(rd->dir_names);
+ rd->dir_names = NULL;
+out_xfarray:
+ xfarray_destroy(rd->dir_entries);
+ rd->dir_entries = NULL;
+ return error;
+}
+
+/*
+ * Move the current file to the orphanage.
+ *
+ * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon
+ * successful return, the scrub transaction will have enough extra reservation
+ * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
+ * orphanage; and both inodes will be ijoined.
+ */
+STATIC int
+xrep_dir_move_to_orphanage(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ xfs_ino_t orig_parent, new_parent;
+ int error;
+
+ /*
+ * We are about to drop the ILOCK on sc->ip to lock the orphanage and
+ * prepare for the adoption. Therefore, look up the old dotdot entry
+ * for sc->ip so that we can compare it after we re-lock sc->ip.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent);
+ if (error)
+ return error;
+
+ /*
+ * Drop the ILOCK on the scrub target and commit the transaction.
+ * Adoption computes its own resource requirements and gathers the
+ * necessary components.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* If we can take the orphanage's iolock then we're ready to move. */
+ if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
+ xchk_iunlock(sc, sc->ilock_flags);
+ error = xrep_orphanage_iolock_two(sc);
+ if (error)
+ return error;
+ }
+
+ /* Grab transaction and ILOCK the two files. */
+ error = xrep_adoption_trans_alloc(sc, &rd->adoption);
+ if (error)
+ return error;
+
+ error = xrep_adoption_compute_name(&rd->adoption, &rd->xname);
+ if (error)
+ return error;
+
+ /*
+ * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
+ * entry again. If the parent changed or the child was unlinked while
+ * the child directory was unlocked, we don't need to move the child to
+ * the orphanage after all.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent);
+ if (error)
+ return error;
+
+ /*
+ * Attach to the orphanage if we still have a linked directory and it
+ * hasn't been moved.
+ */
+ if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
+ error = xrep_adoption_move(&rd->adoption);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Launder the scrub transaction so we can drop the orphanage ILOCK
+ * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK.
+ */
+ error = xrep_adoption_trans_roll(&rd->adoption);
+ if (error)
+ return error;
+
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Repair the directory metadata.
+ *
+ * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer
+ * cache in XFS can't handle aliased multiblock buffers, so this might
+ * misbehave if the directory blocks are crosslinked with other filesystem
+ * metadata.
+ *
+ * XXX: Is it necessary to check the dcache for this directory to make sure
+ * that we always recreate every cached entry?
+ */
+int
+xrep_directory(
+ struct xfs_scrub *sc)
+{
+ struct xrep_dir *rd = sc->buf;
+ int error;
+
+ /* The rmapbt is required to reap the old data fork. */
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+
+ error = xrep_dir_setup_scan(rd);
+ if (error)
+ return error;
+
+ if (xfs_has_parent(sc->mp))
+ error = xrep_dir_scan_dirtree(rd);
+ else
+ error = xrep_dir_salvage_entries(rd);
+ if (error)
+ goto out_teardown;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto out_teardown;
+
+ error = xrep_dir_rebuild_tree(rd);
+ if (error)
+ goto out_teardown;
+
+ if (rd->needs_adoption) {
+ if (!xrep_orphanage_can_adopt(rd->sc))
+ error = -EFSCORRUPTED;
+ else
+ error = xrep_dir_move_to_orphanage(rd);
+ if (error)
+ goto out_teardown;
+ }
+
+out_teardown:
+ xrep_dir_teardown(sc);
+ return error;
+}
diff --git a/fs/xfs/scrub/dirtree.c b/fs/xfs/scrub/dirtree.c
new file mode 100644
index 000000000000..3a9cdf8738b6
--- /dev/null
+++ b/fs/xfs/scrub/dirtree.c
@@ -0,0 +1,1009 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr.h"
+#include "xfs_parent.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/bitmap.h"
+#include "scrub/ino_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/listxattr.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/orphanage.h"
+#include "scrub/dirtree.h"
+
+/*
+ * Directory Tree Structure Validation
+ * ===================================
+ *
+ * Validating the tree qualities of the directory tree structure can be
+ * difficult. If the tree is frozen, running a depth (or breadth) first search
+ * and marking a bitmap suffices to determine if there is a cycle. XORing the
+ * mark bitmap with the inode bitmap afterwards tells us if there are
+ * disconnected cycles. If the tree is not frozen, directory updates can move
+ * subtrees across the scanner wavefront, which complicates the design greatly.
+ *
+ * Directory parent pointers change that by enabling an incremental approach to
+ * validation of the tree structure. Instead of using one thread to scan the
+ * entire filesystem, we instead can have multiple threads walking individual
+ * subdirectories upwards to the root. In a perfect world, the IOLOCK would
+ * suffice to stabilize two directories in a parent -> child relationship.
+ * Unfortunately, the VFS does not take the IOLOCK when moving a child
+ * subdirectory, so we instead synchronize on ILOCK and use dirent update hooks
+ * to detect a race. If a race occurs in a path, we restart the scan.
+ *
+ * If the walk terminates without reaching the root, we know the path is
+ * disconnected and ought to be attached to the lost and found. If on the walk
+ * we find the same subdir that we're scanning, we know this is a cycle and
+ * should delete an incoming edge. If we find multiple paths to the root, we
+ * know to delete an incoming edge.
+ *
+ * There are two big hitches with this approach: first, all file link counts
+ * must be correct to prevent other writers from doing the wrong thing with the
+ * directory tree structure. Second, because we're walking upwards in a tree
+ * of arbitrary depth, we cannot hold all the ILOCKs. Instead, we will use a
+ * directory update hook to invalidate the scan results if one of the paths
+ * we've scanned has changed.
+ */
+
+/* Clean up the dirtree checking resources. */
+STATIC void
+xchk_dirtree_buf_cleanup(
+ void *buf)
+{
+ struct xchk_dirtree *dl = buf;
+ struct xchk_dirpath *path, *n;
+
+ if (dl->scan_ino != NULLFSINO)
+ xfs_dir_hook_del(dl->sc->mp, &dl->dhook);
+
+ xchk_dirtree_for_each_path_safe(dl, path, n) {
+ list_del_init(&path->list);
+ xino_bitmap_destroy(&path->seen_inodes);
+ kfree(path);
+ }
+
+ xfblob_destroy(dl->path_names);
+ xfarray_destroy(dl->path_steps);
+ mutex_destroy(&dl->lock);
+}
+
+/* Set us up to look for directory loops. */
+int
+xchk_setup_dirtree(
+ struct xfs_scrub *sc)
+{
+ struct xchk_dirtree *dl;
+ char *descr;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_dirtree(sc);
+ if (error)
+ return error;
+ }
+
+ dl = kvzalloc(sizeof(struct xchk_dirtree), XCHK_GFP_FLAGS);
+ if (!dl)
+ return -ENOMEM;
+ dl->sc = sc;
+ dl->xname.name = dl->namebuf;
+ dl->hook_xname.name = dl->hook_namebuf;
+ INIT_LIST_HEAD(&dl->path_list);
+ dl->root_ino = NULLFSINO;
+ dl->scan_ino = NULLFSINO;
+ dl->parent_ino = NULLFSINO;
+
+ mutex_init(&dl->lock);
+
+ descr = xchk_xfile_ino_descr(sc, "dirtree path steps");
+ error = xfarray_create(descr, 0, sizeof(struct xchk_dirpath_step),
+ &dl->path_steps);
+ kfree(descr);
+ if (error)
+ goto out_dl;
+
+ descr = xchk_xfile_ino_descr(sc, "dirtree path names");
+ error = xfblob_create(descr, &dl->path_names);
+ kfree(descr);
+ if (error)
+ goto out_steps;
+
+ error = xchk_setup_inode_contents(sc, 0);
+ if (error)
+ goto out_names;
+
+ sc->buf = dl;
+ sc->buf_cleanup = xchk_dirtree_buf_cleanup;
+ return 0;
+
+out_names:
+ xfblob_destroy(dl->path_names);
+out_steps:
+ xfarray_destroy(dl->path_steps);
+out_dl:
+ mutex_destroy(&dl->lock);
+ kvfree(dl);
+ return error;
+}
+
+/*
+ * Add the parent pointer described by @dl->pptr to the given path as a new
+ * step. Returns -ELNRNG if the path is too deep.
+ */
+int
+xchk_dirpath_append(
+ struct xchk_dirtree *dl,
+ struct xfs_inode *ip,
+ struct xchk_dirpath *path,
+ const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr)
+{
+ struct xchk_dirpath_step step = {
+ .pptr_rec = *pptr, /* struct copy */
+ .name_len = name->len,
+ };
+ int error;
+
+ /*
+ * If this path is more than 2 billion steps long, this directory tree
+ * is too far gone to fix.
+ */
+ if (path->nr_steps >= XFS_MAXLINK)
+ return -ELNRNG;
+
+ error = xfblob_storename(dl->path_names, &step.name_cookie, name);
+ if (error)
+ return error;
+
+ error = xino_bitmap_set(&path->seen_inodes, ip->i_ino);
+ if (error)
+ return error;
+
+ error = xfarray_append(dl->path_steps, &step);
+ if (error)
+ return error;
+
+ path->nr_steps++;
+ return 0;
+}
+
+/*
+ * Create an xchk_path for each parent pointer of the directory that we're
+ * scanning. For each path created, we will eventually try to walk towards the
+ * root with the goal of deleting all parents except for one that leads to the
+ * root.
+ *
+ * Returns -EFSCORRUPTED to signal that the inode being scanned has a corrupt
+ * parent pointer and hence there's no point in continuing; or -ENOSR if there
+ * are too many parent pointers for this directory.
+ */
+STATIC int
+xchk_dirtree_create_path(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ };
+ struct xchk_dirtree *dl = priv;
+ struct xchk_dirpath *path;
+ const struct xfs_parent_rec *rec = value;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, NULL, NULL);
+ if (error)
+ return error;
+
+ /*
+ * If there are more than 2 billion actual parent pointers for this
+ * subdirectory, this fs is too far gone to fix.
+ */
+ if (dl->nr_paths >= XFS_MAXLINK)
+ return -ENOSR;
+
+ trace_xchk_dirtree_create_path(sc, ip, dl->nr_paths, &xname, rec);
+
+ /*
+ * Create a new xchk_path structure to remember this parent pointer
+ * and record the first name step.
+ */
+ path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS);
+ if (!path)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&path->list);
+ xino_bitmap_init(&path->seen_inodes);
+ path->nr_steps = 0;
+ path->outcome = XCHK_DIRPATH_SCANNING;
+
+ error = xchk_dirpath_append(dl, sc->ip, path, &xname, rec);
+ if (error)
+ goto out_path;
+
+ path->first_step = xfarray_length(dl->path_steps) - 1;
+ path->second_step = XFARRAY_NULLIDX;
+ path->path_nr = dl->nr_paths;
+
+ list_add_tail(&path->list, &dl->path_list);
+ dl->nr_paths++;
+ return 0;
+out_path:
+ kfree(path);
+ return error;
+}
+
+/*
+ * Validate that the first step of this path still has a corresponding
+ * parent pointer in @sc->ip. We probably dropped @sc->ip's ILOCK while
+ * walking towards the roots, which is why this is necessary.
+ *
+ * This function has a side effect of loading the first parent pointer of this
+ * path into the parent pointer scratch pad. This prepares us to walk up the
+ * directory tree towards the root. Returns -ESTALE if the scan data is now
+ * out of date.
+ */
+STATIC int
+xchk_dirpath_revalidate(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xfs_scrub *sc = dl->sc;
+ int error;
+
+ /*
+ * Look up the parent pointer that corresponds to the start of this
+ * path. If the parent pointer has disappeared on us, dump all the
+ * scan results and try again.
+ */
+ error = xfs_parent_lookup(sc->tp, sc->ip, &dl->xname, &dl->pptr_rec,
+ &dl->pptr_args);
+ if (error == -ENOATTR) {
+ trace_xchk_dirpath_disappeared(dl->sc, sc->ip, path->path_nr,
+ path->first_step, &dl->xname, &dl->pptr_rec);
+ dl->stale = true;
+ return -ESTALE;
+ }
+
+ return error;
+}
+
+/*
+ * Walk the parent pointers of a directory at the end of a path and record
+ * the parent that we find in @dl->xname/pptr_rec.
+ */
+STATIC int
+xchk_dirpath_find_next_step(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xchk_dirtree *dl = priv;
+ const struct xfs_parent_rec *rec = value;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, NULL, NULL);
+ if (error)
+ return error;
+
+ /*
+ * If we've already set @dl->pptr_rec, then this directory has multiple
+ * parents. Signal this back to the caller via -EMLINK.
+ */
+ if (dl->parents_found > 0)
+ return -EMLINK;
+
+ dl->parents_found++;
+ memcpy(dl->namebuf, name, namelen);
+ dl->xname.len = namelen;
+ dl->pptr_rec = *rec; /* struct copy */
+ return 0;
+}
+
+/* Set and log the outcome of a path walk. */
+static inline void
+xchk_dirpath_set_outcome(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ enum xchk_dirpath_outcome outcome)
+{
+ trace_xchk_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps,
+ outcome);
+
+ path->outcome = outcome;
+}
+
+/*
+ * Scan the directory at the end of this path for its parent directory link.
+ * If we find one, extend the path. Returns -ESTALE if the scan data out of
+ * date. Returns -EFSCORRUPTED if the parent pointer is bad; or -ELNRNG if
+ * the path got too deep.
+ */
+STATIC int
+xchk_dirpath_step_up(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ bool is_metadir)
+{
+ struct xfs_scrub *sc = dl->sc;
+ struct xfs_inode *dp;
+ xfs_ino_t parent_ino = be64_to_cpu(dl->pptr_rec.p_ino);
+ unsigned int lock_mode;
+ int error;
+
+ /* Grab and lock the parent directory. */
+ error = xchk_iget(sc, parent_ino, &dp);
+ if (error)
+ return error;
+
+ lock_mode = xfs_ilock_attr_map_shared(dp);
+ mutex_lock(&dl->lock);
+
+ if (dl->stale) {
+ error = -ESTALE;
+ goto out_scanlock;
+ }
+
+ /* We've reached the root directory; the path is ok. */
+ if (parent_ino == dl->root_ino) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_OK);
+ error = 0;
+ goto out_scanlock;
+ }
+
+ /*
+ * The inode being scanned is its own distant ancestor! Get rid of
+ * this path.
+ */
+ if (parent_ino == sc->ip->i_ino) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ error = 0;
+ goto out_scanlock;
+ }
+
+ /*
+ * We've seen this inode before during the path walk. There's a loop
+ * above us in the directory tree. This probably means that we cannot
+ * continue, but let's keep walking paths to get a full picture.
+ */
+ if (xino_bitmap_test(&path->seen_inodes, parent_ino)) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_LOOP);
+ error = 0;
+ goto out_scanlock;
+ }
+
+ /* The handle encoded in the parent pointer must match. */
+ if (VFS_I(dp)->i_generation != be32_to_cpu(dl->pptr_rec.p_gen)) {
+ trace_xchk_dirpath_badgen(dl->sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+ error = -EFSCORRUPTED;
+ goto out_scanlock;
+ }
+
+ /* Parent pointer must point up to a directory. */
+ if (!S_ISDIR(VFS_I(dp)->i_mode)) {
+ trace_xchk_dirpath_nondir_parent(dl->sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+ error = -EFSCORRUPTED;
+ goto out_scanlock;
+ }
+
+ /* Parent cannot be an unlinked directory. */
+ if (VFS_I(dp)->i_nlink == 0) {
+ trace_xchk_dirpath_unlinked_parent(dl->sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+ error = -EFSCORRUPTED;
+ goto out_scanlock;
+ }
+
+ /* Parent must be in the same directory tree. */
+ if (is_metadir != xfs_is_metadir_inode(dp)) {
+ trace_xchk_dirpath_crosses_tree(dl->sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+ error = -EFSCORRUPTED;
+ goto out_scanlock;
+ }
+
+ /*
+ * If the extended attributes look as though they has been zapped by
+ * the inode record repair code, we cannot scan for parent pointers.
+ */
+ if (xchk_pptr_looks_zapped(dp)) {
+ error = -EBUSY;
+ xchk_set_incomplete(sc);
+ goto out_scanlock;
+ }
+
+ /*
+ * Walk the parent pointers of @dp to find the parent of this directory
+ * to find the next step in our walk. If we find that @dp has exactly
+ * one parent, the parent pointer information will be stored in
+ * @dl->pptr_rec. This prepares us for the next step of the walk.
+ */
+ mutex_unlock(&dl->lock);
+ dl->parents_found = 0;
+ error = xchk_xattr_walk(sc, dp, xchk_dirpath_find_next_step, NULL, dl);
+ mutex_lock(&dl->lock);
+ if (error == -EFSCORRUPTED || error == -EMLINK ||
+ (!error && dl->parents_found == 0)) {
+ /*
+ * Further up the directory tree from @sc->ip, we found a
+ * corrupt parent pointer, multiple parent pointers while
+ * finding this directory's parent, or zero parents despite
+ * having a nonzero link count. Keep looking for other paths.
+ */
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT);
+ error = 0;
+ goto out_scanlock;
+ }
+ if (error)
+ goto out_scanlock;
+
+ if (dl->stale) {
+ error = -ESTALE;
+ goto out_scanlock;
+ }
+
+ trace_xchk_dirpath_found_next_step(sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+
+ /* Append to the path steps */
+ error = xchk_dirpath_append(dl, dp, path, &dl->xname, &dl->pptr_rec);
+ if (error)
+ goto out_scanlock;
+
+ if (path->second_step == XFARRAY_NULLIDX)
+ path->second_step = xfarray_length(dl->path_steps) - 1;
+
+out_scanlock:
+ mutex_unlock(&dl->lock);
+ xfs_iunlock(dp, lock_mode);
+ xchk_irele(sc, dp);
+ return error;
+}
+
+/*
+ * Walk the directory tree upwards towards what is hopefully the root
+ * directory, recording path steps as we go. The current path components are
+ * stored in dl->pptr_rec and dl->xname.
+ *
+ * Returns -ESTALE if the scan data are out of date. Returns -EFSCORRUPTED
+ * only if the direct parent pointer of @sc->ip associated with this path is
+ * corrupt.
+ */
+STATIC int
+xchk_dirpath_walk_upwards(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xfs_scrub *sc = dl->sc;
+ bool is_metadir;
+ int error;
+
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ /* Reload the start of this path and make sure it's still there. */
+ error = xchk_dirpath_revalidate(dl, path);
+ if (error)
+ return error;
+
+ trace_xchk_dirpath_walk_upwards(sc, sc->ip, path->path_nr, &dl->xname,
+ &dl->pptr_rec);
+
+ /*
+ * The inode being scanned is its own direct ancestor!
+ * Get rid of this path.
+ */
+ if (be64_to_cpu(dl->pptr_rec.p_ino) == sc->ip->i_ino) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ return 0;
+ }
+
+ /*
+ * Drop ILOCK_EXCL on the inode being scanned. We still hold
+ * IOLOCK_EXCL on it, so it cannot move around or be renamed.
+ *
+ * Beyond this point we're walking up the directory tree, which means
+ * that we can acquire and drop the ILOCK on an alias of sc->ip. The
+ * ILOCK state is no longer tracked in the scrub context. Hence we
+ * must drop @sc->ip's ILOCK during the walk.
+ */
+ is_metadir = xfs_is_metadir_inode(sc->ip);
+ mutex_unlock(&dl->lock);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the first step in the walk towards the root by checking the
+ * start of this path, which is a direct parent pointer of @sc->ip.
+ * If we see any kind of error here (including corruptions), the parent
+ * pointer of @sc->ip is corrupt. Stop the whole scan.
+ */
+ error = xchk_dirpath_step_up(dl, path, is_metadir);
+ if (error) {
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ mutex_lock(&dl->lock);
+ return error;
+ }
+
+ /*
+ * Take steps upward from the second step in this path towards the
+ * root. If we hit corruption errors here, there's a problem
+ * *somewhere* in the path, but we don't need to stop scanning.
+ */
+ while (!error && path->outcome == XCHK_DIRPATH_SCANNING)
+ error = xchk_dirpath_step_up(dl, path, is_metadir);
+
+ /* Retake the locks we had, mark paths, etc. */
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ mutex_lock(&dl->lock);
+ if (error == -EFSCORRUPTED) {
+ xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT);
+ error = 0;
+ }
+ if (!error && dl->stale)
+ return -ESTALE;
+ return error;
+}
+
+/*
+ * Decide if this path step has been touched by this live update. Returns
+ * 1 for yes, 0 for no, or a negative errno.
+ */
+STATIC int
+xchk_dirpath_step_is_stale(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ unsigned int step_nr,
+ xfarray_idx_t step_idx,
+ struct xfs_dir_update_params *p,
+ xfs_ino_t *cursor)
+{
+ struct xchk_dirpath_step step;
+ xfs_ino_t child_ino = *cursor;
+ int error;
+
+ error = xfarray_load(dl->path_steps, step_idx, &step);
+ if (error)
+ return error;
+ *cursor = be64_to_cpu(step.pptr_rec.p_ino);
+
+ /*
+ * If the parent and child being updated are not the ones mentioned in
+ * this path step, the scan data is still ok.
+ */
+ if (p->ip->i_ino != child_ino || p->dp->i_ino != *cursor)
+ return 0;
+
+ /*
+ * If the dirent name lengths or byte sequences are different, the scan
+ * data is still ok.
+ */
+ if (p->name->len != step.name_len)
+ return 0;
+
+ error = xfblob_loadname(dl->path_names, step.name_cookie,
+ &dl->hook_xname, step.name_len);
+ if (error)
+ return error;
+
+ if (memcmp(dl->hook_xname.name, p->name->name, p->name->len) != 0)
+ return 0;
+
+ /*
+ * If the update comes from the repair code itself, walk the state
+ * machine forward.
+ */
+ if (p->ip->i_ino == dl->scan_ino &&
+ path->outcome == XREP_DIRPATH_ADOPTING) {
+ xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_ADOPTED);
+ return 0;
+ }
+
+ if (p->ip->i_ino == dl->scan_ino &&
+ path->outcome == XREP_DIRPATH_DELETING) {
+ xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETED);
+ return 0;
+ }
+
+ /* Exact match, scan data is out of date. */
+ trace_xchk_dirpath_changed(dl->sc, path->path_nr, step_nr, p->dp,
+ p->ip, p->name);
+ return 1;
+}
+
+/*
+ * Decide if this path has been touched by this live update. Returns 1 for
+ * yes, 0 for no, or a negative errno.
+ */
+STATIC int
+xchk_dirpath_is_stale(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ struct xfs_dir_update_params *p)
+{
+ xfs_ino_t cursor = dl->scan_ino;
+ xfarray_idx_t idx = path->first_step;
+ unsigned int i;
+ int ret;
+
+ /*
+ * The child being updated has not been seen by this path at all; this
+ * path cannot be stale.
+ */
+ if (!xino_bitmap_test(&path->seen_inodes, p->ip->i_ino))
+ return 0;
+
+ ret = xchk_dirpath_step_is_stale(dl, path, 0, idx, p, &cursor);
+ if (ret != 0)
+ return ret;
+
+ for (i = 1, idx = path->second_step; i < path->nr_steps; i++, idx++) {
+ ret = xchk_dirpath_step_is_stale(dl, path, i, idx, p, &cursor);
+ if (ret != 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Decide if a directory update from the regular filesystem touches any of the
+ * paths we've scanned, and invalidate the scan data if true.
+ */
+STATIC int
+xchk_dirtree_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xchk_dirtree *dl;
+ struct xchk_dirpath *path;
+ int ret;
+
+ dl = container_of(nb, struct xchk_dirtree, dhook.dirent_hook.nb);
+
+ trace_xchk_dirtree_live_update(dl->sc, p->dp, action, p->ip, p->delta,
+ p->name);
+
+ mutex_lock(&dl->lock);
+
+ if (dl->stale || dl->aborted)
+ goto out_unlock;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ ret = xchk_dirpath_is_stale(dl, path, p);
+ if (ret < 0) {
+ dl->aborted = true;
+ break;
+ }
+ if (ret == 1) {
+ dl->stale = true;
+ break;
+ }
+ }
+
+out_unlock:
+ mutex_unlock(&dl->lock);
+ return NOTIFY_DONE;
+}
+
+/* Delete all the collected path information. */
+STATIC void
+xchk_dirtree_reset(
+ void *buf)
+{
+ struct xchk_dirtree *dl = buf;
+ struct xchk_dirpath *path, *n;
+
+ ASSERT(dl->sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ xchk_dirtree_for_each_path_safe(dl, path, n) {
+ list_del_init(&path->list);
+ xino_bitmap_destroy(&path->seen_inodes);
+ kfree(path);
+ }
+ dl->nr_paths = 0;
+
+ xfarray_truncate(dl->path_steps);
+ xfblob_truncate(dl->path_names);
+
+ dl->stale = false;
+}
+
+/*
+ * Load the name/pptr from the first step in this path into @dl->pptr_rec and
+ * @dl->xname.
+ */
+STATIC int
+xchk_dirtree_load_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xchk_dirpath_step step;
+ int error;
+
+ error = xfarray_load(dl->path_steps, path->first_step, &step);
+ if (error)
+ return error;
+
+ error = xfblob_loadname(dl->path_names, step.name_cookie, &dl->xname,
+ step.name_len);
+ if (error)
+ return error;
+
+ dl->pptr_rec = step.pptr_rec; /* struct copy */
+ return 0;
+}
+
+/*
+ * For each parent pointer of this subdir, trace a path upwards towards the
+ * root directory and record what we find. Returns 0 for success;
+ * -EFSCORRUPTED if walking the parent pointers of @sc->ip failed, -ELNRNG if a
+ * path was too deep; -ENOSR if there were too many parent pointers; or
+ * a negative errno.
+ */
+int
+xchk_dirtree_find_paths_to_root(
+ struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+ struct xchk_dirpath *path;
+ int error = 0;
+
+ do {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ xchk_dirtree_reset(dl);
+
+ /*
+ * If the extended attributes look as though they has been
+ * zapped by the inode record repair code, we cannot scan for
+ * parent pointers.
+ */
+ if (xchk_pptr_looks_zapped(sc->ip)) {
+ xchk_set_incomplete(sc);
+ return -EBUSY;
+ }
+
+ /*
+ * Create path walk contexts for each parent of the directory
+ * that is being scanned. Directories are supposed to have
+ * only one parent, but this is how we detect multiple parents.
+ */
+ error = xchk_xattr_walk(sc, sc->ip, xchk_dirtree_create_path,
+ NULL, dl);
+ if (error)
+ return error;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ /* Load path components into dl->pptr/xname */
+ error = xchk_dirtree_load_path(dl, path);
+ if (error)
+ return error;
+
+ /*
+ * Try to walk up each path to the root. This enables
+ * us to find directory loops in ancestors, and the
+ * like.
+ */
+ error = xchk_dirpath_walk_upwards(dl, path);
+ if (error == -EFSCORRUPTED) {
+ /*
+ * A parent pointer of @sc->ip is bad, don't
+ * bother continuing.
+ */
+ break;
+ }
+ if (error == -ESTALE) {
+ /* This had better be an invalidation. */
+ ASSERT(dl->stale);
+ break;
+ }
+ if (error)
+ return error;
+ if (dl->aborted)
+ return 0;
+ }
+ } while (dl->stale);
+
+ return error;
+}
+
+/*
+ * Figure out what to do with the paths we tried to find. Do not call this
+ * if the scan results are stale.
+ */
+void
+xchk_dirtree_evaluate(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+
+ ASSERT(!dl->stale);
+
+ /* Scan the paths we have to decide what to do. */
+ memset(oc, 0, sizeof(struct xchk_dirtree_outcomes));
+ xchk_dirtree_for_each_path(dl, path) {
+ trace_xchk_dirpath_evaluate_path(dl->sc, path->path_nr,
+ path->nr_steps, path->outcome);
+
+ switch (path->outcome) {
+ case XCHK_DIRPATH_SCANNING:
+ /* shouldn't get here */
+ ASSERT(0);
+ break;
+ case XCHK_DIRPATH_DELETE:
+ /* This one is already going away. */
+ oc->bad++;
+ break;
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ /* Couldn't find the end of this path. */
+ oc->suspect++;
+ break;
+ case XCHK_DIRPATH_STALE:
+ /* shouldn't get here either */
+ ASSERT(0);
+ break;
+ case XCHK_DIRPATH_OK:
+ /* This path got all the way to the root. */
+ oc->good++;
+ break;
+ case XREP_DIRPATH_DELETING:
+ case XREP_DIRPATH_DELETED:
+ case XREP_DIRPATH_ADOPTING:
+ case XREP_DIRPATH_ADOPTED:
+ /* These should not be in progress! */
+ ASSERT(0);
+ break;
+ }
+ }
+
+ trace_xchk_dirtree_evaluate(dl, oc);
+}
+
+/* Look for directory loops. */
+int
+xchk_dirtree(
+ struct xfs_scrub *sc)
+{
+ struct xchk_dirtree_outcomes oc;
+ struct xchk_dirtree *dl = sc->buf;
+ int error;
+
+ /*
+ * Nondirectories do not point downwards to other files, so they cannot
+ * cause a cycle in the directory tree.
+ */
+ if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
+ return -ENOENT;
+
+ ASSERT(xfs_has_parent(sc->mp));
+
+ /*
+ * Find the root of the directory tree. Remember which directory to
+ * scan, because the hook doesn't detach until after sc->ip gets
+ * released during teardown.
+ */
+ dl->root_ino = xchk_inode_rootdir_inum(sc->ip);
+ dl->scan_ino = sc->ip->i_ino;
+
+ trace_xchk_dirtree_start(sc->ip, sc->sm, 0);
+
+ /*
+ * Hook into the directory entry code so that we can capture updates to
+ * paths that we have already scanned. The scanner thread takes each
+ * directory's ILOCK, which means that any in-progress directory update
+ * will finish before we can scan the directory.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+ xfs_dir_hook_setup(&dl->dhook, xchk_dirtree_live_update);
+ error = xfs_dir_hook_add(sc->mp, &dl->dhook);
+ if (error)
+ goto out;
+
+ mutex_lock(&dl->lock);
+
+ /* Trace each parent pointer's path to the root. */
+ error = xchk_dirtree_find_paths_to_root(dl);
+ if (error == -EFSCORRUPTED || error == -ELNRNG || error == -ENOSR) {
+ /*
+ * Don't bother walking the paths if the xattr structure or the
+ * parent pointers are corrupt; this scan cannot be completed
+ * without full information.
+ */
+ xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
+ error = 0;
+ goto out_scanlock;
+ }
+ if (error == -EBUSY) {
+ /*
+ * We couldn't scan some directory's parent pointers because
+ * the attr fork looked like it had been zapped. The
+ * scan was marked incomplete, so no further error code
+ * is necessary.
+ */
+ error = 0;
+ goto out_scanlock;
+ }
+ if (error)
+ goto out_scanlock;
+ if (dl->aborted) {
+ xchk_set_incomplete(sc);
+ goto out_scanlock;
+ }
+
+ /* Assess what we found in our path evaluation. */
+ xchk_dirtree_evaluate(dl, &oc);
+ if (xchk_dirtree_parentless(dl)) {
+ if (oc.good || oc.bad || oc.suspect)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ } else {
+ if (oc.bad || oc.good + oc.suspect != 1)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ if (oc.suspect)
+ xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
+ }
+
+out_scanlock:
+ mutex_unlock(&dl->lock);
+out:
+ trace_xchk_dirtree_done(sc->ip, sc->sm, error);
+ return error;
+}
+
+/* Does the directory targetted by this scrub have no parents? */
+bool
+xchk_dirtree_parentless(const struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+
+ if (xchk_inode_is_dirtree_root(sc->ip))
+ return true;
+ if (VFS_I(sc->ip)->i_nlink == 0)
+ return true;
+ return false;
+}
diff --git a/fs/xfs/scrub/dirtree.h b/fs/xfs/scrub/dirtree.h
new file mode 100644
index 000000000000..9e5d95492717
--- /dev/null
+++ b/fs/xfs/scrub/dirtree.h
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_DIRTREE_H__
+#define __XFS_SCRUB_DIRTREE_H__
+
+/*
+ * Each of these represents one parent pointer path step in a chain going
+ * up towards the directory tree root. These are stored inside an xfarray.
+ */
+struct xchk_dirpath_step {
+ /* Directory entry name associated with this parent link. */
+ xfblob_cookie name_cookie;
+ unsigned int name_len;
+
+ /* Handle of the parent directory. */
+ struct xfs_parent_rec pptr_rec;
+};
+
+enum xchk_dirpath_outcome {
+ XCHK_DIRPATH_SCANNING = 0, /* still being put together */
+ XCHK_DIRPATH_DELETE, /* delete this path */
+ XCHK_DIRPATH_CORRUPT, /* corruption detected in path */
+ XCHK_DIRPATH_LOOP, /* cycle detected further up */
+ XCHK_DIRPATH_STALE, /* path is stale */
+ XCHK_DIRPATH_OK, /* path reaches the root */
+
+ XREP_DIRPATH_DELETING, /* path is being deleted */
+ XREP_DIRPATH_DELETED, /* path has been deleted */
+ XREP_DIRPATH_ADOPTING, /* path is being adopted */
+ XREP_DIRPATH_ADOPTED, /* path has been adopted */
+};
+
+/*
+ * Each of these represents one parent pointer path out of the directory being
+ * scanned. These exist in-core, and hopefully there aren't more than a
+ * handful of them.
+ */
+struct xchk_dirpath {
+ struct list_head list;
+
+ /* Index of the first step in this path. */
+ xfarray_idx_t first_step;
+
+ /* Index of the second step in this path. */
+ xfarray_idx_t second_step;
+
+ /* Inodes seen while walking this path. */
+ struct xino_bitmap seen_inodes;
+
+ /* Number of steps in this path. */
+ unsigned int nr_steps;
+
+ /* Which path is this? */
+ unsigned int path_nr;
+
+ /* What did we conclude from following this path? */
+ enum xchk_dirpath_outcome outcome;
+};
+
+struct xchk_dirtree_outcomes {
+ /* Number of XCHK_DIRPATH_DELETE */
+ unsigned int bad;
+
+ /* Number of XCHK_DIRPATH_CORRUPT or XCHK_DIRPATH_LOOP */
+ unsigned int suspect;
+
+ /* Number of XCHK_DIRPATH_OK */
+ unsigned int good;
+
+ /* Directory needs to be added to lost+found */
+ bool needs_adoption;
+};
+
+struct xchk_dirtree {
+ struct xfs_scrub *sc;
+
+ /* Root inode that we're looking for. */
+ xfs_ino_t root_ino;
+
+ /*
+ * This is the inode that we're scanning. The live update hook can
+ * continue to be called after xchk_teardown drops sc->ip but before
+ * it calls buf_cleanup, so we keep a copy.
+ */
+ xfs_ino_t scan_ino;
+
+ /*
+ * If we start deleting redundant paths to this subdirectory, this is
+ * the inode number of the surviving parent and the dotdot entry will
+ * be set to this value. If the value is NULLFSINO, then use @root_ino
+ * as a stand-in until the orphanage can adopt the subdirectory.
+ */
+ xfs_ino_t parent_ino;
+
+ /* Scratch buffer for scanning pptr xattrs */
+ struct xfs_parent_rec pptr_rec;
+ struct xfs_da_args pptr_args;
+
+ /* Name buffer */
+ struct xfs_name xname;
+ char namebuf[MAXNAMELEN];
+
+ /* Information for reparenting this directory. */
+ struct xrep_adoption adoption;
+
+ /*
+ * Hook into directory updates so that we can receive live updates
+ * from other writer threads.
+ */
+ struct xfs_dir_hook dhook;
+
+ /* Parent pointer update arguments. */
+ struct xfs_parent_args ppargs;
+
+ /* lock for everything below here */
+ struct mutex lock;
+
+ /* buffer for the live update functions to use for dirent names */
+ struct xfs_name hook_xname;
+ unsigned char hook_namebuf[MAXNAMELEN];
+
+ /*
+ * All path steps observed during this scan. Each of the path
+ * steps for a particular pathwalk are recorded in sequential
+ * order in the xfarray. A pathwalk ends either with a step
+ * pointing to the root directory (success) or pointing to NULLFSINO
+ * (loop detected, empty dir detected, etc).
+ */
+ struct xfarray *path_steps;
+
+ /* All names observed during this scan. */
+ struct xfblob *path_names;
+
+ /* All paths being tracked by this scanner. */
+ struct list_head path_list;
+
+ /* Number of paths in path_list. */
+ unsigned int nr_paths;
+
+ /* Number of parents found by a pptr scan. */
+ unsigned int parents_found;
+
+ /* Have the path data been invalidated by a concurrent update? */
+ bool stale:1;
+
+ /* Has the scan been aborted? */
+ bool aborted:1;
+};
+
+#define xchk_dirtree_for_each_path_safe(dl, path, n) \
+ list_for_each_entry_safe((path), (n), &(dl)->path_list, list)
+
+#define xchk_dirtree_for_each_path(dl, path) \
+ list_for_each_entry((path), &(dl)->path_list, list)
+
+bool xchk_dirtree_parentless(const struct xchk_dirtree *dl);
+
+int xchk_dirtree_find_paths_to_root(struct xchk_dirtree *dl);
+int xchk_dirpath_append(struct xchk_dirtree *dl, struct xfs_inode *ip,
+ struct xchk_dirpath *path, const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr);
+void xchk_dirtree_evaluate(struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc);
+
+#endif /* __XFS_SCRUB_DIRTREE_H__ */
diff --git a/fs/xfs/scrub/dirtree_repair.c b/fs/xfs/scrub/dirtree_repair.c
new file mode 100644
index 000000000000..5c04e70ba951
--- /dev/null
+++ b/fs/xfs/scrub/dirtree_repair.c
@@ -0,0 +1,821 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_trans_space.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr.h"
+#include "xfs_parent.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/bitmap.h"
+#include "scrub/ino_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/listxattr.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/orphanage.h"
+#include "scrub/dirtree.h"
+#include "scrub/readdir.h"
+
+/*
+ * Directory Tree Structure Repairs
+ * ================================
+ *
+ * If we decide that the directory being scanned is participating in a
+ * directory loop, the only change we can make is to remove directory entries
+ * pointing down to @sc->ip. If that leaves it with no parents, the directory
+ * should be adopted by the orphanage.
+ */
+
+/* Set up to repair directory loops. */
+int
+xrep_setup_dirtree(
+ struct xfs_scrub *sc)
+{
+ return xrep_orphanage_try_create(sc);
+}
+
+/* Change the outcome of this path. */
+static inline void
+xrep_dirpath_set_outcome(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ enum xchk_dirpath_outcome outcome)
+{
+ trace_xrep_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps,
+ outcome);
+
+ path->outcome = outcome;
+}
+
+/* Delete all paths. */
+STATIC void
+xrep_dirtree_delete_all_paths(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ switch (path->outcome) {
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ oc->suspect--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ case XCHK_DIRPATH_OK:
+ oc->good--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ default:
+ break;
+ }
+ }
+
+ ASSERT(oc->suspect == 0);
+ ASSERT(oc->good == 0);
+}
+
+/* Since this is the surviving path, set the dotdot entry to this value. */
+STATIC void
+xrep_dirpath_retain_parent(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xchk_dirpath_step step;
+ int error;
+
+ error = xfarray_load(dl->path_steps, path->first_step, &step);
+ if (error)
+ return;
+
+ dl->parent_ino = be64_to_cpu(step.pptr_rec.p_ino);
+}
+
+/* Find the one surviving path so we know how to set dotdot. */
+STATIC void
+xrep_dirtree_find_surviving_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+ bool foundit = false;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ switch (path->outcome) {
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ case XCHK_DIRPATH_OK:
+ if (!foundit) {
+ xrep_dirpath_retain_parent(dl, path);
+ foundit = true;
+ continue;
+ }
+ ASSERT(foundit == false);
+ break;
+ default:
+ break;
+ }
+ }
+
+ ASSERT(oc->suspect + oc->good == 1);
+}
+
+/* Delete all paths except for the one good one. */
+STATIC void
+xrep_dirtree_keep_one_good_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+ bool foundit = false;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ switch (path->outcome) {
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ oc->suspect--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ case XCHK_DIRPATH_OK:
+ if (!foundit) {
+ xrep_dirpath_retain_parent(dl, path);
+ foundit = true;
+ continue;
+ }
+ oc->good--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ default:
+ break;
+ }
+ }
+
+ ASSERT(oc->suspect == 0);
+ ASSERT(oc->good < 2);
+}
+
+/* Delete all paths except for one suspect one. */
+STATIC void
+xrep_dirtree_keep_one_suspect_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+ bool foundit = false;
+
+ xchk_dirtree_for_each_path(dl, path) {
+ switch (path->outcome) {
+ case XCHK_DIRPATH_CORRUPT:
+ case XCHK_DIRPATH_LOOP:
+ if (!foundit) {
+ xrep_dirpath_retain_parent(dl, path);
+ foundit = true;
+ continue;
+ }
+ oc->suspect--;
+ oc->bad++;
+ xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE);
+ break;
+ case XCHK_DIRPATH_OK:
+ ASSERT(0);
+ break;
+ default:
+ break;
+ }
+ }
+
+ ASSERT(oc->suspect == 1);
+ ASSERT(oc->good == 0);
+}
+
+/*
+ * Figure out what to do with the paths we tried to find. Returns -EDEADLOCK
+ * if the scan results have become stale.
+ */
+STATIC void
+xrep_dirtree_decide_fate(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ xchk_dirtree_evaluate(dl, oc);
+
+ /* Parentless directories should not have any paths at all. */
+ if (xchk_dirtree_parentless(dl)) {
+ xrep_dirtree_delete_all_paths(dl, oc);
+ return;
+ }
+
+ /* One path is exactly the number of paths we want. */
+ if (oc->good + oc->suspect == 1) {
+ xrep_dirtree_find_surviving_path(dl, oc);
+ return;
+ }
+
+ /* Zero paths means we should reattach the subdir to the orphanage. */
+ if (oc->good + oc->suspect == 0) {
+ if (dl->sc->orphanage)
+ oc->needs_adoption = true;
+ return;
+ }
+
+ /*
+ * Otherwise, this subdirectory has too many parents. If there's at
+ * least one good path, keep it and delete the others.
+ */
+ if (oc->good > 0) {
+ xrep_dirtree_keep_one_good_path(dl, oc);
+ return;
+ }
+
+ /*
+ * There are no good paths and there are too many suspect paths.
+ * Keep the first suspect path and delete the rest.
+ */
+ xrep_dirtree_keep_one_suspect_path(dl, oc);
+}
+
+/*
+ * Load the first step of this path into @step and @dl->xname/pptr
+ * for later repair work.
+ */
+STATIC int
+xrep_dirtree_prep_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path,
+ struct xchk_dirpath_step *step)
+{
+ int error;
+
+ error = xfarray_load(dl->path_steps, path->first_step, step);
+ if (error)
+ return error;
+
+ error = xfblob_loadname(dl->path_names, step->name_cookie, &dl->xname,
+ step->name_len);
+ if (error)
+ return error;
+
+ dl->pptr_rec = step->pptr_rec; /* struct copy */
+ return 0;
+}
+
+/* Delete the VFS dentry for a removed child. */
+STATIC int
+xrep_dirtree_purge_dentry(
+ struct xchk_dirtree *dl,
+ struct xfs_inode *dp,
+ const struct xfs_name *name)
+{
+ struct qstr qname = QSTR_INIT(name->name, name->len);
+ struct dentry *parent_dentry, *child_dentry;
+ int error = 0;
+
+ /*
+ * Find the dentry for the parent directory. If there isn't one, we're
+ * done. Caller already holds i_rwsem for parent and child.
+ */
+ parent_dentry = d_find_alias(VFS_I(dp));
+ if (!parent_dentry)
+ return 0;
+
+ /* The VFS thinks the parent is a directory, right? */
+ if (!d_is_dir(parent_dentry)) {
+ ASSERT(d_is_dir(parent_dentry));
+ error = -EFSCORRUPTED;
+ goto out_dput_parent;
+ }
+
+ /*
+ * Try to find the dirent pointing to the child. If there isn't one,
+ * we're done.
+ */
+ qname.hash = full_name_hash(parent_dentry, name->name, name->len);
+ child_dentry = d_lookup(parent_dentry, &qname);
+ if (!child_dentry) {
+ error = 0;
+ goto out_dput_parent;
+ }
+
+ trace_xrep_dirtree_delete_child(dp->i_mount, child_dentry);
+
+ /* Child is not a directory? We're screwed. */
+ if (!d_is_dir(child_dentry)) {
+ ASSERT(d_is_dir(child_dentry));
+ error = -EFSCORRUPTED;
+ goto out_dput_child;
+ }
+
+ /* Replace the child dentry with a negative one. */
+ d_delete(child_dentry);
+
+out_dput_child:
+ dput(child_dentry);
+out_dput_parent:
+ dput(parent_dentry);
+ return error;
+}
+
+/*
+ * Prepare to delete a link by taking the IOLOCK of the parent and the child
+ * (scrub target). Caller must hold IOLOCK_EXCL on @sc->ip. Returns 0 if we
+ * took both locks, or a negative errno if we couldn't lock the parent in time.
+ */
+static inline int
+xrep_dirtree_unlink_iolock(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp)
+{
+ int error;
+
+ ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL);
+
+ if (xfs_ilock_nowait(dp, XFS_IOLOCK_EXCL))
+ return 0;
+
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ do {
+ xfs_ilock(dp, XFS_IOLOCK_EXCL);
+ if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ break;
+ xfs_iunlock(dp, XFS_IOLOCK_EXCL);
+
+ if (xchk_should_terminate(sc, &error)) {
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+ delay(1);
+ } while (1);
+
+ return 0;
+}
+
+/*
+ * Remove a link from the directory tree and update the dcache. Returns
+ * -ESTALE if the scan data are now out of date.
+ */
+STATIC int
+xrep_dirtree_unlink(
+ struct xchk_dirtree *dl,
+ struct xfs_inode *dp,
+ struct xchk_dirpath *path,
+ struct xchk_dirpath_step *step)
+{
+ struct xfs_scrub *sc = dl->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t dotdot_ino;
+ xfs_ino_t parent_ino = dl->parent_ino;
+ unsigned int resblks;
+ int dontcare;
+ int error;
+
+ /* Take IOLOCK_EXCL of the parent and child. */
+ error = xrep_dirtree_unlink_iolock(sc, dp);
+ if (error)
+ return error;
+
+ /*
+ * Create the transaction that we need to sever the path. Ignore
+ * EDQUOT and ENOSPC being returned via nospace_error because the
+ * directory code can handle a reservationless update.
+ */
+ resblks = xfs_remove_space_res(mp, step->name_len);
+ error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, sc->ip,
+ &resblks, &sc->tp, &dontcare);
+ if (error)
+ goto out_iolock;
+
+ /*
+ * Cancel if someone invalidate the paths while we were trying to get
+ * the ILOCK.
+ */
+ mutex_lock(&dl->lock);
+ if (dl->stale) {
+ mutex_unlock(&dl->lock);
+ error = -ESTALE;
+ goto out_trans_cancel;
+ }
+ xrep_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETING);
+ mutex_unlock(&dl->lock);
+
+ trace_xrep_dirtree_delete_path(dl->sc, sc->ip, path->path_nr,
+ &dl->xname, &dl->pptr_rec);
+
+ /*
+ * Decide if we need to reset the dotdot entry. Rules:
+ *
+ * - If there's a surviving parent, we want dotdot to point there.
+ * - If we don't have any surviving parents, then point dotdot at the
+ * root dir.
+ * - If dotdot is already set to the value we want, pass in NULLFSINO
+ * for no change necessary.
+ *
+ * Do this /before/ we dirty anything, in case the dotdot lookup
+ * fails.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &dotdot_ino);
+ if (error)
+ goto out_trans_cancel;
+ if (parent_ino == NULLFSINO)
+ parent_ino = dl->root_ino;
+ if (dotdot_ino == parent_ino)
+ parent_ino = NULLFSINO;
+
+ /* Drop the link from sc->ip's dotdot entry. */
+ error = xfs_droplink(sc->tp, dp);
+ if (error)
+ goto out_trans_cancel;
+
+ /* Reset the dotdot entry to a surviving parent. */
+ if (parent_ino != NULLFSINO) {
+ error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot,
+ parent_ino, 0);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /* Drop the link from dp to sc->ip. */
+ error = xfs_droplink(sc->tp, sc->ip);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_dir_removename(sc->tp, dp, &dl->xname, sc->ip->i_ino,
+ resblks);
+ if (error) {
+ ASSERT(error != -ENOENT);
+ goto out_trans_cancel;
+ }
+
+ if (xfs_has_parent(sc->mp)) {
+ error = xfs_parent_removename(sc->tp, &dl->ppargs, dp,
+ &dl->xname, sc->ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
+ * Notify dirent hooks that we removed the bad link, invalidate the
+ * dcache, and commit the repair.
+ */
+ xfs_dir_update_hook(dp, sc->ip, -1, &dl->xname);
+ error = xrep_dirtree_purge_dentry(dl, dp, &dl->xname);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xrep_trans_commit(sc);
+ goto out_ilock;
+
+out_trans_cancel:
+ xchk_trans_cancel(sc);
+out_ilock:
+ xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+out_iolock:
+ xfs_iunlock(dp, XFS_IOLOCK_EXCL);
+ return error;
+}
+
+/*
+ * Delete a directory entry that points to this directory. Returns -ESTALE
+ * if the scan data are now out of date.
+ */
+STATIC int
+xrep_dirtree_delete_path(
+ struct xchk_dirtree *dl,
+ struct xchk_dirpath *path)
+{
+ struct xchk_dirpath_step step;
+ struct xfs_scrub *sc = dl->sc;
+ struct xfs_inode *dp;
+ int error;
+
+ /*
+ * Load the parent pointer and directory inode for this path, then
+ * drop the scan lock, the ILOCK, and the transaction so that
+ * _delete_path can reserve the proper transaction. This sets up
+ * @dl->xname for the deletion.
+ */
+ error = xrep_dirtree_prep_path(dl, path, &step);
+ if (error)
+ return error;
+
+ error = xchk_iget(sc, be64_to_cpu(step.pptr_rec.p_ino), &dp);
+ if (error)
+ return error;
+
+ mutex_unlock(&dl->lock);
+ xchk_trans_cancel(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* Delete the directory link and release the parent. */
+ error = xrep_dirtree_unlink(dl, dp, path, &step);
+ xchk_irele(sc, dp);
+
+ /*
+ * Retake all the resources we had at the beginning even if the repair
+ * failed or the scan data are now stale. This keeps things simple for
+ * the caller.
+ */
+ xchk_trans_alloc_empty(sc);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ mutex_lock(&dl->lock);
+
+ if (!error && dl->stale)
+ error = -ESTALE;
+ return error;
+}
+
+/* Add a new path to represent our in-progress adoption. */
+STATIC int
+xrep_dirtree_create_adoption_path(
+ struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+ struct xchk_dirpath *path;
+ int error;
+
+ /*
+ * We should have capped the number of paths at XFS_MAXLINK-1 in the
+ * scanner.
+ */
+ if (dl->nr_paths > XFS_MAXLINK) {
+ ASSERT(dl->nr_paths <= XFS_MAXLINK);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Create a new xchk_path structure to remember this parent pointer
+ * and record the first name step.
+ */
+ path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS);
+ if (!path)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&path->list);
+ xino_bitmap_init(&path->seen_inodes);
+ path->nr_steps = 0;
+ path->outcome = XREP_DIRPATH_ADOPTING;
+
+ /*
+ * Record the new link that we just created in the orphanage. Because
+ * adoption is the last repair that we perform, we don't bother filling
+ * in the path all the way back to the root.
+ */
+ xfs_inode_to_parent_rec(&dl->pptr_rec, sc->orphanage);
+
+ error = xino_bitmap_set(&path->seen_inodes, sc->orphanage->i_ino);
+ if (error)
+ goto out_path;
+
+ trace_xrep_dirtree_create_adoption(sc, sc->ip, dl->nr_paths,
+ &dl->xname, &dl->pptr_rec);
+
+ error = xchk_dirpath_append(dl, sc->ip, path, &dl->xname,
+ &dl->pptr_rec);
+ if (error)
+ goto out_path;
+
+ path->first_step = xfarray_length(dl->path_steps) - 1;
+ path->second_step = XFARRAY_NULLIDX;
+ path->path_nr = dl->nr_paths;
+
+ list_add_tail(&path->list, &dl->path_list);
+ dl->nr_paths++;
+ return 0;
+
+out_path:
+ kfree(path);
+ return error;
+}
+
+/*
+ * Prepare to move a file to the orphanage by taking the IOLOCK of the
+ * orphanage and the child (scrub target). Caller must hold IOLOCK_EXCL on
+ * @sc->ip. Returns 0 if we took both locks, or a negative errno if we
+ * couldn't lock the orphanage in time.
+ */
+static inline int
+xrep_dirtree_adopt_iolock(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL);
+
+ if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ return 0;
+
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ do {
+ xrep_orphanage_ilock(sc, XFS_IOLOCK_EXCL);
+ if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ break;
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+
+ if (xchk_should_terminate(sc, &error)) {
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+ delay(1);
+ } while (1);
+
+ return 0;
+}
+
+/*
+ * Reattach this orphaned directory to the orphanage. Do not call this with
+ * any resources held. Returns -ESTALE if the scan data have become out of
+ * date.
+ */
+STATIC int
+xrep_dirtree_adopt(
+ struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+ int error;
+
+ /* Take the IOLOCK of the orphanage and the scrub target. */
+ error = xrep_dirtree_adopt_iolock(sc);
+ if (error)
+ return error;
+
+ /*
+ * Set up for an adoption. The directory tree fixer runs after the
+ * link counts have been corrected. Therefore, we must bump the
+ * child's link count since there will be no further opportunity to fix
+ * errors.
+ */
+ error = xrep_adoption_trans_alloc(sc, &dl->adoption);
+ if (error)
+ goto out_iolock;
+ dl->adoption.bump_child_nlink = true;
+
+ /* Figure out what name we're going to use here. */
+ error = xrep_adoption_compute_name(&dl->adoption, &dl->xname);
+ if (error)
+ goto out_trans;
+
+ /*
+ * Now that we have a proposed name for the orphanage entry, create
+ * a faux path so that the live update hook will see it.
+ */
+ mutex_lock(&dl->lock);
+ if (dl->stale) {
+ mutex_unlock(&dl->lock);
+ error = -ESTALE;
+ goto out_trans;
+ }
+ error = xrep_dirtree_create_adoption_path(dl);
+ mutex_unlock(&dl->lock);
+ if (error)
+ goto out_trans;
+
+ /* Reparent the directory. */
+ error = xrep_adoption_move(&dl->adoption);
+ if (error)
+ goto out_trans;
+
+ /*
+ * Commit the name and release all inode locks except for the scrub
+ * target's IOLOCK.
+ */
+ error = xrep_trans_commit(sc);
+ goto out_ilock;
+
+out_trans:
+ xchk_trans_cancel(sc);
+out_ilock:
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+out_iolock:
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ return error;
+}
+
+/*
+ * This newly orphaned directory needs to be adopted by the orphanage.
+ * Make this happen.
+ */
+STATIC int
+xrep_dirtree_move_to_orphanage(
+ struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+ int error;
+
+ /*
+ * Start by dropping all the resources that we hold so that we can grab
+ * all the resources that we need for the adoption.
+ */
+ mutex_unlock(&dl->lock);
+ xchk_trans_cancel(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* Perform the adoption. */
+ error = xrep_dirtree_adopt(dl);
+
+ /*
+ * Retake all the resources we had at the beginning even if the repair
+ * failed or the scan data are now stale. This keeps things simple for
+ * the caller.
+ */
+ xchk_trans_alloc_empty(sc);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ mutex_lock(&dl->lock);
+
+ if (!error && dl->stale)
+ error = -ESTALE;
+ return error;
+}
+
+/*
+ * Try to fix all the problems. Returns -ESTALE if the scan data have become
+ * out of date.
+ */
+STATIC int
+xrep_dirtree_fix_problems(
+ struct xchk_dirtree *dl,
+ struct xchk_dirtree_outcomes *oc)
+{
+ struct xchk_dirpath *path;
+ int error;
+
+ /* Delete all the paths we don't want. */
+ xchk_dirtree_for_each_path(dl, path) {
+ if (path->outcome != XCHK_DIRPATH_DELETE)
+ continue;
+
+ error = xrep_dirtree_delete_path(dl, path);
+ if (error)
+ return error;
+ }
+
+ /* Reparent this directory to the orphanage. */
+ if (oc->needs_adoption) {
+ if (xrep_orphanage_can_adopt(dl->sc))
+ return xrep_dirtree_move_to_orphanage(dl);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/* Fix directory loops involving this directory. */
+int
+xrep_dirtree(
+ struct xfs_scrub *sc)
+{
+ struct xchk_dirtree *dl = sc->buf;
+ struct xchk_dirtree_outcomes oc;
+ int error;
+
+ /*
+ * Prepare to fix the directory tree by retaking the scan lock. The
+ * order of resource acquisition is still IOLOCK -> transaction ->
+ * ILOCK -> scan lock.
+ */
+ mutex_lock(&dl->lock);
+ do {
+ /*
+ * Decide what we're going to do, then do it. An -ESTALE
+ * return here means the scan results are invalid and we have
+ * to walk again.
+ */
+ if (!dl->stale) {
+ xrep_dirtree_decide_fate(dl, &oc);
+
+ trace_xrep_dirtree_decided_fate(dl, &oc);
+
+ error = xrep_dirtree_fix_problems(dl, &oc);
+ if (!error || error != -ESTALE)
+ break;
+ }
+ error = xchk_dirtree_find_paths_to_root(dl);
+ if (error == -ELNRNG || error == -ENOSR)
+ error = -EFSCORRUPTED;
+ } while (!error);
+ mutex_unlock(&dl->lock);
+
+ return error;
+}
diff --git a/fs/xfs/scrub/findparent.c b/fs/xfs/scrub/findparent.c
new file mode 100644
index 000000000000..84487072b6dd
--- /dev/null
+++ b/fs/xfs/scrub/findparent.c
@@ -0,0 +1,470 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trans_space.h"
+#include "xfs_health.h"
+#include "xfs_exchmaps.h"
+#include "xfs_parent.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/findparent.h"
+#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+#include "scrub/listxattr.h"
+
+/*
+ * Finding the Parent of a Directory
+ * =================================
+ *
+ * Directories have parent pointers, in the sense that each directory contains
+ * a dotdot entry that points to the single allowed parent. The brute force
+ * way to find the parent of a given directory is to scan every directory in
+ * the filesystem looking for a child dirent that references this directory.
+ *
+ * This module wraps the process of scanning the directory tree. It requires
+ * that @sc->ip is the directory whose parent we want to find, and that the
+ * caller hold only the IOLOCK on that directory. The scan itself needs to
+ * take the ILOCK of each directory visited.
+ *
+ * Because we cannot hold @sc->ip's ILOCK during a scan of the whole fs, it is
+ * necessary to use dirent hook to update the parent scan results. Callers
+ * must not read the scan results without re-taking @sc->ip's ILOCK.
+ *
+ * There are a few shortcuts that we can take to avoid scanning the entire
+ * filesystem, such as noticing directory tree roots and querying the dentry
+ * cache for parent information.
+ */
+
+struct xrep_findparent_info {
+ /* The directory currently being scanned. */
+ struct xfs_inode *dp;
+
+ /*
+ * Scrub context. We're looking for a @dp containing a directory
+ * entry pointing to sc->ip->i_ino.
+ */
+ struct xfs_scrub *sc;
+
+ /* Optional scan information for a xrep_findparent_scan call. */
+ struct xrep_parent_scan_info *parent_scan;
+
+ /*
+ * Parent that we've found for sc->ip. If we're scanning the entire
+ * directory tree, we need this to ensure that we only find /one/
+ * parent directory.
+ */
+ xfs_ino_t found_parent;
+
+ /*
+ * This is set to true if @found_parent was not observed directly from
+ * the directory scan but by noticing a change in dotdot entries after
+ * cycling the sc->ip IOLOCK.
+ */
+ bool parent_tentative;
+};
+
+/*
+ * If this directory entry points to the scrub target inode, then the directory
+ * we're scanning is the parent of the scrub target inode.
+ */
+STATIC int
+xrep_findparent_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_findparent_info *fpi = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(fpi->sc, &error))
+ return error;
+
+ if (ino != fpi->sc->ip->i_ino)
+ return 0;
+
+ /* Ignore garbage directory entry names. */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
+ return -EFSCORRUPTED;
+
+ /*
+ * Ignore dotdot and dot entries -- we're looking for parent -> child
+ * links only.
+ */
+ if (name->name[0] == '.' && (name->len == 1 ||
+ (name->len == 2 && name->name[1] == '.')))
+ return 0;
+
+ /* Uhoh, more than one parent for a dir? */
+ if (fpi->found_parent != NULLFSINO &&
+ !(fpi->parent_tentative && fpi->found_parent == fpi->dp->i_ino)) {
+ trace_xrep_findparent_dirent(fpi->sc->ip, 0);
+ return -EFSCORRUPTED;
+ }
+
+ /* We found a potential parent; remember this. */
+ trace_xrep_findparent_dirent(fpi->sc->ip, fpi->dp->i_ino);
+ fpi->found_parent = fpi->dp->i_ino;
+ fpi->parent_tentative = false;
+
+ if (fpi->parent_scan)
+ xrep_findparent_scan_found(fpi->parent_scan, fpi->dp->i_ino);
+
+ return 0;
+}
+
+/*
+ * If this is a directory, walk the dirents looking for any that point to the
+ * scrub target inode.
+ */
+STATIC int
+xrep_findparent_walk_directory(
+ struct xrep_findparent_info *fpi)
+{
+ struct xfs_scrub *sc = fpi->sc;
+ struct xfs_inode *dp = fpi->dp;
+ unsigned int lock_mode;
+ int error = 0;
+
+ /*
+ * The inode being scanned cannot be its own parent, nor can any
+ * temporary directory we created to stage this repair.
+ */
+ if (dp == sc->ip || dp == sc->tempip)
+ return 0;
+
+ /*
+ * Similarly, temporary files created to stage a repair cannot be the
+ * parent of this inode.
+ */
+ if (xrep_is_tempfile(dp))
+ return 0;
+
+ /*
+ * Scan the directory to see if there it contains an entry pointing to
+ * the directory that we are repairing.
+ */
+ lock_mode = xfs_ilock_data_map_shared(dp);
+
+ /* Don't mix metadata and regular directory trees. */
+ if (xfs_is_metadir_inode(dp) != xfs_is_metadir_inode(sc->ip))
+ goto out_unlock;
+
+ /*
+ * If this directory is known to be sick, we cannot scan it reliably
+ * and must abort.
+ */
+ if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
+ XFS_SICK_INO_BMBTD |
+ XFS_SICK_INO_DIR)) {
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ /*
+ * We cannot complete our parent pointer scan if a directory looks as
+ * though it has been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ error = xchk_dir_walk(sc, dp, xrep_findparent_dirent, fpi);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock(dp, lock_mode);
+ return error;
+}
+
+/*
+ * Update this directory's dotdot pointer based on ongoing dirent updates.
+ */
+STATIC int
+xrep_findparent_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xrep_parent_scan_info *pscan;
+ struct xfs_scrub *sc;
+
+ pscan = container_of(nb, struct xrep_parent_scan_info,
+ dhook.dirent_hook.nb);
+ sc = pscan->sc;
+
+ /*
+ * If @p->ip is the subdirectory that we're interested in and we've
+ * already scanned @p->dp, update the dotdot target inumber to the
+ * parent inode.
+ */
+ if (p->ip->i_ino == sc->ip->i_ino &&
+ xchk_iscan_want_live_update(&pscan->iscan, p->dp->i_ino)) {
+ if (p->delta > 0) {
+ xrep_findparent_scan_found(pscan, p->dp->i_ino);
+ } else {
+ xrep_findparent_scan_found(pscan, NULLFSINO);
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+/*
+ * Set up a scan to find the parent of a directory. The provided dirent hook
+ * will be called when there is a dotdot update for the inode being repaired.
+ */
+int
+__xrep_findparent_scan_start(
+ struct xfs_scrub *sc,
+ struct xrep_parent_scan_info *pscan,
+ notifier_fn_t custom_fn)
+{
+ int error;
+
+ if (!(sc->flags & XCHK_FSGATES_DIRENTS)) {
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+ return -EINVAL;
+ }
+
+ pscan->sc = sc;
+ pscan->parent_ino = NULLFSINO;
+
+ mutex_init(&pscan->lock);
+
+ xchk_iscan_start(sc, 30000, 100, &pscan->iscan);
+
+ /*
+ * Hook into the dirent update code. The hook only operates on inodes
+ * that were already scanned, and the scanner thread takes each inode's
+ * ILOCK, which means that any in-progress inode updates will finish
+ * before we can scan the inode.
+ */
+ if (custom_fn)
+ xfs_dir_hook_setup(&pscan->dhook, custom_fn);
+ else
+ xfs_dir_hook_setup(&pscan->dhook, xrep_findparent_live_update);
+ error = xfs_dir_hook_add(sc->mp, &pscan->dhook);
+ if (error)
+ goto out_iscan;
+
+ return 0;
+out_iscan:
+ xchk_iscan_teardown(&pscan->iscan);
+ mutex_destroy(&pscan->lock);
+ return error;
+}
+
+/*
+ * Scan the entire filesystem looking for a parent inode for the inode being
+ * scrubbed. @sc->ip must not be the root of a directory tree. Callers must
+ * not hold a dirty transaction or any lock that would interfere with taking
+ * an ILOCK.
+ *
+ * Returns 0 with @pscan->parent_ino set to the parent that we found.
+ * Returns 0 with @pscan->parent_ino set to NULLFSINO if we found no parents.
+ * Returns the usual negative errno if something else happened.
+ */
+int
+xrep_findparent_scan(
+ struct xrep_parent_scan_info *pscan)
+{
+ struct xrep_findparent_info fpi = {
+ .sc = pscan->sc,
+ .found_parent = NULLFSINO,
+ .parent_scan = pscan,
+ };
+ struct xfs_scrub *sc = pscan->sc;
+ int ret;
+
+ ASSERT(S_ISDIR(VFS_IC(sc->ip)->i_mode));
+
+ while ((ret = xchk_iscan_iter(&pscan->iscan, &fpi.dp)) == 1) {
+ if (S_ISDIR(VFS_I(fpi.dp)->i_mode))
+ ret = xrep_findparent_walk_directory(&fpi);
+ else
+ ret = 0;
+ xchk_iscan_mark_visited(&pscan->iscan, fpi.dp);
+ xchk_irele(sc, fpi.dp);
+ if (ret)
+ break;
+
+ if (xchk_should_terminate(sc, &ret))
+ break;
+ }
+ xchk_iscan_iter_finish(&pscan->iscan);
+
+ return ret;
+}
+
+/* Tear down a parent scan. */
+void
+xrep_findparent_scan_teardown(
+ struct xrep_parent_scan_info *pscan)
+{
+ xfs_dir_hook_del(pscan->sc->mp, &pscan->dhook);
+ xchk_iscan_teardown(&pscan->iscan);
+ mutex_destroy(&pscan->lock);
+}
+
+/* Finish a parent scan early. */
+void
+xrep_findparent_scan_finish_early(
+ struct xrep_parent_scan_info *pscan,
+ xfs_ino_t ino)
+{
+ xrep_findparent_scan_found(pscan, ino);
+ xchk_iscan_finish_early(&pscan->iscan);
+}
+
+/*
+ * Confirm that the directory @parent_ino actually contains a directory entry
+ * pointing to the child @sc->ip->ino. This function returns one of several
+ * ways:
+ *
+ * Returns 0 with @parent_ino unchanged if the parent was confirmed.
+ * Returns 0 with @parent_ino set to NULLFSINO if the parent was not valid.
+ * Returns the usual negative errno if something else happened.
+ */
+int
+xrep_findparent_confirm(
+ struct xfs_scrub *sc,
+ xfs_ino_t *parent_ino)
+{
+ struct xrep_findparent_info fpi = {
+ .sc = sc,
+ .found_parent = NULLFSINO,
+ };
+ int error;
+
+ /* The root directory always points to itself. */
+ if (sc->ip == sc->mp->m_rootip) {
+ *parent_ino = sc->mp->m_sb.sb_rootino;
+ return 0;
+ }
+
+ /* The metadata root directory always points to itself. */
+ if (sc->ip == sc->mp->m_metadirip) {
+ *parent_ino = sc->mp->m_sb.sb_metadirino;
+ return 0;
+ }
+
+ /* Unlinked dirs can point anywhere; point them up to the root dir. */
+ if (VFS_I(sc->ip)->i_nlink == 0) {
+ *parent_ino = xchk_inode_rootdir_inum(sc->ip);
+ return 0;
+ }
+
+ /* Reject garbage parent inode numbers and self-referential parents. */
+ if (*parent_ino == NULLFSINO)
+ return 0;
+ if (!xfs_verify_dir_ino(sc->mp, *parent_ino) ||
+ *parent_ino == sc->ip->i_ino) {
+ *parent_ino = NULLFSINO;
+ return 0;
+ }
+
+ error = xchk_iget(sc, *parent_ino, &fpi.dp);
+ if (error)
+ return error;
+
+ if (!S_ISDIR(VFS_I(fpi.dp)->i_mode)) {
+ *parent_ino = NULLFSINO;
+ goto out_rele;
+ }
+
+ error = xrep_findparent_walk_directory(&fpi);
+ if (error)
+ goto out_rele;
+
+ *parent_ino = fpi.found_parent;
+out_rele:
+ xchk_irele(sc, fpi.dp);
+ return error;
+}
+
+/*
+ * If we're the root of a directory tree, we are our own parent. If we're an
+ * unlinked directory, the parent /won't/ have a link to us. Set the parent
+ * directory to the root for both cases. Returns NULLFSINO if we don't know
+ * what to do.
+ */
+xfs_ino_t
+xrep_findparent_self_reference(
+ struct xfs_scrub *sc)
+{
+ if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino)
+ return sc->mp->m_sb.sb_rootino;
+
+ if (sc->ip->i_ino == sc->mp->m_sb.sb_metadirino)
+ return sc->mp->m_sb.sb_metadirino;
+
+ if (VFS_I(sc->ip)->i_nlink == 0)
+ return xchk_inode_rootdir_inum(sc->ip);
+
+ return NULLFSINO;
+}
+
+/* Check the dentry cache to see if knows of a parent for the scrub target. */
+xfs_ino_t
+xrep_findparent_from_dcache(
+ struct xfs_scrub *sc)
+{
+ struct inode *pip = NULL;
+ struct dentry *dentry, *parent;
+ xfs_ino_t ret = NULLFSINO;
+
+ dentry = d_find_alias(VFS_I(sc->ip));
+ if (!dentry)
+ goto out;
+
+ parent = dget_parent(dentry);
+ if (!parent)
+ goto out_dput;
+
+ ASSERT(parent->d_sb == sc->ip->i_mount->m_super);
+
+ pip = igrab(d_inode(parent));
+ dput(parent);
+
+ if (S_ISDIR(pip->i_mode)) {
+ trace_xrep_findparent_from_dcache(sc->ip, XFS_I(pip)->i_ino);
+ ret = XFS_I(pip)->i_ino;
+ }
+
+ xchk_irele(sc, XFS_I(pip));
+
+out_dput:
+ dput(dentry);
+out:
+ return ret;
+}
diff --git a/fs/xfs/scrub/findparent.h b/fs/xfs/scrub/findparent.h
new file mode 100644
index 000000000000..d998c7a88152
--- /dev/null
+++ b/fs/xfs/scrub/findparent.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_FINDPARENT_H__
+#define __XFS_SCRUB_FINDPARENT_H__
+
+struct xrep_parent_scan_info {
+ struct xfs_scrub *sc;
+
+ /* Inode scan cursor. */
+ struct xchk_iscan iscan;
+
+ /* Hook to capture directory entry updates. */
+ struct xfs_dir_hook dhook;
+
+ /* Lock protecting parent_ino. */
+ struct mutex lock;
+
+ /* Parent inode that we've found. */
+ xfs_ino_t parent_ino;
+
+ bool lookup_parent;
+};
+
+int __xrep_findparent_scan_start(struct xfs_scrub *sc,
+ struct xrep_parent_scan_info *pscan,
+ notifier_fn_t custom_fn);
+static inline int xrep_findparent_scan_start(struct xfs_scrub *sc,
+ struct xrep_parent_scan_info *pscan)
+{
+ return __xrep_findparent_scan_start(sc, pscan, NULL);
+}
+int xrep_findparent_scan(struct xrep_parent_scan_info *pscan);
+void xrep_findparent_scan_teardown(struct xrep_parent_scan_info *pscan);
+
+static inline void
+xrep_findparent_scan_found(
+ struct xrep_parent_scan_info *pscan,
+ xfs_ino_t ino)
+{
+ mutex_lock(&pscan->lock);
+ pscan->parent_ino = ino;
+ mutex_unlock(&pscan->lock);
+}
+
+void xrep_findparent_scan_finish_early(struct xrep_parent_scan_info *pscan,
+ xfs_ino_t ino);
+
+int xrep_findparent_confirm(struct xfs_scrub *sc, xfs_ino_t *parent_ino);
+
+xfs_ino_t xrep_findparent_self_reference(struct xfs_scrub *sc);
+xfs_ino_t xrep_findparent_from_dcache(struct xfs_scrub *sc);
+
+#endif /* __XFS_SCRUB_FINDPARENT_H__ */
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 5799e9a94f1f..9b598c5790ad 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -19,9 +19,11 @@
#include "xfs_rtbitmap.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
+#include "xfs_rtgroup.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
+#include "scrub/fscounters.h"
/*
* FS Summary Counters
@@ -48,17 +50,6 @@
* our tolerance for mismatch between expected and actual counter values.
*/
-struct xchk_fscounters {
- struct xfs_scrub *sc;
- uint64_t icount;
- uint64_t ifree;
- uint64_t fdblocks;
- uint64_t frextents;
- unsigned long long icount_min;
- unsigned long long icount_max;
- bool frozen;
-};
-
/*
* Since the expected value computation is lockless but only browses incore
* values, the percpu counters should be fairly close to each other. However,
@@ -84,10 +75,9 @@ xchk_fscount_warmup(
struct xfs_buf *agi_bp = NULL;
struct xfs_buf *agf_bp = NULL;
struct xfs_perag *pag = NULL;
- xfs_agnumber_t agno;
int error = 0;
- for_each_perag(mp, agno, pag) {
+ while ((pag = xfs_perag_next(mp, pag))) {
if (xchk_should_terminate(sc, &error))
break;
if (xfs_perag_initialised_agi(pag) &&
@@ -95,7 +85,7 @@ xchk_fscount_warmup(
continue;
/* Lock both AG headers. */
- error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
+ error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp);
if (error)
break;
error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
@@ -133,7 +123,7 @@ xchk_fsfreeze(
{
int error;
- error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL);
trace_xchk_fsfreeze(sc, error);
return error;
}
@@ -145,7 +135,7 @@ xchk_fsthaw(
int error;
/* This should always succeed, we have a kernel freeze */
- error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL);
trace_xchk_fsthaw(sc, error);
return error;
}
@@ -235,14 +225,19 @@ xchk_setup_fscounters(
* Pause all writer activity in the filesystem while we're scrubbing to
* reduce the likelihood of background perturbations to the counters
* throwing off our calculations.
+ *
+ * If we're repairing, we need to prevent any other thread from
+ * changing the global fs summary counters while we're repairing them.
+ * This requires the fs to be frozen, which will disable background
+ * reclaim and purge all inactive inodes.
*/
- if (sc->flags & XCHK_TRY_HARDER) {
+ if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) {
error = xchk_fscounters_freeze(sc);
if (error)
return error;
}
- return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+ return xchk_trans_alloc_empty(sc);
}
/*
@@ -254,7 +249,9 @@ xchk_setup_fscounters(
* set the INCOMPLETE flag even when a negative errno is returned. This care
* must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
* ECANCELED) that are absorbed into a scrub state flag update by
- * xchk_*_process_error.
+ * xchk_*_process_error. Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
*/
/* Count free space btree blocks manually for pre-lazysbcount filesystems. */
@@ -264,7 +261,7 @@ xchk_fscount_btreeblks(
struct xchk_fscounters *fsc,
xfs_agnumber_t agno)
{
- xfs_extlen_t blocks;
+ xfs_filblks_t blocks;
int error;
error = xchk_ag_init_existing(sc, agno, &sc->sa);
@@ -298,9 +295,8 @@ xchk_fscount_aggregate_agcounts(
struct xchk_fscounters *fsc)
{
struct xfs_mount *mp = sc->mp;
- struct xfs_perag *pag;
+ struct xfs_perag *pag = NULL;
uint64_t delayed;
- xfs_agnumber_t agno;
int tries = 8;
int error = 0;
@@ -309,7 +305,7 @@ retry:
fsc->ifree = 0;
fsc->fdblocks = 0;
- for_each_perag(mp, agno, pag) {
+ while ((pag = xfs_perag_next(mp, pag))) {
if (xchk_should_terminate(sc, &error))
break;
@@ -330,7 +326,7 @@ retry:
if (xfs_has_lazysbcount(sc->mp)) {
fsc->fdblocks += pag->pagf_btreeblks;
} else {
- error = xchk_fscount_btreeblks(sc, fsc, agno);
+ error = xchk_fscount_btreeblks(sc, fsc, pag_agno(pag));
if (error)
break;
}
@@ -354,7 +350,7 @@ retry:
* The global incore space reservation is taken from the incore
* counters, so leave that out of the computation.
*/
- fsc->fdblocks -= mp->m_resblks_avail;
+ fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail;
/*
* Delayed allocation reservations are taken out of the incore counters
@@ -391,7 +387,7 @@ retry:
#ifdef CONFIG_XFS_RT
STATIC int
xchk_fscount_add_frextent(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
@@ -412,23 +408,34 @@ xchk_fscount_count_frextents(
struct xchk_fscounters *fsc)
{
struct xfs_mount *mp = sc->mp;
+ struct xfs_rtgroup *rtg = NULL;
int error;
fsc->frextents = 0;
- if (!xfs_has_realtime(mp))
+ fsc->frextents_delayed = 0;
+
+ /*
+ * Don't bother verifying and repairing the fs counters for zoned file
+ * systems as they don't track an on-disk frextents count, and the
+ * in-memory percpu counter also includes reservations.
+ */
+ if (!xfs_has_realtime(mp) || xfs_has_zoned(mp))
return 0;
- xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
- error = xfs_rtalloc_query_all(sc->mp, sc->tp,
- xchk_fscount_add_frextent, fsc);
- if (error) {
- xchk_set_incomplete(sc);
- goto out_unlock;
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ error = xfs_rtalloc_query_all(rtg, sc->tp,
+ xchk_fscount_add_frextent, fsc);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ if (error) {
+ xchk_set_incomplete(sc);
+ xfs_rtgroup_rele(rtg);
+ return error;
+ }
}
-out_unlock:
- xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
- return error;
+ fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents);
+ return 0;
}
#else
STATIC int
@@ -437,6 +444,7 @@ xchk_fscount_count_frextents(
struct xchk_fscounters *fsc)
{
fsc->frextents = 0;
+ fsc->frextents_delayed = 0;
return 0;
}
#endif /* CONFIG_XFS_RT */
@@ -482,6 +490,10 @@ xchk_fscount_within_range(
if (curr_value == expected)
return true;
+ /* We require exact matches when repair is running. */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ return false;
+
min_value = min(old_value, curr_value);
max_value = max(old_value, curr_value);
@@ -507,8 +519,8 @@ xchk_fscounters(
/* Snapshot the percpu counters. */
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
- fdblocks = percpu_counter_sum(&mp->m_fdblocks);
- frextents = percpu_counter_sum(&mp->m_frextents);
+ fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
+ frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS);
/* No negative values, please! */
if (icount < 0 || ifree < 0)
@@ -516,7 +528,7 @@ xchk_fscounters(
/*
* If the filesystem is not frozen, the counter summation calls above
- * can race with xfs_mod_freecounter, which subtracts a requested space
+ * can race with xfs_dec_freecounter, which subtracts a requested space
* reservation from the counter and undoes the subtraction if that made
* the counter go negative. Therefore, it's possible to see negative
* values here, and we should only flag that as a corruption if we
@@ -583,16 +595,18 @@ xchk_fscounters(
try_again = true;
}
- if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
- fsc->fdblocks)) {
+ if (!xchk_fscount_within_range(sc, fdblocks,
+ &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
else
try_again = true;
}
- if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
- fsc->frextents)) {
+ if (!xfs_has_zoned(mp) &&
+ !xchk_fscount_within_range(sc, frextents,
+ &mp->m_free[XC_FREE_RTEXTENTS].count,
+ fsc->frextents - fsc->frextents_delayed)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
else
diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h
new file mode 100644
index 000000000000..bcf56e1c36f9
--- /dev/null
+++ b/fs/xfs/scrub/fscounters.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_FSCOUNTERS_H__
+#define __XFS_SCRUB_FSCOUNTERS_H__
+
+struct xchk_fscounters {
+ struct xfs_scrub *sc;
+ uint64_t icount;
+ uint64_t ifree;
+ uint64_t fdblocks;
+ uint64_t frextents;
+ uint64_t frextents_delayed;
+ unsigned long long icount_min;
+ unsigned long long icount_max;
+ bool frozen;
+};
+
+#endif /* __XFS_SCRUB_FSCOUNTERS_H__ */
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
new file mode 100644
index 000000000000..f0d2b04644e4
--- /dev/null
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_health.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/fscounters.h"
+
+/*
+ * FS Summary Counters
+ * ===================
+ *
+ * We correct errors in the filesystem summary counters by setting them to the
+ * values computed during the obligatory scrub phase. However, we must be
+ * careful not to allow any other thread to change the counters while we're
+ * computing and setting new values. To achieve this, we freeze the
+ * filesystem for the whole operation if the REPAIR flag is set. The checking
+ * function is stricter when we've frozen the fs.
+ */
+
+/*
+ * Reset the superblock counters. Caller is responsible for freezing the
+ * filesystem during the calculation and reset phases.
+ */
+int
+xrep_fscounters(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xchk_fscounters *fsc = sc->buf;
+
+ /*
+ * Reinitialize the in-core counters from what we computed. We froze
+ * the filesystem, so there shouldn't be anyone else trying to modify
+ * these counters.
+ */
+ if (!fsc->frozen) {
+ ASSERT(fsc->frozen);
+ return -EFSCORRUPTED;
+ }
+
+ trace_xrep_reset_counters(mp, fsc);
+
+ percpu_counter_set(&mp->m_icount, fsc->icount);
+ percpu_counter_set(&mp->m_ifree, fsc->ifree);
+ xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks);
+
+ /*
+ * Online repair is only supported on v5 file systems, which require
+ * lazy sb counters and thus no update of sb_fdblocks here. But
+ * sb_frextents only uses a lazy counter with rtgroups, and thus needs
+ * to be updated directly here otherwise. And for that we need to keep
+ * track of the delalloc reservations separately, as they are are
+ * subtracted from m_frextents, but not included in sb_frextents.
+ */
+ if (!xfs_has_zoned(mp)) {
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+ fsc->frextents - fsc->frextents_delayed);
+ if (!xfs_has_rtgroups(mp))
+ mp->m_sb.sb_frextents = fsc->frextents;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 531006910ca9..3c0f25098b69 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -12,8 +12,10 @@
#include "xfs_btree.h"
#include "xfs_ag.h"
#include "xfs_health.h"
+#include "xfs_rtgroup.h"
#include "scrub/scrub.h"
#include "scrub/health.h"
+#include "scrub/common.h"
/*
* Scrub and In-Core Filesystem Health Assessments
@@ -69,10 +71,11 @@
/* Map our scrub type to a sick mask and a set of health update functions. */
enum xchk_health_group {
- XHG_FS = 1,
- XHG_RT,
+ XHG_NONE = 1,
+ XHG_FS,
XHG_AG,
XHG_INO,
+ XHG_RTGROUP,
};
struct xchk_health_map {
@@ -81,6 +84,7 @@ struct xchk_health_map {
};
static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
+ [XFS_SCRUB_TYPE_PROBE] = { XHG_NONE, 0 },
[XFS_SCRUB_TYPE_SB] = { XHG_AG, XFS_SICK_AG_SB },
[XFS_SCRUB_TYPE_AGF] = { XHG_AG, XFS_SICK_AG_AGF },
[XFS_SCRUB_TYPE_AGFL] = { XHG_AG, XFS_SICK_AG_AGFL },
@@ -99,12 +103,19 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_XATTR] = { XHG_INO, XFS_SICK_INO_XATTR },
[XFS_SCRUB_TYPE_SYMLINK] = { XHG_INO, XFS_SICK_INO_SYMLINK },
[XFS_SCRUB_TYPE_PARENT] = { XHG_INO, XFS_SICK_INO_PARENT },
- [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RT, XFS_SICK_RT_BITMAP },
- [XFS_SCRUB_TYPE_RTSUM] = { XHG_RT, XFS_SICK_RT_SUMMARY },
+ [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RTGROUP, XFS_SICK_RG_BITMAP },
+ [XFS_SCRUB_TYPE_RTSUM] = { XHG_RTGROUP, XFS_SICK_RG_SUMMARY },
[XFS_SCRUB_TYPE_UQUOTA] = { XHG_FS, XFS_SICK_FS_UQUOTA },
[XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA },
[XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA },
[XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS },
+ [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK },
+ [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS },
+ [XFS_SCRUB_TYPE_DIRTREE] = { XHG_INO, XFS_SICK_INO_DIRTREE },
+ [XFS_SCRUB_TYPE_METAPATH] = { XHG_FS, XFS_SICK_FS_METAPATH },
+ [XFS_SCRUB_TYPE_RGSUPER] = { XHG_RTGROUP, XFS_SICK_RG_SUPER },
+ [XFS_SCRUB_TYPE_RTRMAPBT] = { XHG_RTGROUP, XFS_SICK_RG_RMAPBT },
+ [XFS_SCRUB_TYPE_RTREFCBT] = { XHG_RTGROUP, XFS_SICK_RG_REFCNTBT },
};
/* Return the health status mask for this scrub type. */
@@ -126,7 +137,7 @@ xchk_mark_healthy_if_clean(
{
if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
XFS_SCRUB_OFLAG_XCORRUPT)))
- sc->sick_mask |= mask;
+ sc->healthy_mask |= mask;
}
/*
@@ -148,6 +159,25 @@ xchk_file_looks_zapped(
}
/*
+ * Scrub gave the filesystem a clean bill of health, so clear all the indirect
+ * markers of past problems (at least for the fs and ags) so that we can be
+ * healthy again.
+ */
+STATIC void
+xchk_mark_all_healthy(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag = NULL;
+ struct xfs_rtgroup *rtg = NULL;
+
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT);
+ while ((pag = xfs_perag_next(mp, pag)))
+ xfs_group_mark_healthy(pag_group(pag), XFS_SICK_AG_INDIRECT);
+ while ((rtg = xfs_rtgroup_next(mp, rtg)))
+ xfs_group_mark_healthy(rtg_group(rtg), XFS_SICK_RG_INDIRECT);
+}
+
+/*
* Update filesystem health assessments based on what we found and did.
*
* If the scrubber finds errors, we mark sick whatever's mentioned in
@@ -162,41 +192,73 @@ xchk_update_health(
struct xfs_scrub *sc)
{
struct xfs_perag *pag;
+ struct xfs_rtgroup *rtg;
+ unsigned int mask = sc->sick_mask;
bool bad;
- if (!sc->sick_mask)
+ /*
+ * The HEALTHY scrub type is a request from userspace to clear all the
+ * indirect flags after a clean scan of the entire filesystem. As such
+ * there's no sick flag defined for it, so we branch here ahead of the
+ * mask check.
+ */
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_HEALTHY &&
+ !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+ xchk_mark_all_healthy(sc->mp);
return;
+ }
bad = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
XFS_SCRUB_OFLAG_XCORRUPT));
+ if (!bad)
+ mask |= sc->healthy_mask;
switch (type_to_health_flag[sc->sm->sm_type].group) {
+ case XHG_NONE:
+ break;
case XHG_AG:
+ if (!mask)
+ return;
pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
if (bad)
- xfs_ag_mark_sick(pag, sc->sick_mask);
+ xfs_group_mark_corrupt(pag_group(pag), mask);
else
- xfs_ag_mark_healthy(pag, sc->sick_mask);
+ xfs_group_mark_healthy(pag_group(pag), mask);
xfs_perag_put(pag);
break;
case XHG_INO:
if (!sc->ip)
return;
+ /*
+ * If we're coming in for repairs then we don't want sickness
+ * flags to propagate to the incore health status if the inode
+ * gets inactivated before we can fix it.
+ */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ mask |= XFS_SICK_INO_FORGET;
+ if (!mask)
+ return;
if (bad)
- xfs_inode_mark_sick(sc->ip, sc->sick_mask);
+ xfs_inode_mark_corrupt(sc->ip, mask);
else
- xfs_inode_mark_healthy(sc->ip, sc->sick_mask);
+ xfs_inode_mark_healthy(sc->ip, mask);
break;
case XHG_FS:
+ if (!mask)
+ return;
if (bad)
- xfs_fs_mark_sick(sc->mp, sc->sick_mask);
+ xfs_fs_mark_corrupt(sc->mp, mask);
else
- xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
+ xfs_fs_mark_healthy(sc->mp, mask);
break;
- case XHG_RT:
+ case XHG_RTGROUP:
+ if (!mask)
+ return;
+ rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno);
if (bad)
- xfs_rt_mark_sick(sc->mp, sc->sick_mask);
+ xfs_group_mark_corrupt(rtg_group(rtg), mask);
else
- xfs_rt_mark_healthy(sc->mp, sc->sick_mask);
+ xfs_group_mark_healthy(rtg_group(rtg), mask);
+ xfs_rtgroup_put(rtg);
break;
default:
ASSERT(0);
@@ -205,13 +267,13 @@ xchk_update_health(
}
/* Is the given per-AG btree healthy enough for scanning? */
-bool
-xchk_ag_btree_healthy_enough(
+void
+xchk_ag_btree_del_cursor_if_sick(
struct xfs_scrub *sc,
- struct xfs_perag *pag,
- xfs_btnum_t btnum)
+ struct xfs_btree_cur **curp,
+ unsigned int sm_type)
{
- unsigned int mask = 0;
+ unsigned int mask = (*curp)->bc_ops->sick_mask;
/*
* We always want the cursor if it's the same type as whatever we're
@@ -220,41 +282,8 @@ xchk_ag_btree_healthy_enough(
* Otherwise, we're only interested in the btree for cross-referencing.
* If we know the btree is bad then don't bother, just set XFAIL.
*/
- switch (btnum) {
- case XFS_BTNUM_BNO:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT)
- return true;
- mask = XFS_SICK_AG_BNOBT;
- break;
- case XFS_BTNUM_CNT:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT)
- return true;
- mask = XFS_SICK_AG_CNTBT;
- break;
- case XFS_BTNUM_INO:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
- return true;
- mask = XFS_SICK_AG_INOBT;
- break;
- case XFS_BTNUM_FINO:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
- return true;
- mask = XFS_SICK_AG_FINOBT;
- break;
- case XFS_BTNUM_RMAP:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT)
- return true;
- mask = XFS_SICK_AG_RMAPBT;
- break;
- case XFS_BTNUM_REFC:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT)
- return true;
- mask = XFS_SICK_AG_REFCNTBT;
- break;
- default:
- ASSERT(0);
- return true;
- }
+ if (sc->sm->sm_type == sm_type)
+ return;
/*
* If we just repaired some AG metadata, sc->sick_mask will reflect all
@@ -266,10 +295,43 @@ xchk_ag_btree_healthy_enough(
type_to_health_flag[sc->sm->sm_type].group == XHG_AG)
mask &= ~sc->sick_mask;
- if (xfs_ag_has_sickness(pag, mask)) {
+ if (xfs_group_has_sickness((*curp)->bc_group, mask)) {
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
- return false;
+ xfs_btree_del_cursor(*curp, XFS_BTREE_NOERROR);
+ *curp = NULL;
+ }
+}
+
+/*
+ * Quick scan to double-check that there isn't any evidence of lingering
+ * primary health problems. If we're still clear, then the health update will
+ * take care of clearing the indirect evidence.
+ */
+int
+xchk_health_record(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_perag *pag = NULL;
+ struct xfs_rtgroup *rtg = NULL;
+ unsigned int sick;
+ unsigned int checked;
+
+ xfs_fs_measure_sickness(mp, &sick, &checked);
+ if (sick & XFS_SICK_FS_PRIMARY)
+ xchk_set_corrupt(sc);
+
+ while ((pag = xfs_perag_next(mp, pag))) {
+ xfs_group_measure_sickness(pag_group(pag), &sick, &checked);
+ if (sick & XFS_SICK_AG_PRIMARY)
+ xchk_set_corrupt(sc);
+ }
+
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ xfs_group_measure_sickness(rtg_group(rtg), &sick, &checked);
+ if (sick & XFS_SICK_RG_PRIMARY)
+ xchk_set_corrupt(sc);
}
- return true;
+ return 0;
}
diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
index a731b2467399..63fc426eb5ae 100644
--- a/fs/xfs/scrub/health.h
+++ b/fs/xfs/scrub/health.h
@@ -8,9 +8,10 @@
unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type);
void xchk_update_health(struct xfs_scrub *sc);
-bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag,
- xfs_btnum_t btnum);
+void xchk_ag_btree_del_cursor_if_sick(struct xfs_scrub *sc,
+ struct xfs_btree_cur **curp, unsigned int sm_type);
void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask);
bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask);
+int xchk_health_record(struct xfs_scrub *sc);
#endif /* __XFS_SCRUB_HEALTH_H__ */
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index a720fc62262a..4dc7c83dc08a 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -76,7 +76,7 @@ xchk_inobt_xref_finobt(
int has_record;
int error;
- ASSERT(cur->bc_btnum == XFS_BTNUM_FINO);
+ ASSERT(xfs_btree_is_fino(cur->bc_ops));
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
if (error)
@@ -179,7 +179,7 @@ xchk_finobt_xref_inobt(
int has_record;
int error;
- ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
+ ASSERT(xfs_btree_is_ino(cur->bc_ops));
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
if (error)
@@ -258,7 +258,7 @@ xchk_iallocbt_chunk(
{
struct xfs_scrub *sc = bs->sc;
struct xfs_mount *mp = bs->cur->bc_mp;
- struct xfs_perag *pag = bs->cur->bc_ag.pag;
+ struct xfs_perag *pag = to_perag(bs->cur->bc_group);
xfs_agblock_t agbno;
xfs_extlen_t len;
@@ -303,7 +303,6 @@ xchk_iallocbt_check_cluster_ifree(
unsigned int irec_ino,
struct xfs_dinode *dip)
{
- struct xfs_mount *mp = bs->cur->bc_mp;
xfs_ino_t fsino;
xfs_agino_t agino;
bool irec_free;
@@ -319,7 +318,7 @@ xchk_iallocbt_check_cluster_ifree(
* the record, compute which fs inode we're talking about.
*/
agino = irec->ir_startino + irec_ino;
- fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.pag->pag_agno, agino);
+ fsino = xfs_agino_to_ino(to_perag(bs->cur->bc_group), agino);
irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
@@ -368,7 +367,6 @@ xchk_iallocbt_check_cluster(
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_buf *cluster_bp;
unsigned int nr_inodes;
- xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
xfs_agblock_t agbno;
unsigned int cluster_index;
uint16_t cluster_mask = 0;
@@ -396,7 +394,7 @@ xchk_iallocbt_check_cluster(
* ir_startino can be large enough to make im_boffset nonzero.
*/
ir_holemask = (irec->ir_holemask & cluster_mask);
- imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap.im_blkno = xfs_agbno_to_daddr(to_perag(bs->cur->bc_group), agbno);
imap.im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino) <<
mp->m_sb.sb_inodelog;
@@ -407,9 +405,9 @@ xchk_iallocbt_check_cluster(
return 0;
}
- trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino,
- imap.im_blkno, imap.im_len, cluster_base, nr_inodes,
- cluster_mask, ir_holemask,
+ trace_xchk_iallocbt_check_cluster(to_perag(bs->cur->bc_group),
+ irec->ir_startino, imap.im_blkno, imap.im_len,
+ cluster_base, nr_inodes, cluster_mask, ir_holemask,
XFS_INO_TO_OFFSET(mp, irec->ir_startino +
cluster_base));
@@ -514,7 +512,7 @@ xchk_iallocbt_rec_alignment(
* Otherwise, we expect that the finobt record is aligned to the
* cluster alignment as told by the superblock.
*/
- if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+ if (xfs_btree_is_fino(bs->cur->bc_ops)) {
unsigned int imask;
imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
@@ -585,7 +583,7 @@ xchk_iallocbt_rec(
uint16_t holemask;
xfs_inobt_btrec_to_irec(mp, rec, &irec);
- if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
+ if (xfs_inobt_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
@@ -649,12 +647,11 @@ out:
*/
STATIC void
xchk_iallocbt_xref_rmap_btreeblks(
- struct xfs_scrub *sc,
- int which)
+ struct xfs_scrub *sc)
{
xfs_filblks_t blocks;
- xfs_extlen_t inobt_blocks = 0;
- xfs_extlen_t finobt_blocks = 0;
+ xfs_filblks_t inobt_blocks = 0;
+ xfs_filblks_t finobt_blocks = 0;
int error;
if (!sc->sa.ino_cur || !sc->sa.rmap_cur ||
@@ -688,7 +685,6 @@ xchk_iallocbt_xref_rmap_btreeblks(
STATIC void
xchk_iallocbt_xref_rmap_inodes(
struct xfs_scrub *sc,
- int which,
unsigned long long inodes)
{
xfs_filblks_t blocks;
@@ -719,17 +715,14 @@ xchk_iallocbt(
.next_startino = NULLAGINO,
.next_cluster_ino = NULLAGINO,
};
- xfs_btnum_t which;
int error;
switch (sc->sm->sm_type) {
case XFS_SCRUB_TYPE_INOBT:
cur = sc->sa.ino_cur;
- which = XFS_BTNUM_INO;
break;
case XFS_SCRUB_TYPE_FINOBT:
cur = sc->sa.fino_cur;
- which = XFS_BTNUM_FINO;
break;
default:
ASSERT(0);
@@ -741,7 +734,7 @@ xchk_iallocbt(
if (error)
return error;
- xchk_iallocbt_xref_rmap_btreeblks(sc, which);
+ xchk_iallocbt_xref_rmap_btreeblks(sc);
/*
* If we're scrubbing the inode btree, inode_blocks is the number of
@@ -750,9 +743,8 @@ xchk_iallocbt(
* knows about. We can't do this for the finobt since it only points
* to inode chunks with free inodes.
*/
- if (which == XFS_BTNUM_INO)
- xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes);
-
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
+ xchk_iallocbt_xref_rmap_inodes(sc, iabt.inodes);
return error;
}
diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c
index b3f7182dd2f5..14e48d3f1912 100644
--- a/fs/xfs/scrub/ialloc_repair.c
+++ b/fs/xfs/scrub/ialloc_repair.c
@@ -146,15 +146,12 @@ xrep_ibt_check_ifree(
struct xfs_scrub *sc = ri->sc;
struct xfs_mount *mp = sc->mp;
struct xfs_dinode *dip;
- xfs_ino_t fsino;
xfs_agino_t agino;
- xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno;
unsigned int cluster_buf_base;
unsigned int offset;
int error;
agino = cluster_ag_base + cluster_index;
- fsino = XFS_AGINO_TO_INO(mp, agno, agino);
/* Inode uncached or half assembled, read disk buffer */
cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base);
@@ -165,7 +162,8 @@ xrep_ibt_check_ifree(
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)
return -EFSCORRUPTED;
- if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)
+ if (dip->di_version >= 3 &&
+ be64_to_cpu(dip->di_ino) != xfs_agino_to_ino(ri->sc->sa.pag, agino))
return -EFSCORRUPTED;
/* Will the in-core inode tell us if it's in use? */
@@ -194,7 +192,7 @@ xrep_ibt_stash(
if (ri->rie.ir_freecount > 0)
ri->finobt_recs++;
- trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie);
+ trace_xrep_ibt_found(ri->sc->sa.pag, &ri->rie);
error = xfarray_append(ri->inode_records, &ri->rie);
if (error)
@@ -307,7 +305,7 @@ xrep_ibt_process_cluster(
* inobt because imap_to_bp directly maps the buffer without touching
* either inode btree.
*/
- imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno);
+ imap.im_blkno = xfs_agbno_to_daddr(sc->sa.pag, cluster_bno);
imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
imap.im_boffset = 0;
error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp);
@@ -369,7 +367,7 @@ xrep_ibt_check_inode_ext(
* On a sparse inode fs, this cluster could be part of a sparse chunk.
* Sparse clusters must be aligned to sparse chunk alignment.
*/
- if (xfs_has_sparseinodes(mp) &&
+ if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align &&
(!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
!IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
return -EFSCORRUPTED;
@@ -423,9 +421,7 @@ xrep_ibt_record_inode_blocks(
if (error)
return error;
- trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno,
- rec->rm_startblock, rec->rm_blockcount, rec->rm_owner,
- rec->rm_offset, rec->rm_flags);
+ trace_xrep_ibt_walk_rmap(ri->sc->sa.pag, rec);
/*
* Record the free/hole masks for each inode cluster that could be
@@ -634,7 +630,6 @@ xrep_ibt_build_new_trees(
struct xfs_scrub *sc = ri->sc;
struct xfs_btree_cur *ino_cur;
struct xfs_btree_cur *fino_cur = NULL;
- xfs_fsblock_t fsbno;
bool need_finobt;
int error;
@@ -656,15 +651,14 @@ xrep_ibt_build_new_trees(
*
* Start by setting up the inobt staging cursor.
*/
- fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
- XFS_IBT_BLOCK(sc->mp)),
- xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno,
+ xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT,
+ xfs_agbno_to_fsb(sc->sa.pag, XFS_IBT_BLOCK(sc->mp)),
XFS_AG_RESV_NONE);
ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
ri->new_inobt.bload.get_records = xrep_ibt_get_records;
- ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake,
- XFS_BTNUM_INO);
+ ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL);
+ xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake);
error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
xfarray_length(ri->inode_records));
if (error)
@@ -677,15 +671,14 @@ xrep_ibt_build_new_trees(
if (sc->mp->m_finobt_nores)
resv = XFS_AG_RESV_NONE;
- fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
- XFS_FIBT_BLOCK(sc->mp)),
xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT,
- fsbno, resv);
+ xfs_agbno_to_fsb(sc->sa.pag, XFS_FIBT_BLOCK(sc->mp)),
+ resv);
ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
ri->new_finobt.bload.get_records = xrep_fibt_get_records;
- fino_cur = xfs_inobt_stage_cursor(sc->sa.pag,
- &ri->new_finobt.afake, XFS_BTNUM_FINO);
+ fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL);
+ xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake);
error = xfs_btree_bload_compute_geometry(fino_cur,
&ri->new_finobt.bload, ri->finobt_recs);
if (error)
@@ -821,7 +814,7 @@ xrep_iallocbt(
sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT;
/* Set up enough storage to handle an AG with nothing but inodes. */
- xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino);
+ xfs_agino_range(mp, pag_agno(sc->sa.pag), &first_agino, &last_agino);
last_agino /= XFS_INODES_PER_CHUNK;
descr = xchk_xfile_ag_descr(sc, "inode index records");
error = xfarray_create(descr, last_agino,
diff --git a/fs/xfs/scrub/ino_bitmap.h b/fs/xfs/scrub/ino_bitmap.h
new file mode 100644
index 000000000000..1300833679ab
--- /dev/null
+++ b/fs/xfs/scrub/ino_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_INO_BITMAP_H__
+#define __XFS_SCRUB_INO_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_ino_t */
+
+struct xino_bitmap {
+ struct xbitmap64 inobitmap;
+};
+
+static inline void xino_bitmap_init(struct xino_bitmap *bitmap)
+{
+ xbitmap64_init(&bitmap->inobitmap);
+}
+
+static inline void xino_bitmap_destroy(struct xino_bitmap *bitmap)
+{
+ xbitmap64_destroy(&bitmap->inobitmap);
+}
+
+static inline int xino_bitmap_set(struct xino_bitmap *bitmap, xfs_ino_t ino)
+{
+ return xbitmap64_set(&bitmap->inobitmap, ino, 1);
+}
+
+static inline int xino_bitmap_test(struct xino_bitmap *bitmap, xfs_ino_t ino)
+{
+ uint64_t len = 1;
+
+ return xbitmap64_test(&bitmap->inobitmap, ino, &len);
+}
+
+#endif /* __XFS_SCRUB_INO_BITMAP_H__ */
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 6e2fe2d6250b..bb3f475b6353 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -60,6 +60,22 @@ xchk_install_handle_iscrub(
if (error)
return error;
+ /*
+ * Don't allow scrubbing by handle of any non-directory inode records
+ * in the metadata directory tree. We don't know if any of the scans
+ * launched by this scrubber will end up indirectly trying to lock this
+ * file.
+ *
+ * Scrubbers of inode-rooted metadata files (e.g. quota files) will
+ * attach all the resources needed to scrub the inode and call
+ * xchk_inode directly. Userspace cannot call this directly.
+ */
+ if (xfs_is_metadir_inode(ip) && !S_ISDIR(VFS_I(ip)->i_mode)) {
+ xchk_irele(sc, ip);
+ sc->ip = NULL;
+ return -ENOENT;
+ }
+
return xchk_prepare_iscrub(sc);
}
@@ -94,9 +110,15 @@ xchk_setup_inode(
return xchk_prepare_iscrub(sc);
}
- /* Reject internal metadata files and obviously bad inode numbers. */
- if (xfs_internal_inum(mp, sc->sm->sm_ino))
+ /*
+ * On pre-metadir filesystems, reject internal metadata files. For
+ * metadir filesystems, limited scrubbing of any file in the metadata
+ * directory tree by handle is allowed, because that is the only way to
+ * validate the lack of parent pointers in the sb-root metadata inodes.
+ */
+ if (!xfs_has_metadir(mp) && xfs_is_sb_inum(mp, sc->sm->sm_ino))
return -ENOENT;
+ /* Reject obviously bad inode numbers. */
if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
return -ENOENT;
@@ -238,12 +260,7 @@ xchk_inode_extsize(
xchk_ino_set_warning(sc, ino);
}
-/*
- * Validate di_cowextsize hint.
- *
- * The rules are documented at xfs_ioctl_setattr_check_cowextsize().
- * These functions must be kept in sync with each other.
- */
+/* Validate di_cowextsize hint. */
STATIC void
xchk_inode_cowextsize(
struct xfs_scrub *sc,
@@ -254,12 +271,32 @@ xchk_inode_cowextsize(
uint64_t flags2)
{
xfs_failaddr_t fa;
+ uint32_t value = be32_to_cpu(dip->di_cowextsize);
+
+ /*
+ * The used block counter for rtrmap is checked and repaired elsewhere.
+ */
+ if (xfs_has_zoned(sc->mp) &&
+ dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))
+ return;
- fa = xfs_inode_validate_cowextsize(sc->mp,
- be32_to_cpu(dip->di_cowextsize), mode, flags,
- flags2);
+ fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2);
if (fa)
xchk_ino_set_corrupt(sc, ino);
+
+ /*
+ * XFS allows a sysadmin to change the rt extent size when adding a rt
+ * section to a filesystem after formatting. If there are any
+ * directories with cowextsize and rtinherit set, the hint could become
+ * misaligned with the new rextsize. The verifier doesn't check this,
+ * because we allow rtinherit directories even without an rt device.
+ * Flag this as an administrative warning since we will clean this up
+ * eventually.
+ */
+ if ((flags & XFS_DIFLAG_RTINHERIT) &&
+ (flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ value % sc->mp->m_sb.sb_rextsize > 0)
+ xchk_ino_set_warning(sc, ino);
}
/* Make sure the di_flags make sense for the inode. */
@@ -338,8 +375,9 @@ xchk_inode_flags2(
if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode))
goto bad;
- /* realtime and reflink make no sense, currently */
- if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK))
+ /* realtime and reflink don't always go together */
+ if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK) &&
+ !xfs_has_rtreflink(mp))
goto bad;
/* no bigtime iflag without the bigtime feature */
@@ -421,8 +459,13 @@ xchk_dinode(
break;
case 2:
case 3:
- if (dip->di_onlink != 0)
- xchk_ino_set_corrupt(sc, ino);
+ if (xfs_dinode_is_metadir(dip)) {
+ if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX)
+ xchk_ino_set_corrupt(sc, ino);
+ } else {
+ if (dip->di_metatype != 0)
+ xchk_ino_set_corrupt(sc, ino);
+ }
if (dip->di_mode == 0 && sc->ip)
xchk_ino_set_corrupt(sc, ino);
@@ -475,6 +518,10 @@ xchk_dinode(
if (!S_ISREG(mode) && !S_ISDIR(mode))
xchk_ino_set_corrupt(sc, ino);
break;
+ case XFS_DINODE_FMT_META_BTREE:
+ if (!S_ISREG(mode))
+ xchk_ino_set_corrupt(sc, ino);
+ break;
case XFS_DINODE_FMT_UUID:
default:
xchk_ino_set_corrupt(sc, ino);
@@ -659,15 +706,13 @@ xchk_inode_xref_bmap(
return;
/* Walk all the extents to check nextents/naextents/nblocks. */
- error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
- &nextents, &count);
+ error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
if (nextents < xfs_dfork_data_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
- error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
- &nextents, &acount);
+ error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents, &acount);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
if (nextents != xfs_dfork_attr_extents(dip))
@@ -739,6 +784,23 @@ xchk_inode_check_reflink_iflag(
xchk_ino_set_corrupt(sc, ino);
}
+/*
+ * If this inode has zero link count, it must be on the unlinked list. If
+ * it has nonzero link count, it must not be on the unlinked list.
+ */
+STATIC void
+xchk_inode_check_unlinked(
+ struct xfs_scrub *sc)
+{
+ if (VFS_I(sc->ip)->i_nlink == 0) {
+ if (!xfs_inode_on_unlinked_list(sc->ip))
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ } else {
+ if (xfs_inode_on_unlinked_list(sc->ip))
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ }
+}
+
/* Scrub an inode. */
int
xchk_inode(
@@ -771,6 +833,8 @@ xchk_inode(
if (S_ISREG(VFS_I(sc->ip)->i_mode))
xchk_inode_check_reflink_iflag(sc, sc->ip->i_ino);
+ xchk_inode_check_unlinked(sc);
+
xchk_inode_xref(sc, sc->ip->i_ino, &di);
out:
return error;
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 0ca62d59f84a..a90a011c7e5f 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -37,12 +37,19 @@
#include "xfs_attr_leaf.h"
#include "xfs_log_priv.h"
#include "xfs_health.h"
+#include "xfs_symlink_remote.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
/*
* Inode Record Repair
@@ -126,6 +133,10 @@ struct xrep_inode {
/* Must we remove all access from this file? */
bool zap_acls;
+
+ /* Inode scanner to see if we can find the ftype from dirents */
+ struct xchk_iscan ftype_iscan;
+ uint8_t alleged_ftype;
};
/*
@@ -227,26 +238,303 @@ xrep_dinode_header(
dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
}
-/* Turn di_mode into /something/ recognizable. */
-STATIC void
+/*
+ * If this directory entry points to the scrub target inode, then the directory
+ * we're scanning is the parent of the scrub target inode.
+ */
+STATIC int
+xrep_dinode_findmode_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_inode *ri = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ if (ino != sc->sm->sm_ino)
+ return 0;
+
+ /* Ignore garbage directory entry names. */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
+ return -EFSCORRUPTED;
+
+ /* Don't pick up dot or dotdot entries; we only want child dirents. */
+ if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
+ xfs_dir2_samename(name, &xfs_name_dot))
+ return 0;
+
+ /*
+ * Uhoh, more than one parent for this inode and they don't agree on
+ * the file type?
+ */
+ if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN &&
+ ri->alleged_ftype != name->type) {
+ trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type,
+ ri->alleged_ftype);
+ return -EFSCORRUPTED;
+ }
+
+ /* We found a potential parent; remember the ftype. */
+ trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type);
+ ri->alleged_ftype = name->type;
+ return 0;
+}
+
+/* Try to lock a directory, or wait a jiffy. */
+static inline int
+xrep_dinode_ilock_nowait(
+ struct xfs_inode *dp,
+ unsigned int lock_mode)
+{
+ if (xfs_ilock_nowait(dp, lock_mode))
+ return true;
+
+ schedule_timeout_killable(1);
+ return false;
+}
+
+/*
+ * Try to lock a directory to look for ftype hints. Since we already hold the
+ * AGI buffer, we cannot block waiting for the ILOCK because rename can take
+ * the ILOCK and then try to lock AGIs.
+ */
+STATIC int
+xrep_dinode_trylock_directory(
+ struct xrep_inode *ri,
+ struct xfs_inode *dp,
+ unsigned int *lock_modep)
+{
+ unsigned long deadline = jiffies + msecs_to_jiffies(30000);
+ unsigned int lock_mode;
+ int error = 0;
+
+ do {
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ if (xfs_need_iread_extents(&dp->i_df))
+ lock_mode = XFS_ILOCK_EXCL;
+ else
+ lock_mode = XFS_ILOCK_SHARED;
+
+ if (xrep_dinode_ilock_nowait(dp, lock_mode)) {
+ *lock_modep = lock_mode;
+ return 0;
+ }
+ } while (!time_is_before_jiffies(deadline));
+ return -EBUSY;
+}
+
+/*
+ * If this is a directory, walk the dirents looking for any that point to the
+ * scrub target inode.
+ */
+STATIC int
+xrep_dinode_findmode_walk_directory(
+ struct xrep_inode *ri,
+ struct xfs_inode *dp)
+{
+ struct xfs_scrub *sc = ri->sc;
+ unsigned int lock_mode;
+ int error = 0;
+
+ /* Ignore temporary repair directories. */
+ if (xrep_is_tempfile(dp))
+ return 0;
+
+ /*
+ * Scan the directory to see if there it contains an entry pointing to
+ * the directory that we are repairing.
+ */
+ error = xrep_dinode_trylock_directory(ri, dp, &lock_mode);
+ if (error)
+ return error;
+
+ /*
+ * If this directory is known to be sick, we cannot scan it reliably
+ * and must abort.
+ */
+ if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
+ XFS_SICK_INO_BMBTD |
+ XFS_SICK_INO_DIR)) {
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ /*
+ * We cannot complete our parent pointer scan if a directory looks as
+ * though it has been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock(dp, lock_mode);
+ return error;
+}
+
+/*
+ * Try to find the mode of the inode being repaired by looking for directories
+ * that point down to this file.
+ */
+STATIC int
+xrep_dinode_find_mode(
+ struct xrep_inode *ri,
+ uint16_t *mode)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_inode *dp;
+ int error;
+
+ /* No ftype means we have no other metadata to consult. */
+ if (!xfs_has_ftype(sc->mp)) {
+ *mode = S_IFREG;
+ return 0;
+ }
+
+ /*
+ * Scan all directories for parents that might point down to this
+ * inode. Skip the inode being repaired during the scan since it
+ * cannot be its own parent. Note that we still hold the AGI locked
+ * so there's a real possibility that _iscan_iter can return EBUSY.
+ */
+ xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan);
+ xchk_iscan_set_agi_trylock(&ri->ftype_iscan);
+ ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
+ ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
+ while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) {
+ if (S_ISDIR(VFS_I(dp)->i_mode))
+ error = xrep_dinode_findmode_walk_directory(ri, dp);
+ xchk_iscan_mark_visited(&ri->ftype_iscan, dp);
+ xchk_irele(sc, dp);
+ if (error < 0)
+ break;
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&ri->ftype_iscan);
+ xchk_iscan_teardown(&ri->ftype_iscan);
+
+ if (error == -EBUSY) {
+ if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) {
+ /*
+ * If we got an EBUSY after finding at least one
+ * dirent, that means the scan found an inode on the
+ * inactivation list and could not open it. Accept the
+ * alleged ftype and install a new mode below.
+ */
+ error = 0;
+ } else if (!(sc->flags & XCHK_TRY_HARDER)) {
+ /*
+ * Otherwise, retry the operation one time to see if
+ * the reason for the delay is an inode from the same
+ * cluster buffer waiting on the inactivation list.
+ */
+ error = -EDEADLOCK;
+ }
+ }
+ if (error)
+ return error;
+
+ /*
+ * Convert the discovered ftype into the file mode. If all else fails,
+ * return S_IFREG.
+ */
+ switch (ri->alleged_ftype) {
+ case XFS_DIR3_FT_DIR:
+ *mode = S_IFDIR;
+ break;
+ case XFS_DIR3_FT_WHT:
+ case XFS_DIR3_FT_CHRDEV:
+ *mode = S_IFCHR;
+ break;
+ case XFS_DIR3_FT_BLKDEV:
+ *mode = S_IFBLK;
+ break;
+ case XFS_DIR3_FT_FIFO:
+ *mode = S_IFIFO;
+ break;
+ case XFS_DIR3_FT_SOCK:
+ *mode = S_IFSOCK;
+ break;
+ case XFS_DIR3_FT_SYMLINK:
+ *mode = S_IFLNK;
+ break;
+ default:
+ *mode = S_IFREG;
+ break;
+ }
+ return 0;
+}
+
+/* Turn di_mode into /something/ recognizable. Returns true if we succeed. */
+STATIC int
xrep_dinode_mode(
struct xrep_inode *ri,
struct xfs_dinode *dip)
{
struct xfs_scrub *sc = ri->sc;
uint16_t mode = be16_to_cpu(dip->di_mode);
+ int error;
trace_xrep_dinode_mode(sc, dip);
if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
- return;
+ return 0;
+
+ /* Try to fix the mode. If we cannot, then leave everything alone. */
+ error = xrep_dinode_find_mode(ri, &mode);
+ switch (error) {
+ case -EINTR:
+ case -EBUSY:
+ case -EDEADLOCK:
+ /* temporary failure or fatal signal */
+ return error;
+ case 0:
+ /* found mode */
+ break;
+ default:
+ /* some other error, assume S_IFREG */
+ mode = S_IFREG;
+ break;
+ }
/* bad mode, so we set it to a file that only root can read */
- mode = S_IFREG;
dip->di_mode = cpu_to_be16(mode);
dip->di_uid = 0;
dip->di_gid = 0;
ri->zap_acls = true;
+ return 0;
+}
+
+/* Fix unused link count fields having nonzero values. */
+STATIC void
+xrep_dinode_nlinks(
+ struct xfs_dinode *dip)
+{
+ if (dip->di_version < 2) {
+ dip->di_nlink = 0;
+ return;
+ }
+
+ if (xfs_dinode_is_metadir(dip)) {
+ if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX)
+ dip->di_metatype = cpu_to_be16(XFS_METAFILE_UNKNOWN);
+ } else {
+ dip->di_metatype = 0;
+ }
}
/* Fix any conflicting flags that the verifiers complain about. */
@@ -277,8 +565,6 @@ xrep_dinode_flags(
flags2 |= XFS_DIFLAG2_REFLINK;
else
flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
- if (flags & XFS_DIFLAG_REALTIME)
- flags2 &= ~XFS_DIFLAG2_REFLINK;
if (!xfs_has_bigtime(mp))
flags2 &= ~XFS_DIFLAG2_BIGTIME;
if (!xfs_has_large_extent_counts(mp))
@@ -287,6 +573,16 @@ xrep_dinode_flags(
dip->di_nrext64_pad = 0;
else if (dip->di_version >= 3)
dip->di_v3_pad = 0;
+
+ if (flags2 & XFS_DIFLAG2_METADATA) {
+ xfs_failaddr_t fa;
+
+ fa = xfs_dinode_verify_metadir(sc->mp, dip, mode, flags,
+ flags2);
+ if (fa)
+ flags2 &= ~XFS_DIFLAG2_METADATA;
+ }
+
dip->di_flags = cpu_to_be16(flags);
dip->di_flags2 = cpu_to_be64(flags2);
}
@@ -414,7 +710,9 @@ xrep_dinode_extsize_hints(
XFS_DIFLAG_EXTSZINHERIT);
}
- if (dip->di_version < 3)
+ if (dip->di_version < 3 ||
+ (xfs_has_zoned(sc->mp) &&
+ dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)))
return;
fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
@@ -478,19 +776,72 @@ xrep_dinode_count_ag_rmaps(
return error;
}
+/* Count extents and blocks for an inode given an rt rmap. */
+STATIC int
+xrep_dinode_walk_rtrmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_inode *ri = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ /* We only care about this inode. */
+ if (rec->rm_owner != ri->sc->sm->sm_ino)
+ return 0;
+
+ if (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))
+ return -EFSCORRUPTED;
+
+ ri->rt_blocks += rec->rm_blockcount;
+ ri->rt_extents++;
+ return 0;
+}
+
+/* Count extents and blocks for an inode from all realtime rmap data. */
+STATIC int
+xrep_dinode_count_rtgroup_rmaps(
+ struct xrep_inode *ri,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_scrub *sc = ri->sc;
+ int error;
+
+ error = xrep_rtgroup_init(sc, rtg, &sc->sr, XFS_RTGLOCK_RMAP);
+ if (error)
+ return error;
+
+ error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_dinode_walk_rtrmap,
+ ri);
+ xchk_rtgroup_btcur_free(&sc->sr);
+ xchk_rtgroup_free(sc, &sc->sr);
+ return error;
+}
+
/* Count extents and blocks for a given inode from all rmap data. */
STATIC int
xrep_dinode_count_rmaps(
struct xrep_inode *ri)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
+ struct xfs_rtgroup *rtg = NULL;
int error;
- if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
+ if (!xfs_has_rmapbt(ri->sc->mp))
return -EOPNOTSUPP;
- for_each_perag(ri->sc->mp, agno, pag) {
+ while ((rtg = xfs_rtgroup_next(ri->sc->mp, rtg))) {
+ error = xrep_dinode_count_rtgroup_rmaps(ri, rtg);
+ if (error) {
+ xfs_rtgroup_rele(rtg);
+ return error;
+ }
+ }
+
+ while ((pag = xfs_perag_next(ri->sc->mp, pag))) {
error = xrep_dinode_count_ag_rmaps(ri, pag);
if (error) {
xfs_perag_rele(pag);
@@ -568,7 +919,7 @@ xrep_dinode_bad_bmbt_fork(
nrecs = be16_to_cpu(dfp->bb_numrecs);
level = be16_to_cpu(dfp->bb_level);
- if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
+ if (nrecs == 0 || xfs_bmdr_space_calc(nrecs) > dfork_size)
return true;
if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
return true;
@@ -580,12 +931,12 @@ xrep_dinode_bad_bmbt_fork(
xfs_fileoff_t fileoff;
xfs_fsblock_t fsbno;
- fkp = XFS_BMDR_KEY_ADDR(dfp, i);
+ fkp = xfs_bmdr_key_addr(dfp, i);
fileoff = be64_to_cpu(fkp->br_startoff);
if (!xfs_verify_fileoff(sc->mp, fileoff))
return true;
- fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr);
+ fpp = xfs_bmdr_ptr_addr(dfp, i, dmxr);
fsbno = be64_to_cpu(*fpp);
if (!xfs_verify_fsbno(sc->mp, fsbno))
return true;
@@ -594,6 +945,85 @@ xrep_dinode_bad_bmbt_fork(
return false;
}
+/* Return true if this rmap-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_rtrmapbt_fork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ unsigned int dfork_size)
+{
+ struct xfs_rtrmap_root *dfp;
+ unsigned int nrecs;
+ unsigned int level;
+
+ if (dfork_size < sizeof(struct xfs_rtrmap_root))
+ return true;
+
+ dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (level > sc->mp->m_rtrmap_maxlevels)
+ return true;
+ if (xfs_rtrmap_droot_space_calc(level, nrecs) > dfork_size)
+ return true;
+ if (level > 0 && nrecs == 0)
+ return true;
+
+ return false;
+}
+
+/* Return true if this refcount-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_rtrefcountbt_fork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ unsigned int dfork_size)
+{
+ struct xfs_rtrefcount_root *dfp;
+ unsigned int nrecs;
+ unsigned int level;
+
+ if (dfork_size < sizeof(struct xfs_rtrefcount_root))
+ return true;
+
+ dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (level > sc->mp->m_rtrefc_maxlevels)
+ return true;
+ if (xfs_rtrefcount_droot_space_calc(level, nrecs) > dfork_size)
+ return true;
+ if (level > 0 && nrecs == 0)
+ return true;
+
+ return false;
+}
+
+/* Check a metadata-btree fork. */
+STATIC bool
+xrep_dinode_bad_metabt_fork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ unsigned int dfork_size,
+ int whichfork)
+{
+ if (whichfork != XFS_DATA_FORK)
+ return true;
+
+ switch (be16_to_cpu(dip->di_metatype)) {
+ case XFS_METAFILE_RTRMAP:
+ return xrep_dinode_bad_rtrmapbt_fork(sc, dip, dfork_size);
+ case XFS_METAFILE_RTREFCOUNT:
+ return xrep_dinode_bad_rtrefcountbt_fork(sc, dip, dfork_size);
+ default:
+ return true;
+ }
+
+ return false;
+}
+
/*
* Check the data fork for things that will fail the ifork verifiers or the
* ifork formatters.
@@ -627,9 +1057,17 @@ xrep_dinode_check_dfork(
return true;
break;
case S_IFREG:
- if (fmt == XFS_DINODE_FMT_LOCAL)
+ switch (fmt) {
+ case XFS_DINODE_FMT_LOCAL:
return true;
- fallthrough;
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ case XFS_DINODE_FMT_META_BTREE:
+ break;
+ default:
+ return true;
+ }
+ break;
case S_IFLNK:
case S_IFDIR:
switch (fmt) {
@@ -674,6 +1112,11 @@ xrep_dinode_check_dfork(
XFS_DATA_FORK))
return true;
break;
+ case XFS_DINODE_FMT_META_BTREE:
+ if (xrep_dinode_bad_metabt_fork(sc, dip, dfork_size,
+ XFS_DATA_FORK))
+ return true;
+ break;
default:
return true;
}
@@ -794,6 +1237,11 @@ xrep_dinode_check_afork(
XFS_ATTR_FORK))
return true;
break;
+ case XFS_DINODE_FMT_META_BTREE:
+ if (xrep_dinode_bad_metabt_fork(sc, dip, afork_size,
+ XFS_ATTR_FORK))
+ return true;
+ break;
default:
return true;
}
@@ -841,9 +1289,11 @@ xrep_dinode_ensure_forkoff(
uint16_t mode)
{
struct xfs_bmdr_block *bmdr;
+ struct xfs_rtrmap_root *rmdr;
+ struct xfs_rtrefcount_root *rcdr;
struct xfs_scrub *sc = ri->sc;
xfs_extnum_t attr_extents, data_extents;
- size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1);
+ size_t bmdr_minsz = xfs_bmdr_space_calc(1);
unsigned int lit_sz = XFS_LITINO(sc->mp);
unsigned int afork_min, dfork_min;
@@ -895,7 +1345,7 @@ xrep_dinode_ensure_forkoff(
case XFS_DINODE_FMT_BTREE:
/* Must have space for btree header and key/pointers. */
bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
- afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+ afork_min = xfs_bmap_broot_space(sc->mp, bmdr);
break;
default:
/* We should never see any other formats. */
@@ -945,7 +1395,22 @@ xrep_dinode_ensure_forkoff(
case XFS_DINODE_FMT_BTREE:
/* Must have space for btree header and key/pointers. */
bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
- dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+ dfork_min = xfs_bmap_broot_space(sc->mp, bmdr);
+ break;
+ case XFS_DINODE_FMT_META_BTREE:
+ switch (be16_to_cpu(dip->di_metatype)) {
+ case XFS_METAFILE_RTRMAP:
+ rmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ dfork_min = xfs_rtrmap_broot_space(sc->mp, rmdr);
+ break;
+ case XFS_METAFILE_RTREFCOUNT:
+ rcdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ dfork_min = xfs_rtrefcount_broot_space(sc->mp, rcdr);
+ break;
+ default:
+ dfork_min = 0;
+ break;
+ }
break;
default:
dfork_min = 0;
@@ -1095,8 +1560,7 @@ xrep_dinode_core(
/* Read the inode cluster buffer. */
error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
- ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
- NULL);
+ ri->imap.im_blkno, ri->imap.im_len, 0, &bp, NULL);
if (error)
return error;
@@ -1107,12 +1571,16 @@ xrep_dinode_core(
/* Fix everything the verifier will complain about. */
dip = xfs_buf_offset(bp, ri->imap.im_boffset);
xrep_dinode_header(sc, dip);
- xrep_dinode_mode(ri, dip);
+ iget_error = xrep_dinode_mode(ri, dip);
+ if (iget_error)
+ goto write;
+ xrep_dinode_nlinks(dip);
xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
xrep_dinode_size(ri, dip);
xrep_dinode_extsize_hints(sc, dip);
xrep_dinode_zap_forks(ri, dip);
+write:
/* Write out the inode. */
trace_xrep_dinode_fixed(sc, dip);
xfs_dinode_calc_crc(sc->mp, dip);
@@ -1128,7 +1596,8 @@ xrep_dinode_core(
* accessing the inode. If iget fails, we still need to commit the
* changes.
*/
- iget_error = xchk_iget(sc, ino, &sc->ip);
+ if (!iget_error)
+ iget_error = xchk_iget(sc, ino, &sc->ip);
if (!iget_error)
xchk_ilock(sc, XFS_IOLOCK_EXCL);
@@ -1201,8 +1670,7 @@ xrep_inode_blockcounts(
trace_xrep_inode_blockcounts(sc);
/* Set data fork counters from the data fork mappings. */
- error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
- &nextents, &count);
+ error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count);
if (error)
return error;
if (xfs_is_reflink_inode(sc->ip)) {
@@ -1226,8 +1694,8 @@ xrep_inode_blockcounts(
/* Set attr fork counters from the attr fork mappings. */
ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
if (ifp) {
- error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
- &nextents, &acount);
+ error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents,
+ &acount);
if (error)
return error;
if (count >= sc->mp->m_sb.sb_dblocks)
@@ -1365,10 +1833,6 @@ xrep_inode_flags(
/* DAX only applies to files and dirs. */
if (!(S_ISREG(mode) || S_ISDIR(mode)))
sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
-
- /* No reflink files on the realtime device. */
- if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
- sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
}
/*
@@ -1453,6 +1917,51 @@ xrep_inode_extsize(
}
}
+/* Ensure this file has an attr fork if it needs to hold a parent pointer. */
+STATIC int
+xrep_inode_pptr(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ struct inode *inode = VFS_I(ip);
+
+ if (!xfs_has_parent(mp))
+ return 0;
+
+ /*
+ * Unlinked inodes that cannot be added to the directory tree will not
+ * have a parent pointer.
+ */
+ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ return 0;
+
+ /* Children of the superblock do not have parent pointers. */
+ if (xchk_inode_is_sb_rooted(ip))
+ return 0;
+
+ /* Inode already has an attr fork; no further work possible here. */
+ if (xfs_inode_has_attr_fork(ip))
+ return 0;
+
+ return xfs_bmap_add_attrfork(sc->tp, ip,
+ sizeof(struct xfs_attr_sf_hdr), true);
+}
+
+/* Fix COW extent size hint problems. */
+STATIC void
+xrep_inode_cowextsize(
+ struct xfs_scrub *sc)
+{
+ /* Fix misaligned CoW extent size hints on a directory. */
+ if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (sc->ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ sc->ip->i_extsize % sc->mp->m_sb.sb_rextsize > 0) {
+ sc->ip->i_cowextsize = 0;
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+ }
+}
+
/* Fix any irregularities in an inode that the verifiers don't catch. */
STATIC int
xrep_inode_problems(
@@ -1463,6 +1972,9 @@ xrep_inode_problems(
error = xrep_inode_blockcounts(sc);
if (error)
return error;
+ error = xrep_inode_pptr(sc);
+ if (error)
+ return error;
xrep_inode_timestamps(sc->ip);
xrep_inode_flags(sc);
xrep_inode_ids(sc);
@@ -1473,12 +1985,53 @@ xrep_inode_problems(
if (S_ISDIR(VFS_I(sc->ip)->i_mode))
xrep_inode_dir_size(sc);
xrep_inode_extsize(sc);
+ xrep_inode_cowextsize(sc);
trace_xrep_inode_fixed(sc);
xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
return xrep_roll_trans(sc);
}
+/*
+ * Make sure this inode's unlinked list pointers are consistent with its
+ * link count.
+ */
+STATIC int
+xrep_inode_unlinked(
+ struct xfs_scrub *sc)
+{
+ unsigned int nlink = VFS_I(sc->ip)->i_nlink;
+ int error;
+
+ /*
+ * If this inode is linked from the directory tree and on the unlinked
+ * list, remove it from the unlinked list.
+ */
+ if (nlink > 0 && xfs_inode_on_unlinked_list(sc->ip)) {
+ struct xfs_perag *pag;
+ int error;
+
+ pag = xfs_perag_get(sc->mp,
+ XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino));
+ error = xfs_iunlink_remove(sc->tp, pag, sc->ip);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If this inode is not linked from the directory tree yet not on the
+ * unlinked list, put it on the unlinked list.
+ */
+ if (nlink == 0 && !xfs_inode_on_unlinked_list(sc->ip)) {
+ error = xfs_iunlink(sc->tp, sc->ip);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
/* Repair an inode's fields. */
int
xrep_inode(
@@ -1496,6 +2049,13 @@ xrep_inode(
ASSERT(ri != NULL);
error = xrep_dinode_problems(ri);
+ if (error == -EBUSY) {
+ /*
+ * Directory scan to recover inode mode encountered a
+ * busy inode, so we did not continue repairing things.
+ */
+ return 0;
+ }
if (error)
return error;
@@ -1521,5 +2081,10 @@ xrep_inode(
return error;
}
+ /* Reconnect incore unlinked list */
+ error = xrep_inode_unlinked(sc);
+ if (error)
+ return error;
+
return xrep_defer_finish(sc);
}
diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c
new file mode 100644
index 000000000000..84f117667ca2
--- /dev/null
+++ b/fs/xfs/scrub/iscan.c
@@ -0,0 +1,826 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+#include "xfs_icache.h"
+#include "scrub/scrub.h"
+#include "scrub/iscan.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * Live File Scan
+ * ==============
+ *
+ * Live file scans walk every inode in a live filesystem. This is more or
+ * less like a regular iwalk, except that when we're advancing the scan cursor,
+ * we must ensure that inodes cannot be added or deleted anywhere between the
+ * old cursor value and the new cursor value. If we're advancing the cursor
+ * by one inode, the caller must hold that inode; if we're finding the next
+ * inode to scan, we must grab the AGI and hold it until we've updated the
+ * scan cursor.
+ *
+ * Callers are expected to use this code to scan all files in the filesystem to
+ * construct a new metadata index of some kind. The scan races against other
+ * live updates, which means there must be a provision to update the new index
+ * when updates are made to inodes that already been scanned. The iscan lock
+ * can be used in live update hook code to stop the scan and protect this data
+ * structure.
+ *
+ * To keep the new index up to date with other metadata updates being made to
+ * the live filesystem, it is assumed that the caller will add hooks as needed
+ * to be notified when a metadata update occurs. The inode scanner must tell
+ * the hook code when an inode has been visited with xchk_iscan_mark_visit.
+ * Hook functions can use xchk_iscan_want_live_update to decide if the
+ * scanner's observations must be updated.
+ */
+
+/*
+ * If the inobt record @rec covers @iscan->skip_ino, mark the inode free so
+ * that the scan ignores that inode.
+ */
+STATIC void
+xchk_iscan_mask_skipino(
+ struct xchk_iscan *iscan,
+ struct xfs_perag *pag,
+ struct xfs_inobt_rec_incore *rec,
+ xfs_agino_t lastrecino)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino);
+ xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino);
+
+ if (pag_agno(pag) != skip_agno)
+ return;
+ if (skip_agino < rec->ir_startino)
+ return;
+ if (skip_agino > lastrecino)
+ return;
+
+ rec->ir_free |= xfs_inobt_maskn(skip_agino - rec->ir_startino, 1);
+}
+
+/*
+ * Set *cursor to the next allocated inode after whatever it's set to now.
+ * If there are no more inodes in this AG, cursor is set to NULLAGINO.
+ */
+STATIC int
+xchk_iscan_find_next(
+ struct xchk_iscan *iscan,
+ struct xfs_buf *agi_bp,
+ struct xfs_perag *pag,
+ xfs_inofree_t *allocmaskp,
+ xfs_agino_t *cursor,
+ uint8_t *nr_inodesp)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_btree_cur *cur;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+ xfs_agnumber_t agno = pag_agno(pag);
+ xfs_agino_t lastino = NULLAGINO;
+ xfs_agino_t first, last;
+ xfs_agino_t agino = *cursor;
+ int has_rec;
+ int error;
+
+ /* If the cursor is beyond the end of this AG, move to the next one. */
+ xfs_agino_range(mp, agno, &first, &last);
+ if (agino > last) {
+ *cursor = NULLAGINO;
+ return 0;
+ }
+
+ /*
+ * Look up the inode chunk for the current cursor position. If there
+ * is no chunk here, we want the next one.
+ */
+ cur = xfs_inobt_init_cursor(pag, tp, agi_bp);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec);
+ if (!error && !has_rec)
+ error = xfs_btree_increment(cur, 0, &has_rec);
+ for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) {
+ xfs_inofree_t allocmask;
+
+ /*
+ * If we've run out of inobt records in this AG, move the
+ * cursor on to the next AG and exit. The caller can try
+ * again with the next AG.
+ */
+ if (!has_rec) {
+ *cursor = NULLAGINO;
+ break;
+ }
+
+ error = xfs_inobt_get_rec(cur, &rec, &has_rec);
+ if (error)
+ break;
+ if (!has_rec) {
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+ /* Make sure that we always move forward. */
+ if (lastino != NULLAGINO &&
+ XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) {
+ error = -EFSCORRUPTED;
+ break;
+ }
+ lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1;
+
+ /*
+ * If this record only covers inodes that come before the
+ * cursor, advance to the next record.
+ */
+ if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
+ continue;
+
+ if (iscan->skip_ino)
+ xchk_iscan_mask_skipino(iscan, pag, &rec, lastino);
+
+ /*
+ * If the incoming lookup put us in the middle of an inobt
+ * record, mark it and the previous inodes "free" so that the
+ * search for allocated inodes will start at the cursor.
+ * We don't care about ir_freecount here.
+ */
+ if (agino >= rec.ir_startino)
+ rec.ir_free |= xfs_inobt_maskn(0,
+ agino + 1 - rec.ir_startino);
+
+ /*
+ * If there are allocated inodes in this chunk, find them
+ * and update the scan cursor.
+ */
+ allocmask = ~rec.ir_free;
+ if (hweight64(allocmask) > 0) {
+ int next = xfs_lowbit64(allocmask);
+
+ ASSERT(next >= 0);
+ *cursor = rec.ir_startino + next;
+ *allocmaskp = allocmask >> next;
+ *nr_inodesp = XFS_INODES_PER_CHUNK - next;
+ break;
+ }
+ }
+
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Advance both the scan and the visited cursors.
+ *
+ * The inumber address space for a given filesystem is sparse, which means that
+ * the scan cursor can jump a long ways in a single iter() call. There are no
+ * inodes in these sparse areas, so we must move the visited cursor forward at
+ * the same time so that the scan user can receive live updates for inodes that
+ * may get created once we release the AGI buffer.
+ */
+static inline void
+xchk_iscan_move_cursor(
+ struct xchk_iscan *iscan,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t cursor, visited;
+
+ BUILD_BUG_ON(XFS_MAXINUMBER == NULLFSINO);
+
+ /*
+ * Special-case ino == 0 here so that we never set visited_ino to
+ * NULLFSINO when wrapping around EOFS, for that will let through all
+ * live updates.
+ */
+ cursor = XFS_AGINO_TO_INO(mp, agno, agino);
+ if (cursor == 0)
+ visited = XFS_MAXINUMBER;
+ else
+ visited = cursor - 1;
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = cursor;
+ iscan->__visited_ino = visited;
+ trace_xchk_iscan_move_cursor(iscan);
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Prepare to return agno/agino to the iscan caller by moving the lastino
+ * cursor to the previous inode. Do this while we still hold the AGI so that
+ * no other threads can create or delete inodes in this AG.
+ */
+static inline void
+xchk_iscan_finish(
+ struct xchk_iscan *iscan)
+{
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = NULLFSINO;
+
+ /* All live updates will be applied from now on */
+ iscan->__visited_ino = NULLFSINO;
+
+ mutex_unlock(&iscan->lock);
+}
+
+/* Mark an inode scan finished before we actually scan anything. */
+void
+xchk_iscan_finish_early(
+ struct xchk_iscan *iscan)
+{
+ ASSERT(iscan->cursor_ino == iscan->scan_start_ino);
+ ASSERT(iscan->__visited_ino == iscan->scan_start_ino);
+
+ xchk_iscan_finish(iscan);
+}
+
+/*
+ * Grab the AGI to advance the inode scan. Returns 0 if *agi_bpp is now set,
+ * -ECANCELED if the live scan aborted, -EBUSY if the AGI could not be grabbed,
+ * or the usual negative errno.
+ */
+STATIC int
+xchk_iscan_read_agi(
+ struct xchk_iscan *iscan,
+ struct xfs_perag *pag,
+ struct xfs_buf **agi_bpp)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ unsigned long relax;
+ int ret;
+
+ if (!xchk_iscan_agi_needs_trylock(iscan))
+ return xfs_ialloc_read_agi(pag, sc->tp, 0, agi_bpp);
+
+ relax = msecs_to_jiffies(iscan->iget_retry_delay);
+ do {
+ ret = xfs_ialloc_read_agi(pag, sc->tp, XFS_IALLOC_FLAG_TRYLOCK,
+ agi_bpp);
+ if (ret != -EAGAIN)
+ return ret;
+ if (!iscan->iget_timeout ||
+ time_is_before_jiffies(iscan->__iget_deadline))
+ return -EBUSY;
+
+ trace_xchk_iscan_agi_retry_wait(iscan);
+ } while (!schedule_timeout_killable(relax) &&
+ !xchk_iscan_aborted(iscan));
+ return -ECANCELED;
+}
+
+/*
+ * Advance ino to the next inode that the inobt thinks is allocated, being
+ * careful to jump to the next AG if we've reached the right end of this AG's
+ * inode btree. Advancing ino effectively means that we've pushed the inode
+ * scan forward, so set the iscan cursor to (ino - 1) so that our live update
+ * predicates will track inode allocations in that part of the inode number
+ * key space once we release the AGI buffer.
+ *
+ * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
+ * -ECANCELED if the live scan aborted, or the usual negative errno.
+ */
+STATIC int
+xchk_iscan_advance(
+ struct xchk_iscan *iscan,
+ struct xfs_perag **pagp,
+ struct xfs_buf **agi_bpp,
+ xfs_inofree_t *allocmaskp,
+ uint8_t *nr_inodesp)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agi_bp;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ xfs_agino_t agino;
+ int ret;
+
+ ASSERT(iscan->cursor_ino >= iscan->__visited_ino);
+
+ do {
+ if (xchk_iscan_aborted(iscan))
+ return -ECANCELED;
+
+ agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino);
+ pag = xfs_perag_get(mp, agno);
+ if (!pag)
+ return -ECANCELED;
+
+ ret = xchk_iscan_read_agi(iscan, pag, &agi_bp);
+ if (ret)
+ goto out_pag;
+
+ agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino);
+ ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp,
+ &agino, nr_inodesp);
+ if (ret)
+ goto out_buf;
+
+ if (agino != NULLAGINO) {
+ /*
+ * Found the next inode in this AG, so return it along
+ * with the AGI buffer and the perag structure to
+ * ensure it cannot go away.
+ */
+ xchk_iscan_move_cursor(iscan, agno, agino);
+ *agi_bpp = agi_bp;
+ *pagp = pag;
+ return 1;
+ }
+
+ /*
+ * Did not find any more inodes in this AG, move on to the next
+ * AG.
+ */
+ agno = (agno + 1) % mp->m_sb.sb_agcount;
+ xchk_iscan_move_cursor(iscan, agno, 0);
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ trace_xchk_iscan_advance_ag(iscan);
+ } while (iscan->cursor_ino != iscan->scan_start_ino);
+
+ xchk_iscan_finish(iscan);
+ return 0;
+
+out_buf:
+ xfs_trans_brelse(sc->tp, agi_bp);
+out_pag:
+ xfs_perag_put(pag);
+ return ret;
+}
+
+/*
+ * Grabbing the inode failed, so we need to back up the scan and ask the caller
+ * to try to _advance the scan again. Returns -EBUSY if we've run out of retry
+ * opportunities, -ECANCELED if the process has a fatal signal pending, or
+ * -EAGAIN if we should try again.
+ */
+STATIC int
+xchk_iscan_iget_retry(
+ struct xchk_iscan *iscan,
+ bool wait)
+{
+ ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1);
+
+ if (!iscan->iget_timeout ||
+ time_is_before_jiffies(iscan->__iget_deadline))
+ return -EBUSY;
+
+ if (wait) {
+ unsigned long relax;
+
+ /*
+ * Sleep for a period of time to let the rest of the system
+ * catch up. If we return early, someone sent a kill signal to
+ * the calling process.
+ */
+ relax = msecs_to_jiffies(iscan->iget_retry_delay);
+ trace_xchk_iscan_iget_retry_wait(iscan);
+
+ if (schedule_timeout_killable(relax) ||
+ xchk_iscan_aborted(iscan))
+ return -ECANCELED;
+ }
+
+ iscan->cursor_ino--;
+ return -EAGAIN;
+}
+
+/*
+ * For an inode scan, we hold the AGI and want to try to grab a batch of
+ * inodes. Holding the AGI prevents inodegc from clearing freed inodes,
+ * so we must use noretry here. For every inode after the first one in the
+ * batch, we don't want to wait, so we use retry there too. Finally, use
+ * dontcache to avoid polluting the cache.
+ */
+#define ISCAN_IGET_FLAGS (XFS_IGET_NORETRY | XFS_IGET_DONTCACHE)
+
+/*
+ * Grab an inode as part of an inode scan. While scanning this inode, the
+ * caller must ensure that no other threads can modify the inode until a call
+ * to xchk_iscan_visit succeeds.
+ *
+ * Returns the number of incore inodes grabbed; -EAGAIN if the caller should
+ * call again xchk_iscan_advance; -EBUSY if we couldn't grab an inode;
+ * -ECANCELED if there's a fatal signal pending; or some other negative errno.
+ */
+STATIC int
+xchk_iscan_iget(
+ struct xchk_iscan *iscan,
+ struct xfs_perag *pag,
+ struct xfs_buf *agi_bp,
+ xfs_inofree_t allocmask,
+ uint8_t nr_inodes)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t ino = iscan->cursor_ino;
+ unsigned int idx = 0;
+ unsigned int i;
+ int error;
+
+ ASSERT(iscan->__inodes[0] == NULL);
+
+ /* Fill the first slot in the inode array. */
+ error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0,
+ &iscan->__inodes[idx]);
+
+ trace_xchk_iscan_iget(iscan, error);
+
+ if (error == -ENOENT || error == -EAGAIN) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ /*
+ * It's possible that this inode has lost all of its links but
+ * hasn't yet been inactivated. If we don't have a transaction
+ * or it's not writable, flush the inodegc workers and wait.
+ * If we have a non-empty transaction, we must not block on
+ * inodegc, which allocates its own transactions.
+ */
+ if (sc->tp && !(sc->tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
+ xfs_inodegc_push(mp);
+ else
+ xfs_inodegc_flush(mp);
+ return xchk_iscan_iget_retry(iscan, true);
+ }
+
+ if (error == -EINVAL) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ /*
+ * We thought the inode was allocated, but the inode btree
+ * lookup failed, which means that it was freed since the last
+ * time we advanced the cursor. Back up and try again. This
+ * should never happen since still hold the AGI buffer from the
+ * inobt check, but we need to be careful about infinite loops.
+ */
+ return xchk_iscan_iget_retry(iscan, false);
+ }
+
+ if (error) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ return error;
+ }
+ idx++;
+ ino++;
+ allocmask >>= 1;
+
+ /*
+ * Now that we've filled the first slot in __inodes, try to fill the
+ * rest of the batch with consecutively ordered inodes. to reduce the
+ * number of _iter calls. Make a bitmap of unallocated inodes from the
+ * zeroes in the inuse bitmap; these inodes will not be scanned, but
+ * the _want_live_update predicate will pass through all live updates.
+ *
+ * If we can't iget an allocated inode, stop and return what we have.
+ */
+ mutex_lock(&iscan->lock);
+ iscan->__batch_ino = ino - 1;
+ iscan->__skipped_inomask = 0;
+ mutex_unlock(&iscan->lock);
+
+ for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) {
+ if (!(allocmask & 1)) {
+ ASSERT(!(iscan->__skipped_inomask & (1ULL << i)));
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = ino;
+ iscan->__skipped_inomask |= (1ULL << i);
+ mutex_unlock(&iscan->lock);
+ continue;
+ }
+
+ ASSERT(iscan->__inodes[idx] == NULL);
+
+ error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0,
+ &iscan->__inodes[idx]);
+ if (error)
+ break;
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = ino;
+ mutex_unlock(&iscan->lock);
+ idx++;
+ }
+
+ trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx);
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ return idx;
+}
+
+/*
+ * Advance the visit cursor to reflect skipped inodes beyond whatever we
+ * scanned.
+ */
+STATIC void
+xchk_iscan_finish_batch(
+ struct xchk_iscan *iscan)
+{
+ xfs_ino_t highest_skipped;
+
+ mutex_lock(&iscan->lock);
+
+ if (iscan->__batch_ino != NULLFSINO) {
+ highest_skipped = iscan->__batch_ino +
+ xfs_highbit64(iscan->__skipped_inomask);
+ iscan->__visited_ino = max(iscan->__visited_ino,
+ highest_skipped);
+
+ trace_xchk_iscan_skip(iscan);
+ }
+
+ iscan->__batch_ino = NULLFSINO;
+ iscan->__skipped_inomask = 0;
+
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Advance the inode scan cursor to the next allocated inode and return up to
+ * 64 consecutive allocated inodes starting with the cursor position.
+ */
+STATIC int
+xchk_iscan_iter_batch(
+ struct xchk_iscan *iscan)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ int ret;
+
+ xchk_iscan_finish_batch(iscan);
+
+ if (iscan->iget_timeout)
+ iscan->__iget_deadline = jiffies +
+ msecs_to_jiffies(iscan->iget_timeout);
+
+ do {
+ struct xfs_buf *agi_bp = NULL;
+ struct xfs_perag *pag = NULL;
+ xfs_inofree_t allocmask = 0;
+ uint8_t nr_inodes = 0;
+
+ ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask,
+ &nr_inodes);
+ if (ret != 1)
+ return ret;
+
+ if (xchk_iscan_aborted(iscan)) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ ret = -ECANCELED;
+ break;
+ }
+
+ ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+/*
+ * Advance the inode scan cursor to the next allocated inode and return the
+ * incore inode structure associated with it.
+ *
+ * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
+ * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be
+ * grabbed, or the usual negative errno.
+ *
+ * If the function returns -EBUSY and the caller can handle skipping an inode,
+ * it may call this function again to continue the scan with the next allocated
+ * inode.
+ */
+int
+xchk_iscan_iter(
+ struct xchk_iscan *iscan,
+ struct xfs_inode **ipp)
+{
+ unsigned int i;
+ int error;
+
+ /* Find a cached inode, or go get another batch. */
+ for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+ if (iscan->__inodes[i])
+ goto foundit;
+ }
+
+ error = xchk_iscan_iter_batch(iscan);
+ if (error <= 0)
+ return error;
+
+ ASSERT(iscan->__inodes[0] != NULL);
+ i = 0;
+
+foundit:
+ /* Give the caller our reference. */
+ *ipp = iscan->__inodes[i];
+ iscan->__inodes[i] = NULL;
+ return 1;
+}
+
+/* Clean up an xfs_iscan_iter call by dropping any inodes that we still hold. */
+void
+xchk_iscan_iter_finish(
+ struct xchk_iscan *iscan)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ unsigned int i;
+
+ for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+ if (iscan->__inodes[i]) {
+ xchk_irele(sc, iscan->__inodes[i]);
+ iscan->__inodes[i] = NULL;
+ }
+ }
+}
+
+/* Mark this inode scan finished and release resources. */
+void
+xchk_iscan_teardown(
+ struct xchk_iscan *iscan)
+{
+ xchk_iscan_iter_finish(iscan);
+ xchk_iscan_finish(iscan);
+ mutex_destroy(&iscan->lock);
+}
+
+/* Pick an AG from which to start a scan. */
+static inline xfs_ino_t
+xchk_iscan_rotor(
+ struct xfs_mount *mp)
+{
+ static atomic_t agi_rotor;
+ unsigned int r = atomic_inc_return(&agi_rotor) - 1;
+
+ /*
+ * Rotoring *backwards* through the AGs, so we add one here before
+ * subtracting from the agcount to arrive at an AG number.
+ */
+ r = (r % mp->m_sb.sb_agcount) + 1;
+
+ return XFS_AGINO_TO_INO(mp, mp->m_sb.sb_agcount - r, 0);
+}
+
+/*
+ * Set ourselves up to start an inode scan. If the @iget_timeout and
+ * @iget_retry_delay parameters are set, the scan will try to iget each inode
+ * for @iget_timeout milliseconds. If an iget call indicates that the inode is
+ * waiting to be inactivated, the CPU will relax for @iget_retry_delay
+ * milliseconds after pushing the inactivation workers.
+ */
+void
+xchk_iscan_start(
+ struct xfs_scrub *sc,
+ unsigned int iget_timeout,
+ unsigned int iget_retry_delay,
+ struct xchk_iscan *iscan)
+{
+ xfs_ino_t start_ino;
+
+ start_ino = xchk_iscan_rotor(sc->mp);
+
+ iscan->__batch_ino = NULLFSINO;
+ iscan->__skipped_inomask = 0;
+
+ iscan->sc = sc;
+ clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+ iscan->iget_timeout = iget_timeout;
+ iscan->iget_retry_delay = iget_retry_delay;
+ iscan->__visited_ino = start_ino;
+ iscan->cursor_ino = start_ino;
+ iscan->scan_start_ino = start_ino;
+ mutex_init(&iscan->lock);
+ memset(iscan->__inodes, 0, sizeof(iscan->__inodes));
+
+ trace_xchk_iscan_start(iscan, start_ino);
+}
+
+/*
+ * Mark this inode as having been visited. Callers must hold a sufficiently
+ * exclusive lock on the inode to prevent concurrent modifications.
+ */
+void
+xchk_iscan_mark_visited(
+ struct xchk_iscan *iscan,
+ struct xfs_inode *ip)
+{
+ mutex_lock(&iscan->lock);
+ iscan->__visited_ino = ip->i_ino;
+ trace_xchk_iscan_visit(iscan);
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Did we skip this inode because it wasn't allocated when we loaded the batch?
+ * If so, it is newly allocated and will not be scanned. All live updates to
+ * this inode must be passed to the caller to maintain scan correctness.
+ */
+static inline bool
+xchk_iscan_skipped(
+ const struct xchk_iscan *iscan,
+ xfs_ino_t ino)
+{
+ if (iscan->__batch_ino == NULLFSINO)
+ return false;
+ if (ino < iscan->__batch_ino)
+ return false;
+ if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK)
+ return false;
+
+ return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino));
+}
+
+/*
+ * Do we need a live update for this inode? This is true if the scanner thread
+ * has visited this inode and the scan hasn't been aborted due to errors.
+ * Callers must hold a sufficiently exclusive lock on the inode to prevent
+ * scanners from reading any inode metadata.
+ */
+bool
+xchk_iscan_want_live_update(
+ struct xchk_iscan *iscan,
+ xfs_ino_t ino)
+{
+ bool ret = false;
+
+ if (xchk_iscan_aborted(iscan))
+ return false;
+
+ mutex_lock(&iscan->lock);
+
+ trace_xchk_iscan_want_live_update(iscan, ino);
+
+ /* Scan is finished, caller should receive all updates. */
+ if (iscan->__visited_ino == NULLFSINO) {
+ ret = true;
+ goto unlock;
+ }
+
+ /*
+ * No inodes have been visited yet, so the visited cursor points at the
+ * start of the scan range. The caller should not receive any updates.
+ */
+ if (iscan->scan_start_ino == iscan->__visited_ino) {
+ ret = false;
+ goto unlock;
+ }
+
+ /*
+ * This inode was not allocated at the time of the iscan batch.
+ * The caller should receive all updates.
+ */
+ if (xchk_iscan_skipped(iscan, ino)) {
+ ret = true;
+ goto unlock;
+ }
+
+ /*
+ * The visited cursor hasn't yet wrapped around the end of the FS. If
+ * @ino is inside the starred range, the caller should receive updates:
+ *
+ * 0 ------------ S ************ V ------------ EOFS
+ */
+ if (iscan->scan_start_ino <= iscan->__visited_ino) {
+ if (ino >= iscan->scan_start_ino &&
+ ino <= iscan->__visited_ino)
+ ret = true;
+
+ goto unlock;
+ }
+
+ /*
+ * The visited cursor wrapped around the end of the FS. If @ino is
+ * inside the starred range, the caller should receive updates:
+ *
+ * 0 ************ V ------------ S ************ EOFS
+ */
+ if (ino >= iscan->scan_start_ino || ino <= iscan->__visited_ino)
+ ret = true;
+
+unlock:
+ mutex_unlock(&iscan->lock);
+ return ret;
+}
diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h
new file mode 100644
index 000000000000..f9f47fa01a9e
--- /dev/null
+++ b/fs/xfs/scrub/iscan.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_ISCAN_H__
+#define __XFS_SCRUB_ISCAN_H__
+
+struct xchk_iscan {
+ struct xfs_scrub *sc;
+
+ /* Lock to protect the scan cursor. */
+ struct mutex lock;
+
+ /*
+ * This is the first inode in the inumber address space that we
+ * examined. When the scan wraps around back to here, the scan is
+ * finished.
+ */
+ xfs_ino_t scan_start_ino;
+
+ /* This is the inode that will be examined next. */
+ xfs_ino_t cursor_ino;
+
+ /* If nonzero and non-NULL, skip this inode when scanning. */
+ xfs_ino_t skip_ino;
+
+ /*
+ * This is the last inode that we've successfully scanned, either
+ * because the caller scanned it, or we moved the cursor past an empty
+ * part of the inode address space. Scan callers should only use the
+ * xchk_iscan_visit function to modify this.
+ */
+ xfs_ino_t __visited_ino;
+
+ /* Operational state of the livescan. */
+ unsigned long __opstate;
+
+ /* Give up on iterating @cursor_ino if we can't iget it by this time. */
+ unsigned long __iget_deadline;
+
+ /* Amount of time (in ms) that we will try to iget an inode. */
+ unsigned int iget_timeout;
+
+ /* Wait this many ms to retry an iget. */
+ unsigned int iget_retry_delay;
+
+ /*
+ * The scan grabs batches of inodes and stashes them here before
+ * handing them out with _iter. Unallocated inodes are set in the
+ * mask so that all updates to that inode are selected for live
+ * update propagation.
+ */
+ xfs_ino_t __batch_ino;
+ xfs_inofree_t __skipped_inomask;
+ struct xfs_inode *__inodes[XFS_INODES_PER_CHUNK];
+};
+
+/* Set if the scan has been aborted due to some event in the fs. */
+#define XCHK_ISCAN_OPSTATE_ABORTED (1)
+
+/* Use trylock to acquire the AGI */
+#define XCHK_ISCAN_OPSTATE_TRYLOCK_AGI (2)
+
+static inline bool
+xchk_iscan_aborted(const struct xchk_iscan *iscan)
+{
+ return test_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+}
+
+static inline void
+xchk_iscan_abort(struct xchk_iscan *iscan)
+{
+ set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+}
+
+static inline bool
+xchk_iscan_agi_needs_trylock(const struct xchk_iscan *iscan)
+{
+ return test_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate);
+}
+
+static inline void
+xchk_iscan_set_agi_trylock(struct xchk_iscan *iscan)
+{
+ set_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate);
+}
+
+void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout,
+ unsigned int iget_retry_delay, struct xchk_iscan *iscan);
+void xchk_iscan_finish_early(struct xchk_iscan *iscan);
+void xchk_iscan_teardown(struct xchk_iscan *iscan);
+
+int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp);
+void xchk_iscan_iter_finish(struct xchk_iscan *iscan);
+
+void xchk_iscan_mark_visited(struct xchk_iscan *iscan, struct xfs_inode *ip);
+bool xchk_iscan_want_live_update(struct xchk_iscan *iscan, xfs_ino_t ino);
+
+#endif /* __XFS_SCRUB_ISCAN_H__ */
diff --git a/fs/xfs/scrub/listxattr.c b/fs/xfs/scrub/listxattr.c
new file mode 100644
index 000000000000..256ff7700c94
--- /dev/null
+++ b/fs/xfs/scrub/listxattr.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_trans.h"
+#include "scrub/scrub.h"
+#include "scrub/bitmap.h"
+#include "scrub/dab_bitmap.h"
+#include "scrub/listxattr.h"
+
+/* Call a function for every entry in a shortform xattr structure. */
+STATIC int
+xchk_xattr_walk_sf(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ void *priv)
+{
+ struct xfs_attr_sf_hdr *hdr = ip->i_af.if_data;
+ struct xfs_attr_sf_entry *sfe;
+ unsigned int i;
+ int error;
+
+ sfe = xfs_attr_sf_firstentry(hdr);
+ for (i = 0; i < hdr->count; i++) {
+ error = attr_fn(sc, ip, sfe->flags, sfe->nameval, sfe->namelen,
+ &sfe->nameval[sfe->namelen], sfe->valuelen,
+ priv);
+ if (error)
+ return error;
+
+ sfe = xfs_attr_sf_nextentry(sfe);
+ }
+
+ return 0;
+}
+
+/* Call a function for every entry in this xattr leaf block. */
+STATIC int
+xchk_xattr_walk_leaf_entries(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ struct xfs_buf *bp,
+ void *priv)
+{
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_attr_leaf_entry *entry;
+ unsigned int i;
+ int error;
+
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
+ entry = xfs_attr3_leaf_entryp(leaf);
+
+ for (i = 0; i < ichdr.count; entry++, i++) {
+ void *value;
+ unsigned char *name;
+ unsigned int namelen, valuelen;
+
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ struct xfs_attr_leaf_name_local *name_loc;
+
+ name_loc = xfs_attr3_leaf_name_local(leaf, i);
+ name = name_loc->nameval;
+ namelen = name_loc->namelen;
+ value = &name_loc->nameval[name_loc->namelen];
+ valuelen = be16_to_cpu(name_loc->valuelen);
+ } else {
+ struct xfs_attr_leaf_name_remote *name_rmt;
+
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+ name = name_rmt->name;
+ namelen = name_rmt->namelen;
+ value = NULL;
+ valuelen = be32_to_cpu(name_rmt->valuelen);
+ }
+
+ error = attr_fn(sc, ip, entry->flags, name, namelen, value,
+ valuelen, priv);
+ if (error)
+ return error;
+
+ }
+
+ return 0;
+}
+
+/*
+ * Call a function for every entry in a leaf-format xattr structure. Avoid
+ * memory allocations for the loop detector since there's only one block.
+ */
+STATIC int
+xchk_xattr_walk_leaf(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ void *priv)
+{
+ struct xfs_buf *leaf_bp;
+ int error;
+
+ error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino, 0, &leaf_bp);
+ if (error)
+ return error;
+
+ error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp, priv);
+ xfs_trans_brelse(sc->tp, leaf_bp);
+ return error;
+}
+
+/* Find the leftmost leaf in the xattr dabtree. */
+STATIC int
+xchk_xattr_find_leftmost_leaf(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ struct xdab_bitmap *seen_dablks,
+ struct xfs_buf **leaf_bpp)
+{
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_buf *bp;
+ xfs_failaddr_t fa;
+ xfs_dablk_t blkno = 0;
+ unsigned int expected_level = 0;
+ int error;
+
+ for (;;) {
+ xfs_extlen_t len = 1;
+ uint16_t magic;
+
+ /* Make sure we haven't seen this new block already. */
+ if (xdab_bitmap_test(seen_dablks, blkno, &len))
+ return -EFSCORRUPTED;
+
+ error = xfs_da3_node_read(tp, ip, blkno, &bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ node = bp->b_addr;
+ magic = be16_to_cpu(node->hdr.info.magic);
+ if (magic == XFS_ATTR_LEAF_MAGIC ||
+ magic == XFS_ATTR3_LEAF_MAGIC)
+ break;
+
+ error = -EFSCORRUPTED;
+ if (magic != XFS_DA_NODE_MAGIC &&
+ magic != XFS_DA3_NODE_MAGIC)
+ goto out_buf;
+
+ fa = xfs_da3_node_header_check(bp, ip->i_ino);
+ if (fa)
+ goto out_buf;
+
+ xfs_da3_node_hdr_from_disk(mp, &nodehdr, node);
+
+ if (nodehdr.count == 0 || nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
+ goto out_buf;
+
+ /* Check the level from the root node. */
+ if (blkno == 0)
+ expected_level = nodehdr.level - 1;
+ else if (expected_level != nodehdr.level)
+ goto out_buf;
+ else
+ expected_level--;
+
+ /* Remember that we've seen this node. */
+ error = xdab_bitmap_set(seen_dablks, blkno, 1);
+ if (error)
+ goto out_buf;
+
+ /* Find the next level towards the leaves of the dabtree. */
+ btree = nodehdr.btree;
+ blkno = be32_to_cpu(btree->before);
+ xfs_trans_brelse(tp, bp);
+ }
+
+ error = -EFSCORRUPTED;
+ fa = xfs_attr3_leaf_header_check(bp, ip->i_ino);
+ if (fa)
+ goto out_buf;
+
+ if (expected_level != 0)
+ goto out_buf;
+
+ /* Remember that we've seen this leaf. */
+ error = xdab_bitmap_set(seen_dablks, blkno, 1);
+ if (error)
+ goto out_buf;
+
+ *leaf_bpp = bp;
+ return 0;
+
+out_buf:
+ xfs_trans_brelse(tp, bp);
+ return error;
+}
+
+/* Call a function for every entry in a node-format xattr structure. */
+STATIC int
+xchk_xattr_walk_node(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ xchk_xattrleaf_fn leaf_fn,
+ void *priv)
+{
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct xdab_bitmap seen_dablks;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_buf *leaf_bp;
+ int error;
+
+ xdab_bitmap_init(&seen_dablks);
+
+ error = xchk_xattr_find_leftmost_leaf(sc, ip, &seen_dablks, &leaf_bp);
+ if (error)
+ goto out_bitmap;
+
+ for (;;) {
+ xfs_extlen_t len;
+
+ error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp,
+ priv);
+ if (error)
+ goto out_leaf;
+
+ /* Find the right sibling of this leaf block. */
+ leaf = leaf_bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+ if (leafhdr.forw == 0)
+ goto out_leaf;
+
+ xfs_trans_brelse(sc->tp, leaf_bp);
+
+ if (leaf_fn) {
+ error = leaf_fn(sc, priv);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Make sure we haven't seen this new leaf already. */
+ len = 1;
+ if (xdab_bitmap_test(&seen_dablks, leafhdr.forw, &len)) {
+ error = -EFSCORRUPTED;
+ goto out_bitmap;
+ }
+
+ error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino,
+ leafhdr.forw, &leaf_bp);
+ if (error)
+ goto out_bitmap;
+
+ /* Remember that we've seen this new leaf. */
+ error = xdab_bitmap_set(&seen_dablks, leafhdr.forw, 1);
+ if (error)
+ goto out_leaf;
+ }
+
+out_leaf:
+ xfs_trans_brelse(sc->tp, leaf_bp);
+out_bitmap:
+ xdab_bitmap_destroy(&seen_dablks);
+ return error;
+}
+
+/*
+ * Call a function for every extended attribute in a file.
+ *
+ * Callers must hold the ILOCK. No validation or cursor restarts allowed.
+ * Returns -EFSCORRUPTED on any problem, including loops in the dabtree.
+ */
+int
+xchk_xattr_walk(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn,
+ xchk_xattrleaf_fn leaf_fn,
+ void *priv)
+{
+ int error;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+ if (!xfs_inode_hasattr(ip))
+ return 0;
+
+ if (ip->i_af.if_format == XFS_DINODE_FMT_LOCAL)
+ return xchk_xattr_walk_sf(sc, ip, attr_fn, priv);
+
+ /* attr functions require that the attr fork is loaded */
+ error = xfs_iread_extents(sc->tp, ip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ if (xfs_attr_is_leaf(ip))
+ return xchk_xattr_walk_leaf(sc, ip, attr_fn, priv);
+
+ return xchk_xattr_walk_node(sc, ip, attr_fn, leaf_fn, priv);
+}
diff --git a/fs/xfs/scrub/listxattr.h b/fs/xfs/scrub/listxattr.h
new file mode 100644
index 000000000000..703cfb7b14cf
--- /dev/null
+++ b/fs/xfs/scrub/listxattr.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_LISTXATTR_H__
+#define __XFS_SCRUB_LISTXATTR_H__
+
+typedef int (*xchk_xattr_fn)(struct xfs_scrub *sc, struct xfs_inode *ip,
+ unsigned int attr_flags, const unsigned char *name,
+ unsigned int namelen, const void *value, unsigned int valuelen,
+ void *priv);
+
+typedef int (*xchk_xattrleaf_fn)(struct xfs_scrub *sc, void *priv);
+
+int xchk_xattr_walk(struct xfs_scrub *sc, struct xfs_inode *ip,
+ xchk_xattr_fn attr_fn, xchk_xattrleaf_fn leaf_fn, void *priv);
+
+#endif /* __XFS_SCRUB_LISTXATTR_H__ */
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
new file mode 100644
index 000000000000..e21c16fbd15d
--- /dev/null
+++ b/fs/xfs/scrub/metapath.c
@@ -0,0 +1,679 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_metafile.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_dir2.h"
+#include "xfs_parent.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_attr.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/readdir.h"
+#include "scrub/repair.h"
+
+/*
+ * Metadata Directory Tree Paths
+ * =============================
+ *
+ * A filesystem with metadir enabled expects to find metadata structures
+ * attached to files that are accessible by walking a path down the metadata
+ * directory tree. Given the metadir path and the incore inode storing the
+ * metadata, this scrubber ensures that the ondisk metadir path points to the
+ * ondisk inode represented by the incore inode.
+ */
+
+struct xchk_metapath {
+ struct xfs_scrub *sc;
+
+ /* Name for lookup */
+ struct xfs_name xname;
+
+ /* Directory update for repairs */
+ struct xfs_dir_update du;
+
+ /* Path down to this metadata file from the parent directory */
+ const char *path;
+
+ /* Directory parent of the metadata file. */
+ struct xfs_inode *dp;
+
+ /* Locks held on dp */
+ unsigned int dp_ilock_flags;
+
+ /* Transaction block reservations */
+ unsigned int link_resblks;
+ unsigned int unlink_resblks;
+
+ /* Parent pointer updates */
+ struct xfs_parent_args link_ppargs;
+ struct xfs_parent_args unlink_ppargs;
+
+ /* Scratchpads for removing links */
+ struct xfs_da_args pptr_args;
+};
+
+/* Release resources tracked in the buffer. */
+static inline void
+xchk_metapath_cleanup(
+ void *buf)
+{
+ struct xchk_metapath *mpath = buf;
+
+ if (mpath->dp_ilock_flags)
+ xfs_iunlock(mpath->dp, mpath->dp_ilock_flags);
+ kfree(mpath->path);
+}
+
+/* Set up a metadir path scan. @path must be dynamically allocated. */
+static inline int
+xchk_setup_metapath_scan(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ const char *path,
+ struct xfs_inode *ip)
+{
+ struct xchk_metapath *mpath;
+ int error;
+
+ if (!path)
+ return -ENOMEM;
+
+ error = xchk_install_live_inode(sc, ip);
+ if (error) {
+ kfree(path);
+ return error;
+ }
+
+ mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS);
+ if (!mpath) {
+ kfree(path);
+ return -ENOMEM;
+ }
+
+ mpath->sc = sc;
+ sc->buf = mpath;
+ sc->buf_cleanup = xchk_metapath_cleanup;
+
+ mpath->dp = dp;
+ mpath->path = path; /* path is now owned by mpath */
+
+ mpath->xname.name = mpath->path;
+ mpath->xname.len = strlen(mpath->path);
+ mpath->xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
+
+ return 0;
+}
+
+#ifdef CONFIG_XFS_RT
+/* Scan the /rtgroups directory itself. */
+static int
+xchk_setup_metapath_rtdir(
+ struct xfs_scrub *sc)
+{
+ if (!sc->mp->m_rtdirip)
+ return -ENOENT;
+
+ return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
+ kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip);
+}
+
+/* Scan a rtgroup inode under the /rtgroups directory. */
+static int
+xchk_setup_metapath_rtginode(
+ struct xfs_scrub *sc,
+ enum xfs_rtg_inodes type)
+{
+ struct xfs_rtgroup *rtg;
+ struct xfs_inode *ip;
+ int error;
+
+ rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno);
+ if (!rtg)
+ return -ENOENT;
+
+ ip = rtg->rtg_inodes[type];
+ if (!ip) {
+ error = -ENOENT;
+ goto out_put_rtg;
+ }
+
+ error = xchk_setup_metapath_scan(sc, sc->mp->m_rtdirip,
+ xfs_rtginode_path(rtg_rgno(rtg), type), ip);
+
+out_put_rtg:
+ xfs_rtgroup_put(rtg);
+ return error;
+}
+#else
+# define xchk_setup_metapath_rtdir(...) (-ENOENT)
+# define xchk_setup_metapath_rtginode(...) (-ENOENT)
+#endif /* CONFIG_XFS_RT */
+
+#ifdef CONFIG_XFS_QUOTA
+/* Scan the /quota directory itself. */
+static int
+xchk_setup_metapath_quotadir(
+ struct xfs_scrub *sc)
+{
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+
+ if (!qi || !qi->qi_dirip)
+ return -ENOENT;
+
+ return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
+ kstrdup("quota", GFP_KERNEL), qi->qi_dirip);
+}
+
+/* Scan a quota inode under the /quota directory. */
+static int
+xchk_setup_metapath_dqinode(
+ struct xfs_scrub *sc,
+ xfs_dqtype_t type)
+{
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ struct xfs_inode *ip = NULL;
+
+ if (!qi)
+ return -ENOENT;
+
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ ip = qi->qi_uquotaip;
+ break;
+ case XFS_DQTYPE_GROUP:
+ ip = qi->qi_gquotaip;
+ break;
+ case XFS_DQTYPE_PROJ:
+ ip = qi->qi_pquotaip;
+ break;
+ default:
+ ASSERT(0);
+ return -EINVAL;
+ }
+ if (!ip)
+ return -ENOENT;
+
+ return xchk_setup_metapath_scan(sc, qi->qi_dirip,
+ kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip);
+}
+#else
+# define xchk_setup_metapath_quotadir(...) (-ENOENT)
+# define xchk_setup_metapath_dqinode(...) (-ENOENT)
+#endif /* CONFIG_XFS_QUOTA */
+
+int
+xchk_setup_metapath(
+ struct xfs_scrub *sc)
+{
+ if (!xfs_has_metadir(sc->mp))
+ return -ENOENT;
+ if (sc->sm->sm_gen)
+ return -EINVAL;
+
+ switch (sc->sm->sm_ino) {
+ case XFS_SCRUB_METAPATH_PROBE:
+ /* Just probing, nothing else to do. */
+ if (sc->sm->sm_agno)
+ return -EINVAL;
+ return 0;
+ case XFS_SCRUB_METAPATH_RTDIR:
+ return xchk_setup_metapath_rtdir(sc);
+ case XFS_SCRUB_METAPATH_RTBITMAP:
+ return xchk_setup_metapath_rtginode(sc, XFS_RTGI_BITMAP);
+ case XFS_SCRUB_METAPATH_RTSUMMARY:
+ return xchk_setup_metapath_rtginode(sc, XFS_RTGI_SUMMARY);
+ case XFS_SCRUB_METAPATH_QUOTADIR:
+ return xchk_setup_metapath_quotadir(sc);
+ case XFS_SCRUB_METAPATH_USRQUOTA:
+ return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_USER);
+ case XFS_SCRUB_METAPATH_GRPQUOTA:
+ return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_GROUP);
+ case XFS_SCRUB_METAPATH_PRJQUOTA:
+ return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_PROJ);
+ case XFS_SCRUB_METAPATH_RTRMAPBT:
+ return xchk_setup_metapath_rtginode(sc, XFS_RTGI_RMAP);
+ case XFS_SCRUB_METAPATH_RTREFCOUNTBT:
+ return xchk_setup_metapath_rtginode(sc, XFS_RTGI_REFCOUNT);
+ default:
+ return -ENOENT;
+ }
+}
+
+/*
+ * Take the ILOCK on the metadata directory parent and child. We do not know
+ * that the metadata directory is not corrupt, so we lock the parent and try
+ * to lock the child. Returns 0 if successful, or -EINTR to abort the scrub.
+ */
+STATIC int
+xchk_metapath_ilock_both(
+ struct xchk_metapath *mpath)
+{
+ struct xfs_scrub *sc = mpath->sc;
+ int error = 0;
+
+ while (true) {
+ xfs_ilock(mpath->dp, XFS_ILOCK_EXCL);
+ if (xchk_ilock_nowait(sc, XFS_ILOCK_EXCL)) {
+ mpath->dp_ilock_flags |= XFS_ILOCK_EXCL;
+ return 0;
+ }
+ xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL);
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ delay(1);
+ }
+
+ ASSERT(0);
+ return -EINTR;
+}
+
+/* Unlock parent and child inodes. */
+static inline void
+xchk_metapath_iunlock(
+ struct xchk_metapath *mpath)
+{
+ struct xfs_scrub *sc = mpath->sc;
+
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ mpath->dp_ilock_flags &= ~XFS_ILOCK_EXCL;
+ xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL);
+}
+
+int
+xchk_metapath(
+ struct xfs_scrub *sc)
+{
+ struct xchk_metapath *mpath = sc->buf;
+ xfs_ino_t ino = NULLFSINO;
+ int error;
+
+ /* Just probing, nothing else to do. */
+ if (sc->sm->sm_ino == XFS_SCRUB_METAPATH_PROBE)
+ return 0;
+
+ /* Parent required to do anything else. */
+ if (mpath->dp == NULL) {
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ error = xchk_metapath_ilock_both(mpath);
+ if (error)
+ goto out_cancel;
+
+ /* Make sure the parent dir has a dirent pointing to this file. */
+ error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino);
+ trace_xchk_metapath_lookup(sc, mpath->path, mpath->dp, ino);
+ if (error == -ENOENT) {
+ /* No directory entry at all */
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ error = 0;
+ goto out_ilock;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out_ilock;
+ if (ino != sc->ip->i_ino) {
+ /* Pointing to wrong inode */
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ }
+
+out_ilock:
+ xchk_metapath_iunlock(mpath);
+out_cancel:
+ xchk_trans_cancel(sc);
+ return error;
+}
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+/* Create the dirent represented by the final component of the path. */
+STATIC int
+xrep_metapath_link(
+ struct xchk_metapath *mpath)
+{
+ struct xfs_scrub *sc = mpath->sc;
+
+ mpath->du.dp = mpath->dp;
+ mpath->du.name = &mpath->xname;
+ mpath->du.ip = sc->ip;
+
+ if (xfs_has_parent(sc->mp))
+ mpath->du.ppargs = &mpath->link_ppargs;
+ else
+ mpath->du.ppargs = NULL;
+
+ trace_xrep_metapath_link(sc, mpath->path, mpath->dp, sc->ip->i_ino);
+
+ return xfs_dir_add_child(sc->tp, mpath->link_resblks, &mpath->du);
+}
+
+/* Remove the dirent at the final component of the path. */
+STATIC int
+xrep_metapath_unlink(
+ struct xchk_metapath *mpath,
+ xfs_ino_t ino,
+ struct xfs_inode *ip)
+{
+ struct xfs_parent_rec rec;
+ struct xfs_scrub *sc = mpath->sc;
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ trace_xrep_metapath_unlink(sc, mpath->path, mpath->dp, ino);
+
+ if (!ip) {
+ /* The child inode isn't allocated. Junk the dirent. */
+ xfs_trans_log_inode(sc->tp, mpath->dp, XFS_ILOG_CORE);
+ return xfs_dir_removename(sc->tp, mpath->dp, &mpath->xname,
+ ino, mpath->unlink_resblks);
+ }
+
+ mpath->du.dp = mpath->dp;
+ mpath->du.name = &mpath->xname;
+ mpath->du.ip = ip;
+ mpath->du.ppargs = NULL;
+
+ /* Figure out if we're removing a parent pointer too. */
+ if (xfs_has_parent(mp)) {
+ xfs_inode_to_parent_rec(&rec, ip);
+ error = xfs_parent_lookup(sc->tp, ip, &mpath->xname, &rec,
+ &mpath->pptr_args);
+ switch (error) {
+ case -ENOATTR:
+ break;
+ case 0:
+ mpath->du.ppargs = &mpath->unlink_ppargs;
+ break;
+ default:
+ return error;
+ }
+ }
+
+ return xfs_dir_remove_child(sc->tp, mpath->unlink_resblks, &mpath->du);
+}
+
+/*
+ * Try to create a dirent in @mpath->dp with the name @mpath->xname that points
+ * to @sc->ip. Returns:
+ *
+ * -EEXIST and an @alleged_child if the dirent that points to the wrong inode;
+ * 0 if there is now a dirent pointing to @sc->ip; or
+ * A negative errno on error.
+ */
+STATIC int
+xrep_metapath_try_link(
+ struct xchk_metapath *mpath,
+ xfs_ino_t *alleged_child)
+{
+ struct xfs_scrub *sc = mpath->sc;
+ xfs_ino_t ino;
+ int error;
+
+ /* Allocate transaction, lock inodes, join to transaction. */
+ error = xchk_trans_alloc(sc, mpath->link_resblks);
+ if (error)
+ return error;
+
+ error = xchk_metapath_ilock_both(mpath);
+ if (error) {
+ xchk_trans_cancel(sc);
+ return error;
+ }
+ xfs_trans_ijoin(sc->tp, mpath->dp, 0);
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino);
+ trace_xrep_metapath_lookup(sc, mpath->path, mpath->dp, ino);
+ if (error == -ENOENT) {
+ /*
+ * There is no dirent in the directory. Create an entry
+ * pointing to @sc->ip.
+ */
+ error = xrep_metapath_link(mpath);
+ if (error)
+ goto out_cancel;
+
+ error = xrep_trans_commit(sc);
+ xchk_metapath_iunlock(mpath);
+ return error;
+ }
+ if (error)
+ goto out_cancel;
+
+ if (ino == sc->ip->i_ino) {
+ /* The dirent already points to @sc->ip; we're done. */
+ error = 0;
+ goto out_cancel;
+ }
+
+ /*
+ * The dirent points elsewhere; pass that back so that the caller
+ * can try to remove the dirent.
+ */
+ *alleged_child = ino;
+ error = -EEXIST;
+
+out_cancel:
+ xchk_trans_cancel(sc);
+ xchk_metapath_iunlock(mpath);
+ return error;
+}
+
+/*
+ * Take the ILOCK on the metadata directory parent and a bad child, if one is
+ * supplied. We do not know that the metadata directory is not corrupt, so we
+ * lock the parent and try to lock the child. Returns 0 if successful, or
+ * -EINTR to abort the repair. The lock state of @dp is not recorded in @mpath.
+ */
+STATIC int
+xchk_metapath_ilock_parent_and_child(
+ struct xchk_metapath *mpath,
+ struct xfs_inode *ip)
+{
+ struct xfs_scrub *sc = mpath->sc;
+ int error = 0;
+
+ while (true) {
+ xfs_ilock(mpath->dp, XFS_ILOCK_EXCL);
+ if (!ip || xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ return 0;
+ xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL);
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ delay(1);
+ }
+
+ ASSERT(0);
+ return -EINTR;
+}
+
+/*
+ * Try to remove a dirent in @mpath->dp with the name @mpath->xname that points
+ * to @alleged_child. Returns:
+ *
+ * 0 if there is no longer a dirent;
+ * -EEXIST if the dirent points to @sc->ip;
+ * -EAGAIN and an updated @alleged_child if the dirent points elsewhere; or
+ * A negative errno for any other error.
+ */
+STATIC int
+xrep_metapath_try_unlink(
+ struct xchk_metapath *mpath,
+ xfs_ino_t *alleged_child)
+{
+ struct xfs_scrub *sc = mpath->sc;
+ struct xfs_inode *ip = NULL;
+ xfs_ino_t ino;
+ int error;
+
+ ASSERT(*alleged_child != sc->ip->i_ino);
+
+ trace_xrep_metapath_try_unlink(sc, mpath->path, mpath->dp,
+ *alleged_child);
+
+ /*
+ * Allocate transaction, grab the alleged child inode, lock inodes,
+ * join to transaction.
+ */
+ error = xchk_trans_alloc(sc, mpath->unlink_resblks);
+ if (error)
+ return error;
+
+ error = xchk_iget(sc, *alleged_child, &ip);
+ if (error == -EINVAL || error == -ENOENT) {
+ /* inode number is bogus, junk the dirent */
+ error = 0;
+ }
+ if (error) {
+ xchk_trans_cancel(sc);
+ return error;
+ }
+
+ error = xchk_metapath_ilock_parent_and_child(mpath, ip);
+ if (error) {
+ xchk_trans_cancel(sc);
+ return error;
+ }
+ xfs_trans_ijoin(sc->tp, mpath->dp, 0);
+ if (ip)
+ xfs_trans_ijoin(sc->tp, ip, 0);
+
+ error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino);
+ trace_xrep_metapath_lookup(sc, mpath->path, mpath->dp, ino);
+ if (error == -ENOENT) {
+ /*
+ * There is no dirent in the directory anymore. We're ready to
+ * try the link operation again.
+ */
+ error = 0;
+ goto out_cancel;
+ }
+ if (error)
+ goto out_cancel;
+
+ if (ino == sc->ip->i_ino) {
+ /* The dirent already points to @sc->ip; we're done. */
+ error = -EEXIST;
+ goto out_cancel;
+ }
+
+ /*
+ * The dirent does not point to the alleged child. Update the caller
+ * and signal that we want to be called again.
+ */
+ if (ino != *alleged_child) {
+ *alleged_child = ino;
+ error = -EAGAIN;
+ goto out_cancel;
+ }
+
+ /* Remove the link to the child. */
+ error = xrep_metapath_unlink(mpath, ino, ip);
+ if (error)
+ goto out_cancel;
+
+ error = xrep_trans_commit(sc);
+ goto out_unlock;
+
+out_cancel:
+ xchk_trans_cancel(sc);
+out_unlock:
+ xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL);
+ if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xchk_irele(sc, ip);
+ }
+ return error;
+}
+
+/*
+ * Make sure the metadata directory path points to the child being examined.
+ *
+ * Repair needs to be able to create a directory structure, create its own
+ * transactions, and take ILOCKs. This function /must/ be called after all
+ * other repairs have completed.
+ */
+int
+xrep_metapath(
+ struct xfs_scrub *sc)
+{
+ struct xchk_metapath *mpath = sc->buf;
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ /* Just probing, nothing to repair. */
+ if (sc->sm->sm_ino == XFS_SCRUB_METAPATH_PROBE)
+ return 0;
+
+ /* Parent required to do anything else. */
+ if (mpath->dp == NULL)
+ return -EFSCORRUPTED;
+
+ /*
+ * Make sure the child file actually has an attr fork to receive a new
+ * parent pointer if the fs has parent pointers.
+ */
+ if (xfs_has_parent(mp)) {
+ error = xfs_attr_add_fork(sc->ip,
+ sizeof(struct xfs_attr_sf_hdr), 1);
+ if (error)
+ return error;
+ }
+
+ /* Compute block reservation required to unlink and link a file. */
+ mpath->unlink_resblks = xfs_remove_space_res(mp, MAXNAMELEN);
+ mpath->link_resblks = xfs_link_space_res(mp, MAXNAMELEN);
+
+ do {
+ xfs_ino_t alleged_child;
+
+ /* Re-establish the link, or tell us which inode to remove. */
+ error = xrep_metapath_try_link(mpath, &alleged_child);
+ if (!error)
+ return 0;
+ if (error != -EEXIST)
+ return error;
+
+ /*
+ * Remove an incorrect link to an alleged child, or tell us
+ * which inode to remove.
+ */
+ do {
+ error = xrep_metapath_try_unlink(mpath, &alleged_child);
+ } while (error == -EAGAIN);
+ if (error == -EEXIST) {
+ /* Link established; we're done. */
+ error = 0;
+ break;
+ }
+ } while (!error);
+
+ return error;
+}
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index bb6d980b4fcd..1588ce971cb8 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -19,6 +19,8 @@
#include "xfs_rmap.h"
#include "xfs_ag.h"
#include "xfs_defer.h"
+#include "xfs_metafile.h"
+#include "xfs_quota.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -58,9 +60,9 @@ xrep_newbt_estimate_slack(
if (sc->ops->type == ST_PERAG) {
free = sc->sa.pag->pagf_freeblks;
- sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
+ sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
} else {
- free = percpu_counter_sum(&sc->mp->m_fdblocks);
+ free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS);
sz = sc->mp->m_sb.sb_dblocks;
}
@@ -121,6 +123,43 @@ xrep_newbt_init_inode(
}
/*
+ * Initialize accounting resources for staging a new metadata inode btree.
+ * If the metadata file has a space reservation, the caller must adjust that
+ * reservation when committing the new ondisk btree.
+ */
+int
+xrep_newbt_init_metadir_inode(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_ifork *ifp;
+
+ ASSERT(xfs_is_metadir_inode(sc->ip));
+
+ xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
+
+ ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
+ if (!ifp)
+ return -ENOMEM;
+
+ /*
+ * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the
+ * inode metadata space reservations can only account allocated space
+ * to the i_nblocks. We do not want to change the inode core fields
+ * until we're ready to commit the new tree, so we allocate the blocks
+ * as if they were regular file blocks. This exposes us to a higher
+ * risk of the repair being cancelled due to ENOSPC.
+ */
+ xrep_newbt_init_ag(xnr, sc, &oinfo,
+ XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
+ XFS_AG_RESV_NONE);
+ xnr->ifake.if_fork = ifp;
+ xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK);
+ return 0;
+}
+
+/*
* Initialize accounting resources for staging a new btree. Callers are
* expected to add their own reservations (and clean them up) manually.
*/
@@ -160,7 +199,8 @@ xrep_newbt_add_blocks(
if (args->tp) {
ASSERT(xnr->oinfo.oi_offset == 0);
- error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
+ error = xfs_alloc_schedule_autoreap(args,
+ XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap);
if (error)
goto out_pag;
}
@@ -185,11 +225,10 @@ xrep_newbt_add_extent(
xfs_agblock_t agbno,
xfs_extlen_t len)
{
- struct xfs_mount *mp = xnr->sc->mp;
struct xfs_alloc_arg args = {
.tp = NULL, /* no autoreap */
.oinfo = xnr->oinfo,
- .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
+ .fsbno = xfs_agbno_to_fsb(pag, agbno),
.len = len,
.resv = xnr->resv,
};
@@ -205,12 +244,12 @@ xrep_newbt_validate_ag_alloc_hint(
struct xfs_scrub *sc = xnr->sc;
xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
- if (agno == sc->sa.pag->pag_agno &&
+ if (agno == pag_agno(sc->sa.pag) &&
xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
return;
- xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
- XFS_AGFL_BLOCK(sc->mp) + 1);
+ xnr->alloc_hint =
+ xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1);
}
/* Allocate disk space for a new per-AG btree. */
@@ -224,6 +263,7 @@ xrep_newbt_alloc_ag_blocks(
int error = 0;
ASSERT(sc->sa.pag != NULL);
+ ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
while (nr_blocks > 0) {
struct xfs_alloc_arg args = {
@@ -239,23 +279,26 @@ xrep_newbt_alloc_ag_blocks(
xrep_newbt_validate_ag_alloc_hint(xnr);
- error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint);
+ if (xnr->alloc_vextent)
+ error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
+ else
+ error = xfs_alloc_vextent_near_bno(&args,
+ xnr->alloc_hint);
if (error)
return error;
if (args.fsbno == NULLFSBLOCK)
return -ENOSPC;
agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
+ if (agno != pag_agno(sc->sa.pag)) {
+ ASSERT(agno == pag_agno(sc->sa.pag));
+ return -EFSCORRUPTED;
+ }
- trace_xrep_newbt_alloc_ag_blocks(mp, agno,
+ trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag,
XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
xnr->oinfo.oi_owner);
- if (agno != sc->sa.pag->pag_agno) {
- ASSERT(agno == sc->sa.pag->pag_agno);
- return -EFSCORRUPTED;
- }
-
error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
if (error)
return error;
@@ -294,6 +337,8 @@ xrep_newbt_alloc_file_blocks(
struct xfs_mount *mp = sc->mp;
int error = 0;
+ ASSERT(xnr->resv != XFS_AG_RESV_METAFILE);
+
while (nr_blocks > 0) {
struct xfs_alloc_arg args = {
.tp = sc->tp,
@@ -309,7 +354,11 @@ xrep_newbt_alloc_file_blocks(
xrep_newbt_validate_file_alloc_hint(xnr);
- error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint);
+ if (xnr->alloc_vextent)
+ error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
+ else
+ error = xfs_alloc_vextent_start_ag(&args,
+ xnr->alloc_hint);
if (error)
return error;
if (args.fsbno == NULLFSBLOCK)
@@ -317,16 +366,16 @@ xrep_newbt_alloc_file_blocks(
agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
- trace_xrep_newbt_alloc_file_blocks(mp, agno,
- XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
- xnr->oinfo.oi_owner);
-
pag = xfs_perag_get(mp, agno);
if (!pag) {
ASSERT(0);
return -EFSCORRUPTED;
}
+ trace_xrep_newbt_alloc_file_blocks(pag,
+ XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
+ xnr->oinfo.oi_owner);
+
error = xrep_newbt_add_blocks(xnr, pag, &args);
xfs_perag_put(pag);
if (error)
@@ -367,7 +416,6 @@ xrep_newbt_free_extent(
struct xfs_scrub *sc = xnr->sc;
xfs_agblock_t free_agbno = resv->agbno;
xfs_extlen_t free_aglen = resv->len;
- xfs_fsblock_t fsbno;
int error;
if (!btree_committed || resv->used == 0) {
@@ -376,8 +424,8 @@ xrep_newbt_free_extent(
* space reservation, let the existing EFI free the entire
* space extent.
*/
- trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
- free_agbno, free_aglen, xnr->oinfo.oi_owner);
+ trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
+ xnr->oinfo.oi_owner);
xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
return 1;
}
@@ -394,8 +442,8 @@ xrep_newbt_free_extent(
if (free_aglen == 0)
return 0;
- trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
- free_aglen, xnr->oinfo.oi_owner);
+ trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
+ xnr->oinfo.oi_owner);
ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
@@ -404,9 +452,9 @@ xrep_newbt_free_extent(
* Use EFIs to free the reservations. This reduces the chance
* that we leak blocks if the system goes down.
*/
- fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
- error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
- xnr->resv, true);
+ error = xfs_free_extent_later(sc->tp,
+ xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen,
+ &xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
if (error)
return error;
@@ -507,7 +555,6 @@ xrep_newbt_claim_block(
union xfs_btree_ptr *ptr)
{
struct xrep_newbt_resv *resv;
- struct xfs_mount *mp = cur->bc_mp;
xfs_agblock_t agbno;
/*
@@ -532,12 +579,10 @@ xrep_newbt_claim_block(
if (resv->used == resv->len)
list_move_tail(&resv->list, &xnr->resv_list);
- trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
- xnr->oinfo.oi_owner);
+ trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
- agbno));
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
+ ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno));
else
ptr->s = cpu_to_be32(agbno);
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
index 89f8e3970b1f..5ce785599287 100644
--- a/fs/xfs/scrub/newbt.h
+++ b/fs/xfs/scrub/newbt.h
@@ -6,6 +6,8 @@
#ifndef __XFS_SCRUB_NEWBT_H__
#define __XFS_SCRUB_NEWBT_H__
+struct xfs_alloc_arg;
+
struct xrep_newbt_resv {
/* Link to list of extents that we've reserved. */
struct list_head list;
@@ -28,6 +30,11 @@ struct xrep_newbt_resv {
struct xrep_newbt {
struct xfs_scrub *sc;
+ /* Custom allocation function, or NULL for xfs_alloc_vextent */
+ int (*alloc_vextent)(struct xfs_scrub *sc,
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t alloc_hint);
+
/* List of extents that we've reserved. */
struct list_head resv_list;
@@ -56,6 +63,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc,
enum xfs_ag_resv_type resv);
int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc,
int whichfork, const struct xfs_owner_info *oinfo);
+int xrep_newbt_init_metadir_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc);
int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks);
int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag,
xfs_agblock_t agbno, xfs_extlen_t len);
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
new file mode 100644
index 000000000000..4a47d0aabf73
--- /dev/null
+++ b/fs/xfs/scrub/nlinks.c
@@ -0,0 +1,1049 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ag.h"
+#include "xfs_parent.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/orphanage.h"
+#include "scrub/nlinks.h"
+#include "scrub/trace.h"
+#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+#include "scrub/listxattr.h"
+
+/*
+ * Live Inode Link Count Checking
+ * ==============================
+ *
+ * Inode link counts are "summary" metadata, in the sense that they are
+ * computed as the number of directory entries referencing each file on the
+ * filesystem. Therefore, we compute the correct link counts by creating a
+ * shadow link count structure and walking every inode.
+ */
+
+/* Set us up to scrub inode link counts. */
+int
+xchk_setup_nlinks(
+ struct xfs_scrub *sc)
+{
+ struct xchk_nlink_ctrs *xnc;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_nlinks(sc);
+ if (error)
+ return error;
+ }
+
+ xnc = kvzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
+ if (!xnc)
+ return -ENOMEM;
+ xnc->xname.name = xnc->namebuf;
+ xnc->sc = sc;
+ sc->buf = xnc;
+
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 1: Collecting file link counts. For each file, we create a shadow link
+ * counting structure, then walk the entire directory tree, incrementing parent
+ * and child link counts for each directory entry seen.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned. This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error. Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
+ *
+ * Because we are scanning a live filesystem, it's possible that another thread
+ * will try to update the link counts for an inode that we've already scanned.
+ * This will cause our counts to be incorrect. Therefore, we hook all
+ * directory entry updates because that is when link count updates occur. By
+ * shadowing transaction updates in this manner, live nlink check can ensure by
+ * locking the inode and the shadow structure that its own copies are not out
+ * of date. Because the hook code runs in a different process context from the
+ * scrub code and the scrub state flags are not accessed atomically, failures
+ * in the hook code must abort the iscan and the scrubber must notice the
+ * aborted scan and set the incomplete flag.
+ *
+ * Note that we use jump labels and srcu notifier hooks to minimize the
+ * overhead when live nlinks is /not/ running. Locking order for nlink
+ * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock.
+ */
+
+/*
+ * Add a delta to an nlink counter, clamping the value to U32_MAX. Because
+ * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results
+ * even if we lose some precision.
+ */
+static inline void
+careful_add(
+ xfs_nlink_t *nlinkp,
+ int delta)
+{
+ uint64_t new_value = (uint64_t)(*nlinkp) + delta;
+
+ BUILD_BUG_ON(XFS_MAXLINK > U32_MAX);
+ *nlinkp = min_t(uint64_t, new_value, U32_MAX);
+}
+
+/* Update incore link count information. Caller must hold the nlinks lock. */
+STATIC int
+xchk_nlinks_update_incore(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino,
+ int parents_delta,
+ int backrefs_delta,
+ int children_delta)
+{
+ struct xchk_nlink nl;
+ int error;
+
+ if (!xnc->nlinks)
+ return 0;
+
+ error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
+ if (error)
+ return error;
+
+ trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta,
+ backrefs_delta, children_delta);
+
+ careful_add(&nl.parents, parents_delta);
+ careful_add(&nl.backrefs, backrefs_delta);
+ careful_add(&nl.children, children_delta);
+
+ nl.flags |= XCHK_NLINK_WRITTEN;
+ error = xfarray_store(xnc->nlinks, ino, &nl);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete.
+ */
+ error = -ECANCELED;
+ }
+ return error;
+}
+
+/*
+ * Apply a link count change from the regular filesystem into our shadow link
+ * count structure based on a directory update in progress.
+ */
+STATIC int
+xchk_nlinks_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xchk_nlink_ctrs *xnc;
+ int error;
+
+ xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
+
+ /*
+ * Ignore temporary directories being used to stage dir repairs, since
+ * we don't bump the link counts of the children.
+ */
+ if (xrep_is_tempfile(p->dp))
+ return NOTIFY_DONE;
+
+ trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
+ p->delta, p->name->name, p->name->len);
+
+ /*
+ * If we've already scanned @dp, update the number of parents that link
+ * to @ip. If @ip is a subdirectory, update the number of child links
+ * going out of @dp.
+ */
+ if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) {
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta,
+ 0, 0);
+ if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode))
+ error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
+ 0, p->delta);
+ mutex_unlock(&xnc->lock);
+ if (error)
+ goto out_abort;
+ }
+
+ /*
+ * If @ip is a subdirectory and we've already scanned it, update the
+ * number of backrefs pointing to @dp.
+ */
+ if (S_ISDIR(VFS_IC(p->ip)->i_mode) &&
+ xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) {
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
+ p->delta, 0);
+ mutex_unlock(&xnc->lock);
+ if (error)
+ goto out_abort;
+ }
+
+ return NOTIFY_DONE;
+
+out_abort:
+ xchk_iscan_abort(&xnc->collect_iscan);
+ return NOTIFY_DONE;
+}
+
+/* Bump the observed link count for the inode referenced by this entry. */
+STATIC int
+xchk_nlinks_collect_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xchk_nlink_ctrs *xnc = priv;
+ bool dot = false, dotdot = false;
+ int error;
+
+ /* Does this name make sense? */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) {
+ error = -ECANCELED;
+ goto out_abort;
+ }
+
+ if (name->len == 1 && name->name[0] == '.')
+ dot = true;
+ else if (name->len == 2 && name->name[0] == '.' &&
+ name->name[1] == '.')
+ dotdot = true;
+
+ /* Don't accept a '.' entry that points somewhere else. */
+ if (dot && ino != dp->i_ino) {
+ error = -ECANCELED;
+ goto out_abort;
+ }
+
+ /* Don't accept an invalid inode number. */
+ if (!xfs_verify_dir_ino(sc->mp, ino)) {
+ error = -ECANCELED;
+ goto out_abort;
+ }
+
+ /* Update the shadow link counts if we haven't already failed. */
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ error = -ECANCELED;
+ goto out_incomplete;
+ }
+
+ trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name);
+
+ mutex_lock(&xnc->lock);
+
+ /*
+ * If this is a dotdot entry, it is a back link from dp to ino. How
+ * we handle this depends on whether or not dp is the root directory.
+ *
+ * The root directory is its own parent, so we pretend the dotdot entry
+ * establishes the "parent" of the root directory. Increment the
+ * number of parents of the root directory.
+ *
+ * Otherwise, increment the number of backrefs pointing back to ino.
+ *
+ * If the filesystem has parent pointers, we walk the pptrs to
+ * determine the backref count.
+ */
+ if (dotdot) {
+ if (xchk_inode_is_dirtree_root(dp))
+ error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+ else if (!xfs_has_parent(sc->mp))
+ error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
+ else
+ error = 0;
+ if (error)
+ goto out_unlock;
+ }
+
+ /*
+ * If this dirent is a forward link from dp to ino, increment the
+ * number of parents linking into ino.
+ */
+ if (!dot && !dotdot) {
+ error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+ if (error)
+ goto out_unlock;
+ }
+
+ /*
+ * If this dirent is a forward link to a subdirectory, increment the
+ * number of child links of dp.
+ */
+ if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) {
+ error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1);
+ if (error)
+ goto out_unlock;
+ }
+
+ mutex_unlock(&xnc->lock);
+ return 0;
+
+out_unlock:
+ mutex_unlock(&xnc->lock);
+out_abort:
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_incomplete:
+ xchk_set_incomplete(sc);
+ return error;
+}
+
+/* Bump the backref count for the inode referenced by this parent pointer. */
+STATIC int
+xchk_nlinks_collect_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ };
+ struct xchk_nlink_ctrs *xnc = priv;
+ const struct xfs_parent_rec *pptr_rec = value;
+ xfs_ino_t parent_ino;
+ int error;
+
+ /* Update the shadow link counts if we haven't already failed. */
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ error = -ECANCELED;
+ goto out_incomplete;
+ }
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, NULL);
+ if (error)
+ return error;
+
+ trace_xchk_nlinks_collect_pptr(sc->mp, ip, &xname, pptr_rec);
+
+ mutex_lock(&xnc->lock);
+
+ error = xchk_nlinks_update_incore(xnc, parent_ino, 0, 1, 0);
+ if (error)
+ goto out_unlock;
+
+ mutex_unlock(&xnc->lock);
+ return 0;
+
+out_unlock:
+ mutex_unlock(&xnc->lock);
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_incomplete:
+ xchk_set_incomplete(sc);
+ return error;
+}
+
+/* Walk a directory to bump the observed link counts of the children. */
+STATIC int
+xchk_nlinks_collect_dir(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode *dp)
+{
+ struct xfs_scrub *sc = xnc->sc;
+ unsigned int lock_mode;
+ int error = 0;
+
+ /*
+ * Ignore temporary directories being used to stage dir repairs, since
+ * we don't bump the link counts of the children.
+ */
+ if (xrep_is_tempfile(dp))
+ return 0;
+
+ /* Prevent anyone from changing this directory while we walk it. */
+ xfs_ilock(dp, XFS_IOLOCK_SHARED);
+ lock_mode = xfs_ilock_data_map_shared(dp);
+
+ /*
+ * The dotdot entry of an unlinked directory still points to the last
+ * parent, but the parent no longer links to this directory. Skip the
+ * directory to avoid overcounting.
+ */
+ if (VFS_I(dp)->i_nlink == 0)
+ goto out_unlock;
+
+ /*
+ * We cannot count file links if the directory looks as though it has
+ * been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_abort;
+ }
+
+ error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc);
+ if (error == -ECANCELED) {
+ error = 0;
+ goto out_unlock;
+ }
+ if (error)
+ goto out_abort;
+
+ /* Walk the parent pointers to get real backref counts. */
+ if (xfs_has_parent(sc->mp)) {
+ /*
+ * If the extended attributes look as though they has been
+ * zapped by the inode record repair code, we cannot scan for
+ * parent pointers.
+ */
+ if (xchk_pptr_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ error = xchk_xattr_walk(sc, dp, xchk_nlinks_collect_pptr, NULL,
+ xnc);
+ if (error == -ECANCELED) {
+ error = 0;
+ goto out_unlock;
+ }
+ if (error)
+ goto out_abort;
+ }
+
+ xchk_iscan_mark_visited(&xnc->collect_iscan, dp);
+ goto out_unlock;
+
+out_abort:
+ xchk_set_incomplete(sc);
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_unlock:
+ xfs_iunlock(dp, lock_mode);
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ return error;
+}
+
+/* If this looks like a valid pointer, count it. */
+static inline int
+xchk_nlinks_collect_metafile(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino)
+{
+ if (!xfs_verify_ino(xnc->sc->mp, ino))
+ return 0;
+
+ trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino);
+ return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+}
+
+/* Bump the link counts of metadata files rooted in the superblock. */
+STATIC int
+xchk_nlinks_collect_metafiles(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xfs_mount *mp = xnc->sc->mp;
+ int error = -ECANCELED;
+
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan))
+ goto out_incomplete;
+
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino);
+ if (error)
+ goto out_abort;
+ mutex_unlock(&xnc->lock);
+
+ return 0;
+
+out_abort:
+ mutex_unlock(&xnc->lock);
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_incomplete:
+ xchk_set_incomplete(xnc->sc);
+ return error;
+}
+
+/* Advance the collection scan cursor for this non-directory file. */
+static inline int
+xchk_nlinks_collect_file(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode *ip)
+{
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ xchk_iscan_mark_visited(&xnc->collect_iscan, ip);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return 0;
+}
+
+/* Walk all directories and count inode links. */
+STATIC int
+xchk_nlinks_collect(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xfs_scrub *sc = xnc->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /* Count the rt and quota files that are rooted in the superblock. */
+ error = xchk_nlinks_collect_metafiles(xnc);
+ if (error)
+ return error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Cancel the transaction to release the log grant space while we scan
+ * the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done even though
+ * we take the IOLOCK to quiesce the file because empty transactions
+ * do not take sb_internal.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) {
+ if (S_ISDIR(VFS_I(ip)->i_mode))
+ error = xchk_nlinks_collect_dir(xnc, ip);
+ else
+ error = xchk_nlinks_collect_file(xnc, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&xnc->collect_iscan);
+ if (error) {
+ xchk_set_incomplete(sc);
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Switch out for a real transaction in preparation for building a new
+ * tree.
+ */
+ xchk_trans_cancel(sc);
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 2: Comparing file link counters. Walk each inode and compare the link
+ * counts against our shadow information; and then walk each shadow link count
+ * structure (that wasn't covered in the first part), comparing it against the
+ * file.
+ */
+
+/* Read the observed link count for comparison with the actual inode. */
+STATIC int
+xchk_nlinks_comparison_read(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino,
+ struct xchk_nlink *obs)
+{
+ struct xchk_nlink nl;
+ int error;
+
+ error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
+ if (error)
+ return error;
+
+ nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN);
+
+ error = xfarray_store(xnc->nlinks, ino, &nl);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete. This
+ * shouldn't really happen outside of the collection phase.
+ */
+ xchk_set_incomplete(xnc->sc);
+ return -ECANCELED;
+ }
+ if (error)
+ return error;
+
+ /* Copy the counters, but do not expose the internal state. */
+ obs->parents = nl.parents;
+ obs->backrefs = nl.backrefs;
+ obs->children = nl.children;
+ obs->flags = 0;
+ return 0;
+}
+
+/* Check our link count against an inode. */
+STATIC int
+xchk_nlinks_compare_inode(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode *ip)
+{
+ struct xchk_nlink obs;
+ struct xfs_scrub *sc = xnc->sc;
+ uint64_t total_links;
+ unsigned int actual_nlink;
+ int error;
+
+ /*
+ * Ignore temporary files being used to stage repairs, since we assume
+ * they're correct for non-directories, and the directory repair code
+ * doesn't bump the link counts for the children.
+ */
+ if (xrep_is_tempfile(ip))
+ return 0;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ mutex_lock(&xnc->lock);
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ xchk_set_incomplete(xnc->sc);
+ error = -ECANCELED;
+ goto out_scanlock;
+ }
+
+ error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs);
+ if (error)
+ goto out_scanlock;
+
+ /*
+ * If we don't have ftype to get an accurate count of the subdirectory
+ * entries in this directory, take advantage of the fact that on a
+ * consistent ftype=0 filesystem, the number of subdirectory
+ * backreferences (dotdot entries) pointing towards this directory
+ * should be equal to the number of subdirectory entries in the
+ * directory.
+ */
+ if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode))
+ obs.children = obs.backrefs;
+
+ total_links = xchk_nlink_total(ip, &obs);
+ actual_nlink = VFS_I(ip)->i_nlink;
+
+ trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs);
+
+ /*
+ * If we found so many parents that we'd overflow i_nlink, we must flag
+ * this as a corruption. The VFS won't let users increase the link
+ * count, but it will let them decrease it.
+ */
+ if (total_links > XFS_NLINK_PINNED) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ } else if (total_links > XFS_MAXLINK) {
+ xchk_ino_set_warning(sc, ip->i_ino);
+ }
+
+ /* Link counts should match. */
+ if (total_links != actual_nlink) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+
+ if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) {
+ /*
+ * The collection phase ignores directories with zero link
+ * count, so we ignore them here too.
+ *
+ * The number of subdirectory backreferences (dotdot entries)
+ * pointing towards this directory should be equal to the
+ * number of subdirectory entries in the directory.
+ */
+ if (obs.children != obs.backrefs)
+ xchk_ino_xref_set_corrupt(sc, ip->i_ino);
+ } else {
+ /*
+ * Non-directories and unlinked directories should not have
+ * back references.
+ */
+ if (obs.backrefs != 0) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+
+ /*
+ * Non-directories and unlinked directories should not have
+ * children.
+ */
+ if (obs.children != 0) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+ }
+
+ if (xchk_inode_is_dirtree_root(ip)) {
+ /*
+ * For the root of a directory tree, both the '.' and '..'
+ * entries should point to the root directory. The dotdot
+ * entry is counted as a parent of the root /and/ a backref of
+ * the root directory.
+ */
+ if (obs.parents != 1) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+ } else if (actual_nlink > 0) {
+ /*
+ * Linked files that are not the root directory should have at
+ * least one parent.
+ */
+ if (obs.parents == 0) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+ }
+
+out_corrupt:
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ error = -ECANCELED;
+out_scanlock:
+ mutex_unlock(&xnc->lock);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
+}
+
+/*
+ * Check our link count against an inode that wasn't checked previously. This
+ * is intended to catch directories with dangling links, though we could be
+ * racing with inode allocation in other threads.
+ */
+STATIC int
+xchk_nlinks_compare_inum(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino)
+{
+ struct xchk_nlink obs;
+ struct xfs_mount *mp = xnc->sc->mp;
+ struct xfs_trans *tp = xnc->sc->tp;
+ struct xfs_buf *agi_bp;
+ struct xfs_inode *ip;
+ int error;
+
+ /*
+ * The first iget failed, so try again with the variant that returns
+ * either an incore inode or the AGI buffer. If the function returns
+ * EINVAL/ENOENT, it should have passed us the AGI buffer so that we
+ * can guarantee that the inode won't be allocated while we check for
+ * a zero link count in the observed link count data.
+ */
+ error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip);
+ if (!error) {
+ /* Actually got an inode, so use the inode compare. */
+ error = xchk_nlinks_compare_inode(xnc, ip);
+ xchk_irele(xnc->sc, ip);
+ return error;
+ }
+ if (error == -ENOENT || error == -EINVAL) {
+ /* No inode was found. Check for zero link count below. */
+ error = 0;
+ }
+ if (error)
+ goto out_agi;
+
+ /* Ensure that we have protected against inode allocation/freeing. */
+ if (agi_bp == NULL) {
+ ASSERT(agi_bp != NULL);
+ xchk_set_incomplete(xnc->sc);
+ return -ECANCELED;
+ }
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ xchk_set_incomplete(xnc->sc);
+ error = -ECANCELED;
+ goto out_agi;
+ }
+
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_comparison_read(xnc, ino, &obs);
+ if (error)
+ goto out_scanlock;
+
+ trace_xchk_nlinks_check_zero(mp, ino, &obs);
+
+ /*
+ * If we can't grab the inode, the link count had better be zero. We
+ * still hold the AGI to prevent inode allocation/freeing.
+ */
+ if (xchk_nlink_total(NULL, &obs) != 0) {
+ xchk_ino_set_corrupt(xnc->sc, ino);
+ error = -ECANCELED;
+ }
+
+out_scanlock:
+ mutex_unlock(&xnc->lock);
+out_agi:
+ if (agi_bp)
+ xfs_trans_brelse(tp, agi_bp);
+ return error;
+}
+
+/*
+ * Try to visit every inode in the filesystem to compare the link count. Move
+ * on if we can't grab an inode, since we'll revisit unchecked nlink records in
+ * the second part.
+ */
+static int
+xchk_nlinks_compare_iter(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode **ipp)
+{
+ int error;
+
+ do {
+ error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
+ } while (error == -EBUSY);
+
+ return error;
+}
+
+/* Compare the link counts we observed against the live information. */
+STATIC int
+xchk_nlinks_compare(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xchk_nlink nl;
+ struct xfs_scrub *sc = xnc->sc;
+ struct xfs_inode *ip;
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /*
+ * Create a new empty transaction so that we can advance the iscan
+ * cursor without deadlocking if the inobt has a cycle and push on the
+ * inactivation workqueue.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ /*
+ * Use the inobt to walk all allocated inodes to compare the link
+ * counts. Inodes skipped by _compare_iter will be tried again in the
+ * next phase of the scan.
+ */
+ xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan);
+ while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) {
+ error = xchk_nlinks_compare_inode(xnc, ip);
+ xchk_iscan_mark_visited(&xnc->compare_iscan, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&xnc->compare_iscan);
+ xchk_iscan_teardown(&xnc->compare_iscan);
+ if (error)
+ return error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /*
+ * Walk all the non-null nlink observations that weren't checked in the
+ * previous step.
+ */
+ mutex_lock(&xnc->lock);
+ while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) {
+ xfs_ino_t ino = cur - 1;
+
+ if (nl.flags & XCHK_NLINK_COMPARE_SCANNED)
+ continue;
+
+ mutex_unlock(&xnc->lock);
+
+ error = xchk_nlinks_compare_inum(xnc, ino);
+ if (error)
+ return error;
+
+ if (xchk_should_terminate(xnc->sc, &error))
+ return error;
+
+ mutex_lock(&xnc->lock);
+ }
+ mutex_unlock(&xnc->lock);
+
+ return error;
+}
+
+/* Tear down everything associated with a nlinks check. */
+static void
+xchk_nlinks_teardown_scan(
+ void *priv)
+{
+ struct xchk_nlink_ctrs *xnc = priv;
+
+ /* Discourage any hook functions that might be running. */
+ xchk_iscan_abort(&xnc->collect_iscan);
+
+ xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook);
+
+ xfarray_destroy(xnc->nlinks);
+ xnc->nlinks = NULL;
+
+ xchk_iscan_teardown(&xnc->collect_iscan);
+ mutex_destroy(&xnc->lock);
+ xnc->sc = NULL;
+}
+
+/*
+ * Scan all inodes in the entire filesystem to generate link count data. If
+ * the scan is successful, the counts will be left alive for a repair. If any
+ * error occurs, we'll tear everything down.
+ */
+STATIC int
+xchk_nlinks_setup_scan(
+ struct xfs_scrub *sc,
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ unsigned long long max_inos;
+ xfs_agnumber_t last_agno = mp->m_sb.sb_agcount - 1;
+ xfs_agino_t first_agino, last_agino;
+ int error;
+
+ mutex_init(&xnc->lock);
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan);
+
+ /*
+ * Set up enough space to store an nlink record for the highest
+ * possible inode number in this system.
+ */
+ xfs_agino_range(mp, last_agno, &first_agino, &last_agino);
+ max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1;
+ descr = xchk_xfile_descr(sc, "file link counts");
+ error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos),
+ sizeof(struct xchk_nlink), &xnc->nlinks);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+
+ /*
+ * Hook into the directory entry code so that we can capture updates to
+ * file link counts. The hook only triggers for inodes that were
+ * already scanned, and the scanner thread takes each inode's ILOCK,
+ * which means that any in-progress inode updates will finish before we
+ * can scan the inode.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+ xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update);
+ error = xfs_dir_hook_add(mp, &xnc->dhook);
+ if (error)
+ goto out_teardown;
+
+ /* Use deferred cleanup to pass the inode link count data to repair. */
+ sc->buf_cleanup = xchk_nlinks_teardown_scan;
+ return 0;
+
+out_teardown:
+ xchk_nlinks_teardown_scan(xnc);
+ return error;
+}
+
+/* Scrub the link count of all inodes on the filesystem. */
+int
+xchk_nlinks(
+ struct xfs_scrub *sc)
+{
+ struct xchk_nlink_ctrs *xnc = sc->buf;
+ int error = 0;
+
+ /* Set ourselves up to check link counts on the live filesystem. */
+ error = xchk_nlinks_setup_scan(sc, xnc);
+ if (error)
+ return error;
+
+ /* Walk all inodes, picking up link count information. */
+ error = xchk_nlinks_collect(xnc);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ /* Fail fast if we're not playing with a full dataset. */
+ if (xchk_iscan_aborted(&xnc->collect_iscan))
+ xchk_set_incomplete(sc);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ return 0;
+
+ /* Compare link counts. */
+ error = xchk_nlinks_compare(xnc);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ /* Check one last time for an incomplete dataset. */
+ if (xchk_iscan_aborted(&xnc->collect_iscan))
+ xchk_set_incomplete(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h
new file mode 100644
index 000000000000..b820712bfd87
--- /dev/null
+++ b/fs/xfs/scrub/nlinks.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_NLINKS_H__
+#define __XFS_SCRUB_NLINKS_H__
+
+/* Live link count control structure. */
+struct xchk_nlink_ctrs {
+ struct xfs_scrub *sc;
+
+ /* Shadow link count data and its mutex. */
+ struct xfarray *nlinks;
+ struct mutex lock;
+
+ /*
+ * The collection step uses a separate iscan context from the compare
+ * step because the collection iscan coordinates live updates to the
+ * observation data while this scanner is running. The compare iscan
+ * is secondary and can be reinitialized as needed.
+ */
+ struct xchk_iscan collect_iscan;
+ struct xchk_iscan compare_iscan;
+
+ /*
+ * Hook into directory updates so that we can receive live updates
+ * from other writer threads.
+ */
+ struct xfs_dir_hook dhook;
+
+ /* Orphanage reparenting request. */
+ struct xrep_adoption adoption;
+
+ /* Directory entry name, plus the trailing null. */
+ struct xfs_name xname;
+ char namebuf[MAXNAMELEN];
+};
+
+/*
+ * In-core link counts for a given inode in the filesystem.
+ *
+ * For an empty rootdir, the directory entries and the field to which they are
+ * accounted are as follows:
+ *
+ * Root directory:
+ *
+ * . points to self (root.child)
+ * .. points to self (root.parent)
+ * f1 points to a child file (f1.parent)
+ * d1 points to a child dir (d1.parent, root.child)
+ *
+ * Subdirectory d1:
+ *
+ * . points to self (d1.child)
+ * .. points to root dir (root.backref)
+ * f2 points to child file (f2.parent)
+ * f3 points to root.f1 (f1.parent)
+ *
+ * root.nlink == 3 (root.dot, root.dotdot, root.d1)
+ * d1.nlink == 2 (root.d1, d1.dot)
+ * f1.nlink == 2 (root.f1, d1.f3)
+ * f2.nlink == 1 (d1.f2)
+ */
+struct xchk_nlink {
+ /* Count of forward links from parent directories to this file. */
+ xfs_nlink_t parents;
+
+ /*
+ * Count of back links to this parent directory from child
+ * subdirectories.
+ */
+ xfs_nlink_t backrefs;
+
+ /*
+ * Count of forward links from this directory to all child files and
+ * the number of dot entries. Should be zero for non-directories.
+ */
+ xfs_nlink_t children;
+
+ /* Record state flags */
+ unsigned int flags;
+};
+
+/*
+ * This incore link count has been written at least once. We never want to
+ * store an xchk_nlink that looks uninitialized.
+ */
+#define XCHK_NLINK_WRITTEN (1U << 0)
+
+/* Already checked this link count record. */
+#define XCHK_NLINK_COMPARE_SCANNED (1U << 1)
+
+/* Already made a repair with this link count record. */
+#define XREP_NLINK_DIRTY (1U << 2)
+
+/* Compute total link count, using large enough variables to detect overflow. */
+static inline uint64_t
+xchk_nlink_total(struct xfs_inode *ip, const struct xchk_nlink *live)
+{
+ uint64_t ret = live->parents;
+
+ /* Add one link count for the dot entry of any linked directory. */
+ if (ip && S_ISDIR(VFS_I(ip)->i_mode) && VFS_I(ip)->i_nlink)
+ ret++;
+ return ret + live->children;
+}
+
+#endif /* __XFS_SCRUB_NLINKS_H__ */
diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c
new file mode 100644
index 000000000000..4ebdee095428
--- /dev/null
+++ b/fs/xfs/scrub/nlinks_repair.c
@@ -0,0 +1,351 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_parent.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/orphanage.h"
+#include "scrub/nlinks.h"
+#include "scrub/trace.h"
+#include "scrub/tempfile.h"
+
+/*
+ * Live Inode Link Count Repair
+ * ============================
+ *
+ * Use the live inode link count information that we collected to replace the
+ * nlink values of the incore inodes. A scrub->repair cycle should have left
+ * the live data and hooks active, so this is safe so long as we make sure the
+ * inode is locked.
+ */
+
+/* Set up to repair inode link counts. */
+int
+xrep_setup_nlinks(
+ struct xfs_scrub *sc)
+{
+ return xrep_orphanage_try_create(sc);
+}
+
+/*
+ * Inodes that aren't the root directory or the orphanage, have a nonzero link
+ * count, and no observed parents should be moved to the orphanage.
+ */
+static inline bool
+xrep_nlinks_is_orphaned(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int actual_nlink,
+ const struct xchk_nlink *obs)
+{
+ if (obs->parents != 0)
+ return false;
+ if (xchk_inode_is_dirtree_root(ip) || ip == sc->orphanage)
+ return false;
+ return actual_nlink != 0;
+}
+
+/* Remove an inode from the unlinked list. */
+STATIC int
+xrep_nlinks_iunlink_remove(
+ struct xfs_scrub *sc)
+{
+ struct xfs_perag *pag;
+ int error;
+
+ pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino));
+ error = xfs_iunlink_remove(sc->tp, pag, sc->ip);
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
+ * Correct the link count of the given inode. Because we have to grab locks
+ * and resources in a certain order, it's possible that this will be a no-op.
+ */
+STATIC int
+xrep_nlinks_repair_inode(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xchk_nlink obs;
+ struct xfs_scrub *sc = xnc->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ uint64_t total_links;
+ uint64_t actual_nlink;
+ bool orphanage_available = false;
+ bool dirty = false;
+ int error;
+
+ /*
+ * Ignore temporary files being used to stage repairs, since we assume
+ * they're correct for non-directories, and the directory repair code
+ * doesn't bump the link counts for the children.
+ */
+ if (xrep_is_tempfile(ip))
+ return 0;
+
+ /*
+ * If the filesystem has an orphanage attached to the scrub context,
+ * prepare for a link count repair that could involve @ip being adopted
+ * by the lost+found.
+ */
+ if (xrep_orphanage_can_adopt(sc)) {
+ error = xrep_orphanage_iolock_two(sc);
+ if (error)
+ return error;
+
+ error = xrep_adoption_trans_alloc(sc, &xnc->adoption);
+ if (error) {
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ } else {
+ orphanage_available = true;
+ }
+ }
+
+ /*
+ * Either there is no orphanage or we couldn't allocate resources for
+ * that kind of update. Let's try again with only the resources we
+ * need for a simple link count update, since that's much more common.
+ */
+ if (!orphanage_available) {
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0,
+ &sc->tp);
+ if (error) {
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(sc->tp, ip, 0);
+ }
+
+ mutex_lock(&xnc->lock);
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ error = -ECANCELED;
+ goto out_scanlock;
+ }
+
+ error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs);
+ if (error)
+ goto out_scanlock;
+
+ /*
+ * We're done accessing the shared scan data, so we can drop the lock.
+ * We still hold @ip's ILOCK, so its link count cannot change.
+ */
+ mutex_unlock(&xnc->lock);
+
+ total_links = xchk_nlink_total(ip, &obs);
+ actual_nlink = VFS_I(ip)->i_nlink;
+
+ /*
+ * Non-directories cannot have directories pointing up to them.
+ *
+ * We previously set error to zero, but set it again because one static
+ * checker author fears that programmers will fail to maintain this
+ * invariant and built their tool to flag this as a security risk. A
+ * different tool author made their bot complain about the redundant
+ * store. This is a never-ending and stupid battle; both tools missed
+ * *actual bugs* elsewhere; and I no longer care.
+ */
+ if (!S_ISDIR(VFS_I(ip)->i_mode) && obs.children != 0) {
+ trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+ error = 0;
+ goto out_trans;
+ }
+
+ /*
+ * Decide if we're going to move this file to the orphanage, and fix
+ * up the incore link counts if we are.
+ */
+ if (orphanage_available &&
+ xrep_nlinks_is_orphaned(sc, ip, actual_nlink, &obs)) {
+ /* Figure out what name we're going to use here. */
+ error = xrep_adoption_compute_name(&xnc->adoption, &xnc->xname);
+ if (error)
+ goto out_trans;
+
+ /*
+ * Reattach this file to the directory tree by moving it to
+ * the orphanage per the adoption parameters that we already
+ * computed.
+ */
+ error = xrep_adoption_move(&xnc->adoption);
+ if (error)
+ goto out_trans;
+
+ /*
+ * Re-read the link counts since the reparenting will have
+ * updated our scan info.
+ */
+ mutex_lock(&xnc->lock);
+ error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs);
+ mutex_unlock(&xnc->lock);
+ if (error)
+ goto out_trans;
+
+ total_links = xchk_nlink_total(ip, &obs);
+ actual_nlink = VFS_I(ip)->i_nlink;
+ dirty = true;
+ }
+
+ /*
+ * If this inode is linked from the directory tree and on the unlinked
+ * list, remove it from the unlinked list.
+ */
+ if (total_links > 0 && xfs_inode_on_unlinked_list(ip)) {
+ error = xrep_nlinks_iunlink_remove(sc);
+ if (error)
+ goto out_trans;
+ dirty = true;
+ }
+
+ /*
+ * If this inode is not linked from the directory tree yet not on the
+ * unlinked list, put it on the unlinked list.
+ */
+ if (total_links == 0 && !xfs_inode_on_unlinked_list(ip)) {
+ error = xfs_iunlink(sc->tp, ip);
+ if (error)
+ goto out_trans;
+ dirty = true;
+ }
+
+ /* Commit the new link count if it changed. */
+ if (total_links != actual_nlink) {
+ trace_xrep_nlinks_update_inode(mp, ip, &obs);
+
+ set_nlink(VFS_I(ip), min_t(unsigned long long, total_links,
+ XFS_NLINK_PINNED));
+ dirty = true;
+ }
+
+ if (!dirty) {
+ error = 0;
+ goto out_trans;
+ }
+
+ xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE);
+
+ error = xrep_trans_commit(sc);
+ goto out_unlock;
+
+out_scanlock:
+ mutex_unlock(&xnc->lock);
+out_trans:
+ xchk_trans_cancel(sc);
+out_unlock:
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ if (orphanage_available) {
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ }
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ return error;
+}
+
+/*
+ * Try to visit every inode in the filesystem for repairs. Move on if we can't
+ * grab an inode, since we're still making forward progress.
+ */
+static int
+xrep_nlinks_iter(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode **ipp)
+{
+ int error;
+
+ do {
+ error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
+ } while (error == -EBUSY);
+
+ return error;
+}
+
+/* Commit the new inode link counters. */
+int
+xrep_nlinks(
+ struct xfs_scrub *sc)
+{
+ struct xchk_nlink_ctrs *xnc = sc->buf;
+ int error;
+
+ /*
+ * We need ftype for an accurate count of the number of child
+ * subdirectory links. Child subdirectories with a back link (dotdot
+ * entry) but no forward link are moved to the orphanage, so we cannot
+ * repair the link count of the parent directory based on the back link
+ * count alone. Filesystems without ftype support are rare (old V4) so
+ * we just skip out here.
+ */
+ if (!xfs_has_ftype(sc->mp))
+ return -EOPNOTSUPP;
+
+ /*
+ * Use the inobt to walk all allocated inodes to compare and fix the
+ * link counts. Retry iget every tenth of a second for up to 30
+ * seconds -- even if repair misses a few inodes, we still try to fix
+ * as many of them as we can.
+ */
+ xchk_iscan_start(sc, 30000, 100, &xnc->compare_iscan);
+ ASSERT(sc->ip == NULL);
+
+ while ((error = xrep_nlinks_iter(xnc, &sc->ip)) == 1) {
+ /*
+ * Commit the scrub transaction so that we can create repair
+ * transactions with the correct reservations.
+ */
+ xchk_trans_cancel(sc);
+
+ error = xrep_nlinks_repair_inode(xnc);
+ xchk_iscan_mark_visited(&xnc->compare_iscan, sc->ip);
+ xchk_irele(sc, sc->ip);
+ sc->ip = NULL;
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+
+ /*
+ * Create a new empty transaction so that we can advance the
+ * iscan cursor without deadlocking if the inobt has a cycle.
+ * We can only push the inactivation workqueues with an empty
+ * transaction.
+ */
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ break;
+ }
+ xchk_iscan_iter_finish(&xnc->compare_iscan);
+ xchk_iscan_teardown(&xnc->compare_iscan);
+
+ return error;
+}
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
new file mode 100644
index 000000000000..9c12cb844231
--- /dev/null
+++ b/fs/xfs/scrub/orphanage.c
@@ -0,0 +1,629 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_dir2.h"
+#include "xfs_icache.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_parent.h"
+#include "xfs_attr_sf.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/trace.h"
+#include "scrub/orphanage.h"
+#include "scrub/readdir.h"
+
+#include <linux/namei.h>
+
+/*
+ * The Orphanage
+ * =============
+ *
+ * If the directory tree is damaged, children of that directory become
+ * inaccessible via that file path. If a child has no other parents, the file
+ * is said to be orphaned. xfs_repair fixes this situation by creating a
+ * orphanage directory (specifically, /lost+found) and creating a directory
+ * entry pointing to the orphaned file.
+ *
+ * Online repair follows this tactic by creating a root-owned /lost+found
+ * directory if one does not exist. If an orphan is found, it will move that
+ * files into orphanage.
+ */
+
+/* Make the orphanage owned by root. */
+STATIC int
+xrep_chown_orphanage(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp)
+{
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_dquot *udqp = NULL, *gdqp = NULL, *pdqp = NULL;
+ struct xfs_dquot *oldu = NULL, *oldg = NULL, *oldp = NULL;
+ struct inode *inode = VFS_I(dp);
+ int error;
+
+ error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
+ XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp);
+ if (error)
+ return error;
+
+ error = xfs_trans_alloc_ichange(dp, udqp, gdqp, pdqp, true, &tp);
+ if (error)
+ goto out_dqrele;
+
+ /*
+ * Always clear setuid/setgid/sticky on the orphanage since we don't
+ * normally want that functionality on this directory and xfs_repair
+ * doesn't create it this way either. Leave the other access bits
+ * unchanged.
+ */
+ inode->i_mode &= ~(S_ISUID | S_ISGID | S_ISVTX);
+
+ /*
+ * Change the ownerships and register quota modifications
+ * in the transaction.
+ */
+ if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) {
+ if (XFS_IS_UQUOTA_ON(mp))
+ oldu = xfs_qm_vop_chown(tp, dp, &dp->i_udquot, udqp);
+ inode->i_uid = GLOBAL_ROOT_UID;
+ }
+ if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) {
+ if (XFS_IS_GQUOTA_ON(mp))
+ oldg = xfs_qm_vop_chown(tp, dp, &dp->i_gdquot, gdqp);
+ inode->i_gid = GLOBAL_ROOT_GID;
+ }
+ if (dp->i_projid != 0) {
+ if (XFS_IS_PQUOTA_ON(mp))
+ oldp = xfs_qm_vop_chown(tp, dp, &dp->i_pdquot, pdqp);
+ dp->i_projid = 0;
+ }
+
+ dp->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+ XFS_STATS_INC(mp, xs_ig_attrchg);
+
+ if (xfs_has_wsync(mp))
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp);
+
+ xfs_qm_dqrele(oldu);
+ xfs_qm_dqrele(oldg);
+ xfs_qm_dqrele(oldp);
+
+out_dqrele:
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+ return error;
+}
+
+#define ORPHANAGE "lost+found"
+
+/* Create the orphanage directory, and set sc->orphanage to it. */
+int
+xrep_orphanage_create(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct dentry *root_dentry, *orphanage_dentry;
+ struct inode *root_inode = VFS_I(sc->mp->m_rootip);
+ struct inode *orphanage_inode;
+ int error;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+ if (xfs_is_readonly(mp)) {
+ sc->orphanage = NULL;
+ return 0;
+ }
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(sc->orphanage == NULL);
+
+ /* Find the dentry for the root directory... */
+ root_dentry = d_find_alias(root_inode);
+ if (!root_dentry) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* ...which is a directory, right? */
+ if (!d_is_dir(root_dentry)) {
+ error = -EFSCORRUPTED;
+ goto out_dput_root;
+ }
+
+ /* Try to find the orphanage directory. */
+ inode_lock_nested(root_inode, I_MUTEX_PARENT);
+ orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry);
+ if (IS_ERR(orphanage_dentry)) {
+ error = PTR_ERR(orphanage_dentry);
+ goto out_unlock_root;
+ }
+
+ /*
+ * Nothing found? Call mkdir to create the orphanage. Create the
+ * directory without other-user access because we're live and someone
+ * could have been relying partly on minimal access to a parent
+ * directory to control access to a file we put in here.
+ */
+ if (d_really_is_negative(orphanage_dentry)) {
+ orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode,
+ orphanage_dentry, 0750);
+ error = PTR_ERR(orphanage_dentry);
+ if (IS_ERR(orphanage_dentry))
+ goto out_unlock_root;
+ }
+
+ /* Not a directory? Bail out. */
+ if (!d_is_dir(orphanage_dentry)) {
+ error = -ENOTDIR;
+ goto out_dput_orphanage;
+ }
+
+ /*
+ * Grab a reference to the orphanage. This /should/ succeed since
+ * we hold the root directory locked and therefore nobody can delete
+ * the orphanage.
+ */
+ orphanage_inode = igrab(d_inode(orphanage_dentry));
+ if (!orphanage_inode) {
+ error = -ENOENT;
+ goto out_dput_orphanage;
+ }
+
+ /* Make sure the orphanage is owned by root. */
+ error = xrep_chown_orphanage(sc, XFS_I(orphanage_inode));
+ if (error)
+ goto out_dput_orphanage;
+
+ /* Stash the reference for later and bail out. */
+ sc->orphanage = XFS_I(orphanage_inode);
+ sc->orphanage_ilock_flags = 0;
+
+out_dput_orphanage:
+ dput(orphanage_dentry);
+out_unlock_root:
+ inode_unlock(VFS_I(sc->mp->m_rootip));
+out_dput_root:
+ dput(root_dentry);
+out:
+ return error;
+}
+
+void
+xrep_orphanage_ilock(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ sc->orphanage_ilock_flags |= ilock_flags;
+ xfs_ilock(sc->orphanage, ilock_flags);
+}
+
+bool
+xrep_orphanage_ilock_nowait(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ if (xfs_ilock_nowait(sc->orphanage, ilock_flags)) {
+ sc->orphanage_ilock_flags |= ilock_flags;
+ return true;
+ }
+
+ return false;
+}
+
+void
+xrep_orphanage_iunlock(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ xfs_iunlock(sc->orphanage, ilock_flags);
+ sc->orphanage_ilock_flags &= ~ilock_flags;
+}
+
+/* Grab the IOLOCK of the orphanage and sc->ip. */
+int
+xrep_orphanage_iolock_two(
+ struct xfs_scrub *sc)
+{
+ int error = 0;
+
+ while (true) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * Normal XFS takes the IOLOCK before grabbing a transaction.
+ * Scrub holds a transaction, which means that we can't block
+ * on either IOLOCK.
+ */
+ if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
+ if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ break;
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ }
+ delay(1);
+ }
+
+ return 0;
+}
+
+/* Release the orphanage. */
+void
+xrep_orphanage_rele(
+ struct xfs_scrub *sc)
+{
+ if (!sc->orphanage)
+ return;
+
+ if (sc->orphanage_ilock_flags)
+ xfs_iunlock(sc->orphanage, sc->orphanage_ilock_flags);
+
+ xchk_irele(sc, sc->orphanage);
+ sc->orphanage = NULL;
+}
+
+/* Adoption moves a file into /lost+found */
+
+/* Can the orphanage adopt @sc->ip? */
+bool
+xrep_orphanage_can_adopt(
+ struct xfs_scrub *sc)
+{
+ ASSERT(sc->ip != NULL);
+
+ if (!sc->orphanage)
+ return false;
+ if (sc->ip == sc->orphanage)
+ return false;
+ if (xchk_inode_is_sb_rooted(sc->ip))
+ return false;
+ if (xfs_is_internal_inode(sc->ip))
+ return false;
+ return true;
+}
+
+/*
+ * Create a new transaction to send a child to the orphanage.
+ *
+ * Allocate a new transaction with sufficient disk space to handle the
+ * adoption, take ILOCK_EXCL of the orphanage and sc->ip, joins them to the
+ * transaction, and reserve quota to reparent the latter. Caller must hold the
+ * IOLOCK of the orphanage and sc->ip.
+ */
+int
+xrep_adoption_trans_alloc(
+ struct xfs_scrub *sc,
+ struct xrep_adoption *adopt)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned int child_blkres = 0;
+ int error;
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(sc->ip != NULL);
+ ASSERT(sc->orphanage != NULL);
+ ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL);
+ ASSERT(sc->orphanage_ilock_flags & XFS_IOLOCK_EXCL);
+ ASSERT(!(sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)));
+ ASSERT(!(sc->orphanage_ilock_flags &
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)));
+
+ /* Compute the worst case space reservation that we need. */
+ adopt->sc = sc;
+ adopt->orphanage_blkres = xfs_link_space_res(mp, MAXNAMELEN);
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode))
+ child_blkres = xfs_rename_space_res(mp, 0, false,
+ xfs_name_dotdot.len, false);
+ if (xfs_has_parent(mp))
+ child_blkres += XFS_ADDAFORK_SPACE_RES(mp);
+ adopt->child_blkres = child_blkres;
+
+ /*
+ * Allocate a transaction to link the child into the parent, along with
+ * enough disk space to handle expansion of both the orphanage and the
+ * dotdot entry of a child directory.
+ */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link,
+ adopt->orphanage_blkres + adopt->child_blkres, 0, 0,
+ &sc->tp);
+ if (error)
+ return error;
+
+ xfs_lock_two_inodes(sc->orphanage, XFS_ILOCK_EXCL,
+ sc->ip, XFS_ILOCK_EXCL);
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ sc->orphanage_ilock_flags |= XFS_ILOCK_EXCL;
+
+ xfs_trans_ijoin(sc->tp, sc->orphanage, 0);
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /*
+ * Reserve enough quota in the orphan directory to add the new name.
+ * Normally the orphanage should have user/group/project ids of zero
+ * and hence is not subject to quota enforcement, but we're allowed to
+ * exceed quota to reattach disconnected parts of the directory tree.
+ */
+ error = xfs_trans_reserve_quota_nblks(sc->tp, sc->orphanage,
+ adopt->orphanage_blkres, 0, true);
+ if (error)
+ goto out_cancel;
+
+ /*
+ * Reserve enough quota in the child directory to change dotdot.
+ * Here we're also allowed to exceed file quota to repair inconsistent
+ * metadata.
+ */
+ if (adopt->child_blkres) {
+ error = xfs_trans_reserve_quota_nblks(sc->tp, sc->ip,
+ adopt->child_blkres, 0, true);
+ if (error)
+ goto out_cancel;
+ }
+
+ return 0;
+out_cancel:
+ xchk_trans_cancel(sc);
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * Compute the xfs_name for the directory entry that we're adding to the
+ * orphanage. Caller must hold ILOCKs of sc->ip and the orphanage and must not
+ * reuse namebuf until the adoption completes or is dissolved.
+ */
+int
+xrep_adoption_compute_name(
+ struct xrep_adoption *adopt,
+ struct xfs_name *xname)
+{
+ struct xfs_scrub *sc = adopt->sc;
+ char *namebuf = (void *)xname->name;
+ xfs_ino_t ino;
+ unsigned int incr = 0;
+ int error = 0;
+
+ adopt->xname = xname;
+ xname->len = snprintf(namebuf, MAXNAMELEN, "%llu", sc->ip->i_ino);
+ xname->type = xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode);
+
+ /* Make sure the filename is unique in the lost+found. */
+ error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino);
+ while (error == 0 && incr < 10000) {
+ xname->len = snprintf(namebuf, MAXNAMELEN, "%llu.%u",
+ sc->ip->i_ino, ++incr);
+ error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino);
+ }
+ if (error == 0) {
+ /* We already have 10,000 entries in the orphanage? */
+ return -EFSCORRUPTED;
+ }
+
+ if (error != -ENOENT)
+ return error;
+ return 0;
+}
+
+/*
+ * Make sure the dcache does not have a positive dentry for the name we've
+ * chosen. The caller should have checked with the ondisk directory, so any
+ * discrepancy is a sign that something is seriously wrong.
+ */
+static int
+xrep_adoption_check_dcache(
+ struct xrep_adoption *adopt)
+{
+ struct qstr qname = QSTR_INIT(adopt->xname->name,
+ adopt->xname->len);
+ struct xfs_scrub *sc = adopt->sc;
+ struct dentry *d_orphanage, *d_child;
+ int error = 0;
+
+ d_orphanage = d_find_alias(VFS_I(sc->orphanage));
+ if (!d_orphanage)
+ return 0;
+
+ d_child = try_lookup_noperm(&qname, d_orphanage);
+ if (d_child) {
+ trace_xrep_adoption_check_child(sc->mp, d_child);
+
+ if (d_is_positive(d_child)) {
+ ASSERT(d_is_negative(d_child));
+ error = -EFSCORRUPTED;
+ }
+
+ dput(d_child);
+ }
+
+ dput(d_orphanage);
+ return error;
+}
+
+/*
+ * Invalidate all dentries for the name that was added to the orphanage
+ * directory, and all dentries pointing to the child inode that was moved.
+ *
+ * There should not be any positive entries for the name, since we've
+ * maintained our lock on the orphanage directory.
+ */
+static void
+xrep_adoption_zap_dcache(
+ struct xrep_adoption *adopt)
+{
+ struct qstr qname = QSTR_INIT(adopt->xname->name,
+ adopt->xname->len);
+ struct xfs_scrub *sc = adopt->sc;
+ struct dentry *d_orphanage, *d_child;
+
+ /* Invalidate all dentries for the adoption name */
+ d_orphanage = d_find_alias(VFS_I(sc->orphanage));
+ if (!d_orphanage)
+ return;
+
+ d_child = try_lookup_noperm(&qname, d_orphanage);
+ while (d_child != NULL) {
+ trace_xrep_adoption_invalidate_child(sc->mp, d_child);
+
+ ASSERT(d_is_negative(d_child));
+ d_invalidate(d_child);
+ dput(d_child);
+ d_child = d_lookup(d_orphanage, &qname);
+ }
+
+ dput(d_orphanage);
+
+ /* Invalidate all the dentries pointing down to this file. */
+ while ((d_child = d_find_alias(VFS_I(sc->ip))) != NULL) {
+ trace_xrep_adoption_invalidate_child(sc->mp, d_child);
+
+ d_invalidate(d_child);
+ dput(d_child);
+ }
+}
+
+/*
+ * If we have to add an attr fork ahead of a parent pointer update, how much
+ * space should we ask for?
+ */
+static inline int
+xrep_adoption_attr_sizeof(
+ const struct xrep_adoption *adopt)
+{
+ return sizeof(struct xfs_attr_sf_hdr) +
+ xfs_attr_sf_entsize_byname(sizeof(struct xfs_parent_rec),
+ adopt->xname->len);
+}
+
+/*
+ * Move the current file to the orphanage under the computed name.
+ *
+ * Returns with a dirty transaction so that the caller can handle any other
+ * work, such as fixing up unlinked lists or resetting link counts.
+ */
+int
+xrep_adoption_move(
+ struct xrep_adoption *adopt)
+{
+ struct xfs_scrub *sc = adopt->sc;
+ bool isdir = S_ISDIR(VFS_I(sc->ip)->i_mode);
+ int error;
+
+ trace_xrep_adoption_reparent(sc->orphanage, adopt->xname,
+ sc->ip->i_ino);
+
+ error = xrep_adoption_check_dcache(adopt);
+ if (error)
+ return error;
+
+ /*
+ * If this filesystem has parent pointers, ensure that the file being
+ * moved to the orphanage has an attribute fork. This is required
+ * because the parent pointer code does not itself add attr forks.
+ */
+ if (!xfs_inode_has_attr_fork(sc->ip) && xfs_has_parent(sc->mp)) {
+ int sf_size = xrep_adoption_attr_sizeof(adopt);
+
+ error = xfs_bmap_add_attrfork(sc->tp, sc->ip, sf_size, true);
+ if (error)
+ return error;
+ }
+
+ /* Create the new name in the orphanage. */
+ error = xfs_dir_createname(sc->tp, sc->orphanage, adopt->xname,
+ sc->ip->i_ino, adopt->orphanage_blkres);
+ if (error)
+ return error;
+
+ /*
+ * Bump the link count of the orphanage if we just added a
+ * subdirectory, and update its timestamps.
+ */
+ xfs_trans_ichgtime(sc->tp, sc->orphanage,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ if (isdir)
+ xfs_bumplink(sc->tp, sc->orphanage);
+ xfs_trans_log_inode(sc->tp, sc->orphanage, XFS_ILOG_CORE);
+
+ /* Bump the link count of the child. */
+ if (adopt->bump_child_nlink) {
+ xfs_bumplink(sc->tp, sc->ip);
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ }
+
+ /* Replace the dotdot entry if the child is a subdirectory. */
+ if (isdir) {
+ error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot,
+ sc->orphanage->i_ino, adopt->child_blkres);
+ if (error)
+ return error;
+ }
+
+ /* Add a parent pointer from the file back to the lost+found. */
+ if (xfs_has_parent(sc->mp)) {
+ error = xfs_parent_addname(sc->tp, &adopt->ppargs,
+ sc->orphanage, adopt->xname, sc->ip);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Notify dirent hooks that we moved the file to /lost+found, and
+ * finish all the deferred work so that we know the adoption is fully
+ * recorded in the log.
+ */
+ xfs_dir_update_hook(sc->orphanage, sc->ip, 1, adopt->xname);
+
+ /* Remove negative dentries from the lost+found's dcache */
+ xrep_adoption_zap_dcache(adopt);
+ return 0;
+}
+
+/*
+ * Roll to a clean scrub transaction so that we can release the orphanage,
+ * even if xrep_adoption_move was not called.
+ *
+ * Commits all the work and deferred ops attached to an adoption request and
+ * rolls to a clean scrub transaction. On success, returns 0 with the scrub
+ * context holding a clean transaction with no inodes joined. On failure,
+ * returns negative errno with no scrub transaction. All inode locks are
+ * still held after this function returns.
+ */
+int
+xrep_adoption_trans_roll(
+ struct xrep_adoption *adopt)
+{
+ struct xfs_scrub *sc = adopt->sc;
+ int error;
+
+ trace_xrep_adoption_trans_roll(sc->orphanage, sc->ip,
+ !!(sc->tp->t_flags & XFS_TRANS_DIRTY));
+
+ /* Finish all the deferred ops to commit all repairs. */
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+
+ /* Roll the transaction once more to detach the inodes. */
+ return xfs_trans_roll(&sc->tp);
+}
diff --git a/fs/xfs/scrub/orphanage.h b/fs/xfs/scrub/orphanage.h
new file mode 100644
index 000000000000..7c7a2e7d81db
--- /dev/null
+++ b/fs/xfs/scrub/orphanage.h
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_ORPHANAGE_H__
+#define __XFS_SCRUB_ORPHANAGE_H__
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int xrep_orphanage_create(struct xfs_scrub *sc);
+
+/*
+ * If we're doing a repair, ensure that the orphanage exists and attach it to
+ * the scrub context.
+ */
+static inline int
+xrep_orphanage_try_create(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ ASSERT(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR);
+
+ error = xrep_orphanage_create(sc);
+ switch (error) {
+ case 0:
+ case -ENOENT:
+ case -ENOTDIR:
+ case -ENOSPC:
+ /*
+ * If the orphanage can't be found or isn't a directory, we'll
+ * keep going, but we won't be able to attach the file to the
+ * orphanage if we can't find the parent.
+ */
+ return 0;
+ }
+
+ return error;
+}
+
+int xrep_orphanage_iolock_two(struct xfs_scrub *sc);
+
+void xrep_orphanage_ilock(struct xfs_scrub *sc, unsigned int ilock_flags);
+bool xrep_orphanage_ilock_nowait(struct xfs_scrub *sc,
+ unsigned int ilock_flags);
+void xrep_orphanage_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags);
+
+void xrep_orphanage_rele(struct xfs_scrub *sc);
+
+/* Information about a request to add a file to the orphanage. */
+struct xrep_adoption {
+ struct xfs_scrub *sc;
+
+ /* Name used for the adoption. */
+ struct xfs_name *xname;
+
+ /* Parent pointer context tracking */
+ struct xfs_parent_args ppargs;
+
+ /* Block reservations for orphanage and child (if directory). */
+ unsigned int orphanage_blkres;
+ unsigned int child_blkres;
+
+ /*
+ * Does the caller want us to bump the child link count? This is not
+ * needed when reattaching files that have become disconnected but have
+ * nlink > 1. It is necessary when changing the directory tree
+ * structure.
+ */
+ bool bump_child_nlink:1;
+};
+
+bool xrep_orphanage_can_adopt(struct xfs_scrub *sc);
+
+int xrep_adoption_trans_alloc(struct xfs_scrub *sc,
+ struct xrep_adoption *adopt);
+int xrep_adoption_compute_name(struct xrep_adoption *adopt,
+ struct xfs_name *xname);
+int xrep_adoption_move(struct xrep_adoption *adopt);
+int xrep_adoption_trans_roll(struct xrep_adoption *adopt);
+#else
+struct xrep_adoption { /* empty */ };
+# define xrep_orphanage_rele(sc) ((void)0)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_ORPHANAGE_H__ */
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 7db873672146..3b692c4acc1e 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -10,19 +10,37 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
+#include "xfs_attr.h"
+#include "xfs_parent.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+#include "scrub/repair.h"
+#include "scrub/listxattr.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/trace.h"
/* Set us up to scrub parents. */
int
xchk_setup_parent(
struct xfs_scrub *sc)
{
+ int error;
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_parent(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_inode_contents(sc, 0);
}
@@ -114,6 +132,14 @@ xchk_parent_validate(
return 0;
}
+ /* Is this the metadata root dir? Then '..' must point to itself. */
+ if (sc->ip == mp->m_metadirip) {
+ if (sc->ip->i_ino != mp->m_sb.sb_metadirino ||
+ sc->ip->i_ino != parent_ino)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
/* '..' must not point to ourselves. */
if (sc->ip->i_ino == parent_ino) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
@@ -143,7 +169,8 @@ xchk_parent_validate(
}
if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
return error;
- if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) {
+ if (dp == sc->ip || xrep_is_tempfile(dp) ||
+ !S_ISDIR(VFS_I(dp)->i_mode)) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
goto out_rele;
}
@@ -166,6 +193,12 @@ xchk_parent_validate(
goto out_unlock;
}
+ /* Metadata and regular inodes cannot cross trees. */
+ if (xfs_is_metadir_inode(dp) != xfs_is_metadir_inode(sc->ip)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out_unlock;
+ }
+
/* Look for a directory entry in the parent pointing to the child. */
error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc);
if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
@@ -185,6 +218,629 @@ out_rele:
return error;
}
+/*
+ * Checking of Parent Pointers
+ * ===========================
+ *
+ * On filesystems with directory parent pointers, we check the referential
+ * integrity by visiting each parent pointer of a child file and checking that
+ * the directory referenced by the pointer actually has a dirent pointing
+ * forward to the child file.
+ */
+
+/* Deferred parent pointer entry that we saved for later. */
+struct xchk_pptr {
+ /* Cookie for retrieval of the pptr name. */
+ xfblob_cookie name_cookie;
+
+ /* Parent pointer record. */
+ struct xfs_parent_rec pptr_rec;
+
+ /* Length of the pptr name. */
+ uint8_t namelen;
+};
+
+struct xchk_pptrs {
+ struct xfs_scrub *sc;
+
+ /* How many parent pointers did we find at the end? */
+ unsigned long long pptrs_found;
+
+ /* Parent of this directory. */
+ xfs_ino_t parent_ino;
+
+ /* Fixed-size array of xchk_pptr structures. */
+ struct xfarray *pptr_entries;
+
+ /* Blobs containing parent pointer names. */
+ struct xfblob *pptr_names;
+
+ /* Scratch buffer for scanning pptr xattrs */
+ struct xfs_da_args pptr_args;
+
+ /* If we've cycled the ILOCK, we must revalidate all deferred pptrs. */
+ bool need_revalidate;
+
+ /* Name buffer */
+ struct xfs_name xname;
+ char namebuf[MAXNAMELEN];
+};
+
+/* Does this parent pointer match the dotdot entry? */
+STATIC int
+xchk_parent_scan_dotdot(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xchk_pptrs *pp = priv;
+ xfs_ino_t parent_ino;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, NULL);
+ if (error)
+ return error;
+
+ if (pp->parent_ino == parent_ino)
+ return -ECANCELED;
+
+ return 0;
+}
+
+/* Look up the dotdot entry so that we can check it as we walk the pptrs. */
+STATIC int
+xchk_parent_pptr_and_dotdot(
+ struct xchk_pptrs *pp)
+{
+ struct xfs_scrub *sc = pp->sc;
+ int error;
+
+ /* Look up '..' */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &pp->parent_ino);
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ return error;
+ if (!xfs_verify_dir_ino(sc->mp, pp->parent_ino)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
+ /* Is this the root dir? Then '..' must point to itself. */
+ if (xchk_inode_is_dirtree_root(sc->ip)) {
+ if (sc->ip->i_ino != pp->parent_ino)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
+ /*
+ * If this is now an unlinked directory, the dotdot value is
+ * meaningless as long as it points to a valid inode.
+ */
+ if (VFS_I(sc->ip)->i_nlink == 0)
+ return 0;
+
+ if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /* Otherwise, walk the pptrs again, and check. */
+ error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_dotdot, NULL, pp);
+ if (error == -ECANCELED) {
+ /* Found a parent pointer that matches dotdot. */
+ return 0;
+ }
+ if (!error || error == -EFSCORRUPTED) {
+ /* Found a broken parent pointer or no match. */
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return 0;
+ }
+ return error;
+}
+
+/*
+ * Try to lock a parent directory for checking dirents. Returns the inode
+ * flags for the locks we now hold, or zero if we failed.
+ */
+STATIC unsigned int
+xchk_parent_lock_dir(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp)
+{
+ if (!xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED))
+ return 0;
+
+ if (!xfs_ilock_nowait(dp, XFS_ILOCK_SHARED)) {
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ return 0;
+ }
+
+ if (!xfs_need_iread_extents(&dp->i_df))
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED;
+
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+ if (!xfs_ilock_nowait(dp, XFS_ILOCK_EXCL)) {
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ return 0;
+ }
+
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL;
+}
+
+/* Check the forward link (dirent) associated with this parent pointer. */
+STATIC int
+xchk_parent_dirent(
+ struct xchk_pptrs *pp,
+ const struct xfs_name *xname,
+ struct xfs_inode *dp)
+{
+ struct xfs_scrub *sc = pp->sc;
+ xfs_ino_t child_ino;
+ int error;
+
+ /*
+ * Use the name attached to this parent pointer to look up the
+ * directory entry in the alleged parent.
+ */
+ error = xchk_dir_lookup(sc, dp, xname, &child_ino);
+ if (error == -ENOENT) {
+ xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return 0;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error))
+ return error;
+
+ /* Does the inode number match? */
+ if (child_ino != sc->ip->i_ino) {
+ xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return 0;
+ }
+
+ return 0;
+}
+
+/* Try to grab a parent directory. */
+STATIC int
+xchk_parent_iget(
+ struct xchk_pptrs *pp,
+ const struct xfs_parent_rec *pptr,
+ struct xfs_inode **dpp)
+{
+ struct xfs_scrub *sc = pp->sc;
+ struct xfs_inode *ip;
+ xfs_ino_t parent_ino = be64_to_cpu(pptr->p_ino);
+ int error;
+
+ /* Validate inode number. */
+ error = xfs_dir_ino_validate(sc->mp, parent_ino);
+ if (error) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return -ECANCELED;
+ }
+
+ error = xchk_iget(sc, parent_ino, &ip);
+ if (error == -EINVAL || error == -ENOENT) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return -ECANCELED;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error))
+ return error;
+
+ /* The parent must be a directory. */
+ if (!S_ISDIR(VFS_I(ip)->i_mode)) {
+ xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ goto out_rele;
+ }
+
+ /* Validate generation number. */
+ if (VFS_I(ip)->i_generation != be32_to_cpu(pptr->p_gen)) {
+ xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ goto out_rele;
+ }
+
+ *dpp = ip;
+ return 0;
+out_rele:
+ xchk_irele(sc, ip);
+ return 0;
+}
+
+/*
+ * Walk an xattr of a file. If this xattr is a parent pointer, follow it up
+ * to a parent directory and check that the parent has a dirent pointing back
+ * to us.
+ */
+STATIC int
+xchk_parent_scan_attr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xfs_name xname = {
+ .name = name,
+ .len = namelen,
+ };
+ struct xchk_pptrs *pp = priv;
+ struct xfs_inode *dp = NULL;
+ const struct xfs_parent_rec *pptr_rec = value;
+ xfs_ino_t parent_ino;
+ unsigned int lockmode;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, NULL);
+ if (error) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return error;
+ }
+
+ /* No self-referential parent pointers. */
+ if (parent_ino == sc->ip->i_ino) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return -ECANCELED;
+ }
+
+ pp->pptrs_found++;
+
+ error = xchk_parent_iget(pp, pptr_rec, &dp);
+ if (error)
+ return error;
+ if (!dp)
+ return 0;
+
+ /* Try to lock the inode. */
+ lockmode = xchk_parent_lock_dir(sc, dp);
+ if (!lockmode) {
+ struct xchk_pptr save_pp = {
+ .pptr_rec = *pptr_rec, /* struct copy */
+ .namelen = namelen,
+ };
+
+ /* Couldn't lock the inode, so save the pptr for later. */
+ trace_xchk_parent_defer(sc->ip, &xname, dp->i_ino);
+
+ error = xfblob_storename(pp->pptr_names, &save_pp.name_cookie,
+ &xname);
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0,
+ &error))
+ goto out_rele;
+
+ error = xfarray_append(pp->pptr_entries, &save_pp);
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0,
+ &error))
+ goto out_rele;
+
+ goto out_rele;
+ }
+
+ error = xchk_parent_dirent(pp, &xname, dp);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock(dp, lockmode);
+out_rele:
+ xchk_irele(sc, dp);
+ return error;
+}
+
+/*
+ * Revalidate a parent pointer that we collected in the past but couldn't check
+ * because of lock contention. Returns 0 if the parent pointer is still valid,
+ * -ENOENT if it has gone away on us, or a negative errno.
+ */
+STATIC int
+xchk_parent_revalidate_pptr(
+ struct xchk_pptrs *pp,
+ const struct xfs_name *xname,
+ struct xfs_parent_rec *pptr)
+{
+ struct xfs_scrub *sc = pp->sc;
+ int error;
+
+ error = xfs_parent_lookup(sc->tp, sc->ip, xname, pptr, &pp->pptr_args);
+ if (error == -ENOATTR) {
+ /* Parent pointer went away, nothing to revalidate. */
+ return -ENOENT;
+ }
+
+ return error;
+}
+
+/*
+ * Check a parent pointer the slow way, which means we cycle locks a bunch
+ * and put up with revalidation until we get it done.
+ */
+STATIC int
+xchk_parent_slow_pptr(
+ struct xchk_pptrs *pp,
+ const struct xfs_name *xname,
+ struct xfs_parent_rec *pptr)
+{
+ struct xfs_scrub *sc = pp->sc;
+ struct xfs_inode *dp = NULL;
+ unsigned int lockmode;
+ int error;
+
+ /* Check that the deferred parent pointer still exists. */
+ if (pp->need_revalidate) {
+ error = xchk_parent_revalidate_pptr(pp, xname, pptr);
+ if (error == -ENOENT)
+ return 0;
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0,
+ &error))
+ return error;
+ }
+
+ error = xchk_parent_iget(pp, pptr, &dp);
+ if (error)
+ return error;
+ if (!dp)
+ return 0;
+
+ /*
+ * If we can grab both IOLOCK and ILOCK of the alleged parent, we
+ * can proceed with the validation.
+ */
+ lockmode = xchk_parent_lock_dir(sc, dp);
+ if (lockmode) {
+ trace_xchk_parent_slowpath(sc->ip, xname, dp->i_ino);
+ goto check_dirent;
+ }
+
+ /*
+ * We couldn't lock the parent dir. Drop all the locks and try to
+ * get them again, one at a time.
+ */
+ xchk_iunlock(sc, sc->ilock_flags);
+ pp->need_revalidate = true;
+
+ trace_xchk_parent_ultraslowpath(sc->ip, xname, dp->i_ino);
+
+ error = xchk_dir_trylock_for_pptrs(sc, dp, &lockmode);
+ if (error)
+ goto out_rele;
+
+ /* Revalidate the parent pointer now that we cycled locks. */
+ error = xchk_parent_revalidate_pptr(pp, xname, pptr);
+ if (error == -ENOENT) {
+ error = 0;
+ goto out_unlock;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error))
+ goto out_unlock;
+
+check_dirent:
+ error = xchk_parent_dirent(pp, xname, dp);
+out_unlock:
+ xfs_iunlock(dp, lockmode);
+out_rele:
+ xchk_irele(sc, dp);
+ return error;
+}
+
+/* Check all the parent pointers that we deferred the first time around. */
+STATIC int
+xchk_parent_finish_slow_pptrs(
+ struct xchk_pptrs *pp)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ foreach_xfarray_idx(pp->pptr_entries, array_cur) {
+ struct xchk_pptr pptr;
+
+ if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ error = xfarray_load(pp->pptr_entries, array_cur, &pptr);
+ if (error)
+ return error;
+
+ error = xfblob_loadname(pp->pptr_names, pptr.name_cookie,
+ &pp->xname, pptr.namelen);
+ if (error)
+ return error;
+
+ error = xchk_parent_slow_pptr(pp, &pp->xname, &pptr.pptr_rec);
+ if (error)
+ return error;
+ }
+
+ /* Empty out both xfiles now that we've checked everything. */
+ xfarray_truncate(pp->pptr_entries);
+ xfblob_truncate(pp->pptr_names);
+ return 0;
+}
+
+/* Count the number of parent pointers. */
+STATIC int
+xchk_parent_count_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xchk_pptrs *pp = priv;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, NULL, NULL);
+ if (error)
+ return error;
+
+ pp->pptrs_found++;
+ return 0;
+}
+
+/*
+ * Compare the number of parent pointers to the link count. For
+ * non-directories these should be the same. For unlinked directories the
+ * count should be zero; for linked directories, it should be nonzero.
+ */
+STATIC int
+xchk_parent_count_pptrs(
+ struct xchk_pptrs *pp)
+{
+ struct xfs_scrub *sc = pp->sc;
+ int error;
+
+ /*
+ * If we cycled the ILOCK while cross-checking parent pointers with
+ * dirents, then we need to recalculate the number of parent pointers.
+ */
+ if (pp->need_revalidate) {
+ pp->pptrs_found = 0;
+ error = xchk_xattr_walk(sc, sc->ip, xchk_parent_count_pptr,
+ NULL, pp);
+ if (error == -EFSCORRUPTED) {
+ /* Found a bad parent pointer */
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return 0;
+ }
+ if (error)
+ return error;
+ }
+
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode)) {
+ if (xchk_inode_is_dirtree_root(sc->ip))
+ pp->pptrs_found++;
+
+ if (VFS_I(sc->ip)->i_nlink == 0 && pp->pptrs_found > 0)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ else if (VFS_I(sc->ip)->i_nlink > 0 &&
+ pp->pptrs_found == 0)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ } else {
+ /*
+ * Starting with metadir, we allow checking of parent pointers
+ * of non-directory files that are children of the superblock.
+ * Pretend that we found a parent pointer attr.
+ */
+ if (xfs_has_metadir(sc->mp) && xchk_inode_is_sb_rooted(sc->ip))
+ pp->pptrs_found++;
+
+ if (VFS_I(sc->ip)->i_nlink != pp->pptrs_found)
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ }
+
+ return 0;
+}
+
+/* Check parent pointers of a file. */
+STATIC int
+xchk_parent_pptr(
+ struct xfs_scrub *sc)
+{
+ struct xchk_pptrs *pp;
+ char *descr;
+ int error;
+
+ pp = kvzalloc(sizeof(struct xchk_pptrs), XCHK_GFP_FLAGS);
+ if (!pp)
+ return -ENOMEM;
+ pp->sc = sc;
+ pp->xname.name = pp->namebuf;
+
+ /*
+ * Set up some staging memory for parent pointers that we can't check
+ * due to locking contention.
+ */
+ descr = xchk_xfile_ino_descr(sc, "slow parent pointer entries");
+ error = xfarray_create(descr, 0, sizeof(struct xchk_pptr),
+ &pp->pptr_entries);
+ kfree(descr);
+ if (error)
+ goto out_pp;
+
+ descr = xchk_xfile_ino_descr(sc, "slow parent pointer names");
+ error = xfblob_create(descr, &pp->pptr_names);
+ kfree(descr);
+ if (error)
+ goto out_entries;
+
+ error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_attr, NULL, pp);
+ if (error == -ECANCELED) {
+ error = 0;
+ goto out_names;
+ }
+ if (error)
+ goto out_names;
+
+ error = xchk_parent_finish_slow_pptrs(pp);
+ if (error == -ETIMEDOUT) {
+ /* Couldn't grab a lock, scrub was marked incomplete */
+ error = 0;
+ goto out_names;
+ }
+ if (error)
+ goto out_names;
+
+ if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out_names;
+
+ /*
+ * For subdirectories, make sure the dotdot entry references the same
+ * inode as the parent pointers.
+ *
+ * If we're scanning a /consistent/ directory, there should only be
+ * one parent pointer, and it should point to the same directory as
+ * the dotdot entry.
+ *
+ * However, a corrupt directory tree might feature a subdirectory with
+ * multiple parents. The directory loop scanner is responsible for
+ * correcting that kind of problem, so for now we only validate that
+ * the dotdot entry matches /one/ of the parents.
+ */
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode)) {
+ error = xchk_parent_pptr_and_dotdot(pp);
+ if (error)
+ goto out_names;
+ }
+
+ if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out_names;
+
+ /*
+ * Complain if the number of parent pointers doesn't match the link
+ * count. This could be a sign of missing parent pointers (or an
+ * incorrect link count).
+ */
+ error = xchk_parent_count_pptrs(pp);
+ if (error)
+ goto out_names;
+
+out_names:
+ xfblob_destroy(pp->pptr_names);
+out_entries:
+ xfarray_destroy(pp->pptr_entries);
+out_pp:
+ kvfree(pp);
+ return error;
+}
+
/* Scrub a parent pointer. */
int
xchk_parent(
@@ -194,6 +850,9 @@ xchk_parent(
xfs_ino_t parent_ino;
int error = 0;
+ if (xfs_has_parent(mp))
+ return xchk_parent_pptr(sc);
+
/*
* If we're a directory, check that the '..' link points up to
* a directory that has one entry pointing to us.
@@ -237,3 +896,63 @@ xchk_parent(
return error;
}
+
+/*
+ * Decide if this file's extended attributes (and therefore its parent
+ * pointers) have been zapped to satisfy the inode and ifork verifiers.
+ * Checking and repairing should be postponed until the extended attribute
+ * structure is fixed.
+ */
+bool
+xchk_pptr_looks_zapped(
+ struct xfs_inode *ip)
+{
+ struct inode *inode = VFS_I(ip);
+
+ ASSERT(xfs_has_parent(ip->i_mount));
+
+ /*
+ * Temporary files that cannot be linked into the directory tree do not
+ * have attr forks because they cannot ever have parents.
+ */
+ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ return false;
+
+ /*
+ * Directory tree roots do not have parents, so the expected outcome
+ * of a parent pointer scan is always the empty set. It's safe to scan
+ * them even if the attr fork was zapped.
+ */
+ if (xchk_inode_is_dirtree_root(ip))
+ return false;
+
+ /*
+ * Metadata inodes that are rooted in the superblock do not have any
+ * parents. Hence the attr fork will not be initialized, but there are
+ * no parent pointers that might have been zapped.
+ */
+ if (xchk_inode_is_sb_rooted(ip))
+ return false;
+
+ /*
+ * Linked and linkable non-rootdir files should always have an
+ * attribute fork because that is where parent pointers are
+ * stored. If the fork is absent, something is amiss.
+ */
+ if (!xfs_inode_has_attr_fork(ip))
+ return true;
+
+ /* Repair zapped this file's attr fork a short time ago */
+ if (xfs_ifork_zapped(ip, XFS_ATTR_FORK))
+ return true;
+
+ /*
+ * If the dinode repair found a bad attr fork, it will reset the fork
+ * to extents format with zero records and wait for the bmapbta
+ * scrubber to reconstruct the block mappings. The extended attribute
+ * structure always contain some content when parent pointers are
+ * enabled, so this is a clear sign of a zapped attr fork.
+ */
+ return ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_af.if_nextents == 0;
+}
diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c
new file mode 100644
index 000000000000..31bfe10be22a
--- /dev/null
+++ b/fs/xfs/scrub/parent_repair.c
@@ -0,0 +1,1639 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trans_space.h"
+#include "xfs_health.h"
+#include "xfs_exchmaps.h"
+#include "xfs_parent.h"
+#include "xfs_attr.h"
+#include "xfs_bmap.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/findparent.h"
+#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/orphanage.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/attr_repair.h"
+#include "scrub/listxattr.h"
+
+/*
+ * Repairing The Directory Parent Pointer
+ * ======================================
+ *
+ * Currently, only directories support parent pointers (in the form of '..'
+ * entries), so we simply scan the filesystem and update the '..' entry.
+ *
+ * Note that because the only parent pointer is the dotdot entry, we won't
+ * touch an unhealthy directory, since the directory repair code is perfectly
+ * capable of rebuilding a directory with the proper parent inode.
+ *
+ * See the section on locking issues in dir_repair.c for more information about
+ * conflicts with the VFS. The findparent code wll keep our incore parent
+ * inode up to date.
+ *
+ * If parent pointers are enabled, we instead reconstruct the parent pointer
+ * information by visiting every directory entry of every directory in the
+ * system and translating the relevant dirents into parent pointers. In this
+ * case, it is advantageous to stash all parent pointers created from dirents
+ * from a single parent file before replaying them into the temporary file. To
+ * save memory, the live filesystem scan reuses the findparent object. Parent
+ * pointer repair chooses either directory scanning or findparent, but not
+ * both.
+ *
+ * When salvaging completes, the remaining stashed entries are replayed to the
+ * temporary file. All non-parent pointer extended attributes are copied to
+ * the temporary file's extended attributes. An atomic file mapping exchange
+ * is used to commit the new xattr blocks to the file being repaired. This
+ * will disrupt attrmulti cursors.
+ */
+
+/* Create a parent pointer in the tempfile. */
+#define XREP_PPTR_ADD (1)
+
+/* Remove a parent pointer from the tempfile. */
+#define XREP_PPTR_REMOVE (2)
+
+/* A stashed parent pointer update. */
+struct xrep_pptr {
+ /* Cookie for retrieval of the pptr name. */
+ xfblob_cookie name_cookie;
+
+ /* Parent pointer record. */
+ struct xfs_parent_rec pptr_rec;
+
+ /* Length of the pptr name. */
+ uint8_t namelen;
+
+ /* XREP_PPTR_{ADD,REMOVE} */
+ uint8_t action;
+};
+
+/*
+ * Stash up to 8 pages of recovered parent pointers in pptr_recs and
+ * pptr_names before we write them to the temp file.
+ */
+#define XREP_PARENT_MAX_STASH_BYTES (PAGE_SIZE * 8)
+
+struct xrep_parent {
+ struct xfs_scrub *sc;
+
+ /* Fixed-size array of xrep_pptr structures. */
+ struct xfarray *pptr_recs;
+
+ /* Blobs containing parent pointer names. */
+ struct xfblob *pptr_names;
+
+ /* xattr keys */
+ struct xfarray *xattr_records;
+
+ /* xattr values */
+ struct xfblob *xattr_blobs;
+
+ /* Scratch buffers for saving extended attributes */
+ unsigned char *xattr_name;
+ void *xattr_value;
+ unsigned int xattr_value_sz;
+
+ /*
+ * Information used to exchange the attr fork mappings, if the fs
+ * supports parent pointers.
+ */
+ struct xrep_tempexch tx;
+
+ /*
+ * Information used to scan the filesystem to find the inumber of the
+ * dotdot entry for this directory. On filesystems without parent
+ * pointers, we use the findparent_* functions on this object and
+ * access only the parent_ino field directly.
+ *
+ * When parent pointers are enabled, the directory entry scanner uses
+ * the iscan, hooks, and lock fields of this object directly.
+ * @pscan.lock coordinates access to pptr_recs, pptr_names, pptr, and
+ * pptr_scratch. This reduces the memory requirements of this
+ * structure.
+ *
+ * The lock also controls access to xattr_records and xattr_blobs(?)
+ */
+ struct xrep_parent_scan_info pscan;
+
+ /* Orphanage reparenting request. */
+ struct xrep_adoption adoption;
+
+ /* Directory entry name, plus the trailing null. */
+ struct xfs_name xname;
+ unsigned char namebuf[MAXNAMELEN];
+
+ /* Scratch buffer for scanning pptr xattrs */
+ struct xfs_da_args pptr_args;
+
+ /* Have we seen any live updates of parent pointers recently? */
+ bool saw_pptr_updates;
+
+ /* Number of parents we found after all other repairs */
+ unsigned long long parents;
+};
+
+struct xrep_parent_xattr {
+ /* Cookie for retrieval of the xattr name. */
+ xfblob_cookie name_cookie;
+
+ /* Cookie for retrieval of the xattr value. */
+ xfblob_cookie value_cookie;
+
+ /* XFS_ATTR_* flags */
+ int flags;
+
+ /* Length of the value and name. */
+ uint32_t valuelen;
+ uint16_t namelen;
+};
+
+/*
+ * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write
+ * them to the temp file.
+ */
+#define XREP_PARENT_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8)
+
+/* Tear down all the incore stuff we created. */
+static void
+xrep_parent_teardown(
+ struct xrep_parent *rp)
+{
+ xrep_findparent_scan_teardown(&rp->pscan);
+ kvfree(rp->xattr_name);
+ rp->xattr_name = NULL;
+ kvfree(rp->xattr_value);
+ rp->xattr_value = NULL;
+ if (rp->xattr_blobs)
+ xfblob_destroy(rp->xattr_blobs);
+ rp->xattr_blobs = NULL;
+ if (rp->xattr_records)
+ xfarray_destroy(rp->xattr_records);
+ rp->xattr_records = NULL;
+ if (rp->pptr_names)
+ xfblob_destroy(rp->pptr_names);
+ rp->pptr_names = NULL;
+ if (rp->pptr_recs)
+ xfarray_destroy(rp->pptr_recs);
+ rp->pptr_recs = NULL;
+}
+
+/* Set up for a parent repair. */
+int
+xrep_setup_parent(
+ struct xfs_scrub *sc)
+{
+ struct xrep_parent *rp;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ rp = kvzalloc(sizeof(struct xrep_parent), XCHK_GFP_FLAGS);
+ if (!rp)
+ return -ENOMEM;
+ rp->sc = sc;
+ rp->xname.name = rp->namebuf;
+ sc->buf = rp;
+
+ error = xrep_tempfile_create(sc, S_IFREG);
+ if (error)
+ return error;
+
+ return xrep_orphanage_try_create(sc);
+}
+
+/*
+ * Scan all files in the filesystem for a child dirent that we can turn into
+ * the dotdot entry for this directory.
+ */
+STATIC int
+xrep_parent_find_dotdot(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t ino;
+ unsigned int sick, checked;
+ int error;
+
+ /*
+ * Avoid sick directories. There shouldn't be anyone else clearing the
+ * directory's sick status.
+ */
+ xfs_inode_measure_sickness(sc->ip, &sick, &checked);
+ if (sick & XFS_SICK_INO_DIR)
+ return -EFSCORRUPTED;
+
+ ino = xrep_findparent_self_reference(sc);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rp->pscan, ino);
+ return 0;
+ }
+
+ /*
+ * Drop the ILOCK on this directory so that we can scan for the dotdot
+ * entry. Figure out who is going to be the parent of this directory,
+ * then retake the ILOCK so that we can salvage directory entries.
+ */
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* Does the VFS dcache have an answer for us? */
+ ino = xrep_findparent_from_dcache(sc);
+ if (ino != NULLFSINO) {
+ error = xrep_findparent_confirm(sc, &ino);
+ if (!error && ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rp->pscan, ino);
+ goto out_relock;
+ }
+ }
+
+ /* Scan the entire filesystem for a parent. */
+ error = xrep_findparent_scan(&rp->pscan);
+out_relock:
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+
+ return error;
+}
+
+/*
+ * Add this stashed incore parent pointer to the temporary file.
+ * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
+ * must not be in transaction context.
+ */
+STATIC int
+xrep_parent_replay_update(
+ struct xrep_parent *rp,
+ const struct xfs_name *xname,
+ struct xrep_pptr *pptr)
+{
+ struct xfs_scrub *sc = rp->sc;
+
+ switch (pptr->action) {
+ case XREP_PPTR_ADD:
+ /* Create parent pointer. */
+ trace_xrep_parent_replay_parentadd(sc->tempip, xname,
+ &pptr->pptr_rec);
+
+ return xfs_parent_set(sc->tempip, sc->ip->i_ino, xname,
+ &pptr->pptr_rec, &rp->pptr_args);
+ case XREP_PPTR_REMOVE:
+ /* Remove parent pointer. */
+ trace_xrep_parent_replay_parentremove(sc->tempip, xname,
+ &pptr->pptr_rec);
+
+ return xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname,
+ &pptr->pptr_rec, &rp->pptr_args);
+ }
+
+ ASSERT(0);
+ return -EIO;
+}
+
+/*
+ * Flush stashed parent pointer updates that have been recorded by the scanner.
+ * This is done to reduce the memory requirements of the parent pointer
+ * rebuild, since files can have a lot of hardlinks and the fs can be busy.
+ *
+ * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile
+ * IOLOCK.
+ */
+STATIC int
+xrep_parent_replay_updates(
+ struct xrep_parent *rp)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ mutex_lock(&rp->pscan.lock);
+ foreach_xfarray_idx(rp->pptr_recs, array_cur) {
+ struct xrep_pptr pptr;
+
+ error = xfarray_load(rp->pptr_recs, array_cur, &pptr);
+ if (error)
+ goto out_unlock;
+
+ error = xfblob_loadname(rp->pptr_names, pptr.name_cookie,
+ &rp->xname, pptr.namelen);
+ if (error)
+ goto out_unlock;
+ rp->xname.len = pptr.namelen;
+ mutex_unlock(&rp->pscan.lock);
+
+ error = xrep_parent_replay_update(rp, &rp->xname, &pptr);
+ if (error)
+ return error;
+
+ mutex_lock(&rp->pscan.lock);
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rp->pptr_recs);
+ xfblob_truncate(rp->pptr_names);
+ mutex_unlock(&rp->pscan.lock);
+ return 0;
+out_unlock:
+ mutex_unlock(&rp->pscan.lock);
+ return error;
+}
+
+/*
+ * Remember that we want to create a parent pointer in the tempfile. These
+ * stashed actions will be replayed later.
+ */
+STATIC int
+xrep_parent_stash_parentadd(
+ struct xrep_parent *rp,
+ const struct xfs_name *name,
+ const struct xfs_inode *dp)
+{
+ struct xrep_pptr pptr = {
+ .action = XREP_PPTR_ADD,
+ .namelen = name->len,
+ };
+ int error;
+
+ trace_xrep_parent_stash_parentadd(rp->sc->tempip, dp, name);
+
+ xfs_inode_to_parent_rec(&pptr.pptr_rec, dp);
+ error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rp->pptr_recs, &pptr);
+}
+
+/*
+ * Remember that we want to remove a parent pointer from the tempfile. These
+ * stashed actions will be replayed later.
+ */
+STATIC int
+xrep_parent_stash_parentremove(
+ struct xrep_parent *rp,
+ const struct xfs_name *name,
+ const struct xfs_inode *dp)
+{
+ struct xrep_pptr pptr = {
+ .action = XREP_PPTR_REMOVE,
+ .namelen = name->len,
+ };
+ int error;
+
+ trace_xrep_parent_stash_parentremove(rp->sc->tempip, dp, name);
+
+ xfs_inode_to_parent_rec(&pptr.pptr_rec, dp);
+ error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rp->pptr_recs, &pptr);
+}
+
+/*
+ * Examine an entry of a directory. If this dirent leads us back to the file
+ * whose parent pointers we're rebuilding, add a pptr to the temporary
+ * directory.
+ */
+STATIC int
+xrep_parent_scan_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_parent *rp = priv;
+ int error;
+
+ /* Dirent doesn't point to this directory. */
+ if (ino != rp->sc->ip->i_ino)
+ return 0;
+
+ /* No weird looking names. */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
+ return -EFSCORRUPTED;
+
+ /* No mismatching ftypes. */
+ if (name->type != xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode))
+ return -EFSCORRUPTED;
+
+ /* Don't pick up dot or dotdot entries; we only want child dirents. */
+ if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
+ xfs_dir2_samename(name, &xfs_name_dot))
+ return 0;
+
+ /*
+ * Transform this dirent into a parent pointer and queue it for later
+ * addition to the temporary file.
+ */
+ mutex_lock(&rp->pscan.lock);
+ error = xrep_parent_stash_parentadd(rp, name, dp);
+ mutex_unlock(&rp->pscan.lock);
+ return error;
+}
+
+/*
+ * Decide if we want to look for dirents in this directory. Skip the file
+ * being repaired and any files being used to stage repairs.
+ */
+static inline bool
+xrep_parent_want_scan(
+ struct xrep_parent *rp,
+ const struct xfs_inode *ip)
+{
+ return ip != rp->sc->ip && !xrep_is_tempfile(ip);
+}
+
+/*
+ * Take ILOCK on a file that we want to scan.
+ *
+ * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt.
+ * Otherwise, take ILOCK_SHARED.
+ */
+static inline unsigned int
+xrep_parent_scan_ilock(
+ struct xrep_parent *rp,
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ /* Still need to take the shared ILOCK to advance the iscan cursor. */
+ if (!xrep_parent_want_scan(rp, ip))
+ goto lock;
+
+ if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) {
+ lock_mode = XFS_ILOCK_EXCL;
+ goto lock;
+ }
+
+lock:
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
+/*
+ * Scan this file for relevant child dirents that point to the file whose
+ * parent pointers we're rebuilding.
+ */
+STATIC int
+xrep_parent_scan_file(
+ struct xrep_parent *rp,
+ struct xfs_inode *ip)
+{
+ unsigned int lock_mode;
+ int error = 0;
+
+ lock_mode = xrep_parent_scan_ilock(rp, ip);
+
+ if (!xrep_parent_want_scan(rp, ip))
+ goto scan_done;
+
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ /*
+ * If the directory looks as though it has been zapped by the
+ * inode record repair code, we cannot scan for child dirents.
+ */
+ if (xchk_dir_looks_zapped(ip)) {
+ error = -EBUSY;
+ goto scan_done;
+ }
+
+ error = xchk_dir_walk(rp->sc, ip, xrep_parent_scan_dirent, rp);
+ if (error)
+ goto scan_done;
+ }
+
+scan_done:
+ xchk_iscan_mark_visited(&rp->pscan.iscan, ip);
+ xfs_iunlock(ip, lock_mode);
+ return error;
+}
+
+/* Decide if we've stashed too much pptr data in memory. */
+static inline bool
+xrep_parent_want_flush_stashed(
+ struct xrep_parent *rp)
+{
+ unsigned long long bytes;
+
+ bytes = xfarray_bytes(rp->pptr_recs) + xfblob_bytes(rp->pptr_names);
+ return bytes > XREP_PARENT_MAX_STASH_BYTES;
+}
+
+/*
+ * Scan all directories in the filesystem to look for dirents that we can turn
+ * into parent pointers.
+ */
+STATIC int
+xrep_parent_scan_dirtree(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /*
+ * Filesystem scans are time consuming. Drop the file ILOCK and all
+ * other resources for the duration of the scan and hope for the best.
+ * The live update hooks will keep our scan information up to date.
+ */
+ xchk_trans_cancel(sc);
+ if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
+ xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
+ XFS_ILOCK_EXCL));
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&rp->pscan.iscan, &ip)) == 1) {
+ bool flush;
+
+ error = xrep_parent_scan_file(rp, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ /* Flush stashed pptr updates to constrain memory usage. */
+ mutex_lock(&rp->pscan.lock);
+ flush = xrep_parent_want_flush_stashed(rp);
+ mutex_unlock(&rp->pscan.lock);
+ if (flush) {
+ xchk_trans_cancel(sc);
+
+ error = xrep_tempfile_iolock_polled(sc);
+ if (error)
+ break;
+
+ error = xrep_parent_replay_updates(rp);
+ xrep_tempfile_iounlock(sc);
+ if (error)
+ break;
+
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ break;
+ }
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&rp->pscan.iscan);
+ if (error) {
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Retake sc->ip's ILOCK now that we're done flushing stashed parent
+ * pointers. We end this function with an empty transaction and the
+ * ILOCK.
+ */
+ xchk_ilock(rp->sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Capture dirent updates being made by other threads which are relevant to the
+ * file being repaired.
+ */
+STATIC int
+xrep_parent_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xrep_parent *rp;
+ struct xfs_scrub *sc;
+ int error;
+
+ rp = container_of(nb, struct xrep_parent, pscan.dhook.dirent_hook.nb);
+ sc = rp->sc;
+
+ /*
+ * This thread updated a dirent that points to the file that we're
+ * repairing, so stash the update for replay against the temporary
+ * file.
+ */
+ if (p->ip->i_ino == sc->ip->i_ino &&
+ xchk_iscan_want_live_update(&rp->pscan.iscan, p->dp->i_ino)) {
+ mutex_lock(&rp->pscan.lock);
+ if (p->delta > 0)
+ error = xrep_parent_stash_parentadd(rp, p->name, p->dp);
+ else
+ error = xrep_parent_stash_parentremove(rp, p->name,
+ p->dp);
+ if (!error)
+ rp->saw_pptr_updates = true;
+ mutex_unlock(&rp->pscan.lock);
+ if (error)
+ goto out_abort;
+ }
+
+ return NOTIFY_DONE;
+out_abort:
+ xchk_iscan_abort(&rp->pscan.iscan);
+ return NOTIFY_DONE;
+}
+
+/* Reset a directory's dotdot entry, if needed. */
+STATIC int
+xrep_parent_reset_dotdot(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t ino;
+ unsigned int spaceres;
+ int error = 0;
+
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &ino);
+ if (error || ino == rp->pscan.parent_ino)
+ return error;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ trace_xrep_parent_reset_dotdot(sc->ip, rp->pscan.parent_ino);
+
+ /*
+ * Reserve more space just in case we have to expand the dir. We're
+ * allowed to exceed quota to repair inconsistent metadata.
+ */
+ spaceres = xfs_rename_space_res(sc->mp, 0, false, xfs_name_dotdot.len,
+ false);
+ error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, spaceres, 0,
+ true);
+ if (error)
+ return error;
+
+ error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot,
+ rp->pscan.parent_ino, spaceres);
+ if (error)
+ return error;
+
+ /*
+ * Roll transaction to detach the inode from the transaction but retain
+ * ILOCK_EXCL.
+ */
+ return xfs_trans_roll(&sc->tp);
+}
+
+/* Pass back the parent inumber if this a parent pointer */
+STATIC int
+xrep_parent_lookup_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ xfs_ino_t *inop = priv;
+ xfs_ino_t parent_ino;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, &parent_ino, NULL);
+ if (error)
+ return error;
+
+ *inop = parent_ino;
+ return -ECANCELED;
+}
+
+/*
+ * Find the first parent of the scrub target by walking parent pointers for
+ * the purpose of deciding if we're going to move it to the orphanage.
+ * We don't care if the attr fork is zapped.
+ */
+STATIC int
+xrep_parent_lookup_pptrs(
+ struct xfs_scrub *sc,
+ xfs_ino_t *inop)
+{
+ int error;
+
+ *inop = NULLFSINO;
+
+ error = xchk_xattr_walk(sc, sc->ip, xrep_parent_lookup_pptr, NULL,
+ inop);
+ if (error && error != -ECANCELED)
+ return error;
+ return 0;
+}
+
+/*
+ * Move the current file to the orphanage.
+ *
+ * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon
+ * successful return, the scrub transaction will have enough extra reservation
+ * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
+ * orphanage; and both inodes will be ijoined.
+ */
+STATIC int
+xrep_parent_move_to_orphanage(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t orig_parent, new_parent;
+ int error;
+
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode)) {
+ /*
+ * We are about to drop the ILOCK on sc->ip to lock the
+ * orphanage and prepare for the adoption. Therefore, look up
+ * the old dotdot entry for sc->ip so that we can compare it
+ * after we re-lock sc->ip.
+ */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot,
+ &orig_parent);
+ if (error)
+ return error;
+ } else {
+ /*
+ * We haven't dropped the ILOCK since we committed the new
+ * xattr structure (and hence the new parent pointer records),
+ * which means that the file cannot have been moved in the
+ * directory tree, and there are no parents.
+ */
+ orig_parent = NULLFSINO;
+ }
+
+ /*
+ * Drop the ILOCK on the scrub target and commit the transaction.
+ * Adoption computes its own resource requirements and gathers the
+ * necessary components.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* If we can take the orphanage's iolock then we're ready to move. */
+ if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
+ xchk_iunlock(sc, sc->ilock_flags);
+ error = xrep_orphanage_iolock_two(sc);
+ if (error)
+ return error;
+ }
+
+ /* Grab transaction and ILOCK the two files. */
+ error = xrep_adoption_trans_alloc(sc, &rp->adoption);
+ if (error)
+ return error;
+
+ error = xrep_adoption_compute_name(&rp->adoption, &rp->xname);
+ if (error)
+ return error;
+
+ /*
+ * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
+ * entry again. If the parent changed or the child was unlinked while
+ * the child directory was unlocked, we don't need to move the child to
+ * the orphanage after all. For a non-directory, we have to scan for
+ * the first parent pointer to see if one has been added.
+ */
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode))
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot,
+ &new_parent);
+ else
+ error = xrep_parent_lookup_pptrs(sc, &new_parent);
+ if (error)
+ return error;
+
+ /*
+ * Attach to the orphanage if we still have a linked directory and it
+ * hasn't been moved.
+ */
+ if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
+ error = xrep_adoption_move(&rp->adoption);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Launder the scrub transaction so we can drop the orphanage ILOCK
+ * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK.
+ */
+ error = xrep_adoption_trans_roll(&rp->adoption);
+ if (error)
+ return error;
+
+ xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
+ return 0;
+}
+
+/* Ensure that the xattr value buffer is large enough. */
+STATIC int
+xrep_parent_alloc_xattr_value(
+ struct xrep_parent *rp,
+ size_t bufsize)
+{
+ void *new_val;
+
+ if (rp->xattr_value_sz >= bufsize)
+ return 0;
+
+ if (rp->xattr_value) {
+ kvfree(rp->xattr_value);
+ rp->xattr_value = NULL;
+ rp->xattr_value_sz = 0;
+ }
+
+ new_val = kvmalloc(bufsize, XCHK_GFP_FLAGS);
+ if (!new_val)
+ return -ENOMEM;
+
+ rp->xattr_value = new_val;
+ rp->xattr_value_sz = bufsize;
+ return 0;
+}
+
+/* Retrieve the (remote) value of a non-pptr xattr. */
+STATIC int
+xrep_parent_fetch_xattr_remote(
+ struct xrep_parent *rp,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ unsigned int valuelen)
+{
+ struct xfs_scrub *sc = rp->sc;
+ struct xfs_da_args args = {
+ .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK,
+ .geo = sc->mp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .dp = ip,
+ .name = name,
+ .namelen = namelen,
+ .trans = sc->tp,
+ .valuelen = valuelen,
+ .owner = ip->i_ino,
+ };
+ int error;
+
+ /*
+ * If we need a larger value buffer, try to allocate one. If that
+ * fails, return with -EDEADLOCK to try harder.
+ */
+ error = xrep_parent_alloc_xattr_value(rp, valuelen);
+ if (error == -ENOMEM)
+ return -EDEADLOCK;
+ if (error)
+ return error;
+
+ args.value = rp->xattr_value;
+ xfs_attr_sethash(&args);
+ return xfs_attr_get_ilocked(&args);
+}
+
+/* Stash non-pptr attributes for later replay into the temporary file. */
+STATIC int
+xrep_parent_stash_xattr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xrep_parent_xattr key = {
+ .valuelen = valuelen,
+ .namelen = namelen,
+ .flags = attr_flags & XFS_ATTR_NSP_ONDISK_MASK,
+ };
+ struct xrep_parent *rp = priv;
+ int error;
+
+ if (attr_flags & (XFS_ATTR_INCOMPLETE | XFS_ATTR_PARENT))
+ return 0;
+
+ if (!value) {
+ error = xrep_parent_fetch_xattr_remote(rp, ip, attr_flags,
+ name, namelen, valuelen);
+ if (error)
+ return error;
+
+ value = rp->xattr_value;
+ }
+
+ trace_xrep_parent_stash_xattr(rp->sc->tempip, key.flags, (void *)name,
+ key.namelen, key.valuelen);
+
+ error = xfblob_store(rp->xattr_blobs, &key.name_cookie, name,
+ key.namelen);
+ if (error)
+ return error;
+
+ error = xfblob_store(rp->xattr_blobs, &key.value_cookie, value,
+ key.valuelen);
+ if (error)
+ return error;
+
+ return xfarray_append(rp->xattr_records, &key);
+}
+
+/* Insert one xattr key/value. */
+STATIC int
+xrep_parent_insert_xattr(
+ struct xrep_parent *rp,
+ const struct xrep_parent_xattr *key)
+{
+ struct xfs_da_args args = {
+ .dp = rp->sc->tempip,
+ .attr_filter = key->flags,
+ .namelen = key->namelen,
+ .valuelen = key->valuelen,
+ .owner = rp->sc->ip->i_ino,
+ .geo = rp->sc->mp->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .op_flags = XFS_DA_OP_OKNOENT,
+ };
+ int error;
+
+ ASSERT(!(key->flags & XFS_ATTR_PARENT));
+
+ /*
+ * Grab pointers to the scrub buffer so that we can use them to insert
+ * attrs into the temp file.
+ */
+ args.name = rp->xattr_name;
+ args.value = rp->xattr_value;
+
+ /*
+ * The attribute name is stored near the end of the in-core buffer,
+ * though we reserve one more byte to ensure null termination.
+ */
+ rp->xattr_name[XATTR_NAME_MAX] = 0;
+
+ error = xfblob_load(rp->xattr_blobs, key->name_cookie, rp->xattr_name,
+ key->namelen);
+ if (error)
+ return error;
+
+ error = xfblob_free(rp->xattr_blobs, key->name_cookie);
+ if (error)
+ return error;
+
+ error = xfblob_load(rp->xattr_blobs, key->value_cookie, args.value,
+ key->valuelen);
+ if (error)
+ return error;
+
+ error = xfblob_free(rp->xattr_blobs, key->value_cookie);
+ if (error)
+ return error;
+
+ rp->xattr_name[key->namelen] = 0;
+
+ trace_xrep_parent_insert_xattr(rp->sc->tempip, key->flags,
+ rp->xattr_name, key->namelen, key->valuelen);
+
+ xfs_attr_sethash(&args);
+ return xfs_attr_set(&args, XFS_ATTRUPDATE_UPSERT, false);
+}
+
+/*
+ * Periodically flush salvaged attributes to the temporary file. This is done
+ * to reduce the memory requirements of the xattr rebuild because files can
+ * contain millions of attributes.
+ */
+STATIC int
+xrep_parent_flush_xattrs(
+ struct xrep_parent *rp)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ /*
+ * Entering this function, the scrub context has a reference to the
+ * inode being repaired, the temporary file, and the empty scrub
+ * transaction that we created for the xattr scan. We hold ILOCK_EXCL
+ * on the inode being repaired.
+ *
+ * To constrain kernel memory use, we occasionally flush salvaged
+ * xattrs from the xfarray and xfblob structures into the temporary
+ * file in preparation for exchanging the xattr structures at the end.
+ * Updating the temporary file requires a transaction, so we commit the
+ * scrub transaction and drop the ILOCK so that xfs_attr_set can
+ * allocate whatever transaction it wants.
+ *
+ * We still hold IOLOCK_EXCL on the inode being repaired, which
+ * prevents anyone from adding xattrs (or parent pointers) while we're
+ * flushing.
+ */
+ xchk_trans_cancel(rp->sc);
+ xchk_iunlock(rp->sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the IOLOCK of the temporary file while we modify xattrs. This
+ * isn't strictly required because the temporary file is never revealed
+ * to userspace, but we follow the same locking rules. We still hold
+ * sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rp->sc);
+ if (error)
+ return error;
+
+ /* Add all the salvaged attrs to the temporary file. */
+ foreach_xfarray_idx(rp->xattr_records, array_cur) {
+ struct xrep_parent_xattr key;
+
+ error = xfarray_load(rp->xattr_records, array_cur, &key);
+ if (error)
+ return error;
+
+ error = xrep_parent_insert_xattr(rp, &key);
+ if (error)
+ return error;
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rp->xattr_records);
+ xfblob_truncate(rp->xattr_blobs);
+
+ xrep_tempfile_iounlock(rp->sc);
+
+ /* Recreate the empty transaction and relock the inode. */
+ error = xchk_trans_alloc_empty(rp->sc);
+ if (error)
+ return error;
+ xchk_ilock(rp->sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/* Decide if we've stashed too much xattr data in memory. */
+static inline bool
+xrep_parent_want_flush_xattrs(
+ struct xrep_parent *rp)
+{
+ unsigned long long bytes;
+
+ bytes = xfarray_bytes(rp->xattr_records) +
+ xfblob_bytes(rp->xattr_blobs);
+ return bytes > XREP_PARENT_XATTR_MAX_STASH_BYTES;
+}
+
+/* Flush staged attributes to the temporary file if we're over the limit. */
+STATIC int
+xrep_parent_try_flush_xattrs(
+ struct xfs_scrub *sc,
+ void *priv)
+{
+ struct xrep_parent *rp = priv;
+ int error;
+
+ if (!xrep_parent_want_flush_xattrs(rp))
+ return 0;
+
+ error = xrep_parent_flush_xattrs(rp);
+ if (error)
+ return error;
+
+ /*
+ * If there were any parent pointer updates to the xattr structure
+ * while we dropped the ILOCK, the xattr structure is now stale.
+ * Signal to the attr copy process that we need to start over, but
+ * this time without opportunistic attr flushing.
+ *
+ * This is unlikely to happen, so we're ok with restarting the copy.
+ */
+ mutex_lock(&rp->pscan.lock);
+ if (rp->saw_pptr_updates)
+ error = -ESTALE;
+ mutex_unlock(&rp->pscan.lock);
+ return error;
+}
+
+/* Copy all the non-pptr extended attributes into the temporary file. */
+STATIC int
+xrep_parent_copy_xattrs(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ int error;
+
+ /*
+ * Clear the pptr updates flag. We hold sc->ip ILOCKed, so there
+ * can't be any parent pointer updates in progress.
+ */
+ mutex_lock(&rp->pscan.lock);
+ rp->saw_pptr_updates = false;
+ mutex_unlock(&rp->pscan.lock);
+
+ /* Copy xattrs, stopping periodically to flush the incore buffers. */
+ error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr,
+ xrep_parent_try_flush_xattrs, rp);
+ if (error && error != -ESTALE)
+ return error;
+
+ if (error == -ESTALE) {
+ /*
+ * The xattr copy collided with a parent pointer update.
+ * Restart the copy, but this time hold the ILOCK all the way
+ * to the end to lock out any directory parent pointer updates.
+ */
+ error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr,
+ NULL, rp);
+ if (error)
+ return error;
+ }
+
+ /* Flush any remaining stashed xattrs to the temporary file. */
+ if (xfarray_bytes(rp->xattr_records) == 0)
+ return 0;
+
+ return xrep_parent_flush_xattrs(rp);
+}
+
+/*
+ * Ensure that @sc->ip and @sc->tempip both have attribute forks before we head
+ * into the attr fork exchange transaction. All files on a filesystem with
+ * parent pointers must have an attr fork because the parent pointer code does
+ * not itself add attribute forks.
+ *
+ * Note: Unlinkable unlinked files don't need one, but the overhead of having
+ * an unnecessary attr fork is not justified by the additional code complexity
+ * that would be needed to track that state correctly.
+ */
+STATIC int
+xrep_parent_ensure_attr_fork(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ int error;
+
+ error = xfs_attr_add_fork(sc->tempip,
+ sizeof(struct xfs_attr_sf_hdr), 1);
+ if (error)
+ return error;
+ return xfs_attr_add_fork(sc->ip, sizeof(struct xfs_attr_sf_hdr), 1);
+}
+
+/*
+ * Finish replaying stashed parent pointer updates, allocate a transaction for
+ * exchanging extent mappings, and take the ILOCKs of both files before we
+ * commit the new attribute structure.
+ */
+STATIC int
+xrep_parent_finalize_tempfile(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ int error;
+
+ /*
+ * Repair relies on the ILOCK to quiesce all possible xattr updates.
+ * Replay all queued parent pointer updates into the tempfile before
+ * exchanging the contents, even if that means dropping the ILOCKs and
+ * the transaction.
+ */
+ do {
+ error = xrep_parent_replay_updates(rp);
+ if (error)
+ return error;
+
+ error = xrep_parent_ensure_attr_fork(rp);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rp->tx);
+ if (error)
+ return error;
+
+ if (xfarray_length(rp->pptr_recs) == 0)
+ break;
+
+ xchk_trans_cancel(sc);
+ xrep_tempfile_iunlock_both(sc);
+ } while (!xchk_should_terminate(sc, &error));
+ return error;
+}
+
+/*
+ * Replay all the stashed parent pointers into the temporary file, copy all
+ * the non-pptr xattrs from the file being repaired into the temporary file,
+ * and exchange the attr fork contents atomically.
+ */
+STATIC int
+xrep_parent_rebuild_pptrs(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t parent_ino = NULLFSINO;
+ int error;
+
+ /*
+ * Copy non-ppttr xattrs from the file being repaired into the
+ * temporary file's xattr structure. We hold sc->ip's IOLOCK, which
+ * prevents setxattr/removexattr calls from occurring, but renames
+ * update the parent pointers without holding IOLOCK. If we detect
+ * stale attr structures, we restart the scan but only flush at the
+ * end.
+ */
+ error = xrep_parent_copy_xattrs(rp);
+ if (error)
+ return error;
+
+ /*
+ * Cancel the empty transaction that we used to walk and copy attrs,
+ * and drop the ILOCK so that we can take the IOLOCK on the temporary
+ * file. We still hold sc->ip's IOLOCK.
+ */
+ xchk_trans_cancel(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ error = xrep_tempfile_iolock_polled(sc);
+ if (error)
+ return error;
+
+ /*
+ * Allocate transaction, lock inodes, and make sure that we've replayed
+ * all the stashed pptr updates to the tempdir. After this point,
+ * we're ready to exchange the attr fork mappings.
+ */
+ error = xrep_parent_finalize_tempfile(rp);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing pptr fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ if (xchk_iscan_aborted(&rp->pscan.iscan))
+ return -ECANCELED;
+
+ /*
+ * Exchange the attr fork contents and junk the old attr fork contents,
+ * which are now in the tempfile.
+ */
+ error = xrep_xattr_swap(sc, &rp->tx);
+ if (error)
+ return error;
+ error = xrep_xattr_reset_tempfile_fork(sc);
+ if (error)
+ return error;
+
+ /*
+ * Roll to get a transaction without any inodes joined to it. Then we
+ * can drop the tempfile's ILOCK and IOLOCK before doing more work on
+ * the scrub target file.
+ */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+ xrep_tempfile_iunlock(sc);
+ xrep_tempfile_iounlock(sc);
+
+ /*
+ * We've committed the new parent pointers. Find at least one parent
+ * so that we can decide if we're moving this file to the orphanage.
+ * For this purpose, root directories are their own parents.
+ */
+ if (xchk_inode_is_dirtree_root(sc->ip)) {
+ xrep_findparent_scan_found(&rp->pscan, sc->ip->i_ino);
+ } else {
+ error = xrep_parent_lookup_pptrs(sc, &parent_ino);
+ if (error)
+ return error;
+ if (parent_ino != NULLFSINO)
+ xrep_findparent_scan_found(&rp->pscan, parent_ino);
+ }
+ return 0;
+}
+
+/*
+ * Commit the new parent pointer structure (currently only the dotdot entry) to
+ * the file that we're repairing.
+ */
+STATIC int
+xrep_parent_rebuild_tree(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ bool try_adoption;
+ int error;
+
+ if (xfs_has_parent(sc->mp)) {
+ error = xrep_parent_rebuild_pptrs(rp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Any file with no parent could be adopted. This check happens after
+ * rebuilding the parent pointer structure because we might have cycled
+ * the ILOCK during that process.
+ */
+ try_adoption = rp->pscan.parent_ino == NULLFSINO;
+
+ /*
+ * Starting with metadir, we allow checking of parent pointers
+ * of non-directory files that are children of the superblock.
+ * Lack of parent is ok here.
+ */
+ if (try_adoption && xfs_has_metadir(sc->mp) &&
+ xchk_inode_is_sb_rooted(sc->ip))
+ try_adoption = false;
+
+ if (try_adoption) {
+ if (xrep_orphanage_can_adopt(sc))
+ return xrep_parent_move_to_orphanage(rp);
+ return -EFSCORRUPTED;
+
+ }
+
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode))
+ return xrep_parent_reset_dotdot(rp);
+
+ return 0;
+}
+
+/* Count the number of parent pointers. */
+STATIC int
+xrep_parent_count_pptr(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int attr_flags,
+ const unsigned char *name,
+ unsigned int namelen,
+ const void *value,
+ unsigned int valuelen,
+ void *priv)
+{
+ struct xrep_parent *rp = priv;
+ int error;
+
+ if (!(attr_flags & XFS_ATTR_PARENT))
+ return 0;
+
+ error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
+ valuelen, NULL, NULL);
+ if (error)
+ return error;
+
+ rp->parents++;
+ return 0;
+}
+
+/*
+ * After all parent pointer rebuilding and adoption activity completes, reset
+ * the link count of this nondirectory, having scanned the fs to rebuild all
+ * parent pointers.
+ */
+STATIC int
+xrep_parent_set_nondir_nlink(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_perag *pag;
+ bool joined = false;
+ int error;
+
+ /* Count parent pointers so we can reset the file link count. */
+ rp->parents = 0;
+ error = xchk_xattr_walk(sc, ip, xrep_parent_count_pptr, NULL, rp);
+ if (error)
+ return error;
+
+ /*
+ * Starting with metadir, we allow checking of parent pointers of
+ * non-directory files that are children of the superblock. Pretend
+ * that we found a parent pointer attr.
+ */
+ if (xfs_has_metadir(sc->mp) && xchk_inode_is_sb_rooted(sc->ip))
+ rp->parents++;
+
+ if (rp->parents > 0 && xfs_inode_on_unlinked_list(ip)) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+
+ /*
+ * The file is on the unlinked list but we found parents.
+ * Remove the file from the unlinked list.
+ */
+ pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, ip->i_ino));
+ if (!pag) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_iunlink_remove(sc->tp, pag, ip);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+ } else if (rp->parents == 0 && !xfs_inode_on_unlinked_list(ip)) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+
+ /*
+ * The file is not on the unlinked list but we found no
+ * parents. Add the file to the unlinked list.
+ */
+ error = xfs_iunlink(sc->tp, ip);
+ if (error)
+ return error;
+ }
+
+ /* Set the correct link count. */
+ if (VFS_I(ip)->i_nlink != rp->parents) {
+ if (!joined) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+ }
+
+ set_nlink(VFS_I(ip), min_t(unsigned long long, rp->parents,
+ XFS_NLINK_PINNED));
+ }
+
+ /* Log the inode to keep it moving forward if we dirtied anything. */
+ if (joined)
+ xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE);
+ return 0;
+}
+
+/* Set up the filesystem scan so we can look for parents. */
+STATIC int
+xrep_parent_setup_scan(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ char *descr;
+ struct xfs_da_geometry *geo = sc->mp->m_attr_geo;
+ int max_len;
+ int error;
+
+ if (!xfs_has_parent(sc->mp))
+ return xrep_findparent_scan_start(sc, &rp->pscan);
+
+ /* Buffers for copying non-pptr attrs to the tempfile */
+ rp->xattr_name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS);
+ if (!rp->xattr_name)
+ return -ENOMEM;
+
+ /*
+ * Allocate enough memory to handle loading local attr values from the
+ * xfblob data while flushing stashed attrs to the temporary file.
+ * We only realloc the buffer when salvaging remote attr values, so
+ * TRY_HARDER means we allocate the maximal attr value size.
+ */
+ if (sc->flags & XCHK_TRY_HARDER)
+ max_len = XATTR_SIZE_MAX;
+ else
+ max_len = xfs_attr_leaf_entsize_local_max(geo->blksize);
+ error = xrep_parent_alloc_xattr_value(rp, max_len);
+ if (error)
+ goto out_xattr_name;
+
+ /* Set up some staging memory for logging parent pointer updates. */
+ descr = xchk_xfile_ino_descr(sc, "parent pointer entries");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_pptr),
+ &rp->pptr_recs);
+ kfree(descr);
+ if (error)
+ goto out_xattr_value;
+
+ descr = xchk_xfile_ino_descr(sc, "parent pointer names");
+ error = xfblob_create(descr, &rp->pptr_names);
+ kfree(descr);
+ if (error)
+ goto out_recs;
+
+ /* Set up some storage for copying attrs before the mapping exchange */
+ descr = xchk_xfile_ino_descr(sc,
+ "parent pointer retained xattr entries");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_parent_xattr),
+ &rp->xattr_records);
+ kfree(descr);
+ if (error)
+ goto out_names;
+
+ descr = xchk_xfile_ino_descr(sc,
+ "parent pointer retained xattr values");
+ error = xfblob_create(descr, &rp->xattr_blobs);
+ kfree(descr);
+ if (error)
+ goto out_attr_keys;
+
+ error = __xrep_findparent_scan_start(sc, &rp->pscan,
+ xrep_parent_live_update);
+ if (error)
+ goto out_attr_values;
+
+ return 0;
+
+out_attr_values:
+ xfblob_destroy(rp->xattr_blobs);
+ rp->xattr_blobs = NULL;
+out_attr_keys:
+ xfarray_destroy(rp->xattr_records);
+ rp->xattr_records = NULL;
+out_names:
+ xfblob_destroy(rp->pptr_names);
+ rp->pptr_names = NULL;
+out_recs:
+ xfarray_destroy(rp->pptr_recs);
+ rp->pptr_recs = NULL;
+out_xattr_value:
+ kvfree(rp->xattr_value);
+ rp->xattr_value = NULL;
+out_xattr_name:
+ kvfree(rp->xattr_name);
+ rp->xattr_name = NULL;
+ return error;
+}
+
+int
+xrep_parent(
+ struct xfs_scrub *sc)
+{
+ struct xrep_parent *rp = sc->buf;
+ int error;
+
+ /*
+ * When the parent pointers feature is enabled, repairs are committed
+ * by atomically committing a new xattr structure and reaping the old
+ * attr fork. Reaping requires rmap and exchange-range to be enabled.
+ */
+ if (xfs_has_parent(sc->mp)) {
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+ }
+
+ error = xrep_parent_setup_scan(rp);
+ if (error)
+ return error;
+
+ if (xfs_has_parent(sc->mp))
+ error = xrep_parent_scan_dirtree(rp);
+ else
+ error = xrep_parent_find_dotdot(rp);
+ if (error)
+ goto out_teardown;
+
+ /* Last chance to abort before we start committing dotdot fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto out_teardown;
+
+ error = xrep_parent_rebuild_tree(rp);
+ if (error)
+ goto out_teardown;
+ if (xfs_has_parent(sc->mp) && !S_ISDIR(VFS_I(sc->ip)->i_mode)) {
+ error = xrep_parent_set_nondir_nlink(rp);
+ if (error)
+ goto out_teardown;
+ }
+
+ error = xrep_defer_finish(sc);
+
+out_teardown:
+ xrep_parent_teardown(rp);
+ return error;
+}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 183d531875ea..58d6d4ed2853 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -212,12 +212,18 @@ xchk_quota_item(
if (mp->m_sb.sb_dblocks < dq->q_blk.count)
xchk_fblock_set_warning(sc, XFS_DATA_FORK,
offset);
+ if (mp->m_sb.sb_rblocks < dq->q_rtb.count)
+ xchk_fblock_set_warning(sc, XFS_DATA_FORK,
+ offset);
} else {
if (mp->m_sb.sb_dblocks < dq->q_blk.count)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
offset);
+ if (mp->m_sb.sb_rblocks < dq->q_rtb.count)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ offset);
}
- if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks)
+ if (dq->q_ino.count > fs_icount)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
/*
diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
index 0bab4c30cb85..8f4c8d41f308 100644
--- a/fs/xfs/scrub/quota_repair.c
+++ b/fs/xfs/scrub/quota_repair.c
@@ -12,7 +12,6 @@
#include "xfs_defer.h"
#include "xfs_btree.h"
#include "xfs_bit.h"
-#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
@@ -77,8 +76,6 @@ xrep_quota_item_fill_bmap_hole(
irec, &nmaps);
if (error)
return error;
- if (nmaps != 1)
- return -ENOSPC;
dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock);
@@ -236,7 +233,7 @@ xrep_quota_item(
rqi->need_quotacheck = true;
dirty = true;
}
- if (dq->q_rtb.count > mp->m_sb.sb_rblocks) {
+ if (!xfs_has_reflink(mp) && dq->q_rtb.count > mp->m_sb.sb_rblocks) {
dq->q_rtb.reserved -= dq->q_rtb.count;
dq->q_rtb.reserved += mp->m_sb.sb_rblocks;
dq->q_rtb.count = mp->m_sb.sb_rblocks;
@@ -444,10 +441,6 @@ xrep_quota_data_fork(
XFS_BMAPI_CONVERT, 0, &nrec, &nmap);
if (error)
goto out;
- if (nmap != 1) {
- error = -ENOSPC;
- goto out;
- }
ASSERT(nrec.br_startoff == irec.br_startoff);
ASSERT(nrec.br_blockcount == irec.br_blockcount);
diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c
new file mode 100644
index 000000000000..dc4033b91e44
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck.c
@@ -0,0 +1,870 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/quota.h"
+#include "scrub/quotacheck.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Quotacheck
+ * ===============
+ *
+ * Quota counters are "summary" metadata, in the sense that they are computed
+ * as the summation of the block usage counts for every file on the filesystem.
+ * Therefore, we compute the correct icount, bcount, and rtbcount values by
+ * creating a shadow quota counter structure and walking every inode.
+ */
+
+/* Track the quota deltas for a dquot in a transaction. */
+struct xqcheck_dqtrx {
+ xfs_dqtype_t q_type;
+ xfs_dqid_t q_id;
+
+ int64_t icount_delta;
+
+ int64_t bcount_delta;
+ int64_t delbcnt_delta;
+
+ int64_t rtbcount_delta;
+ int64_t delrtb_delta;
+};
+
+#define XQCHECK_MAX_NR_DQTRXS (XFS_QM_TRANS_DQTYPES * XFS_QM_TRANS_MAXDQS)
+
+/*
+ * Track the quota deltas for all dquots attached to a transaction if the
+ * quota deltas are being applied to an inode that we already scanned.
+ */
+struct xqcheck_dqacct {
+ struct rhash_head hash;
+ uintptr_t tx_id;
+ struct xqcheck_dqtrx dqtrx[XQCHECK_MAX_NR_DQTRXS];
+ unsigned int refcount;
+};
+
+/* Free a shadow dquot accounting structure. */
+static void
+xqcheck_dqacct_free(
+ void *ptr,
+ void *arg)
+{
+ struct xqcheck_dqacct *dqa = ptr;
+
+ kfree(dqa);
+}
+
+/* Set us up to scrub quota counters. */
+int
+xchk_setup_quotacheck(
+ struct xfs_scrub *sc)
+{
+ if (!XFS_IS_QUOTA_ON(sc->mp))
+ return -ENOENT;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_QUOTA);
+
+ sc->buf = kzalloc(sizeof(struct xqcheck), XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 1: Collecting dquot resource usage counts. For each xfs_dquot attached
+ * to each inode, we create a shadow dquot, and compute the inode count and add
+ * the data/rt block usage from what we see.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned. This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error. Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
+ *
+ * Because we are scanning a live filesystem, it's possible that another thread
+ * will try to update the quota counters for an inode that we've already
+ * scanned. This will cause our counts to be incorrect. Therefore, we hook
+ * the live transaction code in two places: (1) when the callers update the
+ * per-transaction dqtrx structure to log quota counter updates; and (2) when
+ * transaction commit actually logs those updates to the incore dquot. By
+ * shadowing transaction updates in this manner, live quotacheck can ensure
+ * by locking the dquot and the shadow structure that its own copies are not
+ * out of date. Because the hook code runs in a different process context from
+ * the scrub code and the scrub state flags are not accessed atomically,
+ * failures in the hook code must abort the iscan and the scrubber must notice
+ * the aborted scan and set the incomplete flag.
+ *
+ * Note that we use srcu notifier hooks to minimize the overhead when live
+ * quotacheck is /not/ running.
+ */
+
+/* Update an incore dquot counter information from a live update. */
+static int
+xqcheck_update_incore_counts(
+ struct xqcheck *xqc,
+ struct xfarray *counts,
+ xfs_dqid_t id,
+ int64_t inodes,
+ int64_t nblks,
+ int64_t rtblks)
+{
+ struct xqcheck_dquot xcdq;
+ int error;
+
+ error = xfarray_load_sparse(counts, id, &xcdq);
+ if (error)
+ return error;
+
+ xcdq.flags |= XQCHECK_DQUOT_WRITTEN;
+ xcdq.icount += inodes;
+ xcdq.bcount += nblks;
+ xcdq.rtbcount += rtblks;
+
+ error = xfarray_store(counts, id, &xcdq);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete.
+ */
+ error = -ECANCELED;
+ }
+ return error;
+}
+
+/* Decide if this is the shadow dquot accounting structure for a transaction. */
+static int
+xqcheck_dqacct_obj_cmpfn(
+ struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const uintptr_t *tx_idp = arg->key;
+ const struct xqcheck_dqacct *dqa = obj;
+
+ if (dqa->tx_id != *tx_idp)
+ return 1;
+ return 0;
+}
+
+static const struct rhashtable_params xqcheck_dqacct_hash_params = {
+ .min_size = 32,
+ .key_len = sizeof(uintptr_t),
+ .key_offset = offsetof(struct xqcheck_dqacct, tx_id),
+ .head_offset = offsetof(struct xqcheck_dqacct, hash),
+ .automatic_shrinking = true,
+ .obj_cmpfn = xqcheck_dqacct_obj_cmpfn,
+};
+
+/* Find a shadow dqtrx slot for the given dquot. */
+STATIC struct xqcheck_dqtrx *
+xqcheck_get_dqtrx(
+ struct xqcheck_dqacct *dqa,
+ xfs_dqtype_t q_type,
+ xfs_dqid_t q_id)
+{
+ int i;
+
+ for (i = 0; i < XQCHECK_MAX_NR_DQTRXS; i++) {
+ if (dqa->dqtrx[i].q_type == 0 ||
+ (dqa->dqtrx[i].q_type == q_type &&
+ dqa->dqtrx[i].q_id == q_id))
+ return &dqa->dqtrx[i];
+ }
+
+ return NULL;
+}
+
+/*
+ * Create and fill out a quota delta tracking structure to shadow the updates
+ * going on in the regular quota code.
+ */
+static int
+xqcheck_mod_live_ino_dqtrx(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_mod_ino_dqtrx_params *p = data;
+ struct xqcheck *xqc;
+ struct xqcheck_dqacct *dqa;
+ struct xqcheck_dqtrx *dqtrx;
+ int error;
+
+ xqc = container_of(nb, struct xqcheck, qhook.mod_hook.nb);
+
+ /* Skip quota reservation fields. */
+ switch (action) {
+ case XFS_TRANS_DQ_BCOUNT:
+ case XFS_TRANS_DQ_DELBCOUNT:
+ case XFS_TRANS_DQ_ICOUNT:
+ case XFS_TRANS_DQ_RTBCOUNT:
+ case XFS_TRANS_DQ_DELRTBCOUNT:
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ /* Ignore dqtrx updates for quota types we don't care about. */
+ switch (p->q_type) {
+ case XFS_DQTYPE_USER:
+ if (!xqc->ucounts)
+ return NOTIFY_DONE;
+ break;
+ case XFS_DQTYPE_GROUP:
+ if (!xqc->gcounts)
+ return NOTIFY_DONE;
+ break;
+ case XFS_DQTYPE_PROJ:
+ if (!xqc->pcounts)
+ return NOTIFY_DONE;
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ /* Skip inodes that haven't been scanned yet. */
+ if (!xchk_iscan_want_live_update(&xqc->iscan, p->ino))
+ return NOTIFY_DONE;
+
+ /* Make a shadow quota accounting tracker for this transaction. */
+ mutex_lock(&xqc->lock);
+ dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id,
+ xqcheck_dqacct_hash_params);
+ if (!dqa) {
+ dqa = kzalloc(sizeof(struct xqcheck_dqacct), XCHK_GFP_FLAGS);
+ if (!dqa)
+ goto out_abort;
+
+ dqa->tx_id = p->tx_id;
+ error = rhashtable_insert_fast(&xqc->shadow_dquot_acct,
+ &dqa->hash, xqcheck_dqacct_hash_params);
+ if (error)
+ goto out_abort;
+ }
+
+ /* Find the shadow dqtrx (or an empty slot) here. */
+ dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id);
+ if (!dqtrx)
+ goto out_abort;
+ if (dqtrx->q_type == 0) {
+ dqtrx->q_type = p->q_type;
+ dqtrx->q_id = p->q_id;
+ dqa->refcount++;
+ }
+
+ /* Update counter */
+ switch (action) {
+ case XFS_TRANS_DQ_BCOUNT:
+ dqtrx->bcount_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_DELBCOUNT:
+ dqtrx->delbcnt_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_ICOUNT:
+ dqtrx->icount_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_RTBCOUNT:
+ dqtrx->rtbcount_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_DELRTBCOUNT:
+ dqtrx->delrtb_delta += p->delta;
+ break;
+ }
+
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+
+out_abort:
+ xchk_iscan_abort(&xqc->iscan);
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Apply the transaction quota deltas to our shadow quota accounting info when
+ * the regular quota code are doing the same.
+ */
+static int
+xqcheck_apply_live_dqtrx(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_apply_dqtrx_params *p = data;
+ struct xqcheck *xqc;
+ struct xqcheck_dqacct *dqa;
+ struct xqcheck_dqtrx *dqtrx;
+ struct xfarray *counts;
+ int error;
+
+ xqc = container_of(nb, struct xqcheck, qhook.apply_hook.nb);
+
+ /* Map the dquot type to an incore counter object. */
+ switch (p->q_type) {
+ case XFS_DQTYPE_USER:
+ counts = xqc->ucounts;
+ break;
+ case XFS_DQTYPE_GROUP:
+ counts = xqc->gcounts;
+ break;
+ case XFS_DQTYPE_PROJ:
+ counts = xqc->pcounts;
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ if (xchk_iscan_aborted(&xqc->iscan) || counts == NULL)
+ return NOTIFY_DONE;
+
+ /*
+ * Find the shadow dqtrx for this transaction and dquot, if any deltas
+ * need to be applied here. If not, we're finished early.
+ */
+ mutex_lock(&xqc->lock);
+ dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id,
+ xqcheck_dqacct_hash_params);
+ if (!dqa)
+ goto out_unlock;
+ dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id);
+ if (!dqtrx || dqtrx->q_type == 0)
+ goto out_unlock;
+
+ /* Update our shadow dquot if we're committing. */
+ if (action == XFS_APPLY_DQTRX_COMMIT) {
+ error = xqcheck_update_incore_counts(xqc, counts, p->q_id,
+ dqtrx->icount_delta,
+ dqtrx->bcount_delta + dqtrx->delbcnt_delta,
+ dqtrx->rtbcount_delta + dqtrx->delrtb_delta);
+ if (error)
+ goto out_abort;
+ }
+
+ /* Free the shadow accounting structure if that was the last user. */
+ dqa->refcount--;
+ if (dqa->refcount == 0) {
+ error = rhashtable_remove_fast(&xqc->shadow_dquot_acct,
+ &dqa->hash, xqcheck_dqacct_hash_params);
+ if (error)
+ goto out_abort;
+ xqcheck_dqacct_free(dqa, NULL);
+ }
+
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+
+out_abort:
+ xchk_iscan_abort(&xqc->iscan);
+out_unlock:
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+}
+
+/* Record this inode's quota usage in our shadow quota counter data. */
+STATIC int
+xqcheck_collect_inode(
+ struct xqcheck *xqc,
+ struct xfs_inode *ip)
+{
+ struct xfs_trans *tp = xqc->sc->tp;
+ xfs_filblks_t nblks, rtblks;
+ uint ilock_flags = 0;
+ xfs_dqid_t id;
+ bool isreg = S_ISREG(VFS_I(ip)->i_mode);
+ int error = 0;
+
+ if (xfs_is_metadir_inode(ip) ||
+ xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) {
+ /*
+ * Quota files are never counted towards quota, so we do not
+ * need to take the lock. Files do not switch between the
+ * metadata and regular directory trees without a reallocation,
+ * so we do not need to ILOCK them either.
+ */
+ xchk_iscan_mark_visited(&xqc->iscan, ip);
+ return 0;
+ }
+
+ /* Figure out the data / rt device block counts. */
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ if (isreg)
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ /*
+ * Read in the data fork for rt files so that _count_blocks
+ * can count the number of blocks allocated from the rt volume.
+ * Inodes do not track that separately.
+ */
+ ilock_flags = xfs_ilock_data_map_shared(ip);
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_abort;
+ } else {
+ ilock_flags = XFS_ILOCK_SHARED;
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ }
+ xfs_inode_count_blocks(tp, ip, &nblks, &rtblks);
+
+ if (xchk_iscan_aborted(&xqc->iscan)) {
+ error = -ECANCELED;
+ goto out_incomplete;
+ }
+
+ /* Update the shadow dquot counters. */
+ mutex_lock(&xqc->lock);
+ if (xqc->ucounts) {
+ id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_USER);
+ error = xqcheck_update_incore_counts(xqc, xqc->ucounts, id, 1,
+ nblks, rtblks);
+ if (error)
+ goto out_mutex;
+ }
+
+ if (xqc->gcounts) {
+ id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_GROUP);
+ error = xqcheck_update_incore_counts(xqc, xqc->gcounts, id, 1,
+ nblks, rtblks);
+ if (error)
+ goto out_mutex;
+ }
+
+ if (xqc->pcounts) {
+ id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_PROJ);
+ error = xqcheck_update_incore_counts(xqc, xqc->pcounts, id, 1,
+ nblks, rtblks);
+ if (error)
+ goto out_mutex;
+ }
+ mutex_unlock(&xqc->lock);
+
+ xchk_iscan_mark_visited(&xqc->iscan, ip);
+ goto out_ilock;
+
+out_mutex:
+ mutex_unlock(&xqc->lock);
+out_abort:
+ xchk_iscan_abort(&xqc->iscan);
+out_incomplete:
+ xchk_set_incomplete(xqc->sc);
+out_ilock:
+ xfs_iunlock(ip, ilock_flags);
+ if (isreg)
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return error;
+}
+
+/* Walk all the allocated inodes and run a quota scan on them. */
+STATIC int
+xqcheck_collect_counts(
+ struct xqcheck *xqc)
+{
+ struct xfs_scrub *sc = xqc->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Cancel the transaction to release the log grant space while we scan
+ * the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done without
+ * risk of deadlock between sb_internal and the IOLOCK (we take the
+ * IOLOCK to quiesce the file before scanning) because empty
+ * transactions do not take sb_internal.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) {
+ error = xqcheck_collect_inode(xqc, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&xqc->iscan);
+ if (error) {
+ xchk_set_incomplete(sc);
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Switch out for a real transaction in preparation for building a new
+ * tree.
+ */
+ xchk_trans_cancel(sc);
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 2: Comparing dquot resource counters. Walk each xfs_dquot, comparing
+ * the resource usage counters against our shadow dquots; and then walk each
+ * shadow dquot (that wasn't covered in the first part), comparing it against
+ * the xfs_dquot.
+ */
+
+/*
+ * Check the dquot data against what we observed. Caller must hold the dquot
+ * lock.
+ */
+STATIC int
+xqcheck_compare_dquot(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype,
+ struct xfs_dquot *dq)
+{
+ struct xqcheck_dquot xcdq;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ int error;
+
+ if (xchk_iscan_aborted(&xqc->iscan)) {
+ xchk_set_incomplete(xqc->sc);
+ return -ECANCELED;
+ }
+
+ mutex_lock(&xqc->lock);
+ error = xfarray_load_sparse(counts, dq->q_id, &xcdq);
+ if (error)
+ goto out_unlock;
+
+ if (xcdq.icount != dq->q_ino.count)
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+ if (xcdq.bcount != dq->q_blk.count)
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+ if (xcdq.rtbcount != dq->q_rtb.count)
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+ xcdq.flags |= (XQCHECK_DQUOT_COMPARE_SCANNED | XQCHECK_DQUOT_WRITTEN);
+ error = xfarray_store(counts, dq->q_id, &xcdq);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete. This
+ * should never happen outside of the collection phase.
+ */
+ xchk_set_incomplete(xqc->sc);
+ error = -ECANCELED;
+ }
+ mutex_unlock(&xqc->lock);
+ if (error)
+ return error;
+
+ if (xqc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return -ECANCELED;
+
+ return 0;
+
+out_unlock:
+ mutex_unlock(&xqc->lock);
+ return error;
+}
+
+/*
+ * Walk all the observed dquots, and make sure there's a matching incore
+ * dquot and that its counts match ours.
+ */
+STATIC int
+xqcheck_walk_observations(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype)
+{
+ struct xqcheck_dquot xcdq;
+ struct xfs_dquot *dq;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ int error;
+
+ mutex_lock(&xqc->lock);
+ while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) {
+ xfs_dqid_t id = cur - 1;
+
+ if (xcdq.flags & XQCHECK_DQUOT_COMPARE_SCANNED)
+ continue;
+
+ mutex_unlock(&xqc->lock);
+
+ error = xfs_qm_dqget(xqc->sc->mp, id, dqtype, false, &dq);
+ if (error == -ENOENT) {
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, id);
+ return 0;
+ }
+ if (error)
+ return error;
+
+ error = xqcheck_compare_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ return error;
+
+ if (xchk_should_terminate(xqc->sc, &error))
+ return error;
+
+ mutex_lock(&xqc->lock);
+ }
+ mutex_unlock(&xqc->lock);
+
+ return error;
+}
+
+/* Compare the quota counters we observed against the live dquots. */
+STATIC int
+xqcheck_compare_dqtype(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype)
+{
+ struct xchk_dqiter cursor = { };
+ struct xfs_scrub *sc = xqc->sc;
+ struct xfs_dquot *dq;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /* If the quota CHKD flag is cleared, we need to repair this quota. */
+ if (!(xfs_quota_chkd_flag(dqtype) & sc->mp->m_qflags)) {
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, 0);
+ return 0;
+ }
+
+ /* Compare what we observed against the actual dquots. */
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xqcheck_compare_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
+ if (error)
+ return error;
+
+ /* Walk all the observed dquots and compare to the incore ones. */
+ return xqcheck_walk_observations(xqc, dqtype);
+}
+
+/* Tear down everything associated with a quotacheck. */
+static void
+xqcheck_teardown_scan(
+ void *priv)
+{
+ struct xqcheck *xqc = priv;
+ struct xfs_quotainfo *qi = xqc->sc->mp->m_quotainfo;
+
+ /* Discourage any hook functions that might be running. */
+ xchk_iscan_abort(&xqc->iscan);
+
+ /*
+ * As noted above, the apply hook is responsible for cleaning up the
+ * shadow dquot accounting data when a transaction completes. The mod
+ * hook must be removed before the apply hook so that we don't
+ * mistakenly leave an active shadow account for the mod hook to get
+ * its hands on. No hooks should be running after these functions
+ * return.
+ */
+ xfs_dqtrx_hook_del(qi, &xqc->qhook);
+
+ if (xqc->shadow_dquot_acct.key_len) {
+ rhashtable_free_and_destroy(&xqc->shadow_dquot_acct,
+ xqcheck_dqacct_free, NULL);
+ xqc->shadow_dquot_acct.key_len = 0;
+ }
+
+ if (xqc->pcounts) {
+ xfarray_destroy(xqc->pcounts);
+ xqc->pcounts = NULL;
+ }
+
+ if (xqc->gcounts) {
+ xfarray_destroy(xqc->gcounts);
+ xqc->gcounts = NULL;
+ }
+
+ if (xqc->ucounts) {
+ xfarray_destroy(xqc->ucounts);
+ xqc->ucounts = NULL;
+ }
+
+ xchk_iscan_teardown(&xqc->iscan);
+ mutex_destroy(&xqc->lock);
+ xqc->sc = NULL;
+}
+
+/*
+ * Scan all inodes in the entire filesystem to generate quota counter data.
+ * If the scan is successful, the quota data will be left alive for a repair.
+ * If any error occurs, we'll tear everything down.
+ */
+STATIC int
+xqcheck_setup_scan(
+ struct xfs_scrub *sc,
+ struct xqcheck *xqc)
+{
+ char *descr;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ unsigned long long max_dquots = XFS_DQ_ID_MAX + 1ULL;
+ int error;
+
+ ASSERT(xqc->sc == NULL);
+ xqc->sc = sc;
+
+ mutex_init(&xqc->lock);
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &xqc->iscan);
+
+ error = -ENOMEM;
+ if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_USER)) {
+ descr = xchk_xfile_descr(sc, "user dquot records");
+ error = xfarray_create(descr, max_dquots,
+ sizeof(struct xqcheck_dquot), &xqc->ucounts);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+ }
+
+ if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_GROUP)) {
+ descr = xchk_xfile_descr(sc, "group dquot records");
+ error = xfarray_create(descr, max_dquots,
+ sizeof(struct xqcheck_dquot), &xqc->gcounts);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+ }
+
+ if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_PROJ)) {
+ descr = xchk_xfile_descr(sc, "project dquot records");
+ error = xfarray_create(descr, max_dquots,
+ sizeof(struct xqcheck_dquot), &xqc->pcounts);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+ }
+
+ /*
+ * Set up hash table to map transactions to our internal shadow dqtrx
+ * structures.
+ */
+ error = rhashtable_init(&xqc->shadow_dquot_acct,
+ &xqcheck_dqacct_hash_params);
+ if (error)
+ goto out_teardown;
+
+ /*
+ * Hook into the quota code. The hook only triggers for inodes that
+ * were already scanned, and the scanner thread takes each inode's
+ * ILOCK, which means that any in-progress inode updates will finish
+ * before we can scan the inode.
+ *
+ * The apply hook (which removes the shadow dquot accounting struct)
+ * must be installed before the mod hook so that we never fail to catch
+ * the end of a quota update sequence and leave stale shadow data.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_QUOTA);
+ xfs_dqtrx_hook_setup(&xqc->qhook, xqcheck_mod_live_ino_dqtrx,
+ xqcheck_apply_live_dqtrx);
+
+ error = xfs_dqtrx_hook_add(qi, &xqc->qhook);
+ if (error)
+ goto out_teardown;
+
+ /* Use deferred cleanup to pass the quota count data to repair. */
+ sc->buf_cleanup = xqcheck_teardown_scan;
+ return 0;
+
+out_teardown:
+ xqcheck_teardown_scan(xqc);
+ return error;
+}
+
+/* Scrub all counters for a given quota type. */
+int
+xchk_quotacheck(
+ struct xfs_scrub *sc)
+{
+ struct xqcheck *xqc = sc->buf;
+ int error = 0;
+
+ /* Check quota counters on the live filesystem. */
+ error = xqcheck_setup_scan(sc, xqc);
+ if (error)
+ return error;
+
+ /* Walk all inodes, picking up quota information. */
+ error = xqcheck_collect_counts(xqc);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ /* Fail fast if we're not playing with a full dataset. */
+ if (xchk_iscan_aborted(&xqc->iscan))
+ xchk_set_incomplete(sc);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ return 0;
+
+ /* Compare quota counters. */
+ if (xqc->ucounts) {
+ error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_USER);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+ }
+ if (xqc->gcounts) {
+ error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_GROUP);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+ }
+ if (xqc->pcounts) {
+ error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_PROJ);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+ }
+
+ /* Check one last time for an incomplete dataset. */
+ if (xchk_iscan_aborted(&xqc->iscan))
+ xchk_set_incomplete(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/quotacheck.h b/fs/xfs/scrub/quotacheck.h
new file mode 100644
index 000000000000..4ea5f249c978
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_QUOTACHECK_H__
+#define __XFS_SCRUB_QUOTACHECK_H__
+
+/* Quota counters for live quotacheck. */
+struct xqcheck_dquot {
+ /* block usage count */
+ int64_t bcount;
+
+ /* inode usage count */
+ int64_t icount;
+
+ /* realtime block usage count */
+ int64_t rtbcount;
+
+ /* Record state */
+ unsigned int flags;
+};
+
+/*
+ * This incore dquot record has been written at least once. We never want to
+ * store an xqcheck_dquot that looks uninitialized.
+ */
+#define XQCHECK_DQUOT_WRITTEN (1U << 0)
+
+/* Already checked this dquot. */
+#define XQCHECK_DQUOT_COMPARE_SCANNED (1U << 1)
+
+/* Already repaired this dquot. */
+#define XQCHECK_DQUOT_REPAIR_SCANNED (1U << 2)
+
+/* Live quotacheck control structure. */
+struct xqcheck {
+ struct xfs_scrub *sc;
+
+ /* Shadow dquot counter data. */
+ struct xfarray *ucounts;
+ struct xfarray *gcounts;
+ struct xfarray *pcounts;
+
+ /* Lock protecting quotacheck count observations */
+ struct mutex lock;
+
+ struct xchk_iscan iscan;
+
+ /* Hooks into the quota code. */
+ struct xfs_dqtrx_hook qhook;
+
+ /* Shadow quota delta tracking structure. */
+ struct rhashtable shadow_dquot_acct;
+};
+
+/* Return the incore counter array for a given quota type. */
+static inline struct xfarray *
+xqcheck_counters_for(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype)
+{
+ switch (dqtype) {
+ case XFS_DQTYPE_USER:
+ return xqc->ucounts;
+ case XFS_DQTYPE_GROUP:
+ return xqc->gcounts;
+ case XFS_DQTYPE_PROJ:
+ return xqc->pcounts;
+ }
+
+ ASSERT(0);
+ return NULL;
+}
+
+#endif /* __XFS_SCRUB_QUOTACHECK_H__ */
diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c
new file mode 100644
index 000000000000..dd8554c755b5
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck_repair.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_sb.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/quota.h"
+#include "scrub/quotacheck.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Quotacheck Repair
+ * ======================
+ *
+ * Use the live quota counter information that we collected to replace the
+ * counter values in the incore dquots. A scrub->repair cycle should have left
+ * the live data and hooks active, so this is safe so long as we make sure the
+ * dquot is locked.
+ */
+
+/* Commit new counters to a dquot. */
+static int
+xqcheck_commit_dquot(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype,
+ struct xfs_dquot *dq)
+{
+ struct xqcheck_dquot xcdq;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ int64_t delta;
+ bool dirty = false;
+ int error = 0;
+
+ /* Unlock the dquot just long enough to allocate a transaction. */
+ xfs_dqunlock(dq);
+ error = xchk_trans_alloc(xqc->sc, 0);
+ xfs_dqlock(dq);
+ if (error)
+ return error;
+
+ xfs_trans_dqjoin(xqc->sc->tp, dq);
+
+ if (xchk_iscan_aborted(&xqc->iscan)) {
+ error = -ECANCELED;
+ goto out_cancel;
+ }
+
+ mutex_lock(&xqc->lock);
+ error = xfarray_load_sparse(counts, dq->q_id, &xcdq);
+ if (error)
+ goto out_unlock;
+
+ /* Adjust counters as needed. */
+ delta = (int64_t)xcdq.icount - dq->q_ino.count;
+ if (delta) {
+ dq->q_ino.reserved += delta;
+ dq->q_ino.count += delta;
+ dirty = true;
+ }
+
+ delta = (int64_t)xcdq.bcount - dq->q_blk.count;
+ if (delta) {
+ dq->q_blk.reserved += delta;
+ dq->q_blk.count += delta;
+ dirty = true;
+ }
+
+ delta = (int64_t)xcdq.rtbcount - dq->q_rtb.count;
+ if (delta) {
+ dq->q_rtb.reserved += delta;
+ dq->q_rtb.count += delta;
+ dirty = true;
+ }
+
+ xcdq.flags |= (XQCHECK_DQUOT_REPAIR_SCANNED | XQCHECK_DQUOT_WRITTEN);
+ error = xfarray_store(counts, dq->q_id, &xcdq);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the repair
+ * and must cancel the whole operation. This should never
+ * happen, but we need to catch it anyway.
+ */
+ error = -ECANCELED;
+ }
+ mutex_unlock(&xqc->lock);
+ if (error || !dirty)
+ goto out_cancel;
+
+ trace_xrep_quotacheck_dquot(xqc->sc->mp, dq->q_type, dq->q_id);
+
+ /* Commit the dirty dquot to disk. */
+ dq->q_flags |= XFS_DQFLAG_DIRTY;
+ if (dq->q_id)
+ xfs_qm_adjust_dqtimers(dq);
+ xfs_trans_log_dquot(xqc->sc->tp, dq);
+
+ /*
+ * Transaction commit unlocks the dquot, so we must re-lock it so that
+ * the caller can put the reference (which apparently requires a locked
+ * dquot).
+ */
+ error = xrep_trans_commit(xqc->sc);
+ xfs_dqlock(dq);
+ return error;
+
+out_unlock:
+ mutex_unlock(&xqc->lock);
+out_cancel:
+ xchk_trans_cancel(xqc->sc);
+
+ /* Re-lock the dquot so the caller can put the reference. */
+ xfs_dqlock(dq);
+ return error;
+}
+
+/* Commit new quota counters for a particular quota type. */
+STATIC int
+xqcheck_commit_dqtype(
+ struct xqcheck *xqc,
+ unsigned int dqtype)
+{
+ struct xchk_dqiter cursor = { };
+ struct xqcheck_dquot xcdq;
+ struct xfs_scrub *sc = xqc->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ struct xfs_dquot *dq;
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ int error;
+
+ /*
+ * Update the counters of every dquot that the quota file knows about.
+ */
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xqcheck_commit_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
+ if (error)
+ return error;
+
+ /*
+ * Make a second pass to deal with the dquots that we know about but
+ * the quota file previously did not know about.
+ */
+ mutex_lock(&xqc->lock);
+ while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) {
+ xfs_dqid_t id = cur - 1;
+
+ if (xcdq.flags & XQCHECK_DQUOT_REPAIR_SCANNED)
+ continue;
+
+ mutex_unlock(&xqc->lock);
+
+ /*
+ * Grab the dquot, allowing for dquot block allocation in a
+ * separate transaction. We committed the scrub transaction
+ * in a previous step, so we will not be creating nested
+ * transactions here.
+ */
+ error = xfs_qm_dqget(mp, id, dqtype, true, &dq);
+ if (error)
+ return error;
+
+ error = xqcheck_commit_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ return error;
+
+ mutex_lock(&xqc->lock);
+ }
+ mutex_unlock(&xqc->lock);
+
+ return error;
+}
+
+/* Figure out quota CHKD flags for the running quota types. */
+static inline unsigned int
+xqcheck_chkd_flags(
+ struct xfs_mount *mp)
+{
+ unsigned int ret = 0;
+
+ if (XFS_IS_UQUOTA_ON(mp))
+ ret |= XFS_UQUOTA_CHKD;
+ if (XFS_IS_GQUOTA_ON(mp))
+ ret |= XFS_GQUOTA_CHKD;
+ if (XFS_IS_PQUOTA_ON(mp))
+ ret |= XFS_PQUOTA_CHKD;
+ return ret;
+}
+
+/* Commit the new dquot counters. */
+int
+xrep_quotacheck(
+ struct xfs_scrub *sc)
+{
+ struct xqcheck *xqc = sc->buf;
+ unsigned int qflags = xqcheck_chkd_flags(sc->mp);
+ int error;
+
+ /*
+ * Clear the CHKD flag for the running quota types and commit the scrub
+ * transaction so that we can allocate new quota block mappings if we
+ * have to. If we crash after this point, the sb still has the CHKD
+ * flags cleared, so mount quotacheck will fix all of this up.
+ */
+ xrep_update_qflags(sc, qflags, 0);
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ /* Commit the new counters to the dquots. */
+ if (xqc->ucounts) {
+ error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_USER);
+ if (error)
+ return error;
+ }
+ if (xqc->gcounts) {
+ error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_GROUP);
+ if (error)
+ return error;
+ }
+ if (xqc->pcounts) {
+ error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_PROJ);
+ if (error)
+ return error;
+ }
+
+ /* Set the CHKD flags now that we've fixed quota counts. */
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+
+ xrep_update_qflags(sc, 0, qflags);
+ return xrep_trans_commit(sc);
+}
diff --git a/fs/xfs/scrub/rcbag.c b/fs/xfs/scrub/rcbag.c
new file mode 100644
index 000000000000..e1e52bc20713
--- /dev/null
+++ b/fs/xfs/scrub/rcbag.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_error.h"
+#include "scrub/scrub.h"
+#include "scrub/rcbag_btree.h"
+#include "scrub/rcbag.h"
+#include "scrub/trace.h"
+
+struct rcbag {
+ struct xfs_mount *mp;
+ struct xfbtree xfbtree;
+ uint64_t nr_items;
+};
+
+int
+rcbag_init(
+ struct xfs_mount *mp,
+ struct xfs_buftarg *btp,
+ struct rcbag **bagp)
+{
+ struct rcbag *bag;
+ int error;
+
+ bag = kzalloc(sizeof(struct rcbag), XCHK_GFP_FLAGS);
+ if (!bag)
+ return -ENOMEM;
+
+ bag->nr_items = 0;
+ bag->mp = mp;
+
+ error = rcbagbt_mem_init(mp, &bag->xfbtree, btp);
+ if (error)
+ goto out_bag;
+
+ *bagp = bag;
+ return 0;
+
+out_bag:
+ kfree(bag);
+ return error;
+}
+
+void
+rcbag_free(
+ struct rcbag **bagp)
+{
+ struct rcbag *bag = *bagp;
+
+ xfbtree_destroy(&bag->xfbtree);
+ kfree(bag);
+ *bagp = NULL;
+}
+
+/* Track an rmap in the refcount bag. */
+int
+rcbag_add(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ const struct xfs_rmap_irec *rmap)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ int has;
+ int error;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = rcbagbt_lookup_eq(cur, rmap, &has);
+ if (error)
+ goto out_cur;
+
+ if (has) {
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ bagrec.rbg_refcount++;
+ error = rcbagbt_update(cur, &bagrec);
+ if (error)
+ goto out_cur;
+ } else {
+ bagrec.rbg_startblock = rmap->rm_startblock;
+ bagrec.rbg_blockcount = rmap->rm_blockcount;
+ bagrec.rbg_refcount = 1;
+
+ error = rcbagbt_insert(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+
+ error = xfbtree_trans_commit(&bag->xfbtree, tp);
+ if (error)
+ return error;
+
+ bag->nr_items++;
+ return 0;
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ xfbtree_trans_cancel(&bag->xfbtree, tp);
+ return error;
+}
+
+/* Return the number of records in the bag. */
+uint64_t
+rcbag_count(
+ const struct rcbag *rcbag)
+{
+ return rcbag->nr_items;
+}
+
+static inline uint32_t rcbag_rec_next_bno(const struct rcbag_rec *r)
+{
+ return r->rbg_startblock + r->rbg_blockcount;
+}
+
+/*
+ * Find the next block where the refcount changes, given the next rmap we
+ * looked at and the ones we're already tracking.
+ */
+int
+rcbag_next_edge(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ const struct xfs_rmap_irec *next_rmap,
+ bool next_valid,
+ uint32_t *next_bnop)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ uint32_t next_bno = NULLAGBLOCK;
+ int has;
+ int error;
+
+ if (next_valid)
+ next_bno = next_rmap->rm_startblock;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_increment(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ next_bno = min(next_bno, rcbag_rec_next_bno(&bagrec));
+ }
+
+ /*
+ * We should have found /something/ because either next_rrm is the next
+ * interesting rmap to look at after emitting this refcount extent, or
+ * there are other rmaps in rmap_bag contributing to the current
+ * sharing count. But if something is seriously wrong, bail out.
+ */
+ if (next_bno == NULLAGBLOCK) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+
+ *next_bnop = next_bno;
+ return 0;
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/* Pop all refcount bag records that end at next_bno */
+int
+rcbag_remove_ending_at(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ uint32_t next_bno)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ int has;
+ int error;
+
+ /* go to the right edge of the tree */
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ memset(&cur->bc_rec, 0xFF, sizeof(cur->bc_rec));
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, &has);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_decrement(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ if (rcbag_rec_next_bno(&bagrec) != next_bno)
+ continue;
+
+ error = xfs_btree_delete(cur, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ bag->nr_items -= bagrec.rbg_refcount;
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+ return xfbtree_trans_commit(&bag->xfbtree, tp);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ xfbtree_trans_cancel(&bag->xfbtree, tp);
+ return error;
+}
+
+/* Dump the rcbag. */
+void
+rcbag_dump(
+ struct rcbag *bag,
+ struct xfs_trans *tp)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ unsigned long long nr = 0;
+ int has;
+ int error;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_increment(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ xfs_err(bag->mp, "[%llu]: bno 0x%x fsbcount 0x%x refcount 0x%llx\n",
+ nr++,
+ (unsigned int)bagrec.rbg_startblock,
+ (unsigned int)bagrec.rbg_blockcount,
+ (unsigned long long)bagrec.rbg_refcount);
+ }
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+}
diff --git a/fs/xfs/scrub/rcbag.h b/fs/xfs/scrub/rcbag.h
new file mode 100644
index 000000000000..e29ef788ba72
--- /dev/null
+++ b/fs/xfs/scrub/rcbag.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RCBAG_H__
+#define __XFS_SCRUB_RCBAG_H__
+
+struct xfs_mount;
+struct rcbag;
+struct xfs_buftarg;
+
+int rcbag_init(struct xfs_mount *mp, struct xfs_buftarg *btp,
+ struct rcbag **bagp);
+void rcbag_free(struct rcbag **bagp);
+int rcbag_add(struct rcbag *bag, struct xfs_trans *tp,
+ const struct xfs_rmap_irec *rmap);
+uint64_t rcbag_count(const struct rcbag *bag);
+
+int rcbag_next_edge(struct rcbag *bag, struct xfs_trans *tp,
+ const struct xfs_rmap_irec *next_rmap, bool next_valid,
+ uint32_t *next_bnop);
+int rcbag_remove_ending_at(struct rcbag *bag, struct xfs_trans *tp,
+ uint32_t next_bno);
+
+void rcbag_dump(struct rcbag *bag, struct xfs_trans *tp);
+
+#endif /* __XFS_SCRUB_RCBAG_H__ */
diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c
new file mode 100644
index 000000000000..709356dc6256
--- /dev/null
+++ b/fs/xfs/scrub/rcbag_btree.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_error.h"
+#include "scrub/rcbag_btree.h"
+#include "scrub/trace.h"
+
+static struct kmem_cache *rcbagbt_cur_cache;
+
+STATIC void
+rcbagbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ struct rcbag_key *bag_key = (struct rcbag_key *)key;
+ const struct rcbag_rec *bag_rec = (const struct rcbag_rec *)rec;
+
+ BUILD_BUG_ON(sizeof(struct rcbag_key) > sizeof(union xfs_btree_key));
+ BUILD_BUG_ON(sizeof(struct rcbag_rec) > sizeof(union xfs_btree_rec));
+
+ bag_key->rbg_startblock = bag_rec->rbg_startblock;
+ bag_key->rbg_blockcount = bag_rec->rbg_blockcount;
+}
+
+STATIC void
+rcbagbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ struct rcbag_rec *bag_rec = (struct rcbag_rec *)rec;
+ struct rcbag_rec *bag_irec = (struct rcbag_rec *)&cur->bc_rec;
+
+ bag_rec->rbg_startblock = bag_irec->rbg_startblock;
+ bag_rec->rbg_blockcount = bag_irec->rbg_blockcount;
+ bag_rec->rbg_refcount = bag_irec->rbg_refcount;
+}
+
+STATIC int64_t
+rcbagbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec;
+ const struct rcbag_key *kp = (const struct rcbag_key *)key;
+
+ if (kp->rbg_startblock > rec->rbg_startblock)
+ return 1;
+ if (kp->rbg_startblock < rec->rbg_startblock)
+ return -1;
+
+ if (kp->rbg_blockcount > rec->rbg_blockcount)
+ return 1;
+ if (kp->rbg_blockcount < rec->rbg_blockcount)
+ return -1;
+
+ return 0;
+}
+
+STATIC int64_t
+rcbagbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
+{
+ const struct rcbag_key *kp1 = (const struct rcbag_key *)k1;
+ const struct rcbag_key *kp2 = (const struct rcbag_key *)k2;
+
+ ASSERT(mask == NULL);
+
+ if (kp1->rbg_startblock > kp2->rbg_startblock)
+ return 1;
+ if (kp1->rbg_startblock < kp2->rbg_startblock)
+ return -1;
+
+ if (kp1->rbg_blockcount > kp2->rbg_blockcount)
+ return 1;
+ if (kp1->rbg_blockcount < kp2->rbg_blockcount)
+ return -1;
+
+ return 0;
+}
+
+STATIC int
+rcbagbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ const struct rcbag_key *kp1 = (const struct rcbag_key *)k1;
+ const struct rcbag_key *kp2 = (const struct rcbag_key *)k2;
+
+ if (kp1->rbg_startblock > kp2->rbg_startblock)
+ return 0;
+ if (kp1->rbg_startblock < kp2->rbg_startblock)
+ return 1;
+
+ if (kp1->rbg_blockcount > kp2->rbg_blockcount)
+ return 0;
+ if (kp1->rbg_blockcount < kp2->rbg_blockcount)
+ return 1;
+
+ return 0;
+}
+
+STATIC int
+rcbagbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ const struct rcbag_rec *rp1 = (const struct rcbag_rec *)r1;
+ const struct rcbag_rec *rp2 = (const struct rcbag_rec *)r2;
+
+ if (rp1->rbg_startblock > rp2->rbg_startblock)
+ return 0;
+ if (rp1->rbg_startblock < rp2->rbg_startblock)
+ return 1;
+
+ if (rp1->rbg_blockcount > rp2->rbg_blockcount)
+ return 0;
+ if (rp1->rbg_blockcount < rp2->rbg_blockcount)
+ return 1;
+
+ return 0;
+}
+
+static xfs_failaddr_t
+rcbagbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ unsigned int level;
+ unsigned int maxrecs;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+
+ level = be16_to_cpu(block->bb_level);
+ if (level >= rcbagbt_maxlevels_possible())
+ return __this_address;
+
+ maxrecs = rcbagbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0);
+ return xfs_btree_memblock_verify(bp, maxrecs);
+}
+
+static void
+rcbagbt_rw_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa = rcbagbt_verify(bp);
+
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops rcbagbt_mem_buf_ops = {
+ .name = "rcbagbt_mem",
+ .magic = { 0, cpu_to_be32(RCBAG_MAGIC) },
+ .verify_read = rcbagbt_rw_verify,
+ .verify_write = rcbagbt_rw_verify,
+ .verify_struct = rcbagbt_verify,
+};
+
+static const struct xfs_btree_ops rcbagbt_mem_ops = {
+ .name = "rcbag",
+ .type = XFS_BTREE_TYPE_MEM,
+
+ .rec_len = sizeof(struct rcbag_rec),
+ .key_len = sizeof(struct rcbag_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = 1,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rcbag_2),
+
+ .dup_cursor = xfbtree_dup_cursor,
+ .set_root = xfbtree_set_root,
+ .alloc_block = xfbtree_alloc_block,
+ .free_block = xfbtree_free_block,
+ .get_minrecs = xfbtree_get_minrecs,
+ .get_maxrecs = xfbtree_get_maxrecs,
+ .init_key_from_rec = rcbagbt_init_key_from_rec,
+ .init_rec_from_cur = rcbagbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfbtree_init_ptr_from_cur,
+ .key_diff = rcbagbt_key_diff,
+ .buf_ops = &rcbagbt_mem_buf_ops,
+ .diff_two_keys = rcbagbt_diff_two_keys,
+ .keys_inorder = rcbagbt_keys_inorder,
+ .recs_inorder = rcbagbt_recs_inorder,
+};
+
+/* Create a cursor for an in-memory btree. */
+struct xfs_btree_cur *
+rcbagbt_mem_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfbtree *xfbtree)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_btree_alloc_cursor(mp, tp, &rcbagbt_mem_ops,
+ rcbagbt_maxlevels_possible(), rcbagbt_cur_cache);
+
+ cur->bc_mem.xfbtree = xfbtree;
+ cur->bc_nlevels = xfbtree->nlevels;
+ return cur;
+}
+
+/* Create an in-memory refcount bag btree. */
+int
+rcbagbt_mem_init(
+ struct xfs_mount *mp,
+ struct xfbtree *xfbt,
+ struct xfs_buftarg *btp)
+{
+ xfbt->owner = 0;
+ return xfbtree_init(mp, xfbt, btp, &rcbagbt_mem_ops);
+}
+
+/* Calculate number of records in a refcount bag btree block. */
+static inline unsigned int
+rcbagbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct rcbag_rec);
+ return blocklen /
+ (sizeof(struct rcbag_key) + sizeof(rcbag_ptr_t));
+}
+
+/*
+ * Calculate number of records in an refcount bag btree block.
+ */
+unsigned int
+rcbagbt_maxrecs(
+ struct xfs_mount *mp,
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= RCBAG_BLOCK_LEN;
+ return rcbagbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for refcount bag btrees. */
+unsigned int
+rcbagbt_maxlevels_possible(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_space_to_height(minrecs, ULLONG_MAX);
+}
+
+/* Calculate the refcount bag btree size for some records. */
+unsigned long long
+rcbagbt_calc_size(
+ unsigned long long nr_records)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_calc_size(minrecs, nr_records);
+}
+
+int __init
+rcbagbt_init_cur_cache(void)
+{
+ rcbagbt_cur_cache = kmem_cache_create("xfs_rcbagbt_cur",
+ xfs_btree_cur_sizeof(rcbagbt_maxlevels_possible()),
+ 0, 0, NULL);
+
+ if (!rcbagbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+rcbagbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(rcbagbt_cur_cache);
+ rcbagbt_cur_cache = NULL;
+}
+
+/* Look up the refcount bag record corresponding to this reverse mapping. */
+int
+rcbagbt_lookup_eq(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rmap,
+ int *success)
+{
+ struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec;
+
+ rec->rbg_startblock = rmap->rm_startblock;
+ rec->rbg_blockcount = rmap->rm_blockcount;
+
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, success);
+}
+
+/* Get the data from the pointed-to record. */
+int
+rcbagbt_get_rec(
+ struct xfs_btree_cur *cur,
+ struct rcbag_rec *rec,
+ int *has)
+{
+ union xfs_btree_rec *btrec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &btrec, has);
+ if (error || !(*has))
+ return error;
+
+ memcpy(rec, btrec, sizeof(struct rcbag_rec));
+ return 0;
+}
+
+/* Update the record referred to by cur to the value given. */
+int
+rcbagbt_update(
+ struct xfs_btree_cur *cur,
+ const struct rcbag_rec *rec)
+{
+ union xfs_btree_rec btrec;
+
+ memcpy(&btrec, rec, sizeof(struct rcbag_rec));
+ return xfs_btree_update(cur, &btrec);
+}
+
+/* Update the record referred to by cur to the value given. */
+int
+rcbagbt_insert(
+ struct xfs_btree_cur *cur,
+ const struct rcbag_rec *rec,
+ int *success)
+{
+ struct rcbag_rec *btrec = (struct rcbag_rec *)&cur->bc_rec;
+
+ memcpy(btrec, rec, sizeof(struct rcbag_rec));
+ return xfs_btree_insert(cur, success);
+}
diff --git a/fs/xfs/scrub/rcbag_btree.h b/fs/xfs/scrub/rcbag_btree.h
new file mode 100644
index 000000000000..03cadb032552
--- /dev/null
+++ b/fs/xfs/scrub/rcbag_btree.h
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RCBAG_BTREE_H__
+#define __XFS_SCRUB_RCBAG_BTREE_H__
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+#define RCBAG_MAGIC 0x74826671 /* 'JRBG' */
+
+struct rcbag_key {
+ uint32_t rbg_startblock;
+ uint32_t rbg_blockcount;
+};
+
+struct rcbag_rec {
+ uint32_t rbg_startblock;
+ uint32_t rbg_blockcount;
+ uint64_t rbg_refcount;
+};
+
+typedef __be64 rcbag_ptr_t;
+
+/* reflinks only exist on crc enabled filesystems */
+#define RCBAG_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define RCBAG_REC_ADDR(block, index) \
+ ((struct rcbag_rec *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ (((index) - 1) * sizeof(struct rcbag_rec))))
+
+#define RCBAG_KEY_ADDR(block, index) \
+ ((struct rcbag_key *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ ((index) - 1) * sizeof(struct rcbag_key)))
+
+#define RCBAG_PTR_ADDR(block, index, maxrecs) \
+ ((rcbag_ptr_t *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ (maxrecs) * sizeof(struct rcbag_key) + \
+ ((index) - 1) * sizeof(rcbag_ptr_t)))
+
+unsigned int rcbagbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+ bool leaf);
+
+unsigned long long rcbagbt_calc_size(unsigned long long nr_records);
+
+unsigned int rcbagbt_maxlevels_possible(void);
+
+int __init rcbagbt_init_cur_cache(void);
+void rcbagbt_destroy_cur_cache(void);
+
+struct xfs_btree_cur *rcbagbt_mem_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfbtree *xfbtree);
+int rcbagbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
+ struct xfs_buftarg *btp);
+
+int rcbagbt_lookup_eq(struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rmap, int *success);
+int rcbagbt_get_rec(struct xfs_btree_cur *cur, struct rcbag_rec *rec, int *has);
+int rcbagbt_update(struct xfs_btree_cur *cur, const struct rcbag_rec *rec);
+int rcbagbt_insert(struct xfs_btree_cur *cur, const struct rcbag_rec *rec,
+ int *success);
+
+#else
+# define rcbagbt_init_cur_cache() 0
+# define rcbagbt_destroy_cur_cache() ((void)0)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
+#endif /* __XFS_SCRUB_RCBAG_BTREE_H__ */
diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c
index 16462332c897..01c9a2dc0f2c 100644
--- a/fs/xfs/scrub/readdir.c
+++ b/fs/xfs/scrub/readdir.c
@@ -18,6 +18,7 @@
#include "xfs_trans.h"
#include "xfs_error.h"
#include "scrub/scrub.h"
+#include "scrub/common.h"
#include "scrub/readdir.h"
/* Call a function for every entry in a shortform directory. */
@@ -99,7 +100,7 @@ xchk_dir_walk_block(
unsigned int off, next_off, end;
int error;
- error = xfs_dir3_block_read(sc->tp, dp, &bp);
+ error = xfs_dir3_block_read(sc->tp, dp, dp->i_ino, &bp);
if (error)
return error;
@@ -175,7 +176,7 @@ xchk_read_leaf_dir_buf(
if (new_off > *curoff)
*curoff = new_off;
- return xfs_dir3_data_read(tp, dp, map.br_startoff, 0, bpp);
+ return xfs_dir3_data_read(tp, dp, dp->i_ino, map.br_startoff, 0, bpp);
}
/* Call a function for every entry in a leaf directory. */
@@ -273,32 +274,27 @@ xchk_dir_walk(
.dp = dp,
.geo = dp->i_mount->m_dir_geo,
.trans = sc->tp,
+ .owner = dp->i_ino,
};
- bool isblock;
int error;
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
- ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+ switch (xfs_dir2_format(&args, &error)) {
+ case XFS_DIR2_FMT_SF:
return xchk_dir_walk_sf(sc, dp, dirent_fn, priv);
-
- /* dir2 functions require that the data fork is loaded */
- error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK);
- if (error)
- return error;
-
- error = xfs_dir2_isblock(&args, &isblock);
- if (error)
- return error;
-
- if (isblock)
+ case XFS_DIR2_FMT_BLOCK:
return xchk_dir_walk_block(sc, dp, dirent_fn, priv);
-
- return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv);
+ case XFS_DIR2_FMT_LEAF:
+ case XFS_DIR2_FMT_NODE:
+ return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv);
+ default:
+ return error;
+ }
}
/*
@@ -324,50 +320,102 @@ xchk_dir_lookup(
.hashval = xfs_dir2_hashname(dp->i_mount, name),
.whichfork = XFS_DATA_FORK,
.op_flags = XFS_DA_OP_OKNOENT,
+ .owner = dp->i_ino,
};
- bool isblock, isleaf;
int error;
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
+ /*
+ * A temporary directory's block headers are written with the owner
+ * set to sc->ip, so we must switch the owner here for the lookup.
+ */
+ if (dp == sc->tempip)
+ args.owner = sc->ip->i_ino;
+
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
- ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
- if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- error = xfs_dir2_sf_lookup(&args);
- goto out_check_rval;
- }
+ error = xfs_dir_lookup_args(&args);
+ if (!error)
+ *ino = args.inumber;
+ return error;
+}
- /* dir2 functions require that the data fork is loaded */
- error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK);
- if (error)
- return error;
+/*
+ * Try to grab the IOLOCK and ILOCK of sc->ip and ip, returning @ip's lock
+ * state. The caller may have a transaction, so we must use trylock for both
+ * IOLOCKs.
+ */
+static inline unsigned int
+xchk_dir_trylock_both(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ if (!xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL))
+ return 0;
- error = xfs_dir2_isblock(&args, &isblock);
- if (error)
- return error;
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+ goto parent_iolock;
- if (isblock) {
- error = xfs_dir2_block_lookup(&args);
- goto out_check_rval;
- }
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ goto parent_ilock;
- error = xfs_dir2_isleaf(&args, &isleaf);
- if (error)
- return error;
+ return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL;
+
+parent_ilock:
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+parent_iolock:
+ xchk_iunlock(sc, XFS_IOLOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Try for a limited time to grab the IOLOCK and ILOCK of both the scrub target
+ * (@sc->ip) and the inode at the other end (@ip) of a directory or parent
+ * pointer link so that we can check that link.
+ *
+ * We do not know ahead of time that the directory tree is /not/ corrupt, so we
+ * cannot use the "lock two inode" functions because we do not know that there
+ * is not a racing thread trying to take the locks in opposite order. First
+ * take IOLOCK_EXCL of the scrub target, and then try to take IOLOCK_SHARED
+ * of @ip to synchronize with the VFS. Next, take ILOCK_EXCL of the scrub
+ * target and @ip to synchronize with XFS.
+ *
+ * If the trylocks succeed, *lockmode will be set to the locks held for @ip;
+ * @sc->ilock_flags will be set for the locks held for @sc->ip; and zero will
+ * be returned. If not, returns -EDEADLOCK to try again; or -ETIMEDOUT if
+ * XCHK_TRY_HARDER was set. Returns -EINTR if the process has been killed.
+ */
+int
+xchk_dir_trylock_for_pptrs(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ unsigned int *lockmode)
+{
+ unsigned int nr;
+ int error = 0;
+
+ ASSERT(sc->ilock_flags == 0);
+
+ for (nr = 0; nr < HZ; nr++) {
+ *lockmode = xchk_dir_trylock_both(sc, ip);
+ if (*lockmode)
+ return 0;
- if (isleaf) {
- error = xfs_dir2_leaf_lookup(&args);
- goto out_check_rval;
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ delay(1);
}
- error = xfs_dir2_node_lookup(&args);
+ if (sc->flags & XCHK_TRY_HARDER) {
+ xchk_set_incomplete(sc);
+ return -ETIMEDOUT;
+ }
-out_check_rval:
- if (error == -EEXIST)
- error = 0;
- if (!error)
- *ino = args.inumber;
- return error;
+ return -EDEADLOCK;
}
diff --git a/fs/xfs/scrub/readdir.h b/fs/xfs/scrub/readdir.h
index 55787f4df123..da501877a64d 100644
--- a/fs/xfs/scrub/readdir.h
+++ b/fs/xfs/scrub/readdir.h
@@ -16,4 +16,7 @@ int xchk_dir_walk(struct xfs_scrub *sc, struct xfs_inode *dp,
int xchk_dir_lookup(struct xfs_scrub *sc, struct xfs_inode *dp,
const struct xfs_name *name, xfs_ino_t *ino);
+int xchk_dir_trylock_for_pptrs(struct xfs_scrub *sc, struct xfs_inode *ip,
+ unsigned int *lockmode);
+
#endif /* __XFS_SCRUB_READDIR_H__ */
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index f99eca799809..8703897c0a9c 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -33,6 +33,9 @@
#include "xfs_attr.h"
#include "xfs_attr_remote.h"
#include "xfs_defer.h"
+#include "xfs_metafile.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -40,6 +43,7 @@
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
#include "scrub/fsb_bitmap.h"
+#include "scrub/rtb_bitmap.h"
#include "scrub/reap.h"
/*
@@ -114,7 +118,7 @@ xreap_put_freelist(
int error;
/* Make sure there's space on the freelist. */
- error = xrep_fix_freelist(sc, true);
+ error = xrep_fix_freelist(sc, 0);
if (error)
return error;
@@ -137,7 +141,7 @@ xreap_put_freelist(
agfl_bp, agbno, 0);
if (error)
return error;
- xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
+ xfs_extent_busy_insert(sc->tp, pag_group(sc->sa.pag), agbno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
return 0;
@@ -211,6 +215,48 @@ static inline void xreap_defer_finish_reset(struct xreap_state *rs)
rs->force_roll = false;
}
+/*
+ * Compute the maximum length of a buffer cache scan (in units of sectors),
+ * given a quantity of fs blocks.
+ */
+xfs_daddr_t
+xrep_bufscan_max_sectors(
+ struct xfs_mount *mp,
+ xfs_extlen_t fsblocks)
+{
+ int max_fsbs;
+
+ /* Remote xattr values are the largest buffers that we support. */
+ max_fsbs = xfs_attr3_max_rmt_blocks(mp);
+
+ return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs));
+}
+
+/*
+ * Return an incore buffer from a sector scan, or NULL if there are no buffers
+ * left to return.
+ */
+struct xfs_buf *
+xrep_bufscan_advance(
+ struct xfs_mount *mp,
+ struct xrep_bufscan *scan)
+{
+ scan->__sector_count += scan->daddr_step;
+ while (scan->__sector_count <= scan->max_sectors) {
+ struct xfs_buf *bp = NULL;
+ int error;
+
+ error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr,
+ scan->__sector_count, XBF_LIVESCAN, &bp);
+ if (!error)
+ return bp;
+
+ scan->__sector_count += scan->daddr_step;
+ }
+
+ return NULL;
+}
+
/* Try to invalidate the incore buffers for an extent that we're freeing. */
STATIC void
xreap_agextent_binval(
@@ -221,7 +267,6 @@ xreap_agextent_binval(
struct xfs_scrub *sc = rs->sc;
struct xfs_perag *pag = sc->sa.pag;
struct xfs_mount *mp = sc->mp;
- xfs_agnumber_t agno = sc->sa.pag->pag_agno;
xfs_agblock_t agbno_next = agbno + *aglenp;
xfs_agblock_t bno = agbno;
@@ -241,28 +286,15 @@ xreap_agextent_binval(
* of any plausible size.
*/
while (bno < agbno_next) {
- xfs_agblock_t fsbcount;
- xfs_agblock_t max_fsbs;
-
- /*
- * Max buffer size is the max remote xattr buffer size, which
- * is one fs block larger than 64k.
- */
- max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
- xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
-
- for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) {
- struct xfs_buf *bp = NULL;
- xfs_daddr_t daddr;
- int error;
-
- daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
- error = xfs_buf_incore(mp->m_ddev_targp, daddr,
- XFS_FSB_TO_BB(mp, fsbcount),
- XBF_LIVESCAN, &bp);
- if (error)
- continue;
-
+ struct xrep_bufscan scan = {
+ .daddr = xfs_agbno_to_daddr(pag, bno),
+ .max_sectors = xrep_bufscan_max_sectors(mp,
+ agbno_next - bno),
+ .daddr_step = XFS_FSB_TO_BB(mp, 1),
+ };
+ struct xfs_buf *bp;
+
+ while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
xfs_trans_bjoin(sc->tp, bp);
xfs_trans_binval(sc->tp, bp);
rs->invalidated++;
@@ -282,7 +314,7 @@ xreap_agextent_binval(
}
out:
- trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
+ trace_xreap_agextent_binval(pag_group(sc->sa.pag), agbno, *aglenp);
}
/*
@@ -341,7 +373,8 @@ xreap_agextent_select(
out_found:
*aglenp = len;
- trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
+ trace_xreap_agextent_select(pag_group(sc->sa.pag), agbno, len,
+ *crosslinked);
out_cur:
xfs_btree_del_cursor(cur, error);
return error;
@@ -362,7 +395,9 @@ xreap_agextent_iter(
xfs_fsblock_t fsbno;
int error = 0;
- fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
+ ASSERT(rs->resv != XFS_AG_RESV_METAFILE);
+
+ fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno);
/*
* If there are other rmappings, this block is cross linked and must
@@ -378,7 +413,8 @@ xreap_agextent_iter(
* to run xfs_repair.
*/
if (crosslinked) {
- trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
+ trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno,
+ *aglenp);
rs->force_roll = true;
@@ -388,7 +424,8 @@ xreap_agextent_iter(
* records from the refcountbt, which will remove the
* rmap record as well.
*/
- xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+ xfs_refcount_free_cow_extent(sc->tp, false, fsbno,
+ *aglenp);
return 0;
}
@@ -396,7 +433,7 @@ xreap_agextent_iter(
*aglenp, rs->oinfo);
}
- trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
+ trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp);
/*
* Invalidate as many buffers as we can, starting at agbno. If this
@@ -420,9 +457,9 @@ xreap_agextent_iter(
if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
ASSERT(rs->resv == XFS_AG_RESV_NONE);
- xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+ xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp);
error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
- rs->resv, true);
+ rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
if (error)
return error;
@@ -448,7 +485,7 @@ xreap_agextent_iter(
* system with large EFIs.
*/
error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
- rs->resv, true);
+ rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
if (error)
return error;
@@ -646,3 +683,637 @@ xrep_reap_fsblocks(
return 0;
}
+
+#ifdef CONFIG_XFS_RT
+/*
+ * Figure out the longest run of blocks that we can dispose of with a single
+ * call. Cross-linked blocks should have their reverse mappings removed, but
+ * single-owner extents can be freed. Units are rt blocks, not rt extents.
+ */
+STATIC int
+xreap_rgextent_select(
+ struct xreap_state *rs,
+ xfs_rgblock_t rgbno,
+ xfs_rgblock_t rgbno_next,
+ bool *crosslinked,
+ xfs_extlen_t *rglenp)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_btree_cur *cur;
+ xfs_rgblock_t bno = rgbno + 1;
+ xfs_extlen_t len = 1;
+ int error;
+
+ /*
+ * Determine if there are any other rmap records covering the first
+ * block of this extent. If so, the block is crosslinked.
+ */
+ cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg);
+ error = xfs_rmap_has_other_keys(cur, rgbno, 1, rs->oinfo,
+ crosslinked);
+ if (error)
+ goto out_cur;
+
+ /*
+ * Figure out how many of the subsequent blocks have the same crosslink
+ * status.
+ */
+ while (bno < rgbno_next) {
+ bool also_crosslinked;
+
+ error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
+ &also_crosslinked);
+ if (error)
+ goto out_cur;
+
+ if (*crosslinked != also_crosslinked)
+ break;
+
+ len++;
+ bno++;
+ }
+
+ *rglenp = len;
+ trace_xreap_agextent_select(rtg_group(sc->sr.rtg), rgbno, len,
+ *crosslinked);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Dispose of as much of the beginning of this rtgroup extent as possible.
+ * The number of blocks disposed of will be returned in @rglenp.
+ */
+STATIC int
+xreap_rgextent_iter(
+ struct xreap_state *rs,
+ xfs_rgblock_t rgbno,
+ xfs_extlen_t *rglenp,
+ bool crosslinked)
+{
+ struct xfs_scrub *sc = rs->sc;
+ xfs_rtblock_t rtbno;
+ int error;
+
+ /*
+ * The only caller so far is CoW fork repair, so we only know how to
+ * unlink or free CoW staging extents. Here we don't have to worry
+ * about invalidating buffers!
+ */
+ if (rs->oinfo != &XFS_RMAP_OINFO_COW) {
+ ASSERT(rs->oinfo == &XFS_RMAP_OINFO_COW);
+ return -EFSCORRUPTED;
+ }
+ ASSERT(rs->resv == XFS_AG_RESV_NONE);
+
+ rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno);
+
+ /*
+ * If there are other rmappings, this block is cross linked and must
+ * not be freed. Remove the forward and reverse mapping and move on.
+ */
+ if (crosslinked) {
+ trace_xreap_dispose_unmap_extent(rtg_group(sc->sr.rtg), rgbno,
+ *rglenp);
+
+ xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
+ rs->deferred++;
+ return 0;
+ }
+
+ trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp);
+
+ /*
+ * The CoW staging extent is not crosslinked. Use deferred work items
+ * to remove the refcountbt records (which removes the rmap records)
+ * and free the extent. We're not worried about the system going down
+ * here because log recovery walks the refcount btree to clean out the
+ * CoW staging extents.
+ */
+ xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
+ error = xfs_free_extent_later(sc->tp, rtbno, *rglenp, NULL,
+ rs->resv,
+ XFS_FREE_EXTENT_REALTIME |
+ XFS_FREE_EXTENT_SKIP_DISCARD);
+ if (error)
+ return error;
+
+ rs->deferred++;
+ return 0;
+}
+
+#define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \
+ XFS_RTGLOCK_RMAP | \
+ XFS_RTGLOCK_REFCOUNT)
+
+/*
+ * Break a rt file metadata extent into sub-extents by fate (crosslinked, not
+ * crosslinked), and dispose of each sub-extent separately. The extent must
+ * be aligned to a realtime extent.
+ */
+STATIC int
+xreap_rtmeta_extent(
+ uint64_t rtbno,
+ uint64_t len,
+ void *priv)
+{
+ struct xreap_state *rs = priv;
+ struct xfs_scrub *sc = rs->sc;
+ xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, rtbno);
+ xfs_rgblock_t rgbno_next = rgbno + len;
+ int error = 0;
+
+ ASSERT(sc->ip != NULL);
+ ASSERT(!sc->sr.rtg);
+
+ /*
+ * We're reaping blocks after repairing file metadata, which means that
+ * we have to init the xchk_ag structure ourselves.
+ */
+ sc->sr.rtg = xfs_rtgroup_get(sc->mp, xfs_rtb_to_rgno(sc->mp, rtbno));
+ if (!sc->sr.rtg)
+ return -EFSCORRUPTED;
+
+ xfs_rtgroup_lock(sc->sr.rtg, XREAP_RTGLOCK_ALL);
+
+ while (rgbno < rgbno_next) {
+ xfs_extlen_t rglen;
+ bool crosslinked;
+
+ error = xreap_rgextent_select(rs, rgbno, rgbno_next,
+ &crosslinked, &rglen);
+ if (error)
+ goto out_unlock;
+
+ error = xreap_rgextent_iter(rs, rgbno, &rglen, crosslinked);
+ if (error)
+ goto out_unlock;
+
+ if (xreap_want_defer_finish(rs)) {
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ goto out_unlock;
+ xreap_defer_finish_reset(rs);
+ } else if (xreap_want_roll(rs)) {
+ error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+ if (error)
+ goto out_unlock;
+ xreap_reset(rs);
+ }
+
+ rgbno += rglen;
+ }
+
+out_unlock:
+ xfs_rtgroup_unlock(sc->sr.rtg, XREAP_RTGLOCK_ALL);
+ xfs_rtgroup_put(sc->sr.rtg);
+ sc->sr.rtg = NULL;
+ return error;
+}
+
+/*
+ * Dispose of every block of every rt metadata extent in the bitmap.
+ * Do not use this to dispose of the mappings in an ondisk inode fork.
+ */
+int
+xrep_reap_rtblocks(
+ struct xfs_scrub *sc,
+ struct xrtb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xreap_state rs = {
+ .sc = sc,
+ .oinfo = oinfo,
+ .resv = XFS_AG_RESV_NONE,
+ };
+ int error;
+
+ ASSERT(xfs_has_rmapbt(sc->mp));
+ ASSERT(sc->ip != NULL);
+
+ error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs);
+ if (error)
+ return error;
+
+ if (xreap_dirty(&rs))
+ return xrep_defer_finish(sc);
+
+ return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * Dispose of every block of an old metadata btree that used to be rooted in a
+ * metadata directory file.
+ */
+int
+xrep_reap_metadir_fsblocks(
+ struct xfs_scrub *sc,
+ struct xfsb_bitmap *bitmap)
+{
+ /*
+ * Reap old metadir btree blocks with XFS_AG_RESV_NONE because the old
+ * blocks are no longer mapped by the inode, and inode metadata space
+ * reservations can only account freed space to the i_nblocks.
+ */
+ struct xfs_owner_info oinfo;
+ struct xreap_state rs = {
+ .sc = sc,
+ .oinfo = &oinfo,
+ .resv = XFS_AG_RESV_NONE,
+ };
+ int error;
+
+ ASSERT(xfs_has_rmapbt(sc->mp));
+ ASSERT(sc->ip != NULL);
+ ASSERT(xfs_is_metadir_inode(sc->ip));
+
+ xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
+
+ error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
+ if (error)
+ return error;
+
+ if (xreap_dirty(&rs)) {
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ }
+
+ return xrep_reset_metafile_resv(sc);
+}
+
+/*
+ * Metadata files are not supposed to share blocks with anything else.
+ * If blocks are shared, we remove the reverse mapping (thus reducing the
+ * crosslink factor); if blocks are not shared, we also need to free them.
+ *
+ * This first step determines the longest subset of the passed-in imap
+ * (starting at its beginning) that is either crosslinked or not crosslinked.
+ * The blockcount will be adjust down as needed.
+ */
+STATIC int
+xreap_bmapi_select(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *imap,
+ bool *crosslinked)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_btree_cur *cur;
+ xfs_filblks_t len = 1;
+ xfs_agblock_t bno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t agbno_next;
+ int error;
+
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
+ agbno_next = agbno + imap->br_blockcount;
+
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+
+ xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff);
+ error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
+ if (error)
+ goto out_cur;
+
+ bno = agbno + 1;
+ while (bno < agbno_next) {
+ bool also_crosslinked;
+
+ oinfo.oi_offset++;
+ error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo,
+ &also_crosslinked);
+ if (error)
+ goto out_cur;
+
+ if (also_crosslinked != *crosslinked)
+ break;
+
+ len++;
+ bno++;
+ }
+
+ imap->br_blockcount = len;
+ trace_xreap_bmapi_select(pag_group(sc->sa.pag), agbno, len,
+ *crosslinked);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Decide if this buffer can be joined to a transaction. This is true for most
+ * buffers, but there are two cases that we want to catch: large remote xattr
+ * value buffers are not logged and can overflow the buffer log item dirty
+ * bitmap size; and oversized cached buffers if things have really gone
+ * haywire.
+ */
+static inline bool
+xreap_buf_loggable(
+ const struct xfs_buf *bp)
+{
+ int i;
+
+ for (i = 0; i < bp->b_map_count; i++) {
+ int chunks;
+ int map_size;
+
+ chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
+ XFS_BLF_CHUNK);
+ map_size = DIV_ROUND_UP(chunks, NBWORD);
+ if (map_size > XFS_BLF_DATAMAP_SIZE)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Invalidate any buffers for this file mapping. The @imap blockcount may be
+ * adjusted downward if we need to roll the transaction.
+ */
+STATIC int
+xreap_bmapi_binval(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *imap)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_perag *pag = sc->sa.pag;
+ int bmap_flags = xfs_bmapi_aflag(whichfork);
+ xfs_fileoff_t off;
+ xfs_fileoff_t max_off;
+ xfs_extlen_t scan_blocks;
+ xfs_agblock_t bno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t agbno_next;
+ unsigned int invalidated = 0;
+ int error;
+
+ /*
+ * Avoid invalidating AG headers and post-EOFS blocks because we never
+ * own those.
+ */
+ agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
+ agbno_next = agbno + imap->br_blockcount;
+ if (!xfs_verify_agbno(pag, agbno) ||
+ !xfs_verify_agbno(pag, agbno_next - 1))
+ return 0;
+
+ /*
+ * Buffers for file blocks can span multiple contiguous mappings. This
+ * means that for each block in the mapping, there could exist an
+ * xfs_buf indexed by that block with any length up to the maximum
+ * buffer size (remote xattr values) or to the next hole in the fork.
+ * To set up our binval scan, first we need to figure out the location
+ * of the next hole.
+ */
+ off = imap->br_startoff + imap->br_blockcount;
+ max_off = off + xfs_attr3_max_rmt_blocks(mp);
+ while (off < max_off) {
+ struct xfs_bmbt_irec hmap;
+ int nhmaps = 1;
+
+ error = xfs_bmapi_read(ip, off, max_off - off, &hmap,
+ &nhmaps, bmap_flags);
+ if (error)
+ return error;
+ if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ if (!xfs_bmap_is_real_extent(&hmap))
+ break;
+
+ off = hmap.br_startoff + hmap.br_blockcount;
+ }
+ scan_blocks = off - imap->br_startoff;
+
+ trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks);
+
+ /*
+ * If there are incore buffers for these blocks, invalidate them. If
+ * we can't (try)lock the buffer we assume it's owned by someone else
+ * and leave it alone. The buffer cache cannot detect aliasing, so
+ * employ nested loops to detect incore buffers of any plausible size.
+ */
+ while (bno < agbno_next) {
+ struct xrep_bufscan scan = {
+ .daddr = xfs_agbno_to_daddr(pag, bno),
+ .max_sectors = xrep_bufscan_max_sectors(mp,
+ scan_blocks),
+ .daddr_step = XFS_FSB_TO_BB(mp, 1),
+ };
+ struct xfs_buf *bp;
+
+ while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
+ if (xreap_buf_loggable(bp)) {
+ xfs_trans_bjoin(sc->tp, bp);
+ xfs_trans_binval(sc->tp, bp);
+ } else {
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ }
+ invalidated++;
+
+ /*
+ * Stop invalidating if we've hit the limit; we should
+ * still have enough reservation left to free however
+ * much of the mapping we've seen so far.
+ */
+ if (invalidated > XREAP_MAX_BINVAL) {
+ imap->br_blockcount = agbno_next - bno;
+ goto out;
+ }
+ }
+
+ bno++;
+ scan_blocks--;
+ }
+
+out:
+ trace_xreap_bmapi_binval(pag_group(sc->sa.pag), agbno,
+ imap->br_blockcount);
+ return 0;
+}
+
+/*
+ * Dispose of as much of the beginning of this file fork mapping as possible.
+ * The number of blocks disposed of is returned in @imap->br_blockcount.
+ */
+STATIC int
+xrep_reap_bmapi_iter(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *imap,
+ bool crosslinked)
+{
+ int error;
+
+ if (crosslinked) {
+ /*
+ * If there are other rmappings, this block is cross linked and
+ * must not be freed. Remove the reverse mapping, leave the
+ * buffer cache in its possibly confused state, and move on.
+ * We don't want to risk discarding valid data buffers from
+ * anybody else who thinks they own the block, even though that
+ * runs the risk of stale buffer warnings in the future.
+ */
+ trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag),
+ XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
+ imap->br_blockcount);
+
+ /*
+ * Schedule removal of the mapping from the fork. We use
+ * deferred log intents in this function to control the exact
+ * sequence of metadata updates.
+ */
+ xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
+ xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+ -(int64_t)imap->br_blockcount);
+ xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap);
+ return 0;
+ }
+
+ /*
+ * If the block is not crosslinked, we can invalidate all the incore
+ * buffers for the extent, and then free the extent. This is a bit of
+ * a mess since we don't detect discontiguous buffers that are indexed
+ * by a block starting before the first block of the extent but overlap
+ * anyway.
+ */
+ trace_xreap_dispose_free_extent(pag_group(sc->sa.pag),
+ XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
+ imap->br_blockcount);
+
+ /*
+ * Invalidate as many buffers as we can, starting at the beginning of
+ * this mapping. If this function sets blockcount to zero, the
+ * transaction is full of logged buffer invalidations, so we need to
+ * return early so that we can roll and retry.
+ */
+ error = xreap_bmapi_binval(sc, ip, whichfork, imap);
+ if (error || imap->br_blockcount == 0)
+ return error;
+
+ /*
+ * Schedule removal of the mapping from the fork. We use deferred log
+ * intents in this function to control the exact sequence of metadata
+ * updates.
+ */
+ xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
+ xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+ -(int64_t)imap->br_blockcount);
+ return xfs_free_extent_later(sc->tp, imap->br_startblock,
+ imap->br_blockcount, NULL, XFS_AG_RESV_NONE,
+ XFS_FREE_EXTENT_SKIP_DISCARD);
+}
+
+/*
+ * Dispose of as much of this file extent as we can. Upon successful return,
+ * the imap will reflect the mapping that was removed from the fork.
+ */
+STATIC int
+xreap_ifork_extent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *imap)
+{
+ xfs_agnumber_t agno;
+ bool crosslinked;
+ int error;
+
+ ASSERT(sc->sa.pag == NULL);
+
+ trace_xreap_ifork_extent(sc, ip, whichfork, imap);
+
+ agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
+ sc->sa.pag = xfs_perag_get(sc->mp, agno);
+ if (!sc->sa.pag)
+ return -EFSCORRUPTED;
+
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
+ if (error)
+ goto out_pag;
+
+ /*
+ * Decide the fate of the blocks at the beginning of the mapping, then
+ * update the mapping to use it with the unmap calls.
+ */
+ error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked);
+ if (error)
+ goto out_agf;
+
+ error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked);
+ if (error)
+ goto out_agf;
+
+out_agf:
+ xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
+ sc->sa.agf_bp = NULL;
+out_pag:
+ xfs_perag_put(sc->sa.pag);
+ sc->sa.pag = NULL;
+ return error;
+}
+
+/*
+ * Dispose of each block mapped to the given fork of the given file. Callers
+ * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip. The fork
+ * must not have any delalloc reservations.
+ */
+int
+xrep_reap_ifork(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ xfs_fileoff_t off = 0;
+ int bmap_flags = xfs_bmapi_aflag(whichfork);
+ int error;
+
+ ASSERT(xfs_has_rmapbt(sc->mp));
+ ASSERT(ip == sc->ip || ip == sc->tempip);
+ ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));
+
+ while (off < XFS_MAX_FILEOFF) {
+ struct xfs_bmbt_irec imap;
+ int nimaps = 1;
+
+ /* Read the next extent, skip past holes and delalloc. */
+ error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap,
+ &nimaps, bmap_flags);
+ if (error)
+ return error;
+ if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * If this is a real space mapping, reap as much of it as we
+ * can in a single transaction.
+ */
+ if (xfs_bmap_is_real_extent(&imap)) {
+ error = xreap_ifork_extent(sc, ip, whichfork, &imap);
+ if (error)
+ return error;
+
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ off = imap.br_startoff + imap.br_blockcount;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
index 0b69f16dd98f..4c8f62701fb3 100644
--- a/fs/xfs/scrub/reap.h
+++ b/fs/xfs/scrub/reap.h
@@ -13,5 +13,35 @@ int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap,
const struct xfs_owner_info *oinfo);
+int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork);
+int xrep_reap_metadir_fsblocks(struct xfs_scrub *sc,
+ struct xfsb_bitmap *bitmap);
+
+#ifdef CONFIG_XFS_RT
+int xrep_reap_rtblocks(struct xfs_scrub *sc, struct xrtb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo);
+#else
+# define xrep_reap_rtblocks(...) (-EOPNOTSUPP)
+#endif /* CONFIG_XFS_RT */
+
+/* Buffer cache scan context. */
+struct xrep_bufscan {
+ /* Disk address for the buffers we want to scan. */
+ xfs_daddr_t daddr;
+
+ /* Maximum number of sectors to scan. */
+ xfs_daddr_t max_sectors;
+
+ /* Each round, increment the search length by this number of sectors. */
+ xfs_daddr_t daddr_step;
+
+ /* Internal scan state; initialize to zero. */
+ xfs_daddr_t __sector_count;
+};
+
+xfs_daddr_t xrep_bufscan_max_sectors(struct xfs_mount *mp,
+ xfs_extlen_t fsblocks);
+struct xfs_buf *xrep_bufscan_advance(struct xfs_mount *mp,
+ struct xrep_bufscan *scan);
#endif /* __XFS_SCRUB_REAP_H__ */
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index bf22f245bbfa..d46528023015 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -7,8 +7,10 @@
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_ag.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
@@ -17,6 +19,7 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub reference count btrees.
@@ -27,6 +30,15 @@ xchk_setup_ag_refcountbt(
{
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ if (xchk_could_repair(sc)) {
+ int error;
+
+ error = xrep_setup_ag_refcountbt(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_ag_btree(sc, false);
}
@@ -409,7 +421,7 @@ xchk_refcount_mergeable(
if (r1->rc_refcount != r2->rc_refcount)
return false;
if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount >
- MAXREFCEXTLEN)
+ XFS_REFC_LEN_MAX)
return false;
return true;
@@ -441,7 +453,8 @@ xchk_refcountbt_rec(
struct xchk_refcbt_records *rrc = bs->private;
xfs_refcount_btrec_to_irec(rec, &irec);
- if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
+ if (xfs_refcount_check_irec(to_perag(bs->cur->bc_group), &irec) !=
+ NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
@@ -478,7 +491,7 @@ xchk_refcount_xref_rmap(
struct xfs_scrub *sc,
xfs_filblks_t cow_blocks)
{
- xfs_extlen_t refcbt_blocks = 0;
+ xfs_filblks_t refcbt_blocks = 0;
xfs_filblks_t blocks;
int error;
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
index f38fccc42a20..9c8cb5332da0 100644
--- a/fs/xfs/scrub/refcount_repair.c
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -25,6 +25,7 @@
#include "xfs_refcount_btree.h"
#include "xfs_error.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -37,6 +38,7 @@
#include "scrub/xfarray.h"
#include "scrub/newbt.h"
#include "scrub/reap.h"
+#include "scrub/rcbag.h"
/*
* Rebuilding the Reference Count Btree
@@ -97,12 +99,6 @@
* insert all the records.
*/
-/* The only parts of the rmap that we care about for computing refcounts. */
-struct xrep_refc_rmap {
- xfs_agblock_t startblock;
- xfs_extlen_t blockcount;
-} __packed;
-
struct xrep_refc {
/* refcount extents */
struct xfarray *refcount_records;
@@ -122,6 +118,20 @@ struct xrep_refc {
xfs_extlen_t btblocks;
};
+/* Set us up to repair refcount btrees. */
+int
+xrep_setup_ag_refcountbt(
+ struct xfs_scrub *sc)
+{
+ char *descr;
+ int error;
+
+ descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ return error;
+}
+
/* Check for any obvious conflicts with this shared/CoW staging extent. */
STATIC int
xrep_refc_check_ext(
@@ -173,13 +183,13 @@ xrep_refc_stash(
if (xchk_should_terminate(sc, &error))
return error;
- irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount);
+ irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount);
error = xrep_refc_check_ext(rr->sc, &irec);
if (error)
return error;
- trace_xrep_refc_found(sc->sa.pag, &irec);
+ trace_xrep_refc_found(pag_group(sc->sa.pag), &irec);
return xfarray_append(rr->refcount_records, &irec);
}
@@ -205,7 +215,7 @@ xrep_refc_rmap_shareable(
return false;
/* Metadata in files are never shareable */
- if (xfs_internal_inum(mp, rmap->rm_owner))
+ if (xfs_is_sb_inum(mp, rmap->rm_owner))
return false;
/* Metadata and unwritten file blocks are not shareable. */
@@ -223,10 +233,9 @@ xrep_refc_rmap_shareable(
STATIC int
xrep_refc_walk_rmaps(
struct xrep_refc *rr,
- struct xrep_refc_rmap *rrm,
+ struct xfs_rmap_irec *rmap,
bool *have_rec)
{
- struct xfs_rmap_irec rmap;
struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur;
struct xfs_mount *mp = cur->bc_mp;
int have_gt;
@@ -250,29 +259,30 @@ xrep_refc_walk_rmaps(
if (!have_gt)
return 0;
- error = xfs_rmap_get_rec(cur, &rmap, &have_gt);
+ error = xfs_rmap_get_rec(cur, rmap, &have_gt);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, !have_gt))
+ if (XFS_IS_CORRUPT(mp, !have_gt)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
- if (rmap.rm_owner == XFS_RMAP_OWN_COW) {
- error = xrep_refc_stash_cow(rr, rmap.rm_startblock,
- rmap.rm_blockcount);
+ if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
+ error = xrep_refc_stash_cow(rr, rmap->rm_startblock,
+ rmap->rm_blockcount);
if (error)
return error;
- } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) {
+ } else if (rmap->rm_owner == XFS_RMAP_OWN_REFC) {
/* refcountbt block, dump it when we're done. */
- rr->btblocks += rmap.rm_blockcount;
+ rr->btblocks += rmap->rm_blockcount;
error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
- rmap.rm_startblock, rmap.rm_blockcount);
+ rmap->rm_startblock,
+ rmap->rm_blockcount);
if (error)
return error;
}
- } while (!xrep_refc_rmap_shareable(mp, &rmap));
+ } while (!xrep_refc_rmap_shareable(mp, rmap));
- rrm->startblock = rmap.rm_startblock;
- rrm->blockcount = rmap.rm_blockcount;
*have_rec = true;
return 0;
}
@@ -354,45 +364,6 @@ xrep_refc_sort_records(
return error;
}
-#define RRM_NEXT(r) ((r).startblock + (r).blockcount)
-/*
- * Find the next block where the refcount changes, given the next rmap we
- * looked at and the ones we're already tracking.
- */
-static inline int
-xrep_refc_next_edge(
- struct xfarray *rmap_bag,
- struct xrep_refc_rmap *next_rrm,
- bool next_valid,
- xfs_agblock_t *nbnop)
-{
- struct xrep_refc_rmap rrm;
- xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
- xfs_agblock_t nbno = NULLAGBLOCK;
- int error;
-
- if (next_valid)
- nbno = next_rrm->startblock;
-
- while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1)
- nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm));
-
- if (error)
- return error;
-
- /*
- * We should have found /something/ because either next_rrm is the next
- * interesting rmap to look at after emitting this refcount extent, or
- * there are other rmaps in rmap_bag contributing to the current
- * sharing count. But if something is seriously wrong, bail out.
- */
- if (nbno == NULLAGBLOCK)
- return -EFSCORRUPTED;
-
- *nbnop = nbno;
- return 0;
-}
-
/*
* Walk forward through the rmap btree to collect all rmaps starting at
* @bno in @rmap_bag. These represent the file(s) that share ownership of
@@ -402,22 +373,21 @@ xrep_refc_next_edge(
static int
xrep_refc_push_rmaps_at(
struct xrep_refc *rr,
- struct xfarray *rmap_bag,
+ struct rcbag *rcstack,
xfs_agblock_t bno,
- struct xrep_refc_rmap *rrm,
- bool *have,
- uint64_t *stack_sz)
+ struct xfs_rmap_irec *rmap,
+ bool *have)
{
struct xfs_scrub *sc = rr->sc;
int have_gt;
int error;
- while (*have && rrm->startblock == bno) {
- error = xfarray_store_anywhere(rmap_bag, rrm);
+ while (*have && rmap->rm_startblock == bno) {
+ error = rcbag_add(rcstack, rr->sc->tp, rmap);
if (error)
return error;
- (*stack_sz)++;
- error = xrep_refc_walk_rmaps(rr, rrm, have);
+
+ error = xrep_refc_walk_rmaps(rr, rmap, have);
if (error)
return error;
}
@@ -425,8 +395,10 @@ xrep_refc_push_rmaps_at(
error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt);
if (error)
return error;
- if (XFS_IS_CORRUPT(sc->mp, !have_gt))
+ if (XFS_IS_CORRUPT(sc->mp, !have_gt)) {
+ xfs_btree_mark_sick(sc->sa.rmap_cur);
return -EFSCORRUPTED;
+ }
return 0;
}
@@ -436,12 +408,9 @@ STATIC int
xrep_refc_find_refcounts(
struct xrep_refc *rr)
{
- struct xrep_refc_rmap rrm;
struct xfs_scrub *sc = rr->sc;
- struct xfarray *rmap_bag;
- char *descr;
- uint64_t old_stack_sz;
- uint64_t stack_sz = 0;
+ struct rcbag *rcstack;
+ uint64_t old_stack_height;
xfs_agblock_t sbno;
xfs_agblock_t cbno;
xfs_agblock_t nbno;
@@ -451,14 +420,11 @@ xrep_refc_find_refcounts(
xrep_ag_btcur_init(sc, &sc->sa);
/*
- * Set up a sparse array to store all the rmap records that we're
- * tracking to generate a reference count record. If this exceeds
- * MAXREFCOUNT, we clamp rc_refcount.
+ * Set up a bag to store all the rmap records that we're tracking to
+ * generate a reference count record. If the size of the bag exceeds
+ * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount.
*/
- descr = xchk_xfile_ag_descr(sc, "rmap record bag");
- error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap),
- &rmap_bag);
- kfree(descr);
+ error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
if (error)
goto out_cur;
@@ -469,62 +435,54 @@ xrep_refc_find_refcounts(
/* Process reverse mappings into refcount data. */
while (xfs_btree_has_more_records(sc->sa.rmap_cur)) {
+ struct xfs_rmap_irec rmap;
+
/* Push all rmaps with pblk == sbno onto the stack */
- error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+ error = xrep_refc_walk_rmaps(rr, &rmap, &have);
if (error)
goto out_bag;
if (!have)
break;
- sbno = cbno = rrm.startblock;
- error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno,
- &rrm, &have, &stack_sz);
+ sbno = cbno = rmap.rm_startblock;
+ error = xrep_refc_push_rmaps_at(rr, rcstack, sbno, &rmap,
+ &have);
if (error)
goto out_bag;
/* Set nbno to the bno of the next refcount change */
- error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno);
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno);
if (error)
goto out_bag;
ASSERT(nbno > sbno);
- old_stack_sz = stack_sz;
+ old_stack_height = rcbag_count(rcstack);
/* While stack isn't empty... */
- while (stack_sz) {
- xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
-
+ while (rcbag_count(rcstack) > 0) {
/* Pop all rmaps that end at nbno */
- while ((error = xfarray_iter(rmap_bag, &array_cur,
- &rrm)) == 1) {
- if (RRM_NEXT(rrm) != nbno)
- continue;
- error = xfarray_unset(rmap_bag, array_cur - 1);
- if (error)
- goto out_bag;
- stack_sz--;
- }
+ error = rcbag_remove_ending_at(rcstack, sc->tp, nbno);
if (error)
goto out_bag;
/* Push array items that start at nbno */
- error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+ error = xrep_refc_walk_rmaps(rr, &rmap, &have);
if (error)
goto out_bag;
if (have) {
- error = xrep_refc_push_rmaps_at(rr, rmap_bag,
- nbno, &rrm, &have, &stack_sz);
+ error = xrep_refc_push_rmaps_at(rr, rcstack,
+ nbno, &rmap, &have);
if (error)
goto out_bag;
}
/* Emit refcount if necessary */
ASSERT(nbno > cbno);
- if (stack_sz != old_stack_sz) {
- if (old_stack_sz > 1) {
+ if (rcbag_count(rcstack) != old_stack_height) {
+ if (old_stack_height > 1) {
error = xrep_refc_stash(rr,
XFS_REFC_DOMAIN_SHARED,
cbno, nbno - cbno,
- old_stack_sz);
+ old_stack_height);
if (error)
goto out_bag;
}
@@ -532,13 +490,13 @@ xrep_refc_find_refcounts(
}
/* Stack empty, go find the next rmap */
- if (stack_sz == 0)
+ if (rcbag_count(rcstack) == 0)
break;
- old_stack_sz = stack_sz;
+ old_stack_height = rcbag_count(rcstack);
sbno = nbno;
/* Set nbno to the bno of the next refcount change */
- error = xrep_refc_next_edge(rmap_bag, &rrm, have,
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
&nbno);
if (error)
goto out_bag;
@@ -547,14 +505,13 @@ xrep_refc_find_refcounts(
}
}
- ASSERT(stack_sz == 0);
+ ASSERT(rcbag_count(rcstack) == 0);
out_bag:
- xfarray_destroy(rmap_bag);
+ rcbag_free(&rcstack);
out_cur:
xchk_ag_btcur_free(&sc->sa);
return error;
}
-#undef RRM_NEXT
/* Retrieve refcountbt data for bulk load. */
STATIC int
@@ -633,7 +590,6 @@ xrep_refc_build_new_tree(
struct xfs_scrub *sc = rr->sc;
struct xfs_btree_cur *refc_cur;
struct xfs_perag *pag = sc->sa.pag;
- xfs_fsblock_t fsbno;
int error;
error = xrep_refc_sort_records(rr);
@@ -646,15 +602,15 @@ xrep_refc_build_new_tree(
* to root the new btree while it's under construction and before we
* attach it to the AG header.
*/
- fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp));
- xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno,
+ xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC,
+ xfs_agbno_to_fsb(pag, xfs_refc_block(sc->mp)),
XFS_AG_RESV_METADATA);
rr->new_btree.bload.get_records = xrep_refc_get_records;
rr->new_btree.bload.claim_block = xrep_refc_claim_block;
/* Compute how many blocks we'll need. */
- refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake,
- pag);
+ refc_cur = xfs_refcountbt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(refc_cur, &rr->new_btree.afake);
error = xfs_btree_bload_compute_geometry(refc_cur,
&rr->new_btree.bload,
xfarray_length(rr->refcount_records));
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 745d5b8f405a..f8f9ed30f56b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -21,6 +21,7 @@
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
+#include "xfs_rtbitmap.h"
#include "xfs_extent_busy.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
@@ -30,12 +31,27 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_reflink.h"
+#include "xfs_health.h"
+#include "xfs_buf_mem.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_dir2.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtalloc.h"
+#include "xfs_metafile.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/stats.h"
+#include "scrub/xfile.h"
+#include "scrub/attr_repair.h"
/*
* Attempt to repair some metadata, if the metadata is corrupt and userspace
@@ -53,6 +69,7 @@ xrep_attempt(
trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
xchk_ag_btcur_free(&sc->sa);
+ xchk_rtgroup_btcur_free(&sc->sr);
/* Repair whatever's broken. */
ASSERT(sc->ops->repair);
@@ -287,7 +304,7 @@ xrep_calc_ag_resblks(
icount = pag->pagi_count;
} else {
/* Try to get the actual counters from disk. */
- error = xfs_ialloc_read_agi(pag, NULL, &bp);
+ error = xfs_ialloc_read_agi(pag, NULL, 0, &bp);
if (!error) {
icount = pag->pagi_count;
xfs_buf_relse(bp);
@@ -297,7 +314,7 @@ xrep_calc_ag_resblks(
/* Now grab the block counters from the AGF. */
error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
if (error) {
- aglen = pag->block_count;
+ aglen = pag_group(pag)->xg_block_count;
freelen = aglen;
usedlen = aglen;
} else {
@@ -317,16 +334,14 @@ xrep_calc_ag_resblks(
/* If the block counts are impossible, make worst-case assumptions. */
if (aglen == NULLAGBLOCK ||
- aglen != pag->block_count ||
+ aglen != pag_group(pag)->xg_block_count ||
freelen >= aglen) {
- aglen = pag->block_count;
+ aglen = pag_group(pag)->xg_block_count;
freelen = aglen;
usedlen = aglen;
}
- xfs_perag_put(pag);
- trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
- freelen, usedlen);
+ trace_xrep_calc_ag_resblks(pag, icount, aglen, freelen, usedlen);
/*
* Figure out how many blocks we'd need worst case to rebuild
@@ -364,12 +379,48 @@ xrep_calc_ag_resblks(
rmapbt_sz = 0;
}
- trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
- inobt_sz, rmapbt_sz, refcbt_sz);
+ trace_xrep_calc_ag_resblks_btsize(pag, bnobt_sz, inobt_sz, rmapbt_sz,
+ refcbt_sz);
+ xfs_perag_put(pag);
return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
}
+#ifdef CONFIG_XFS_RT
+/*
+ * Figure out how many blocks to reserve for a rtgroup repair. We calculate
+ * the worst case estimate for the number of blocks we'd need to rebuild one of
+ * any type of per-rtgroup btree.
+ */
+xfs_extlen_t
+xrep_calc_rtgroup_resblks(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_scrub_metadata *sm = sc->sm;
+ uint64_t usedlen;
+ xfs_extlen_t rmapbt_sz = 0;
+
+ if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+ return 0;
+ if (!xfs_has_rtgroups(mp)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ usedlen = xfs_rtbxlen_to_blen(mp, xfs_rtgroup_extents(mp, sm->sm_agno));
+ ASSERT(usedlen <= XFS_MAX_RGBLOCKS);
+
+ if (xfs_has_rmapbt(mp))
+ rmapbt_sz = xfs_rtrmapbt_calc_size(mp, usedlen);
+
+ trace_xrep_calc_rtgroup_resblks_btsize(mp, sm->sm_agno, usedlen,
+ rmapbt_sz);
+
+ return rmapbt_sz;
+}
+#endif /* CONFIG_XFS_RT */
+
/*
* Reconstructing per-AG Btrees
*
@@ -400,18 +451,17 @@ xrep_calc_ag_resblks(
int
xrep_fix_freelist(
struct xfs_scrub *sc,
- bool can_shrink)
+ int alloc_flags)
{
struct xfs_alloc_arg args = {0};
args.mp = sc->mp;
args.tp = sc->tp;
- args.agno = sc->sa.pag->pag_agno;
+ args.agno = pag_agno(sc->sa.pag);
args.alignment = 1;
args.pag = sc->sa.pag;
- return xfs_alloc_fix_freelist(&args,
- can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
+ return xfs_alloc_fix_freelist(&args, alloc_flags);
}
/*
@@ -476,7 +526,7 @@ xrep_findroot_block(
int block_level;
int error = 0;
- daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno);
+ daddr = xfs_agbno_to_daddr(ri->sc->sa.pag, agbno);
/*
* Blocks in the AGFL have stale contents that might just happen to
@@ -605,7 +655,7 @@ xrep_findroot_block(
else
fab->root = NULLAGBLOCK;
- trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno,
+ trace_xrep_findroot_block(ri->sc->sa.pag, agbno,
be32_to_cpu(btblock->bb_magic), fab->height - 1);
out:
xfs_trans_brelse(ri->sc->tp, bp);
@@ -687,6 +737,44 @@ xrep_find_ag_btree_roots(
}
#ifdef CONFIG_XFS_QUOTA
+/* Update some quota flags in the superblock. */
+void
+xrep_update_qflags(
+ struct xfs_scrub *sc,
+ unsigned int clear_flags,
+ unsigned int set_flags)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+
+ mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
+ if ((mp->m_qflags & clear_flags) == 0 &&
+ (mp->m_qflags & set_flags) == set_flags)
+ goto no_update;
+
+ mp->m_qflags &= ~clear_flags;
+ mp->m_qflags |= set_flags;
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_qflags &= ~clear_flags;
+ mp->m_sb.sb_qflags |= set_flags;
+ spin_unlock(&mp->m_sb_lock);
+
+ /*
+ * Update the quota flags in the ondisk superblock without touching
+ * the summary counters. We have not quiesced inode chunk allocation,
+ * so we cannot coordinate with updates to the icount and ifree percpu
+ * counters.
+ */
+ bp = xfs_trans_getsb(sc->tp);
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
+ xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1);
+
+no_update:
+ mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
+}
+
/* Force a quotacheck the next time we mount. */
void
xrep_force_quotacheck(
@@ -699,13 +787,7 @@ xrep_force_quotacheck(
if (!(flag & sc->mp->m_qflags))
return;
- mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
- sc->mp->m_qflags &= ~flag;
- spin_lock(&sc->mp->m_sb_lock);
- sc->mp->m_sb.sb_qflags &= ~flag;
- spin_unlock(&sc->mp->m_sb_lock);
- xfs_log_sb(sc->tp);
- mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+ xrep_update_qflags(sc, flag, 0);
}
/*
@@ -799,20 +881,20 @@ xrep_ag_btcur_init(
/* Set up a bnobt cursor for cross-referencing. */
if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
- sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
- sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag);
+ sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag);
}
/* Set up a inobt cursor for cross-referencing. */
if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
- sa->agi_bp, XFS_BTNUM_INO);
+ sa->agi_bp);
if (xfs_has_finobt(mp))
- sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag,
- sc->tp, sa->agi_bp, XFS_BTNUM_FINO);
+ sa->fino_cur = xfs_finobt_init_cursor(sc->sa.pag,
+ sc->tp, sa->agi_bp);
}
/* Set up a rmapbt cursor for cross-referencing. */
@@ -874,7 +956,7 @@ xrep_reinit_pagi(
ASSERT(xfs_perag_initialised_agi(pag));
clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
- error = xfs_ialloc_read_agi(pag, sc->tp, &bp);
+ error = xfs_ialloc_read_agi(pag, sc->tp, 0, &bp);
if (error)
return error;
@@ -900,7 +982,7 @@ xrep_ag_init(
ASSERT(!sa->pag);
- error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp);
+ error = xfs_ialloc_read_agi(pag, sc->tp, 0, &sa->agi_bp);
if (error)
return error;
@@ -914,6 +996,83 @@ xrep_ag_init(
return 0;
}
+#ifdef CONFIG_XFS_RT
+/* Initialize all the btree cursors for a RT repair. */
+void
+xrep_rtgroup_btcur_init(
+ struct xfs_scrub *sc,
+ struct xchk_rt *sr)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ ASSERT(sr->rtg != NULL);
+
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTRMAPBT &&
+ (sr->rtlock_flags & XFS_RTGLOCK_RMAP) &&
+ xfs_has_rtrmapbt(mp))
+ sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg);
+
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTREFCBT &&
+ (sr->rtlock_flags & XFS_RTGLOCK_REFCOUNT) &&
+ xfs_has_rtreflink(mp))
+ sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg);
+}
+
+/*
+ * Given a reference to a rtgroup structure, lock rtgroup btree inodes and
+ * create btree cursors. Must only be called to repair a regular rt file.
+ */
+int
+xrep_rtgroup_init(
+ struct xfs_scrub *sc,
+ struct xfs_rtgroup *rtg,
+ struct xchk_rt *sr,
+ unsigned int rtglock_flags)
+{
+ ASSERT(sr->rtg == NULL);
+
+ xfs_rtgroup_lock(rtg, rtglock_flags);
+ sr->rtlock_flags = rtglock_flags;
+
+ /* Grab our own passive reference from the caller's ref. */
+ sr->rtg = xfs_rtgroup_hold(rtg);
+ xrep_rtgroup_btcur_init(sc, sr);
+ return 0;
+}
+
+/* Ensure that all rt blocks in the given range are not marked free. */
+int
+xrep_require_rtext_inuse(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t rgbno,
+ xfs_filblks_t len)
+{
+ struct xfs_mount *mp = sc->mp;
+ xfs_rtxnum_t startrtx;
+ xfs_rtxnum_t endrtx;
+ bool is_free = false;
+ int error = 0;
+
+ if (xfs_has_zoned(mp)) {
+ if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1))
+ return -EFSCORRUPTED;
+ return 0;
+ }
+
+ startrtx = xfs_rgbno_to_rtx(mp, rgbno);
+ endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1);
+
+ error = xfs_rtalloc_extent_is_free(sc->sr.rtg, sc->tp, startrtx,
+ endrtx - startrtx + 1, &is_free);
+ if (error)
+ return error;
+ if (is_free)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
/* Reinitialize the per-AG block reservation for the AG we just fixed. */
int
xrep_reset_perag_resv(
@@ -929,18 +1088,15 @@ xrep_reset_perag_resv(
ASSERT(sc->tp);
sc->flags &= ~XREP_RESET_PERAG_RESV;
- error = xfs_ag_resv_free(sc->sa.pag);
- if (error)
- goto out;
+ xfs_ag_resv_free(sc->sa.pag);
error = xfs_ag_resv_init(sc->sa.pag, sc->tp);
if (error == -ENOSPC) {
xfs_err(sc->mp,
"Insufficient free space to reset per-AG reservation for AG %u after repair.",
- sc->sa.pag->pag_agno);
+ pag_agno(sc->sa.pag));
error = 0;
}
-out:
return error;
}
@@ -970,55 +1126,27 @@ xrep_metadata_inode_subtype(
struct xfs_scrub *sc,
unsigned int scrub_type)
{
- __u32 smtype = sc->sm->sm_type;
- __u32 smflags = sc->sm->sm_flags;
- unsigned int sick_mask = sc->sick_mask;
+ struct xfs_scrub_subord *sub;
int error;
/*
- * Let's see if the inode needs repair. We're going to open-code calls
- * to the scrub and repair functions so that we can hang on to the
+ * Let's see if the inode needs repair. Use a subordinate scrub context
+ * to call the scrub and repair functions so that we can hang on to the
* resources that we already acquired instead of using the standard
* setup/teardown routines.
*/
- sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
- sc->sm->sm_type = scrub_type;
-
- switch (scrub_type) {
- case XFS_SCRUB_TYPE_INODE:
- error = xchk_inode(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTD:
- error = xchk_bmap_data(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTA:
- error = xchk_bmap_attr(sc);
- break;
- default:
- ASSERT(0);
- error = -EFSCORRUPTED;
- }
+ sub = xchk_scrub_create_subord(sc, scrub_type);
+ error = sub->sc.ops->scrub(&sub->sc);
if (error)
goto out;
-
- if (!xrep_will_attempt(sc))
+ if (!xrep_will_attempt(&sub->sc))
goto out;
/*
* Repair some part of the inode. This will potentially join the inode
* to the transaction.
*/
- switch (scrub_type) {
- case XFS_SCRUB_TYPE_INODE:
- error = xrep_inode(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTD:
- error = xrep_bmap(sc, XFS_DATA_FORK, false);
- break;
- case XFS_SCRUB_TYPE_BMBTA:
- error = xrep_bmap(sc, XFS_ATTR_FORK, false);
- break;
- }
+ error = sub->sc.ops->repair(&sub->sc);
if (error)
goto out;
@@ -1027,10 +1155,10 @@ xrep_metadata_inode_subtype(
* that the inode will not be joined to the transaction when we exit
* the function.
*/
- error = xfs_defer_finish(&sc->tp);
+ error = xfs_defer_finish(&sub->sc.tp);
if (error)
goto out;
- error = xfs_trans_roll(&sc->tp);
+ error = xfs_trans_roll(&sub->sc.tp);
if (error)
goto out;
@@ -1038,31 +1166,18 @@ xrep_metadata_inode_subtype(
* Clear the corruption flags and re-check the metadata that we just
* repaired.
*/
- sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
-
- switch (scrub_type) {
- case XFS_SCRUB_TYPE_INODE:
- error = xchk_inode(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTD:
- error = xchk_bmap_data(sc);
- break;
- case XFS_SCRUB_TYPE_BMBTA:
- error = xchk_bmap_attr(sc);
- break;
- }
+ sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ error = sub->sc.ops->scrub(&sub->sc);
if (error)
goto out;
/* If corruption persists, the repair has failed. */
- if (xchk_needs_repair(sc->sm)) {
+ if (xchk_needs_repair(sub->sc.sm)) {
error = -EFSCORRUPTED;
goto out;
}
out:
- sc->sick_mask = sick_mask;
- sc->sm->sm_type = smtype;
- sc->sm->sm_flags = smflags;
+ xchk_scrub_free_subord(sub);
return error;
}
@@ -1088,10 +1203,17 @@ xrep_metadata_inode_forks(
if (error)
return error;
- /* Make sure the attr fork looks ok before we delete it. */
- error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
- if (error)
- return error;
+ /*
+ * Metadata files can only have extended attributes on metadir
+ * filesystems, either for parent pointers or for actual xattr data.
+ * For a non-metadir filesystem, make sure the attr fork looks ok
+ * before we delete it.
+ */
+ if (xfs_inode_hasattr(sc->ip)) {
+ error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
+ if (error)
+ return error;
+ }
/* Clear the reflink flag since metadata never shares. */
if (xfs_is_reflink_inode(sc->ip)) {
@@ -1103,6 +1225,20 @@ xrep_metadata_inode_forks(
}
/*
+ * Metadata files on non-metadir filesystems cannot have attr forks,
+ * so clear them now.
+ */
+ if (xfs_inode_hasattr(sc->ip) && !xfs_has_metadir(sc->mp)) {
+ if (!dirty) {
+ dirty = true;
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ }
+ error = xrep_xattr_reset_fork(sc);
+ if (error)
+ return error;
+ }
+
+ /*
* If we modified the inode, roll the transaction but don't rejoin the
* inode to the new transaction because xrep_bmap_data can do that.
*/
@@ -1115,3 +1251,193 @@ xrep_metadata_inode_forks(
return 0;
}
+
+/*
+ * Set up an in-memory buffer cache so that we can use the xfbtree. Allocating
+ * a shmem file might take loks, so we cannot be in transaction context. Park
+ * our resources in the scrub context and let the teardown function take care
+ * of them at the right time.
+ */
+int
+xrep_setup_xfbtree(
+ struct xfs_scrub *sc,
+ const char *descr)
+{
+ ASSERT(sc->tp == NULL);
+
+ return xmbuf_alloc(sc->mp, descr, &sc->xmbtp);
+}
+
+/*
+ * Create a dummy transaction for use in a live update hook function. This
+ * function MUST NOT be called from regular repair code because the current
+ * process' transaction is saved via the cookie.
+ */
+int
+xrep_trans_alloc_hook_dummy(
+ struct xfs_mount *mp,
+ void **cookiep,
+ struct xfs_trans **tpp)
+{
+ int error;
+
+ *cookiep = current->journal_info;
+ current->journal_info = NULL;
+
+ error = xfs_trans_alloc_empty(mp, tpp);
+ if (!error)
+ return 0;
+
+ current->journal_info = *cookiep;
+ *cookiep = NULL;
+ return error;
+}
+
+/* Cancel a dummy transaction used by a live update hook function. */
+void
+xrep_trans_cancel_hook_dummy(
+ void **cookiep,
+ struct xfs_trans *tp)
+{
+ xfs_trans_cancel(tp);
+ current->journal_info = *cookiep;
+ *cookiep = NULL;
+}
+
+/*
+ * See if this buffer can pass the given ->verify_struct() function.
+ *
+ * If the buffer already has ops attached and they're not the ones that were
+ * passed in, we reject the buffer. Otherwise, we perform the structure test
+ * (note that we do not check CRCs) and return the outcome of the test. The
+ * buffer ops and error state are left unchanged.
+ */
+bool
+xrep_buf_verify_struct(
+ struct xfs_buf *bp,
+ const struct xfs_buf_ops *ops)
+{
+ const struct xfs_buf_ops *old_ops = bp->b_ops;
+ xfs_failaddr_t fa;
+ int old_error;
+
+ if (old_ops) {
+ if (old_ops != ops)
+ return false;
+ }
+
+ old_error = bp->b_error;
+ bp->b_ops = ops;
+ fa = bp->b_ops->verify_struct(bp);
+ bp->b_ops = old_ops;
+ bp->b_error = old_error;
+
+ return fa == NULL;
+}
+
+/* Check the sanity of a rmap record for a metadata btree inode. */
+int
+xrep_check_ino_btree_mapping(
+ struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *rec)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ /*
+ * Metadata btree inodes never have extended attributes, and all blocks
+ * should have the bmbt block flag set.
+ */
+ if ((rec->rm_flags & XFS_RMAP_ATTR_FORK) ||
+ !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+ return -EFSCORRUPTED;
+
+ /* Make sure the block is within the AG. */
+ if (!xfs_verify_agbext(sc->sa.pag, rec->rm_startblock,
+ rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
+ rec->rm_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/*
+ * Reset the block count of the inode being repaired, and adjust the dquot
+ * block usage to match. The inode must not have an xattr fork.
+ */
+void
+xrep_inode_set_nblocks(
+ struct xfs_scrub *sc,
+ int64_t new_blocks)
+{
+ int64_t delta =
+ new_blocks - sc->ip->i_nblocks;
+
+ sc->ip->i_nblocks = new_blocks;
+
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ if (delta != 0)
+ xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT,
+ delta);
+}
+
+/* Reset the block reservation for a metadata inode. */
+int
+xrep_reset_metafile_resv(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ int64_t delta;
+ int error;
+
+ delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail -
+ mp->m_metafile_resv_target;
+ if (delta == 0)
+ return 0;
+
+ /*
+ * Too many blocks have been reserved, transfer some from the incore
+ * reservation back to the filesystem.
+ */
+ if (delta > 0) {
+ int64_t give_back;
+
+ give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail);
+ if (give_back > 0) {
+ xfs_mod_sb_delalloc(mp, -give_back);
+ xfs_add_fdblocks(mp, give_back);
+ mp->m_metafile_resv_avail -= give_back;
+ }
+
+ return 0;
+ }
+
+ /*
+ * Not enough reservation; try to take some blocks from the filesystem
+ * to the metabtree reservation.
+ */
+ delta = -delta; /* delta is negative here, so invert the sign. */
+ error = xfs_dec_fdblocks(mp, delta, true);
+ while (error == -ENOSPC) {
+ delta--;
+ if (delta == 0) {
+ xfs_warn(sc->mp,
+"Insufficient free space to reset metabtree reservation after repair.");
+ return 0;
+ }
+ error = xfs_dec_fdblocks(mp, delta, true);
+ }
+ if (error)
+ return error;
+
+ xfs_mod_sb_delalloc(mp, delta);
+ mp->m_metafile_resv_avail += delta;
+ return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 17114327e6fa..af0a3a9e5ed9 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -8,6 +8,7 @@
#include "xfs_quota_defs.h"
+struct xfs_rtgroup;
struct xchk_stats_run;
static inline int xrep_notsupported(struct xfs_scrub *sc)
@@ -49,9 +50,11 @@ xrep_trans_commit(
struct xbitmap;
struct xagb_bitmap;
+struct xrgb_bitmap;
struct xfsb_bitmap;
+struct xrtb_bitmap;
-int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
+int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags);
struct xrep_find_ag_btree {
/* in: rmap owner of the btree we're looking for */
@@ -72,6 +75,8 @@ int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp);
#ifdef CONFIG_XFS_QUOTA
+void xrep_update_qflags(struct xfs_scrub *sc, unsigned int clear_flags,
+ unsigned int set_flags);
void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
int xrep_ino_dqattach(struct xfs_scrub *sc);
#else
@@ -79,11 +84,23 @@ int xrep_ino_dqattach(struct xfs_scrub *sc);
# define xrep_ino_dqattach(sc) (0)
#endif /* CONFIG_XFS_QUOTA */
+int xrep_setup_xfbtree(struct xfs_scrub *sc, const char *descr);
+
int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork,
xfs_extnum_t nextents);
int xrep_reset_perag_resv(struct xfs_scrub *sc);
int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten);
int xrep_metadata_inode_forks(struct xfs_scrub *sc);
+int xrep_setup_ag_rmapbt(struct xfs_scrub *sc);
+int xrep_setup_ag_refcountbt(struct xfs_scrub *sc);
+int xrep_setup_xattr(struct xfs_scrub *sc);
+int xrep_setup_directory(struct xfs_scrub *sc);
+int xrep_setup_parent(struct xfs_scrub *sc);
+int xrep_setup_nlinks(struct xfs_scrub *sc);
+int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks);
+int xrep_setup_dirtree(struct xfs_scrub *sc);
+int xrep_setup_rtrmapbt(struct xfs_scrub *sc);
+int xrep_setup_rtrefcountbt(struct xfs_scrub *sc);
/* Repair setup functions */
int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -94,6 +111,20 @@ int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap);
void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag,
struct xchk_ag *sa);
+#ifdef CONFIG_XFS_RT
+int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg,
+ struct xchk_rt *sr, unsigned int rtglock_flags);
+void xrep_rtgroup_btcur_init(struct xfs_scrub *sc, struct xchk_rt *sr);
+int xrep_require_rtext_inuse(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_filblks_t len);
+xfs_extlen_t xrep_calc_rtgroup_resblks(struct xfs_scrub *sc);
+#else
+# define xrep_rtgroup_init(sc, rtg, sr, lockflags) (-ENOSYS)
+# define xrep_calc_rtgroup_resblks(sc) (0)
+#endif /* CONFIG_XFS_RT */
+
+int xrep_check_ino_btree_mapping(struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *rec);
/* Metadata revalidators */
@@ -109,31 +140,67 @@ int xrep_agfl(struct xfs_scrub *sc);
int xrep_agi(struct xfs_scrub *sc);
int xrep_allocbt(struct xfs_scrub *sc);
int xrep_iallocbt(struct xfs_scrub *sc);
+int xrep_rmapbt(struct xfs_scrub *sc);
int xrep_refcountbt(struct xfs_scrub *sc);
int xrep_inode(struct xfs_scrub *sc);
int xrep_bmap_data(struct xfs_scrub *sc);
int xrep_bmap_attr(struct xfs_scrub *sc);
int xrep_bmap_cow(struct xfs_scrub *sc);
+int xrep_nlinks(struct xfs_scrub *sc);
+int xrep_fscounters(struct xfs_scrub *sc);
+int xrep_xattr(struct xfs_scrub *sc);
+int xrep_directory(struct xfs_scrub *sc);
+int xrep_parent(struct xfs_scrub *sc);
+int xrep_symlink(struct xfs_scrub *sc);
+int xrep_dirtree(struct xfs_scrub *sc);
+int xrep_metapath(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xrep_rtbitmap(struct xfs_scrub *sc);
+int xrep_rtsummary(struct xfs_scrub *sc);
+int xrep_rgsuperblock(struct xfs_scrub *sc);
+int xrep_rtrmapbt(struct xfs_scrub *sc);
+int xrep_rtrefcountbt(struct xfs_scrub *sc);
#else
# define xrep_rtbitmap xrep_notsupported
+# define xrep_rtsummary xrep_notsupported
+# define xrep_rgsuperblock xrep_notsupported
+# define xrep_rtrmapbt xrep_notsupported
+# define xrep_rtrefcountbt xrep_notsupported
#endif /* CONFIG_XFS_RT */
#ifdef CONFIG_XFS_QUOTA
int xrep_quota(struct xfs_scrub *sc);
+int xrep_quotacheck(struct xfs_scrub *sc);
#else
# define xrep_quota xrep_notsupported
+# define xrep_quotacheck xrep_notsupported
#endif /* CONFIG_XFS_QUOTA */
int xrep_reinit_pagf(struct xfs_scrub *sc);
int xrep_reinit_pagi(struct xfs_scrub *sc);
+int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep,
+ struct xfs_trans **tpp);
+void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp);
+
+bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks);
+int xrep_reset_metafile_resv(struct xfs_scrub *sc);
+
#else
#define xrep_ino_dqattach(sc) (0)
-#define xrep_will_attempt(sc) (false)
+
+/*
+ * When online repair is not built into the kernel, we still want to attempt
+ * the repair so that the stub xrep_attempt below will return EOPNOTSUPP.
+ */
+static inline bool xrep_will_attempt(const struct xfs_scrub *sc)
+{
+ return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
+ xchk_needs_repair(sc->sm);
+}
static inline int
xrep_attempt(
@@ -152,6 +219,8 @@ xrep_calc_ag_resblks(
return 0;
}
+#define xrep_calc_rtgroup_resblks xrep_calc_ag_resblks
+
static inline int
xrep_reset_perag_resv(
struct xfs_scrub *sc)
@@ -171,9 +240,24 @@ xrep_setup_nothing(
return 0;
}
#define xrep_setup_ag_allocbt xrep_setup_nothing
+#define xrep_setup_ag_rmapbt xrep_setup_nothing
+#define xrep_setup_ag_refcountbt xrep_setup_nothing
+#define xrep_setup_xattr xrep_setup_nothing
+#define xrep_setup_directory xrep_setup_nothing
+#define xrep_setup_parent xrep_setup_nothing
+#define xrep_setup_nlinks xrep_setup_nothing
+#define xrep_setup_dirtree xrep_setup_nothing
+#define xrep_setup_metapath xrep_setup_nothing
+#define xrep_setup_rtrmapbt xrep_setup_nothing
+#define xrep_setup_rtrefcountbt xrep_setup_nothing
#define xrep_setup_inode(sc, imap) ((void)0)
+static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x)
+{
+ return 0;
+}
+
#define xrep_revalidate_allocbt (NULL)
#define xrep_revalidate_iallocbt (NULL)
@@ -184,6 +268,7 @@ xrep_setup_nothing(
#define xrep_agi xrep_notsupported
#define xrep_allocbt xrep_notsupported
#define xrep_iallocbt xrep_notsupported
+#define xrep_rmapbt xrep_notsupported
#define xrep_refcountbt xrep_notsupported
#define xrep_inode xrep_notsupported
#define xrep_bmap_data xrep_notsupported
@@ -191,6 +276,19 @@ xrep_setup_nothing(
#define xrep_bmap_cow xrep_notsupported
#define xrep_rtbitmap xrep_notsupported
#define xrep_quota xrep_notsupported
+#define xrep_quotacheck xrep_notsupported
+#define xrep_nlinks xrep_notsupported
+#define xrep_fscounters xrep_notsupported
+#define xrep_rtsummary xrep_notsupported
+#define xrep_xattr xrep_notsupported
+#define xrep_directory xrep_notsupported
+#define xrep_parent xrep_notsupported
+#define xrep_symlink xrep_notsupported
+#define xrep_dirtree xrep_notsupported
+#define xrep_metapath xrep_notsupported
+#define xrep_rgsuperblock xrep_notsupported
+#define xrep_rtrmapbt xrep_notsupported
+#define xrep_rtrefcountbt xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rgb_bitmap.h b/fs/xfs/scrub/rgb_bitmap.h
new file mode 100644
index 000000000000..4c3126b66dcb
--- /dev/null
+++ b/fs/xfs/scrub/rgb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RGB_BITMAP_H__
+#define __XFS_SCRUB_RGB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_rgblock_t */
+
+struct xrgb_bitmap {
+ struct xbitmap32 rgbitmap;
+};
+
+static inline void xrgb_bitmap_init(struct xrgb_bitmap *bitmap)
+{
+ xbitmap32_init(&bitmap->rgbitmap);
+}
+
+static inline void xrgb_bitmap_destroy(struct xrgb_bitmap *bitmap)
+{
+ xbitmap32_destroy(&bitmap->rgbitmap);
+}
+
+static inline int xrgb_bitmap_set(struct xrgb_bitmap *bitmap,
+ xfs_rgblock_t start, xfs_extlen_t len)
+{
+ return xbitmap32_set(&bitmap->rgbitmap, start, len);
+}
+
+static inline int xrgb_bitmap_walk(struct xrgb_bitmap *bitmap,
+ xbitmap32_walk_fn fn, void *priv)
+{
+ return xbitmap32_walk(&bitmap->rgbitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_RGB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/rgsuper.c b/fs/xfs/scrub/rgsuper.c
new file mode 100644
index 000000000000..d189732d0e24
--- /dev/null
+++ b/fs/xfs/scrub/rgsuper.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_rtgroup.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_rmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+
+/* Set us up with a transaction and an empty context. */
+int
+xchk_setup_rgsuperblock(
+ struct xfs_scrub *sc)
+{
+ return xchk_trans_alloc(sc, 0);
+}
+
+/* Cross-reference with the other rt metadata. */
+STATIC void
+xchk_rgsuperblock_xref(
+ struct xfs_scrub *sc)
+{
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ xchk_xref_is_used_rt_space(sc, xfs_rgbno_to_rtb(sc->sr.rtg, 0), 1);
+ xchk_xref_is_only_rt_owned_by(sc, 0, 1, &XFS_RMAP_OINFO_FS);
+}
+
+int
+xchk_rgsuperblock(
+ struct xfs_scrub *sc)
+{
+ xfs_rgnumber_t rgno = sc->sm->sm_agno;
+ int error;
+
+ /*
+ * Only rtgroup 0 has a superblock. We may someday want to use higher
+ * rgno for other functions, similar to what we do with the primary
+ * super scrub function.
+ */
+ if (rgno != 0)
+ return -ENOENT;
+
+ /*
+ * Grab an active reference to the rtgroup structure. If we can't get
+ * it, we're racing with something that's tearing down the group, so
+ * signal that the group no longer exists. Take the rtbitmap in shared
+ * mode so that the group can't change while we're doing things.
+ */
+ error = xchk_rtgroup_init_existing(sc, rgno, &sc->sr);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP_SHARED);
+ if (error)
+ return error;
+
+ /*
+ * Since we already validated the rt superblock at mount time, we don't
+ * need to check its contents again. All we need is to cross-reference.
+ */
+ xchk_rgsuperblock_xref(sc);
+ return 0;
+}
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int
+xrep_rgsuperblock(
+ struct xfs_scrub *sc)
+{
+ ASSERT(rtg_rgno(sc->sr.rtg) == 0);
+
+ xfs_log_sb(sc->tp);
+ return 0;
+}
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index c99d1714f283..39e9ad7cd8ae 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -25,6 +25,7 @@
#include "scrub/btree.h"
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub reverse mapping btrees.
@@ -36,6 +37,14 @@ xchk_setup_ag_rmapbt(
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+ if (xchk_could_repair(sc)) {
+ int error;
+
+ error = xrep_setup_ag_rmapbt(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_ag_btree(sc, false);
}
@@ -349,7 +358,7 @@ xchk_rmapbt_rec(
struct xfs_rmap_irec irec;
if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL ||
- xfs_rmap_check_irec(bs->cur, &irec) != NULL) {
+ xfs_rmap_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
@@ -401,7 +410,7 @@ xchk_rmapbt_walk_ag_metadata(
goto out;
/* OWN_LOG: Internal log */
- if (xfs_ag_contains_log(mp, sc->sa.pag->pag_agno)) {
+ if (xfs_ag_contains_log(mp, pag_agno(sc->sa.pag))) {
error = xagb_bitmap_set(&cr->log_owned,
XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart),
mp->m_sb.sb_logblocks);
@@ -412,8 +421,8 @@ xchk_rmapbt_walk_ag_metadata(
/* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */
cur = sc->sa.bno_cur;
if (!cur)
- cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
+ cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
if (cur != sc->sa.bno_cur)
xfs_btree_del_cursor(cur, error);
@@ -422,8 +431,8 @@ xchk_rmapbt_walk_ag_metadata(
cur = sc->sa.cnt_cur;
if (!cur)
- cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
if (cur != sc->sa.cnt_cur)
xfs_btree_del_cursor(cur, error);
@@ -447,8 +456,7 @@ xchk_rmapbt_walk_ag_metadata(
/* OWN_INOBT: inobt, finobt */
cur = sc->sa.ino_cur;
if (!cur)
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp,
- XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp);
error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
if (cur != sc->sa.ino_cur)
xfs_btree_del_cursor(cur, error);
@@ -458,8 +466,8 @@ xchk_rmapbt_walk_ag_metadata(
if (xfs_has_finobt(sc->mp)) {
cur = sc->sa.fino_cur;
if (!cur)
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
- sc->sa.agi_bp, XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp,
+ sc->sa.agi_bp);
error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
if (cur != sc->sa.fino_cur)
xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
new file mode 100644
index 000000000000..f5f73078ffe2
--- /dev/null
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -0,0 +1,1743 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_ag.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrefcount_btree.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Reverse Mapping Btree Repair
+ * ============================
+ *
+ * This is the most involved of all the AG space btree rebuilds. Everywhere
+ * else in XFS we lock inodes and then AG data structures, but generating the
+ * list of rmap records requires that we be able to scan both block mapping
+ * btrees of every inode in the filesystem to see if it owns any extents in
+ * this AG. We can't tolerate any inode updates while we do this, so we
+ * freeze the filesystem to lock everyone else out, and grant ourselves
+ * special privileges to run transactions with regular background reclamation
+ * turned off.
+ *
+ * We also have to be very careful not to allow inode reclaim to start a
+ * transaction because all transactions (other than our own) will block.
+ * Deferred inode inactivation helps us out there.
+ *
+ * I) Reverse mappings for all non-space metadata and file data are collected
+ * according to the following algorithm:
+ *
+ * 1. For each fork of each inode:
+ * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary.
+ * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate
+ * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree
+ * block.
+ * 1.3. If the incore extent map is loaded but the fork is in btree format,
+ * just visit the bmbt blocks to set the corresponding BMBIT areas.
+ * 1.4. From the incore extent map, accumulate each bmap that falls into our
+ * target AG. Remember, multiple bmap records can map to a single rmap
+ * record, so we cannot simply emit rmap records 1:1.
+ * 1.5. Emit rmap records for each extent in BMBIT and free it.
+ * 2. Create bitmaps INOBIT and ICHUNKBIT.
+ * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT,
+ * and set bits in INOBIT for each btree block. If the inobt has no records
+ * at all, we must be careful to record its root in INOBIT.
+ * 4. For each block in the finobt, set the corresponding INOBIT area.
+ * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them.
+ * 6. Create bitmaps REFCBIT and COWBIT.
+ * 7. For each CoW staging extent in the refcountbt, set the corresponding
+ * areas in COWBIT.
+ * 8. For each block in the refcountbt, set the corresponding REFCBIT area.
+ * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them.
+ * A. Emit rmap for the AG headers.
+ * B. Emit rmap for the log, if there is one.
+ *
+ * II) The rmapbt shape and space metadata rmaps are computed as follows:
+ *
+ * 1. Count the rmaps collected in the previous step. (= NR)
+ * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB)
+ * 3. Reserve RMB blocks through the newbt using the allocator in normap mode.
+ * 4. Create bitmap AGBIT.
+ * 5. For each reservation in the newbt, set the corresponding areas in AGBIT.
+ * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT.
+ * 7. Count the extents in AGBIT. (= AGNR)
+ * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB')
+ * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB',
+ * and clear AGBIT. Go to step 5.
+ * A. Emit rmaps for each extent in AGBIT.
+ *
+ * III) The rmapbt is constructed and set in place as follows:
+ *
+ * 1. Sort the rmap records.
+ * 2. Bulk load the rmaps.
+ *
+ * IV) Reap the old btree blocks.
+ *
+ * 1. Create a bitmap OLDRMBIT.
+ * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT.
+ * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT.
+ * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are
+ * the parts of the AG that the rmap didn't find during its scan of the
+ * primary metadata and aren't known to be in the free space, which implies
+ * that they were the old rmapbt blocks.
+ * 5. Commit.
+ *
+ * We use the 'xrep_rmap' prefix for all the rmap functions.
+ */
+
+/* Context for collecting rmaps */
+struct xrep_rmap {
+ /* new rmapbt information */
+ struct xrep_newbt new_btree;
+
+ /* lock for the xfbtree and xfile */
+ struct mutex lock;
+
+ /* rmap records generated from primary metadata */
+ struct xfbtree rmap_btree;
+
+ struct xfs_scrub *sc;
+
+ /* in-memory btree cursor for the xfs_btree_bload iteration */
+ struct xfs_btree_cur *mcur;
+
+ /* Hooks into rmap update code. */
+ struct xfs_rmap_hook rhook;
+
+ /* inode scan cursor */
+ struct xchk_iscan iscan;
+
+ /* Number of non-freespace records found. */
+ unsigned long long nr_records;
+
+ /* bnobt/cntbt contribution to btreeblks */
+ xfs_agblock_t freesp_btblocks;
+
+ /* old agf_rmap_blocks counter */
+ unsigned int old_rmapbt_fsbcount;
+};
+
+/* Set us up to repair reverse mapping btrees. */
+int
+xrep_setup_ag_rmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rmap *rr;
+ char *descr;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
+
+ descr = xchk_xfile_ag_descr(sc, "reverse mapping records");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ if (error)
+ return error;
+
+ rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS);
+ if (!rr)
+ return -ENOMEM;
+
+ rr->sc = sc;
+ sc->buf = rr;
+ return 0;
+}
+
+/* Make sure there's nothing funny about this mapping. */
+STATIC int
+xrep_rmap_check_mapping(
+ struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *rec)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
+ rec->rm_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Store a reverse-mapping record. */
+static inline int
+xrep_rmap_stash(
+ struct xrep_rmap *rr,
+ xfs_agblock_t startblock,
+ xfs_extlen_t blockcount,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ struct xfs_rmap_irec rmap = {
+ .rm_startblock = startblock,
+ .rm_blockcount = blockcount,
+ .rm_owner = owner,
+ .rm_offset = offset,
+ .rm_flags = flags,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_btree_cur *mcur;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ if (xchk_iscan_aborted(&rr->iscan))
+ return -EFSCORRUPTED;
+
+ trace_xrep_rmap_found(sc->sa.pag, &rmap);
+
+ mutex_lock(&rr->lock);
+ mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree);
+ error = xfs_rmap_map_raw(mcur, &rmap);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_cancel;
+
+ error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp);
+ if (error)
+ goto out_abort;
+
+ mutex_unlock(&rr->lock);
+ return 0;
+
+out_cancel:
+ xfbtree_trans_cancel(&rr->rmap_btree, sc->tp);
+out_abort:
+ xchk_iscan_abort(&rr->iscan);
+ mutex_unlock(&rr->lock);
+ return error;
+}
+
+struct xrep_rmap_stash_run {
+ struct xrep_rmap *rr;
+ uint64_t owner;
+ unsigned int rmap_flags;
+};
+
+static int
+xrep_rmap_stash_run(
+ uint32_t start,
+ uint32_t len,
+ void *priv)
+{
+ struct xrep_rmap_stash_run *rsr = priv;
+ struct xrep_rmap *rr = rsr->rr;
+
+ return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags);
+}
+
+/*
+ * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure
+ * that the ranges are in units of FS blocks.
+ */
+STATIC int
+xrep_rmap_stash_bitmap(
+ struct xrep_rmap *rr,
+ struct xagb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xrep_rmap_stash_run rsr = {
+ .rr = rr,
+ .owner = oinfo->oi_owner,
+ .rmap_flags = 0,
+ };
+
+ if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+ rsr.rmap_flags |= XFS_RMAP_ATTR_FORK;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+ rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK;
+
+ return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr);
+}
+
+/* Section (I): Finding all file and bmbt extents. */
+
+/* Context for accumulating rmaps for an inode fork. */
+struct xrep_rmap_ifork {
+ /*
+ * Accumulate rmap data here to turn multiple adjacent bmaps into a
+ * single rmap.
+ */
+ struct xfs_rmap_irec accum;
+
+ /* Bitmap of bmbt blocks in this AG. */
+ struct xagb_bitmap bmbt_blocks;
+
+ struct xrep_rmap *rr;
+
+ /* Which inode fork? */
+ int whichfork;
+};
+
+/* Stash an rmap that we accumulated while walking an inode fork. */
+STATIC int
+xrep_rmap_stash_accumulated(
+ struct xrep_rmap_ifork *rf)
+{
+ if (rf->accum.rm_blockcount == 0)
+ return 0;
+
+ return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock,
+ rf->accum.rm_blockcount, rf->accum.rm_owner,
+ rf->accum.rm_offset, rf->accum.rm_flags);
+}
+
+/* Accumulate a bmbt record. */
+STATIC int
+xrep_rmap_visit_bmbt(
+ struct xfs_btree_cur *cur,
+ struct xfs_bmbt_irec *rec,
+ void *priv)
+{
+ struct xrep_rmap_ifork *rf = priv;
+ struct xfs_mount *mp = rf->rr->sc->mp;
+ struct xfs_rmap_irec *accum = &rf->accum;
+ xfs_agblock_t agbno;
+ unsigned int rmap_flags = 0;
+ int error;
+
+ if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) !=
+ pag_agno(rf->rr->sc->sa.pag))
+ return 0;
+
+ agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock);
+ if (rf->whichfork == XFS_ATTR_FORK)
+ rmap_flags |= XFS_RMAP_ATTR_FORK;
+ if (rec->br_state == XFS_EXT_UNWRITTEN)
+ rmap_flags |= XFS_RMAP_UNWRITTEN;
+
+ /* If this bmap is adjacent to the previous one, just add it. */
+ if (accum->rm_blockcount > 0 &&
+ rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
+ agbno == accum->rm_startblock + accum->rm_blockcount &&
+ rmap_flags == accum->rm_flags) {
+ accum->rm_blockcount += rec->br_blockcount;
+ return 0;
+ }
+
+ /* Otherwise stash the old rmap and start accumulating a new one. */
+ error = xrep_rmap_stash_accumulated(rf);
+ if (error)
+ return error;
+
+ accum->rm_startblock = agbno;
+ accum->rm_blockcount = rec->br_blockcount;
+ accum->rm_offset = rec->br_startoff;
+ accum->rm_flags = rmap_flags;
+ return 0;
+}
+
+/* Add a btree block to the bitmap. */
+STATIC int
+xrep_rmap_visit_iroot_btree_block(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xrep_rmap_ifork *rf = priv;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t agbno;
+
+ xfs_btree_get_block(cur, level, &bp);
+ if (!bp)
+ return 0;
+
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+ if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != pag_agno(rf->rr->sc->sa.pag))
+ return 0;
+
+ agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1);
+}
+
+/*
+ * Iterate a metadata btree rooted in an inode to collect rmap records for
+ * anything in this fork that matches the AG.
+ */
+STATIC int
+xrep_rmap_scan_iroot_btree(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_owner_info oinfo;
+ struct xrep_rmap *rr = rf->rr;
+ int error;
+
+ xagb_bitmap_init(&rf->bmbt_blocks);
+
+ /* Record all the blocks in the btree itself. */
+ error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block,
+ XFS_BTREE_VISIT_ALL, rf);
+ if (error)
+ goto out;
+
+ /* Emit rmaps for the btree blocks. */
+ xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork);
+ error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo);
+ if (error)
+ goto out;
+
+ /* Stash any remaining accumulated rmaps. */
+ error = xrep_rmap_stash_accumulated(rf);
+out:
+ xagb_bitmap_destroy(&rf->bmbt_blocks);
+ return error;
+}
+
+/*
+ * Iterate the block mapping btree to collect rmap records for anything in this
+ * fork that matches the AG. Sets @mappings_done to true if we've scanned the
+ * block mappings in this fork.
+ */
+STATIC int
+xrep_rmap_scan_bmbt(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_inode *ip,
+ bool *mappings_done)
+{
+ struct xrep_rmap *rr = rf->rr;
+ struct xfs_btree_cur *cur;
+ struct xfs_ifork *ifp;
+ int error;
+
+ *mappings_done = false;
+ ifp = xfs_ifork_ptr(ip, rf->whichfork);
+ cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork);
+
+ if (!xfs_ifork_is_realtime(ip, rf->whichfork) &&
+ xfs_need_iread_extents(ifp)) {
+ /*
+ * If the incore extent cache isn't loaded, scan the bmbt for
+ * mapping records. This avoids loading the incore extent
+ * tree, which will increase memory pressure at a time when
+ * we're trying to run as quickly as we possibly can. Ignore
+ * realtime extents.
+ */
+ error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf);
+ if (error)
+ goto out_cur;
+
+ *mappings_done = true;
+ }
+
+ /* Scan for the bmbt blocks, which always live on the data device. */
+ error = xrep_rmap_scan_iroot_btree(rf, cur);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Iterate the in-core extent cache to collect rmap records for anything in
+ * this fork that matches the AG.
+ */
+STATIC int
+xrep_rmap_scan_iext(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_ifork *ifp)
+{
+ struct xfs_bmbt_irec rec;
+ struct xfs_iext_cursor icur;
+ int error;
+
+ for_each_xfs_iext(ifp, &icur, &rec) {
+ if (isnullstartblock(rec.br_startblock))
+ continue;
+ error = xrep_rmap_visit_bmbt(NULL, &rec, rf);
+ if (error)
+ return error;
+ }
+
+ return xrep_rmap_stash_accumulated(rf);
+}
+
+static int
+xrep_rmap_scan_meta_btree(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_inode *ip)
+{
+ struct xfs_scrub *sc = rf->rr->sc;
+ struct xfs_rtgroup *rtg = NULL;
+ struct xfs_btree_cur *cur = NULL;
+ enum xfs_rtg_inodes type;
+ int error;
+
+ if (rf->whichfork != XFS_DATA_FORK)
+ return -EFSCORRUPTED;
+
+ switch (ip->i_metatype) {
+ case XFS_METAFILE_RTRMAP:
+ type = XFS_RTGI_RMAP;
+ break;
+ case XFS_METAFILE_RTREFCOUNT:
+ type = XFS_RTGI_REFCOUNT;
+ break;
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) {
+ if (ip == rtg->rtg_inodes[type])
+ goto found;
+ }
+
+ /*
+ * We should never find an rt metadata btree inode that isn't
+ * associated with an rtgroup yet has ondisk blocks allocated to it.
+ */
+ if (ip->i_nblocks) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+
+found:
+ switch (ip->i_metatype) {
+ case XFS_METAFILE_RTRMAP:
+ cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg);
+ break;
+ case XFS_METAFILE_RTREFCOUNT:
+ cur = xfs_rtrefcountbt_init_cursor(sc->tp, rtg);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ goto out_rtg;
+ }
+
+ error = xrep_rmap_scan_iroot_btree(rf, cur);
+ xfs_btree_del_cursor(cur, error);
+out_rtg:
+ xfs_rtgroup_rele(rtg);
+ return error;
+}
+
+/* Find all the extents from a given AG in an inode fork. */
+STATIC int
+xrep_rmap_scan_ifork(
+ struct xrep_rmap *rr,
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xrep_rmap_ifork rf = {
+ .accum = { .rm_owner = ip->i_ino, },
+ .rr = rr,
+ .whichfork = whichfork,
+ };
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ bool mappings_done;
+ int error = 0;
+
+ if (!ifp)
+ return 0;
+
+ switch (ifp->if_format) {
+ case XFS_DINODE_FMT_BTREE:
+ /*
+ * Scan the bmap btree for data device mappings. This includes
+ * the btree blocks themselves, even if this is a realtime
+ * file.
+ */
+ error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
+ if (error || mappings_done)
+ return error;
+ fallthrough;
+ case XFS_DINODE_FMT_EXTENTS:
+ /* Scan incore extent cache if this isn't a realtime file. */
+ if (xfs_ifork_is_realtime(ip, whichfork))
+ return 0;
+
+ return xrep_rmap_scan_iext(&rf, ifp);
+ case XFS_DINODE_FMT_META_BTREE:
+ return xrep_rmap_scan_meta_btree(&rf, ip);
+ }
+
+ return 0;
+}
+
+/*
+ * Take ILOCK on a file that we want to scan.
+ *
+ * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded
+ * attr bmbt. Otherwise, take ILOCK_SHARED.
+ */
+static inline unsigned int
+xrep_rmap_scan_ilock(
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ if (xfs_need_iread_extents(&ip->i_df)) {
+ lock_mode = XFS_ILOCK_EXCL;
+ goto lock;
+ }
+
+ if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
+ lock_mode = XFS_ILOCK_EXCL;
+
+lock:
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
+/* Record reverse mappings for a file. */
+STATIC int
+xrep_rmap_scan_inode(
+ struct xrep_rmap *rr,
+ struct xfs_inode *ip)
+{
+ unsigned int lock_mode = xrep_rmap_scan_ilock(ip);
+ int error;
+
+ /* Check the data fork. */
+ error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ /* Check the attr fork. */
+ error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK);
+ if (error)
+ goto out_unlock;
+
+ /* COW fork extents are "owned" by the refcount btree. */
+
+ xchk_iscan_mark_visited(&rr->iscan, ip);
+out_unlock:
+ xfs_iunlock(ip, lock_mode);
+ return error;
+}
+
+/* Section (I): Find all AG metadata extents except for free space metadata. */
+
+struct xrep_rmap_inodes {
+ struct xrep_rmap *rr;
+ struct xagb_bitmap inobt_blocks; /* INOBIT */
+ struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */
+};
+
+/* Record inode btree rmaps. */
+STATIC int
+xrep_rmap_walk_inobt(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore irec;
+ struct xrep_rmap_inodes *ri = priv;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agblock_t agbno;
+ xfs_extlen_t aglen;
+ xfs_agino_t agino;
+ xfs_agino_t iperhole;
+ unsigned int i;
+ int error;
+
+ /* Record the inobt blocks. */
+ error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur);
+ if (error)
+ return error;
+
+ xfs_inobt_btrec_to_irec(mp, rec, &irec);
+ if (xfs_inobt_check_irec(to_perag(cur->bc_group), &irec) != NULL)
+ return -EFSCORRUPTED;
+
+ agino = irec.ir_startino;
+
+ /* Record a non-sparse inode chunk. */
+ if (!xfs_inobt_issparse(irec.ir_holemask)) {
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ aglen = max_t(xfs_extlen_t, 1,
+ XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock);
+
+ return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
+ }
+
+ /* Iterate each chunk. */
+ iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock,
+ XFS_INODES_PER_HOLEMASK_BIT);
+ aglen = iperhole / mp->m_sb.sb_inopblock;
+ for (i = 0, agino = irec.ir_startino;
+ i < XFS_INOBT_HOLEMASK_BITS;
+ i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) {
+ /* Skip holes. */
+ if (irec.ir_holemask & (1 << i))
+ continue;
+
+ /* Record the inode chunk otherwise. */
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Collect rmaps for the blocks containing inode btrees and the inode chunks. */
+STATIC int
+xrep_rmap_find_inode_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xrep_rmap_inodes ri = {
+ .rr = rr,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ xagb_bitmap_init(&ri.inobt_blocks);
+ xagb_bitmap_init(&ri.ichunk_blocks);
+
+ /*
+ * Iterate every record in the inobt so we can capture all the inode
+ * chunks and the blocks in the inobt itself.
+ */
+ error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri);
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Note that if there are zero records in the inobt then query_all does
+ * nothing and we have to account the empty inobt root manually.
+ */
+ if (xagb_bitmap_empty(&ri.ichunk_blocks)) {
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+
+ error = xagb_bitmap_set(&ri.inobt_blocks,
+ be32_to_cpu(agi->agi_root), 1);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Scan the finobt too. */
+ if (xfs_has_finobt(sc->mp)) {
+ error = xagb_bitmap_set_btblocks(&ri.inobt_blocks,
+ sc->sa.fino_cur);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Generate rmaps for everything. */
+ error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks,
+ &XFS_RMAP_OINFO_INOBT);
+ if (error)
+ goto out_bitmap;
+ error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks,
+ &XFS_RMAP_OINFO_INODES);
+
+out_bitmap:
+ xagb_bitmap_destroy(&ri.inobt_blocks);
+ xagb_bitmap_destroy(&ri.ichunk_blocks);
+ return error;
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rmap_walk_cowblocks(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *irec,
+ void *priv)
+{
+ struct xagb_bitmap *bitmap = priv;
+
+ if (!xfs_refcount_check_domain(irec) ||
+ irec->rc_domain != XFS_REFC_DOMAIN_COW)
+ return -EFSCORRUPTED;
+
+ return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount);
+}
+
+/*
+ * Collect rmaps for the blocks containing the refcount btree, and all CoW
+ * staging extents.
+ */
+STATIC int
+xrep_rmap_find_refcount_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xagb_bitmap refcountbt_blocks; /* REFCBIT */
+ struct xagb_bitmap cow_blocks; /* COWBIT */
+ struct xfs_refcount_irec low = {
+ .rc_startblock = 0,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_refcount_irec high = {
+ .rc_startblock = -1U,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ if (!xfs_has_reflink(sc->mp))
+ return 0;
+
+ xagb_bitmap_init(&refcountbt_blocks);
+ xagb_bitmap_init(&cow_blocks);
+
+ /* refcountbt */
+ error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur);
+ if (error)
+ goto out_bitmap;
+
+ /* Collect rmaps for CoW staging extents. */
+ error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high,
+ xrep_rmap_walk_cowblocks, &cow_blocks);
+ if (error)
+ goto out_bitmap;
+
+ /* Generate rmaps for everything. */
+ error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
+ if (error)
+ goto out_bitmap;
+ error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks,
+ &XFS_RMAP_OINFO_REFC);
+
+out_bitmap:
+ xagb_bitmap_destroy(&cow_blocks);
+ xagb_bitmap_destroy(&refcountbt_blocks);
+ return error;
+}
+
+/* Generate rmaps for the AG headers (AGI/AGF/AGFL) */
+STATIC int
+xrep_rmap_find_agheader_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ /* Create a record for the AG sb->agfl. */
+ return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp),
+ XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1,
+ XFS_RMAP_OWN_FS, 0, 0);
+}
+
+/* Generate rmaps for the log, if it's in this AG. */
+STATIC int
+xrep_rmap_find_log_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ if (!xfs_ag_contains_log(sc->mp, pag_agno(sc->sa.pag)))
+ return 0;
+
+ return xrep_rmap_stash(rr,
+ XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart),
+ sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0);
+}
+
+/* Check and count all the records that we gathered. */
+STATIC int
+xrep_rmap_check_record(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rmap *rr = priv;
+ int error;
+
+ error = xrep_rmap_check_mapping(rr->sc, rec);
+ if (error)
+ return error;
+
+ rr->nr_records++;
+ return 0;
+}
+
+/*
+ * Generate all the reverse-mappings for this AG, a list of the old rmapbt
+ * blocks, and the new btreeblks count. Figure out if we have enough free
+ * space to reconstruct the inode btrees. The caller must clean up the lists
+ * if anything goes wrong. This implements section (I) above.
+ */
+STATIC int
+xrep_rmap_find_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xchk_ag *sa = &sc->sa;
+ struct xfs_inode *ip;
+ struct xfs_btree_cur *mcur;
+ int error;
+
+ /* Find all the per-AG metadata. */
+ xrep_ag_btcur_init(sc, &sc->sa);
+
+ error = xrep_rmap_find_inode_rmaps(rr);
+ if (error)
+ goto end_agscan;
+
+ error = xrep_rmap_find_refcount_rmaps(rr);
+ if (error)
+ goto end_agscan;
+
+ error = xrep_rmap_find_agheader_rmaps(rr);
+ if (error)
+ goto end_agscan;
+
+ error = xrep_rmap_find_log_rmaps(rr);
+end_agscan:
+ xchk_ag_btcur_free(&sc->sa);
+ if (error)
+ return error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Unlock the AG header buffers and cancel the transaction to release
+ * the log grant space while we scan the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done even though
+ * we take the IOLOCK to quiesce the file because empty transactions
+ * do not take sb_internal.
+ */
+ sa->agf_bp = NULL;
+ sa->agi_bp = NULL;
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ /* Iterate all AGs for inodes rmaps. */
+ while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
+ error = xrep_rmap_scan_inode(rr, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&rr->iscan);
+ if (error)
+ return error;
+
+ /*
+ * Switch out for a real transaction and lock the AG headers in
+ * preparation for building a new tree.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_setup_fs(sc);
+ if (error)
+ return error;
+ error = xchk_perag_drain_and_lock(sc);
+ if (error)
+ return error;
+
+ /*
+ * If a hook failed to update the in-memory btree, we lack the data to
+ * continue the repair.
+ */
+ if (xchk_iscan_aborted(&rr->iscan))
+ return -EFSCORRUPTED;
+
+ /*
+ * Now that we have everything locked again, we need to count the
+ * number of rmap records stashed in the btree. This should reflect
+ * all actively-owned space in the filesystem. At the same time, check
+ * all our records before we start building a new btree, which requires
+ * a bnobt cursor.
+ */
+ mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
+ sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+
+ rr->nr_records = 0;
+ error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr);
+
+ xfs_btree_del_cursor(sc->sa.bno_cur, error);
+ sc->sa.bno_cur = NULL;
+ xfs_btree_del_cursor(mcur, error);
+
+ return error;
+}
+
+/* Section (II): Reserving space for new rmapbt and setting free space bitmap */
+
+struct xrep_rmap_agfl {
+ struct xagb_bitmap *bitmap;
+ xfs_agnumber_t agno;
+};
+
+/* Add an AGFL block to the rmap list. */
+STATIC int
+xrep_rmap_walk_agfl(
+ struct xfs_mount *mp,
+ xfs_agblock_t agbno,
+ void *priv)
+{
+ struct xrep_rmap_agfl *ra = priv;
+
+ return xagb_bitmap_set(ra->bitmap, agbno, 1);
+}
+
+/*
+ * Run one round of reserving space for the new rmapbt and recomputing the
+ * number of blocks needed to store the previously observed rmapbt records and
+ * the ones we'll create for the free space metadata. When we don't need more
+ * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to
+ * true.
+ */
+STATIC int
+xrep_rmap_try_reserve(
+ struct xrep_rmap *rr,
+ struct xfs_btree_cur *rmap_cur,
+ struct xagb_bitmap *freesp_blocks,
+ uint64_t *blocks_reserved,
+ bool *done)
+{
+ struct xrep_rmap_agfl ra = {
+ .bitmap = freesp_blocks,
+ .agno = pag_agno(rr->sc->sa.pag),
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xrep_newbt_resv *resv, *n;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_buf *agfl_bp;
+ uint64_t nr_blocks; /* RMB */
+ uint64_t freesp_records;
+ int error;
+
+ /*
+ * We're going to recompute new_btree.bload.nr_blocks at the end of
+ * this function to reflect however many btree blocks we need to store
+ * all the rmap records (including the ones that reflect the changes we
+ * made to support the new rmapbt blocks), so we save the old value
+ * here so we can decide if we've reserved enough blocks.
+ */
+ nr_blocks = rr->new_btree.bload.nr_blocks;
+
+ /*
+ * Make sure we've reserved enough space for the new btree. This can
+ * change the shape of the free space btrees, which can cause secondary
+ * interactions with the rmap records because all three space btrees
+ * have the same rmap owner. We'll account for all that below.
+ */
+ error = xrep_newbt_alloc_blocks(&rr->new_btree,
+ nr_blocks - *blocks_reserved);
+ if (error)
+ return error;
+
+ *blocks_reserved = rr->new_btree.bload.nr_blocks;
+
+ /* Clear everything in the bitmap. */
+ xagb_bitmap_destroy(freesp_blocks);
+
+ /* Set all the bnobt blocks in the bitmap. */
+ sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur);
+ xfs_btree_del_cursor(sc->sa.bno_cur, error);
+ sc->sa.bno_cur = NULL;
+ if (error)
+ return error;
+
+ /* Set all the cntbt blocks in the bitmap. */
+ sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur);
+ xfs_btree_del_cursor(sc->sa.cnt_cur, error);
+ sc->sa.cnt_cur = NULL;
+ if (error)
+ return error;
+
+ /* Record our new btreeblks value. */
+ rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2;
+
+ /* Set all the new rmapbt blocks in the bitmap. */
+ list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) {
+ error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len);
+ if (error)
+ return error;
+ }
+
+ /* Set all the AGFL blocks in the bitmap. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+ if (error)
+ return error;
+
+ error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra);
+ if (error)
+ return error;
+
+ /* Count the extents in the bitmap. */
+ freesp_records = xagb_bitmap_count_set_regions(freesp_blocks);
+
+ /* Compute how many blocks we'll need for all the rmaps. */
+ error = xfs_btree_bload_compute_geometry(rmap_cur,
+ &rr->new_btree.bload, rr->nr_records + freesp_records);
+ if (error)
+ return error;
+
+ /* We're done when we don't need more blocks. */
+ *done = nr_blocks >= rr->new_btree.bload.nr_blocks;
+ return 0;
+}
+
+/*
+ * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for
+ * the free space metadata. This implements section (II) above.
+ */
+STATIC int
+xrep_rmap_reserve_space(
+ struct xrep_rmap *rr,
+ struct xfs_btree_cur *rmap_cur)
+{
+ struct xagb_bitmap freesp_blocks; /* AGBIT */
+ uint64_t blocks_reserved = 0;
+ bool done = false;
+ int error;
+
+ /* Compute how many blocks we'll need for the rmaps collected so far. */
+ error = xfs_btree_bload_compute_geometry(rmap_cur,
+ &rr->new_btree.bload, rr->nr_records);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ xagb_bitmap_init(&freesp_blocks);
+
+ /*
+ * Iteratively reserve space for the new rmapbt and recompute the
+ * number of blocks needed to store the previously observed rmapbt
+ * records and the ones we'll create for the free space metadata.
+ * Finish when we don't need more blocks.
+ */
+ do {
+ error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks,
+ &blocks_reserved, &done);
+ if (error)
+ goto out_bitmap;
+ } while (!done);
+
+ /* Emit rmaps for everything in the free space bitmap. */
+ xrep_ag_btcur_init(rr->sc, &rr->sc->sa);
+ error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG);
+ xchk_ag_btcur_free(&rr->sc->sa);
+
+out_bitmap:
+ xagb_bitmap_destroy(&freesp_blocks);
+ return error;
+}
+
+/* Section (III): Building the new rmap btree. */
+
+/* Update the AGF counters. */
+STATIC int
+xrep_rmap_reset_counters(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ xfs_agblock_t rmap_btblocks;
+
+ /*
+ * The AGF header contains extra information related to the reverse
+ * mapping btree, so we must update those fields here.
+ */
+ rmap_btblocks = rr->new_btree.afake.af_blocks - 1;
+ agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks);
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS);
+
+ /*
+ * After we commit the new btree to disk, it is possible that the
+ * process to reap the old btree blocks will race with the AIL trying
+ * to checkpoint the old btree blocks into the filesystem. If the new
+ * tree is shorter than the old one, the rmapbt write verifier will
+ * fail and the AIL will shut down the filesystem.
+ *
+ * To avoid this, save the old incore btree height values as the alt
+ * height values before re-initializing the perag info from the updated
+ * AGF to capture all the new values.
+ */
+ pag->pagf_repair_rmap_level = pag->pagf_rmap_level;
+
+ /* Reinitialize with the values we just logged. */
+ return xrep_reinit_pagf(sc);
+}
+
+/* Retrieve rmapbt data for bulk load. */
+STATIC int
+xrep_rmap_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xrep_rmap *rr = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ int stat = 0;
+
+ error = xfs_btree_increment(rr->mcur, 0, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return -EFSCORRUPTED;
+
+ error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return -EFSCORRUPTED;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rmap_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_rmap *rr = priv;
+
+ return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Custom allocation function for new rmap btrees. */
+STATIC int
+xrep_rmap_alloc_vextent(
+ struct xfs_scrub *sc,
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t alloc_hint)
+{
+ int error;
+
+ /*
+ * We don't want an rmap update on the allocation, since we iteratively
+ * compute the OWN_AG records /after/ allocating blocks for the records
+ * that we already know we need to store. Therefore, fix the freelist
+ * with the NORMAP flag set so that we don't also try to create an rmap
+ * for new AGFL blocks.
+ */
+ error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP);
+ if (error)
+ return error;
+
+ /*
+ * If xrep_fix_freelist fixed the freelist by moving blocks from the
+ * free space btrees or by removing blocks from the AGFL and queueing
+ * an EFI to free the block, the transaction will be dirty. This
+ * second case is of interest to us.
+ *
+ * Later on, we will need to compare gaps in the new recordset against
+ * the block usage of all OWN_AG owners in order to free the old
+ * btree's blocks, which means that we can't have EFIs for former AGFL
+ * blocks attached to the repair transaction when we commit the new
+ * btree.
+ *
+ * xrep_newbt_alloc_blocks guarantees this for us by calling
+ * xrep_defer_finish to commit anything that fix_freelist may have
+ * added to the transaction.
+ */
+ return xfs_alloc_vextent_near_bno(args, alloc_hint);
+}
+
+
+/* Count the records in this btree. */
+STATIC int
+xrep_rmap_count_records(
+ struct xfs_btree_cur *cur,
+ unsigned long long *nr)
+{
+ int running = 1;
+ int error;
+
+ *nr = 0;
+
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ return error;
+
+ while (running && !(error = xfs_btree_increment(cur, 0, &running))) {
+ if (running)
+ (*nr)++;
+ }
+
+ return error;
+}
+/*
+ * Use the collected rmap information to stage a new rmap btree. If this is
+ * successful we'll return with the new btree root information logged to the
+ * repair transaction but not yet committed. This implements section (III)
+ * above.
+ */
+STATIC int
+xrep_rmap_build_new_tree(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_btree_cur *rmap_cur;
+ int error;
+
+ /*
+ * Preserve the old rmapbt block count so that we can adjust the
+ * per-AG rmapbt reservation after we commit the new btree root and
+ * want to dispose of the old btree blocks.
+ */
+ rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks);
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the AG header. The new blocks are accounted to the
+ * rmapbt per-AG reservation, which we will adjust further after
+ * committing the new btree.
+ */
+ xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE,
+ xfs_agbno_to_fsb(pag, XFS_RMAP_BLOCK(sc->mp)),
+ XFS_AG_RESV_RMAPBT);
+ rr->new_btree.bload.get_records = xrep_rmap_get_records;
+ rr->new_btree.bload.claim_block = xrep_rmap_claim_block;
+ rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent;
+ rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake);
+
+ /*
+ * Initialize @rr->new_btree, reserve space for the new rmapbt,
+ * and compute OWN_AG rmaps.
+ */
+ error = xrep_rmap_reserve_space(rr, rmap_cur);
+ if (error)
+ goto err_cur;
+
+ /*
+ * Count the rmapbt records again, because the space reservation
+ * for the rmapbt itself probably added more records to the btree.
+ */
+ rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL,
+ &rr->rmap_btree);
+
+ error = xrep_rmap_count_records(rr->mcur, &rr->nr_records);
+ if (error)
+ goto err_mcur;
+
+ /*
+ * Due to btree slack factors, it's possible for a new btree to be one
+ * level taller than the old btree. Update the incore btree height so
+ * that we don't trip the verifiers when writing the new btree blocks
+ * to disk.
+ */
+ pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height;
+
+ /*
+ * Move the cursor to the left edge of the tree so that the first
+ * increment in ->get_records positions us at the first record.
+ */
+ error = xfs_btree_goto_left_edge(rr->mcur);
+ if (error)
+ goto err_level;
+
+ /* Add all observed rmap records. */
+ error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
+ if (error)
+ goto err_level;
+
+ /*
+ * Install the new btree in the AG header. After this point the old
+ * btree is no longer accessible and the new tree is live.
+ */
+ xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(rmap_cur, 0);
+ xfs_btree_del_cursor(rr->mcur, 0);
+ rr->mcur = NULL;
+
+ /*
+ * Now that we've written the new btree to disk, we don't need to keep
+ * updating the in-memory btree. Abort the scan to stop live updates.
+ */
+ xchk_iscan_abort(&rr->iscan);
+
+ /*
+ * The newly committed rmap recordset includes mappings for the blocks
+ * that we reserved to build the new btree. If there is excess space
+ * reservation to be freed, the corresponding rmap records must also be
+ * removed.
+ */
+ rr->new_btree.oinfo = XFS_RMAP_OINFO_AG;
+
+ /* Reset the AGF counters now that we've changed the btree shape. */
+ error = xrep_rmap_reset_counters(rr);
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rr->new_btree);
+ if (error)
+ return error;
+
+ return xrep_roll_ag_trans(sc);
+
+err_level:
+ pag->pagf_repair_rmap_level = 0;
+err_mcur:
+ xfs_btree_del_cursor(rr->mcur, error);
+err_cur:
+ xfs_btree_del_cursor(rmap_cur, error);
+err_newbt:
+ xrep_newbt_cancel(&rr->new_btree);
+ return error;
+}
+
+/* Section (IV): Reaping the old btree. */
+
+struct xrep_rmap_find_gaps {
+ struct xagb_bitmap rmap_gaps;
+ xfs_agblock_t next_agbno;
+};
+
+/* Subtract each free extent in the bnobt from the rmap gaps. */
+STATIC int
+xrep_rmap_find_freesp(
+ struct xfs_btree_cur *cur,
+ const struct xfs_alloc_rec_incore *rec,
+ void *priv)
+{
+ struct xrep_rmap_find_gaps *rfg = priv;
+
+ return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock,
+ rec->ar_blockcount);
+}
+
+/* Record the free space we find, as part of cleaning out the btree. */
+STATIC int
+xrep_rmap_find_gaps(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rmap_find_gaps *rfg = priv;
+ int error;
+
+ if (rec->rm_startblock > rfg->next_agbno) {
+ error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno,
+ rec->rm_startblock - rfg->next_agbno);
+ if (error)
+ return error;
+ }
+
+ rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno,
+ rec->rm_startblock + rec->rm_blockcount);
+ return 0;
+}
+
+/*
+ * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make
+ * a list of gaps in the rmap records and a list of the extents mentioned in
+ * the bnobt. Any block that's in the new rmapbt gap list but not mentioned
+ * in the bnobt is a block from the old rmapbt and can be removed.
+ */
+STATIC int
+xrep_rmap_remove_old_tree(
+ struct xrep_rmap *rr)
+{
+ struct xrep_rmap_find_gaps rfg = {
+ .next_agbno = 0,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_btree_cur *mcur;
+ xfs_agblock_t agend;
+ int error;
+
+ xagb_bitmap_init(&rfg.rmap_gaps);
+
+ /* Compute free space from the new rmapbt. */
+ mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
+
+ error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_bitmap;
+
+ /* Insert a record for space between the last rmap and EOAG. */
+ agend = be32_to_cpu(agf->agf_length);
+ if (rfg.next_agbno < agend) {
+ error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno,
+ agend - rfg.next_agbno);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Compute free space from the existing bnobt. */
+ sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp,
+ &rfg);
+ xfs_btree_del_cursor(sc->sa.bno_cur, error);
+ sc->sa.bno_cur = NULL;
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Free the "free" blocks that the new rmapbt knows about but the bnobt
+ * doesn't--these are the old rmapbt blocks. Credit the old rmapbt
+ * block usage count back to the per-AG rmapbt reservation (and not
+ * fdblocks, since the rmap btree lives in free space) to keep the
+ * reservation and free space accounting correct.
+ */
+ error = xrep_reap_agblocks(sc, &rfg.rmap_gaps,
+ &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT);
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Now that we've zapped all the old rmapbt blocks we can turn off
+ * the alternate height mechanism and reset the per-AG space
+ * reservation.
+ */
+ pag->pagf_repair_rmap_level = 0;
+ sc->flags |= XREP_RESET_PERAG_RESV;
+out_bitmap:
+ xagb_bitmap_destroy(&rfg.rmap_gaps);
+ return error;
+}
+
+static inline bool
+xrep_rmapbt_want_live_update(
+ struct xchk_iscan *iscan,
+ const struct xfs_owner_info *oi)
+{
+ if (xchk_iscan_aborted(iscan))
+ return false;
+
+ /*
+ * Before unlocking the AG header to perform the inode scan, we
+ * recorded reverse mappings for all AG metadata except for the OWN_AG
+ * metadata. IOWs, the in-memory btree knows about the AG headers, the
+ * two inode btrees, the CoW staging extents, and the refcount btrees.
+ * For these types of metadata, we need to record the live updates in
+ * the in-memory rmap btree.
+ *
+ * However, we do not scan the free space btrees or the AGFL until we
+ * have re-locked the AGF and are ready to reserve space for the new
+ * rmap btree, so we do not want live updates for OWN_AG metadata.
+ */
+ if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
+ return oi->oi_owner != XFS_RMAP_OWN_AG;
+
+ /* Ignore updates to files that the scanner hasn't visited yet. */
+ return xchk_iscan_want_live_update(iscan, oi->oi_owner);
+}
+
+/*
+ * Apply a rmapbt update from the regular filesystem into our shadow btree.
+ * We're running from the thread that owns the AGF buffer and is generating
+ * the update, so we must be careful about which parts of the struct xrep_rmap
+ * that we change.
+ */
+static int
+xrep_rmapbt_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_rmap_update_params *p = data;
+ struct xrep_rmap *rr;
+ struct xfs_mount *mp;
+ struct xfs_btree_cur *mcur;
+ struct xfs_trans *tp;
+ void *txcookie;
+ int error;
+
+ rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
+ mp = rr->sc->mp;
+
+ if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
+ goto out_unlock;
+
+ trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p);
+
+ error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
+ if (error)
+ goto out_abort;
+
+ mutex_lock(&rr->lock);
+ mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
+ error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
+ p->blockcount, &p->oinfo, p->unwritten);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_cancel;
+
+ error = xfbtree_trans_commit(&rr->rmap_btree, tp);
+ if (error)
+ goto out_cancel;
+
+ xrep_trans_cancel_hook_dummy(&txcookie, tp);
+ mutex_unlock(&rr->lock);
+ return NOTIFY_DONE;
+
+out_cancel:
+ xfbtree_trans_cancel(&rr->rmap_btree, tp);
+ xrep_trans_cancel_hook_dummy(&txcookie, tp);
+out_abort:
+ mutex_unlock(&rr->lock);
+ xchk_iscan_abort(&rr->iscan);
+out_unlock:
+ return NOTIFY_DONE;
+}
+
+/* Set up the filesystem scan components. */
+STATIC int
+xrep_rmap_setup_scan(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ mutex_init(&rr->lock);
+
+ /* Set up in-memory rmap btree */
+ error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp,
+ pag_agno(sc->sa.pag));
+ if (error)
+ goto out_mutex;
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &rr->iscan);
+
+ /*
+ * Hook into live rmap operations so that we can update our in-memory
+ * btree to reflect live changes on the filesystem. Since we drop the
+ * AGF buffer to scan all the inodes, we need this piece to avoid
+ * installing a stale btree.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_RMAP);
+ xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update);
+ error = xfs_rmap_hook_add(pag_group(sc->sa.pag), &rr->rhook);
+ if (error)
+ goto out_iscan;
+ return 0;
+
+out_iscan:
+ xchk_iscan_teardown(&rr->iscan);
+ xfbtree_destroy(&rr->rmap_btree);
+out_mutex:
+ mutex_destroy(&rr->lock);
+ return error;
+}
+
+/* Tear down scan components. */
+STATIC void
+xrep_rmap_teardown(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ xchk_iscan_abort(&rr->iscan);
+ xfs_rmap_hook_del(pag_group(sc->sa.pag), &rr->rhook);
+ xchk_iscan_teardown(&rr->iscan);
+ xfbtree_destroy(&rr->rmap_btree);
+ mutex_destroy(&rr->lock);
+}
+
+/* Repair the rmap btree for some AG. */
+int
+xrep_rmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rmap *rr = sc->buf;
+ int error;
+
+ error = xrep_rmap_setup_scan(rr);
+ if (error)
+ return error;
+
+ /*
+ * Collect rmaps for everything in this AG that isn't space metadata.
+ * These rmaps won't change even as we try to allocate blocks.
+ */
+ error = xrep_rmap_find_rmaps(rr);
+ if (error)
+ goto out_records;
+
+ /* Rebuild the rmap information. */
+ error = xrep_rmap_build_new_tree(rr);
+ if (error)
+ goto out_records;
+
+ /* Kill the old tree. */
+ error = xrep_rmap_remove_old_tree(rr);
+ if (error)
+ goto out_records;
+
+out_records:
+ xrep_rmap_teardown(rr);
+ return error;
+}
diff --git a/fs/xfs/scrub/rtb_bitmap.h b/fs/xfs/scrub/rtb_bitmap.h
new file mode 100644
index 000000000000..1313ef605511
--- /dev/null
+++ b/fs/xfs/scrub/rtb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RTB_BITMAP_H__
+#define __XFS_SCRUB_RTB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_rtblock_t */
+
+struct xrtb_bitmap {
+ struct xbitmap64 rtbitmap;
+};
+
+static inline void xrtb_bitmap_init(struct xrtb_bitmap *bitmap)
+{
+ xbitmap64_init(&bitmap->rtbitmap);
+}
+
+static inline void xrtb_bitmap_destroy(struct xrtb_bitmap *bitmap)
+{
+ xbitmap64_destroy(&bitmap->rtbitmap);
+}
+
+static inline int xrtb_bitmap_set(struct xrtb_bitmap *bitmap,
+ xfs_rtblock_t start, xfs_filblks_t len)
+{
+ return xbitmap64_set(&bitmap->rtbitmap, start, len);
+}
+
+static inline int xrtb_bitmap_walk(struct xrtb_bitmap *bitmap,
+ xbitmap64_walk_fn fn, void *priv)
+{
+ return xbitmap64_walk(&bitmap->rtbitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_RTB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 441ca9977652..d5ff8609dbfb 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -9,16 +9,25 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_btree.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_rtbitmap.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bit.h"
+#include "xfs_rtgroup.h"
+#include "xfs_sb.h"
+#include "xfs_rmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_exchmaps.h"
+#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
+#include "scrub/tempexch.h"
#include "scrub/rtbitmap.h"
+#include "scrub/btree.h"
/* Set us up with the realtime metadata locked. */
int
@@ -29,10 +38,19 @@ xchk_setup_rtbitmap(
struct xchk_rtbitmap *rtb;
int error;
- rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS);
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ rtb = kzalloc(struct_size(rtb, words, xchk_rtbitmap_wordcnt(sc)),
+ XCHK_GFP_FLAGS);
if (!rtb)
return -ENOMEM;
sc->buf = rtb;
+ rtb->sc = sc;
+
+ error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
+ if (error)
+ return error;
if (xchk_could_repair(sc)) {
error = xrep_setup_rtbitmap(sc, rtb);
@@ -44,7 +62,7 @@ xchk_setup_rtbitmap(
if (error)
return error;
- error = xchk_install_live_inode(sc, sc->mp->m_rbmip);
+ error = xchk_install_live_inode(sc, rtg_bitmap(sc->sr.rtg));
if (error)
return error;
@@ -52,7 +70,9 @@ xchk_setup_rtbitmap(
if (error)
return error;
- xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+ error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+ if (error)
+ return error;
/*
* Now that we've locked the rtbitmap, we can't race with growfsrt
@@ -60,32 +80,65 @@ xchk_setup_rtbitmap(
* Hence it is safe to compute and check the geometry values.
*/
if (mp->m_sb.sb_rblocks) {
- rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
+ rtb->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
rtb->rextslog = xfs_compute_rextslog(rtb->rextents);
- rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents);
+ rtb->rbmblocks = xfs_rtbitmap_blockcount(mp);
}
+
return 0;
}
-/* Realtime bitmap. */
+/* Per-rtgroup bitmap contents. */
+
+/* Cross-reference rtbitmap entries with other metadata. */
+STATIC void
+xchk_rtbitmap_xref(
+ struct xchk_rtbitmap *rtb,
+ xfs_rtblock_t startblock,
+ xfs_rtblock_t blockcount)
+{
+ struct xfs_scrub *sc = rtb->sc;
+ xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, startblock);
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+ if (!sc->sr.rmap_cur)
+ return;
+
+ xchk_xref_has_no_rt_owner(sc, rgbno, blockcount);
+ xchk_xref_is_not_rt_shared(sc, rgbno, blockcount);
+ xchk_xref_is_not_rt_cow_staging(sc, rgbno, blockcount);
+
+ if (rtb->next_free_rgbno < rgbno)
+ xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno,
+ rgbno - rtb->next_free_rgbno);
+ rtb->next_free_rgbno = rgbno + blockcount;
+}
/* Scrub a free extent record from the realtime bitmap. */
STATIC int
xchk_rtbitmap_rec(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
{
- struct xfs_scrub *sc = priv;
+ struct xchk_rtbitmap *rtb = priv;
+ struct xfs_scrub *sc = rtb->sc;
xfs_rtblock_t startblock;
xfs_filblks_t blockcount;
- startblock = xfs_rtx_to_rtb(mp, rec->ar_startext);
- blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount);
+ startblock = xfs_rtx_to_rtb(rtg, rec->ar_startext);
+ blockcount = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount);
- if (!xfs_verify_rtbext(mp, startblock, blockcount))
+ if (!xfs_verify_rtbext(rtg_mount(rtg), startblock, blockcount))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+
+ xchk_rtbitmap_xref(rtb, startblock, blockcount);
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return -ECANCELED;
+
return 0;
}
@@ -133,24 +186,27 @@ xchk_rtbitmap_check_extents(
return error;
}
-/* Scrub the realtime bitmap. */
+/* Scrub this group's realtime bitmap. */
int
xchk_rtbitmap(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
+ struct xfs_inode *rbmip = rtg_bitmap(rtg);
struct xchk_rtbitmap *rtb = sc->buf;
+ xfs_rgblock_t last_rgbno;
int error;
/* Is sb_rextents correct? */
if (mp->m_sb.sb_rextents != rtb->rextents) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
return 0;
}
/* Is sb_rextslog correct? */
if (mp->m_sb.sb_rextslog != rtb->rextslog) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
return 0;
}
@@ -159,17 +215,17 @@ xchk_rtbitmap(
* case can we exceed 4bn bitmap blocks since the super field is a u32.
*/
if (rtb->rbmblocks > U32_MAX) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
return 0;
}
if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
return 0;
}
/* The bitmap file length must be aligned to an fsblock. */
- if (mp->m_rbmip->i_disk_size & mp->m_blockmask) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ if (rbmip->i_disk_size & mp->m_blockmask) {
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
return 0;
}
@@ -178,8 +234,8 @@ xchk_rtbitmap(
* growfsrt expands the bitmap file before updating sb_rextents, so the
* file can be larger than sb_rbmblocks.
*/
- if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ if (rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) {
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
return 0;
}
@@ -192,10 +248,20 @@ xchk_rtbitmap(
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
- error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc);
+ rtb->next_free_rgbno = 0;
+ error = xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtbitmap_rec, rtb);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
return error;
+ /*
+ * Check that the are rmappings for all rt extents between the end of
+ * the last free extent we saw and the last possible extent in the rt
+ * group.
+ */
+ last_rgbno = rtg->rtg_extents * mp->m_sb.sb_rextsize - 1;
+ if (rtb->next_free_rgbno < last_rgbno)
+ xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno,
+ last_rgbno - rtb->next_free_rgbno);
return 0;
}
@@ -206,6 +272,7 @@ xchk_xref_is_used_rt_space(
xfs_rtblock_t rtbno,
xfs_extlen_t len)
{
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
xfs_rtxnum_t startext;
xfs_rtxnum_t endext;
bool is_free;
@@ -214,15 +281,19 @@ xchk_xref_is_used_rt_space(
if (xchk_skip_xref(sc->sm))
return;
+ if (xfs_has_zoned(sc->mp)) {
+ if (!xfs_zone_rgbno_is_valid(rtg,
+ xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1))
+ xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino);
+ return;
+ }
+
startext = xfs_rtb_to_rtx(sc->mp, rtbno);
endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
- xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
- error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext,
+ error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext,
endext - startext + 1, &is_free);
if (!xchk_should_check_xref(sc, &error, NULL))
- goto out_unlock;
+ return;
if (is_free)
- xchk_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
-out_unlock:
- xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
}
diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h
index 85304ff019e1..fe52b877253d 100644
--- a/fs/xfs/scrub/rtbitmap.h
+++ b/fs/xfs/scrub/rtbitmap.h
@@ -6,17 +6,72 @@
#ifndef __XFS_SCRUB_RTBITMAP_H__
#define __XFS_SCRUB_RTBITMAP_H__
+/*
+ * We use an xfile to construct new bitmap blocks for the portion of the
+ * rtbitmap file that we're replacing. Whereas the ondisk bitmap must be
+ * accessed through the buffer cache, the xfile bitmap supports direct
+ * word-level accesses. Therefore, we create a small abstraction for linear
+ * access.
+ */
+typedef unsigned long long xrep_wordoff_t;
+typedef unsigned int xrep_wordcnt_t;
+
+/* Mask to round an rtx down to the nearest bitmap word. */
+#define XREP_RTBMP_WORDMASK ((1ULL << XFS_NBWORDLOG) - 1)
+
+
struct xchk_rtbitmap {
+ struct xfs_scrub *sc;
+
uint64_t rextents;
uint64_t rbmblocks;
unsigned int rextslog;
unsigned int resblks;
+
+ /* The next free rt group block number that we expect to see. */
+ xfs_rgblock_t next_free_rgbno;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ /* stuff for staging a new bitmap */
+ struct xfs_rtalloc_args args;
+ struct xrep_tempexch tempexch;
+#endif
+
+ /* The next rtgroup block we expect to see during our rtrmapbt walk. */
+ xfs_rgblock_t next_rgbno;
+
+ /* rtgroup lock flags */
+ unsigned int rtglock_flags;
+
+ /* rtword position of xfile as we write buffers to disk. */
+ xrep_wordoff_t prep_wordoff;
+
+ /* In-Memory rtbitmap for repair. */
+ union xfs_rtword_raw words[];
};
#ifdef CONFIG_XFS_ONLINE_REPAIR
int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb);
+
+/*
+ * How big should the words[] buffer be?
+ *
+ * For repairs, we want a full fsblock worth of space so that we can memcpy a
+ * buffer full of 1s into the xfile bitmap. The xfile bitmap doesn't have
+ * rtbitmap block headers, so we don't use blockwsize. Scrub doesn't use the
+ * words buffer at all.
+ */
+static inline unsigned int
+xchk_rtbitmap_wordcnt(
+ struct xfs_scrub *sc)
+{
+ if (xchk_could_repair(sc))
+ return sc->mp->m_sb.sb_blocksize >> XFS_WORDLOG;
+ return 0;
+}
#else
# define xrep_setup_rtbitmap(sc, rtb) (0)
+# define xchk_rtbitmap_wordcnt(sc) (0)
#endif /* CONFIG_XFS_ONLINE_REPAIR */
#endif /* __XFS_SCRUB_RTBITMAP_H__ */
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
index 46f5d5f605c9..203a1a97c502 100644
--- a/fs/xfs/scrub/rtbitmap_repair.c
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -12,32 +12,66 @@
#include "xfs_btree.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
+#include "xfs_rtalloc.h"
#include "xfs_inode.h"
#include "xfs_bit.h"
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_exchmaps.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
+#include "xfs_extent_busy.h"
+#include "xfs_refcount.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/xfile.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/reap.h"
#include "scrub/rtbitmap.h"
-/* Set up to repair the realtime bitmap file metadata. */
+/* rt bitmap content repairs */
+
+/* Set up to repair the realtime bitmap for this group. */
int
xrep_setup_rtbitmap(
struct xfs_scrub *sc,
struct xchk_rtbitmap *rtb)
{
struct xfs_mount *mp = sc->mp;
- unsigned long long blocks = 0;
+ char *descr;
+ unsigned long long blocks = mp->m_sb.sb_rbmblocks;
+ int error;
+
+ error = xrep_tempfile_create(sc, S_IFREG);
+ if (error)
+ return error;
+
+ /* Create an xfile to hold our reconstructed bitmap. */
+ descr = xchk_xfile_rtgroup_descr(sc, "bitmap file");
+ error = xfile_create(descr, blocks * mp->m_sb.sb_blocksize, &sc->xfile);
+ kfree(descr);
+ if (error)
+ return error;
/*
- * Reserve enough blocks to write out a completely new bmbt for a
- * maximally fragmented bitmap file. We do not hold the rtbitmap
- * ILOCK yet, so this is entirely speculative.
+ * Reserve enough blocks to write out a completely new bitmap file,
+ * plus twice as many blocks as we would need if we can only allocate
+ * one block per data fork mapping. This should cover the
+ * preallocation of the temporary file and exchanging the extent
+ * mappings.
+ *
+ * We cannot use xfs_exchmaps_estimate because we have not yet
+ * constructed the replacement bitmap and therefore do not know how
+ * many extents it will use. By the time we do, we will have a dirty
+ * transaction (which we cannot drop because we cannot drop the
+ * rtbitmap ILOCK) and cannot ask for more reservation.
*/
- blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks);
+ blocks += xfs_bmbt_calc_size(mp, blocks) * 2;
if (blocks > UINT_MAX)
return -EOPNOTSUPP;
@@ -45,6 +79,325 @@ xrep_setup_rtbitmap(
return 0;
}
+static inline xrep_wordoff_t
+rtx_to_wordoff(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t rtx)
+{
+ return rtx >> XFS_NBWORDLOG;
+}
+
+static inline xrep_wordcnt_t
+rtxlen_to_wordcnt(
+ xfs_rtxlen_t rtxlen)
+{
+ return rtxlen >> XFS_NBWORDLOG;
+}
+
+/* Helper functions to record rtwords in an xfile. */
+
+static inline int
+xfbmp_load(
+ struct xchk_rtbitmap *rtb,
+ xrep_wordoff_t wordoff,
+ xfs_rtword_t *word)
+{
+ union xfs_rtword_raw urk;
+ int error;
+
+ ASSERT(xfs_has_rtgroups(rtb->sc->mp));
+
+ error = xfile_load(rtb->sc->xfile, &urk,
+ sizeof(union xfs_rtword_raw),
+ wordoff << XFS_WORDLOG);
+ if (error)
+ return error;
+
+ *word = be32_to_cpu(urk.rtg);
+ return 0;
+}
+
+static inline int
+xfbmp_store(
+ struct xchk_rtbitmap *rtb,
+ xrep_wordoff_t wordoff,
+ const xfs_rtword_t word)
+{
+ union xfs_rtword_raw urk;
+
+ ASSERT(xfs_has_rtgroups(rtb->sc->mp));
+
+ urk.rtg = cpu_to_be32(word);
+ return xfile_store(rtb->sc->xfile, &urk,
+ sizeof(union xfs_rtword_raw),
+ wordoff << XFS_WORDLOG);
+}
+
+static inline int
+xfbmp_copyin(
+ struct xchk_rtbitmap *rtb,
+ xrep_wordoff_t wordoff,
+ const union xfs_rtword_raw *word,
+ xrep_wordcnt_t nr_words)
+{
+ return xfile_store(rtb->sc->xfile, word, nr_words << XFS_WORDLOG,
+ wordoff << XFS_WORDLOG);
+}
+
+static inline int
+xfbmp_copyout(
+ struct xchk_rtbitmap *rtb,
+ xrep_wordoff_t wordoff,
+ union xfs_rtword_raw *word,
+ xrep_wordcnt_t nr_words)
+{
+ return xfile_load(rtb->sc->xfile, word, nr_words << XFS_WORDLOG,
+ wordoff << XFS_WORDLOG);
+}
+
+/* Perform a logical OR operation on an rtword in the incore bitmap. */
+static int
+xrep_rtbitmap_or(
+ struct xchk_rtbitmap *rtb,
+ xrep_wordoff_t wordoff,
+ xfs_rtword_t mask)
+{
+ xfs_rtword_t word;
+ int error;
+
+ error = xfbmp_load(rtb, wordoff, &word);
+ if (error)
+ return error;
+
+ trace_xrep_rtbitmap_or(rtb->sc->mp, wordoff, mask, word);
+
+ return xfbmp_store(rtb, wordoff, word | mask);
+}
+
+/*
+ * Mark as free every rt extent between the next rt block we expected to see
+ * in the rtrmap records and the given rt block.
+ */
+STATIC int
+xrep_rtbitmap_mark_free(
+ struct xchk_rtbitmap *rtb,
+ xfs_rgblock_t rgbno)
+{
+ struct xfs_mount *mp = rtb->sc->mp;
+ struct xchk_rt *sr = &rtb->sc->sr;
+ struct xfs_rtgroup *rtg = sr->rtg;
+ xfs_rtxnum_t startrtx;
+ xfs_rtxnum_t nextrtx;
+ xrep_wordoff_t wordoff, nextwordoff;
+ unsigned int bit;
+ unsigned int bufwsize;
+ xfs_extlen_t mod;
+ xfs_rtword_t mask;
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (!xfs_verify_rgbext(rtg, rtb->next_rgbno, rgbno - rtb->next_rgbno))
+ return -EFSCORRUPTED;
+
+ /*
+ * Convert rt blocks to rt extents The block range we find must be
+ * aligned to an rtextent boundary on both ends.
+ */
+ startrtx = xfs_rgbno_to_rtx(mp, rtb->next_rgbno);
+ mod = xfs_rgbno_to_rtxoff(mp, rtb->next_rgbno);
+ if (mod)
+ return -EFSCORRUPTED;
+
+ nextrtx = xfs_rgbno_to_rtx(mp, rgbno - 1) + 1;
+ mod = xfs_rgbno_to_rtxoff(mp, rgbno - 1);
+ if (mod != mp->m_sb.sb_rextsize - 1)
+ return -EFSCORRUPTED;
+
+ /* Must not be shared or CoW staging. */
+ if (sr->refc_cur) {
+ error = xfs_refcount_has_records(sr->refc_cur,
+ XFS_REFC_DOMAIN_SHARED, rtb->next_rgbno,
+ rgbno - rtb->next_rgbno, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ error = xfs_refcount_has_records(sr->refc_cur,
+ XFS_REFC_DOMAIN_COW, rtb->next_rgbno,
+ rgbno - rtb->next_rgbno, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+ }
+
+ trace_xrep_rtbitmap_record_free(mp, startrtx, nextrtx - 1);
+
+ /* Set bits as needed to round startrtx up to the nearest word. */
+ bit = startrtx & XREP_RTBMP_WORDMASK;
+ if (bit) {
+ xfs_rtblock_t len = nextrtx - startrtx;
+ unsigned int lastbit;
+
+ lastbit = min(bit + len, XFS_NBWORD);
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+
+ error = xrep_rtbitmap_or(rtb, rtx_to_wordoff(mp, startrtx),
+ mask);
+ if (error || lastbit - bit == len)
+ return error;
+ startrtx += XFS_NBWORD - bit;
+ }
+
+ /* Set bits as needed to round nextrtx down to the nearest word. */
+ bit = nextrtx & XREP_RTBMP_WORDMASK;
+ if (bit) {
+ mask = ((xfs_rtword_t)1 << bit) - 1;
+
+ error = xrep_rtbitmap_or(rtb, rtx_to_wordoff(mp, nextrtx),
+ mask);
+ if (error || startrtx + bit == nextrtx)
+ return error;
+ nextrtx -= bit;
+ }
+
+ trace_xrep_rtbitmap_record_free_bulk(mp, startrtx, nextrtx - 1);
+
+ /* Set all the words in between, up to a whole fs block at once. */
+ wordoff = rtx_to_wordoff(mp, startrtx);
+ nextwordoff = rtx_to_wordoff(mp, nextrtx);
+ bufwsize = mp->m_sb.sb_blocksize >> XFS_WORDLOG;
+
+ while (wordoff < nextwordoff) {
+ xrep_wordoff_t rem;
+ xrep_wordcnt_t wordcnt;
+
+ wordcnt = min_t(xrep_wordcnt_t, nextwordoff - wordoff,
+ bufwsize);
+
+ /*
+ * Try to keep us aligned to the rtwords buffer to reduce the
+ * number of xfile writes.
+ */
+ rem = wordoff & (bufwsize - 1);
+ if (rem)
+ wordcnt = min_t(xrep_wordcnt_t, wordcnt,
+ bufwsize - rem);
+
+ error = xfbmp_copyin(rtb, wordoff, rtb->words, wordcnt);
+ if (error)
+ return error;
+
+ wordoff += wordcnt;
+ }
+
+ return 0;
+}
+
+/* Set free space in the rtbitmap based on rtrmapbt records. */
+STATIC int
+xrep_rtbitmap_walk_rtrmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xchk_rtbitmap *rtb = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(rtb->sc, &error))
+ return error;
+
+ if (rtb->next_rgbno < rec->rm_startblock) {
+ error = xrep_rtbitmap_mark_free(rtb, rec->rm_startblock);
+ if (error)
+ return error;
+ }
+
+ rtb->next_rgbno = max(rtb->next_rgbno,
+ rec->rm_startblock + rec->rm_blockcount);
+ return 0;
+}
+
+/*
+ * Walk the rtrmapbt to find all the gaps between records, and mark the gaps
+ * in the realtime bitmap that we're computing.
+ */
+STATIC int
+xrep_rtbitmap_find_freespace(
+ struct xchk_rtbitmap *rtb)
+{
+ struct xfs_scrub *sc = rtb->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
+ uint64_t blockcount;
+ int error;
+
+ /* Prepare a buffer of ones so that we can accelerate bulk setting. */
+ memset(rtb->words, 0xFF, mp->m_sb.sb_blocksize);
+
+ xrep_rtgroup_btcur_init(sc, &sc->sr);
+ error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_rtbitmap_walk_rtrmap,
+ rtb);
+ if (error)
+ goto out;
+
+ /*
+ * Mark as free every possible rt extent from the last one we saw to
+ * the end of the rt group.
+ */
+ blockcount = rtg->rtg_extents * mp->m_sb.sb_rextsize;
+ if (rtb->next_rgbno < blockcount) {
+ error = xrep_rtbitmap_mark_free(rtb, blockcount);
+ if (error)
+ goto out;
+ }
+
+out:
+ xchk_rtgroup_btcur_free(&sc->sr);
+ return error;
+}
+
+static int
+xrep_rtbitmap_prep_buf(
+ struct xfs_scrub *sc,
+ struct xfs_buf *bp,
+ void *data)
+{
+ struct xchk_rtbitmap *rtb = data;
+ struct xfs_mount *mp = sc->mp;
+ union xfs_rtword_raw *ondisk;
+ int error;
+
+ rtb->args.mp = sc->mp;
+ rtb->args.tp = sc->tp;
+ rtb->args.rbmbp = bp;
+ ondisk = xfs_rbmblock_wordptr(&rtb->args, 0);
+ rtb->args.rbmbp = NULL;
+
+ error = xfbmp_copyout(rtb, rtb->prep_wordoff, ondisk,
+ mp->m_blockwsize);
+ if (error)
+ return error;
+
+ if (xfs_has_rtgroups(sc->mp)) {
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+
+ hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC);
+ hdr->rt_owner = cpu_to_be64(sc->ip->i_ino);
+ hdr->rt_blkno = cpu_to_be64(xfs_buf_daddr(bp));
+ hdr->rt_lsn = 0;
+ uuid_copy(&hdr->rt_uuid, &sc->mp->m_sb.sb_meta_uuid);
+ bp->b_ops = &xfs_rtbitmap_buf_ops;
+ } else {
+ bp->b_ops = &xfs_rtbuf_ops;
+ }
+
+ rtb->prep_wordoff += mp->m_blockwsize;
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTBITMAP_BUF);
+ return 0;
+}
+
/*
* Make sure that the given range of the data fork of the realtime file is
* mapped to written blocks. The caller must ensure that the inode is joined
@@ -108,8 +461,6 @@ xrep_rtbitmap_data_mappings(
0, &map, &nmaps);
if (error)
return error;
- if (nmaps != 1)
- return -EFSCORRUPTED;
/* Commit new extent and all deferred work. */
error = xrep_defer_finish(sc);
@@ -162,9 +513,18 @@ xrep_rtbitmap(
{
struct xchk_rtbitmap *rtb = sc->buf;
struct xfs_mount *mp = sc->mp;
+ struct xfs_group *xg = rtg_group(sc->sr.rtg);
unsigned long long blocks = 0;
+ unsigned int busy_gen;
int error;
+ /* We require the realtime rmapbt to rebuild anything. */
+ if (!xfs_has_rtrmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+
/* Impossibly large rtbitmap means we can't touch the filesystem. */
if (rtb->rbmblocks > U32_MAX)
return 0;
@@ -197,6 +557,79 @@ xrep_rtbitmap(
if (error)
return error;
- /* Fix inconsistent bitmap geometry */
- return xrep_rtbitmap_geometry(sc, rtb);
+ /*
+ * Fix inconsistent bitmap geometry. This function returns with a
+ * clean scrub transaction.
+ */
+ error = xrep_rtbitmap_geometry(sc, rtb);
+ if (error)
+ return error;
+
+ /*
+ * Make sure the busy extent list is clear because we can't put extents
+ * on there twice.
+ */
+ if (!xfs_extent_busy_list_empty(xg, &busy_gen)) {
+ error = xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Generate the new rtbitmap data. We don't need the rtbmp information
+ * once this call is finished.
+ */
+ error = xrep_rtbitmap_find_freespace(rtb);
+ if (error)
+ return error;
+
+ /*
+ * Try to take ILOCK_EXCL of the temporary file. We had better be the
+ * only ones holding onto this inode, but we can't block while holding
+ * the rtbitmap file's ILOCK_EXCL.
+ */
+ while (!xrep_tempfile_ilock_nowait(sc)) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+ delay(1);
+ }
+
+ /*
+ * Make sure we have space allocated for the part of the bitmap
+ * file that corresponds to this group. We already joined sc->ip.
+ */
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+ error = xrep_tempfile_prealloc(sc, 0, rtb->rbmblocks);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /* Copy the bitmap file that we generated. */
+ error = xrep_tempfile_copyin(sc, 0, rtb->rbmblocks,
+ xrep_rtbitmap_prep_buf, rtb);
+ if (error)
+ return error;
+ error = xrep_tempfile_set_isize(sc,
+ XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks));
+ if (error)
+ return error;
+
+ /*
+ * Now exchange the data fork contents. We're done with the temporary
+ * buffer, so we can reuse it for the tempfile exchmaps information.
+ */
+ error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, 0,
+ rtb->rbmblocks, &rtb->tempexch);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_contents(sc, &rtb->tempexch);
+ if (error)
+ return error;
+
+ /* Free the old rtbitmap blocks if they're not in use. */
+ return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
}
diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c
new file mode 100644
index 000000000000..4c5dffc73641
--- /dev/null
+++ b/fs/xfs/scrub/rtrefcount.c
@@ -0,0 +1,661 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
+#include "xfs_metafile.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_rtalloc.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/repair.h"
+
+/* Set us up with the realtime refcount metadata locked. */
+int
+xchk_setup_rtrefcountbt(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_rtrefcountbt(sc);
+ if (error)
+ return error;
+ }
+
+ error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
+ if (error)
+ return error;
+
+ error = xchk_setup_rt(sc);
+ if (error)
+ return error;
+
+ error = xchk_install_live_inode(sc, rtg_refcount(sc->sr.rtg));
+ if (error)
+ return error;
+
+ return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+}
+
+/* Realtime Reference count btree scrubber. */
+
+/*
+ * Confirming Reference Counts via Reverse Mappings
+ *
+ * We want to count the reverse mappings overlapping a refcount record
+ * (bno, len, refcount), allowing for the possibility that some of the
+ * overlap may come from smaller adjoining reverse mappings, while some
+ * comes from single extents which overlap the range entirely. The
+ * outer loop is as follows:
+ *
+ * 1. For all reverse mappings overlapping the refcount extent,
+ * a. If a given rmap completely overlaps, mark it as seen.
+ * b. Otherwise, record the fragment (in agbno order) for later
+ * processing.
+ *
+ * Once we've seen all the rmaps, we know that for all blocks in the
+ * refcount record we want to find $refcount owners and we've already
+ * visited $seen extents that overlap all the blocks. Therefore, we
+ * need to find ($refcount - $seen) owners for every block in the
+ * extent; call that quantity $target_nr. Proceed as follows:
+ *
+ * 2. Pull the first $target_nr fragments from the list; all of them
+ * should start at or before the start of the extent.
+ * Call this subset of fragments the working set.
+ * 3. Until there are no more unprocessed fragments,
+ * a. Find the shortest fragments in the set and remove them.
+ * b. Note the block number of the end of these fragments.
+ * c. Pull the same number of fragments from the list. All of these
+ * fragments should start at the block number recorded in the
+ * previous step.
+ * d. Put those fragments in the set.
+ * 4. Check that there are $target_nr fragments remaining in the list,
+ * and that they all end at or beyond the end of the refcount extent.
+ *
+ * If the refcount is correct, all the check conditions in the algorithm
+ * should always hold true. If not, the refcount is incorrect.
+ */
+struct xchk_rtrefcnt_frag {
+ struct list_head list;
+ struct xfs_rmap_irec rm;
+};
+
+struct xchk_rtrefcnt_check {
+ struct xfs_scrub *sc;
+ struct list_head fragments;
+
+ /* refcount extent we're examining */
+ xfs_rgblock_t bno;
+ xfs_extlen_t len;
+ xfs_nlink_t refcount;
+
+ /* number of owners seen */
+ xfs_nlink_t seen;
+};
+
+/*
+ * Decide if the given rmap is large enough that we can redeem it
+ * towards refcount verification now, or if it's a fragment, in
+ * which case we'll hang onto it in the hopes that we'll later
+ * discover that we've collected exactly the correct number of
+ * fragments as the rtrefcountbt says we should have.
+ */
+STATIC int
+xchk_rtrefcountbt_rmap_check(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xchk_rtrefcnt_check *refchk = priv;
+ struct xchk_rtrefcnt_frag *frag;
+ xfs_rgblock_t rm_last;
+ xfs_rgblock_t rc_last;
+ int error = 0;
+
+ if (xchk_should_terminate(refchk->sc, &error))
+ return error;
+
+ rm_last = rec->rm_startblock + rec->rm_blockcount - 1;
+ rc_last = refchk->bno + refchk->len - 1;
+
+ /* Confirm that a single-owner refc extent is a CoW stage. */
+ if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) {
+ xchk_btree_xref_set_corrupt(refchk->sc, cur, 0);
+ return 0;
+ }
+
+ if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) {
+ /*
+ * The rmap overlaps the refcount record, so we can confirm
+ * one refcount owner seen.
+ */
+ refchk->seen++;
+ } else {
+ /*
+ * This rmap covers only part of the refcount record, so
+ * save the fragment for later processing. If the rmapbt
+ * is healthy each rmap_irec we see will be in agbno order
+ * so we don't need insertion sort here.
+ */
+ frag = kmalloc(sizeof(struct xchk_rtrefcnt_frag),
+ XCHK_GFP_FLAGS);
+ if (!frag)
+ return -ENOMEM;
+ memcpy(&frag->rm, rec, sizeof(frag->rm));
+ list_add_tail(&frag->list, &refchk->fragments);
+ }
+
+ return 0;
+}
+
+/*
+ * Given a bunch of rmap fragments, iterate through them, keeping
+ * a running tally of the refcount. If this ever deviates from
+ * what we expect (which is the rtrefcountbt's refcount minus the
+ * number of extents that totally covered the rtrefcountbt extent),
+ * we have a rtrefcountbt error.
+ */
+STATIC void
+xchk_rtrefcountbt_process_rmap_fragments(
+ struct xchk_rtrefcnt_check *refchk)
+{
+ struct list_head worklist;
+ struct xchk_rtrefcnt_frag *frag;
+ struct xchk_rtrefcnt_frag *n;
+ xfs_rgblock_t bno;
+ xfs_rgblock_t rbno;
+ xfs_rgblock_t next_rbno;
+ xfs_nlink_t nr;
+ xfs_nlink_t target_nr;
+
+ target_nr = refchk->refcount - refchk->seen;
+ if (target_nr == 0)
+ return;
+
+ /*
+ * There are (refchk->rc.rc_refcount - refchk->nr refcount)
+ * references we haven't found yet. Pull that many off the
+ * fragment list and figure out where the smallest rmap ends
+ * (and therefore the next rmap should start). All the rmaps
+ * we pull off should start at or before the beginning of the
+ * refcount record's range.
+ */
+ INIT_LIST_HEAD(&worklist);
+ rbno = NULLRGBLOCK;
+
+ /* Make sure the fragments actually /are/ in bno order. */
+ bno = 0;
+ list_for_each_entry(frag, &refchk->fragments, list) {
+ if (frag->rm.rm_startblock < bno)
+ goto done;
+ bno = frag->rm.rm_startblock;
+ }
+
+ /*
+ * Find all the rmaps that start at or before the refc extent,
+ * and put them on the worklist.
+ */
+ nr = 0;
+ list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+ if (frag->rm.rm_startblock > refchk->bno || nr > target_nr)
+ break;
+ bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+ if (bno < rbno)
+ rbno = bno;
+ list_move_tail(&frag->list, &worklist);
+ nr++;
+ }
+
+ /*
+ * We should have found exactly $target_nr rmap fragments starting
+ * at or before the refcount extent.
+ */
+ if (nr != target_nr)
+ goto done;
+
+ while (!list_empty(&refchk->fragments)) {
+ /* Discard any fragments ending at rbno from the worklist. */
+ nr = 0;
+ next_rbno = NULLRGBLOCK;
+ list_for_each_entry_safe(frag, n, &worklist, list) {
+ bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+ if (bno != rbno) {
+ if (bno < next_rbno)
+ next_rbno = bno;
+ continue;
+ }
+ list_del(&frag->list);
+ kfree(frag);
+ nr++;
+ }
+
+ /* Try to add nr rmaps starting at rbno to the worklist. */
+ list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+ bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
+ if (frag->rm.rm_startblock != rbno)
+ goto done;
+ list_move_tail(&frag->list, &worklist);
+ if (next_rbno > bno)
+ next_rbno = bno;
+ nr--;
+ if (nr == 0)
+ break;
+ }
+
+ /*
+ * If we get here and nr > 0, this means that we added fewer
+ * items to the worklist than we discarded because the fragment
+ * list ran out of items. Therefore, we cannot maintain the
+ * required refcount. Something is wrong, so we're done.
+ */
+ if (nr)
+ goto done;
+
+ rbno = next_rbno;
+ }
+
+ /*
+ * Make sure the last extent we processed ends at or beyond
+ * the end of the refcount extent.
+ */
+ if (rbno < refchk->bno + refchk->len)
+ goto done;
+
+ /* Actually record us having seen the remaining refcount. */
+ refchk->seen = refchk->refcount;
+done:
+ /* Delete fragments and work list. */
+ list_for_each_entry_safe(frag, n, &worklist, list) {
+ list_del(&frag->list);
+ kfree(frag);
+ }
+ list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
+ list_del(&frag->list);
+ kfree(frag);
+ }
+}
+
+/* Use the rmap entries covering this extent to verify the refcount. */
+STATIC void
+xchk_rtrefcountbt_xref_rmap(
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *irec)
+{
+ struct xchk_rtrefcnt_check refchk = {
+ .sc = sc,
+ .bno = irec->rc_startblock,
+ .len = irec->rc_blockcount,
+ .refcount = irec->rc_refcount,
+ .seen = 0,
+ };
+ struct xfs_rmap_irec low;
+ struct xfs_rmap_irec high;
+ struct xchk_rtrefcnt_frag *frag;
+ struct xchk_rtrefcnt_frag *n;
+ int error;
+
+ if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ /* Cross-reference with the rmapbt to confirm the refcount. */
+ memset(&low, 0, sizeof(low));
+ low.rm_startblock = irec->rc_startblock;
+ memset(&high, 0xFF, sizeof(high));
+ high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1;
+
+ INIT_LIST_HEAD(&refchk.fragments);
+ error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high,
+ xchk_rtrefcountbt_rmap_check, &refchk);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+ goto out_free;
+
+ xchk_rtrefcountbt_process_rmap_fragments(&refchk);
+ if (irec->rc_refcount != refchk.seen)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+
+out_free:
+ list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
+ list_del(&frag->list);
+ kfree(frag);
+ }
+}
+
+/* Cross-reference with the other btrees. */
+STATIC void
+xchk_rtrefcountbt_xref(
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *irec)
+{
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ xchk_xref_is_used_rt_space(sc,
+ xfs_rgbno_to_rtb(sc->sr.rtg, irec->rc_startblock),
+ irec->rc_blockcount);
+ xchk_rtrefcountbt_xref_rmap(sc, irec);
+}
+
+struct xchk_rtrefcbt_records {
+ /* Previous refcount record. */
+ struct xfs_refcount_irec prev_rec;
+
+ /* The next rtgroup block where we aren't expecting shared extents. */
+ xfs_rgblock_t next_unshared_rgbno;
+
+ /* Number of CoW blocks we expect. */
+ xfs_extlen_t cow_blocks;
+
+ /* Was the last record a shared or CoW staging extent? */
+ enum xfs_refc_domain prev_domain;
+};
+
+static inline bool
+xchk_rtrefcount_mergeable(
+ struct xchk_rtrefcbt_records *rrc,
+ const struct xfs_refcount_irec *r2)
+{
+ const struct xfs_refcount_irec *r1 = &rrc->prev_rec;
+
+ /* Ignore if prev_rec is not yet initialized. */
+ if (r1->rc_blockcount > 0)
+ return false;
+
+ if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock)
+ return false;
+ if (r1->rc_refcount != r2->rc_refcount)
+ return false;
+ if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount >
+ XFS_REFC_LEN_MAX)
+ return false;
+
+ return true;
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_rtrefcountbt_check_mergeable(
+ struct xchk_btree *bs,
+ struct xchk_rtrefcbt_records *rrc,
+ const struct xfs_refcount_irec *irec)
+{
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ if (xchk_rtrefcount_mergeable(rrc, irec))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec));
+}
+
+STATIC int
+xchk_rtrefcountbt_rmap_check_gap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ xfs_rgblock_t *next_bno = priv;
+
+ if (*next_bno != NULLRGBLOCK && rec->rm_startblock < *next_bno)
+ return -ECANCELED;
+
+ *next_bno = rec->rm_startblock + rec->rm_blockcount;
+ return 0;
+}
+
+/*
+ * Make sure that a gap in the reference count records does not correspond to
+ * overlapping records (i.e. shared extents) in the reverse mappings.
+ */
+static inline void
+xchk_rtrefcountbt_xref_gaps(
+ struct xfs_scrub *sc,
+ struct xchk_rtrefcbt_records *rrc,
+ xfs_rtblock_t bno)
+{
+ struct xfs_rmap_irec low;
+ struct xfs_rmap_irec high;
+ xfs_rgblock_t next_bno = NULLRGBLOCK;
+ int error;
+
+ if (bno <= rrc->next_unshared_rgbno || !sc->sr.rmap_cur ||
+ xchk_skip_xref(sc->sm))
+ return;
+
+ memset(&low, 0, sizeof(low));
+ low.rm_startblock = rrc->next_unshared_rgbno;
+ memset(&high, 0xFF, sizeof(high));
+ high.rm_startblock = bno - 1;
+
+ error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high,
+ xchk_rtrefcountbt_rmap_check_gap, &next_bno);
+ if (error == -ECANCELED)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+ else
+ xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur);
+}
+
+/* Scrub a rtrefcountbt record. */
+STATIC int
+xchk_rtrefcountbt_rec(
+ struct xchk_btree *bs,
+ const union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xchk_rtrefcbt_records *rrc = bs->private;
+ struct xfs_refcount_irec irec;
+ u32 mod;
+
+ xfs_refcount_btrec_to_irec(rec, &irec);
+ if (xfs_rtrefcount_check_irec(to_rtg(bs->cur->bc_group), &irec) !=
+ NULL) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ /* We can only share full rt extents. */
+ mod = xfs_rgbno_to_rtxoff(mp, irec.rc_startblock);
+ if (mod)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ mod = xfs_extlen_to_rtxmod(mp, irec.rc_blockcount);
+ if (mod)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
+ rrc->cow_blocks += irec.rc_blockcount;
+
+ /* Shared records always come before CoW records. */
+ if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED &&
+ rrc->prev_domain == XFS_REFC_DOMAIN_COW)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ rrc->prev_domain = irec.rc_domain;
+
+ xchk_rtrefcountbt_check_mergeable(bs, rrc, &irec);
+ xchk_rtrefcountbt_xref(bs->sc, &irec);
+
+ /*
+ * If this is a record for a shared extent, check that all blocks
+ * between the previous record and this one have at most one reverse
+ * mapping.
+ */
+ if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) {
+ xchk_rtrefcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock);
+ rrc->next_unshared_rgbno = irec.rc_startblock +
+ irec.rc_blockcount;
+ }
+
+ return 0;
+}
+
+/* Make sure we have as many refc blocks as the rmap says. */
+STATIC void
+xchk_refcount_xref_rmap(
+ struct xfs_scrub *sc,
+ const struct xfs_owner_info *btree_oinfo,
+ xfs_extlen_t cow_blocks)
+{
+ xfs_filblks_t refcbt_blocks = 0;
+ xfs_filblks_t blocks;
+ int error;
+
+ if (!sc->sr.rmap_cur || !sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ /* Check that we saw as many refcbt blocks as the rmap knows about. */
+ error = xfs_btree_count_blocks(sc->sr.refc_cur, &refcbt_blocks);
+ if (!xchk_btree_process_error(sc, sc->sr.refc_cur, 0, &error))
+ return;
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, btree_oinfo,
+ &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+ return;
+ if (blocks != refcbt_blocks)
+ xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+
+ /* Check that we saw as many cow blocks as the rmap knows about. */
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sr.rmap_cur,
+ &XFS_RMAP_OINFO_COW, &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+ return;
+ if (blocks != cow_blocks)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
+
+/* Scrub the refcount btree for some AG. */
+int
+xchk_rtrefcountbt(
+ struct xfs_scrub *sc)
+{
+ struct xfs_owner_info btree_oinfo;
+ struct xchk_rtrefcbt_records rrc = {
+ .cow_blocks = 0,
+ .next_unshared_rgbno = 0,
+ .prev_domain = XFS_REFC_DOMAIN_SHARED,
+ };
+ int error;
+
+ error = xchk_metadata_inode_forks(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
+ xfs_rmap_ino_bmbt_owner(&btree_oinfo, rtg_refcount(sc->sr.rtg)->i_ino,
+ XFS_DATA_FORK);
+ error = xchk_btree(sc, sc->sr.refc_cur, xchk_rtrefcountbt_rec,
+ &btree_oinfo, &rrc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
+ /*
+ * Check that all blocks between the last refcount > 1 record and the
+ * end of the rt volume have at most one reverse mapping.
+ */
+ xchk_rtrefcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_rblocks);
+
+ xchk_refcount_xref_rmap(sc, &btree_oinfo, rrc.cow_blocks);
+
+ return 0;
+}
+
+/* xref check that a cow staging extent is marked in the rtrefcountbt. */
+void
+xchk_xref_is_rt_cow_staging(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ struct xfs_refcount_irec rc;
+ int has_refcount;
+ int error;
+
+ if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ /* Find the CoW staging extent. */
+ error = xfs_refcount_lookup_le(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW,
+ bno, &has_refcount);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (!has_refcount) {
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+ return;
+ }
+
+ error = xfs_refcount_get_rec(sc->sr.refc_cur, &rc, &has_refcount);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (!has_refcount) {
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+ return;
+ }
+
+ /* CoW lookup returned a shared extent record? */
+ if (rc.rc_domain != XFS_REFC_DOMAIN_COW)
+ xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+
+ /* Must be at least as long as what was passed in */
+ if (rc.rc_blockcount < len)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
+/*
+ * xref check that the extent is not shared. Only file data blocks
+ * can have multiple owners.
+ */
+void
+xchk_xref_is_not_rt_shared(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_refcount_has_records(sc->sr.refc_cur,
+ XFS_REFC_DOMAIN_SHARED, bno, len, &outcome);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
+/* xref check that the extent is not being used for CoW staging. */
+void
+xchk_xref_is_not_rt_cow_staging(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_refcount_has_records(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW,
+ bno, len, &outcome);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c
new file mode 100644
index 000000000000..983362447826
--- /dev/null
+++ b/fs/xfs/scrub/rtrefcount_repair.c
@@ -0,0 +1,761 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_rtrefcount_btree.h"
+#include "xfs_error.h"
+#include "xfs_health.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_rtalloc.h"
+#include "xfs_ag.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+#include "scrub/rcbag.h"
+
+/*
+ * Rebuilding the Reference Count Btree
+ * ====================================
+ *
+ * This algorithm is "borrowed" from xfs_repair. Imagine the rmap
+ * entries as rectangles representing extents of physical blocks, and
+ * that the rectangles can be laid down to allow them to overlap each
+ * other; then we know that we must emit a refcnt btree entry wherever
+ * the amount of overlap changes, i.e. the emission stimulus is
+ * level-triggered:
+ *
+ * - ---
+ * -- ----- ---- --- ------
+ * -- ---- ----------- ---- ---------
+ * -------------------------------- -----------
+ * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
+ * 2 1 23 21 3 43 234 2123 1 01 2 3 0
+ *
+ * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
+ *
+ * Note that in the actual refcnt btree we don't store the refcount < 2
+ * cases because the bnobt tells us which blocks are free; single-use
+ * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
+ * supports storing multiple entries covering a given block we could
+ * theoretically dispense with the refcntbt and simply count rmaps, but
+ * that's inefficient in the (hot) write path, so we'll take the cost of
+ * the extra tree to save time. Also there's no guarantee that rmap
+ * will be enabled.
+ *
+ * Given an array of rmaps sorted by physical block number, a starting
+ * physical block (sp), a bag to hold rmaps that cover sp, and the next
+ * physical block where the level changes (np), we can reconstruct the
+ * rt refcount btree as follows:
+ *
+ * While there are still unprocessed rmaps in the array,
+ * - Set sp to the physical block (pblk) of the next unprocessed rmap.
+ * - Add to the bag all rmaps in the array where startblock == sp.
+ * - Set np to the physical block where the bag size will change. This
+ * is the minimum of (the pblk of the next unprocessed rmap) and
+ * (startblock + len of each rmap in the bag).
+ * - Record the bag size as old_bag_size.
+ *
+ * - While the bag isn't empty,
+ * - Remove from the bag all rmaps where startblock + len == np.
+ * - Add to the bag all rmaps in the array where startblock == np.
+ * - If the bag size isn't old_bag_size, store the refcount entry
+ * (sp, np - sp, bag_size) in the refcnt btree.
+ * - If the bag is empty, break out of the inner loop.
+ * - Set old_bag_size to the bag size
+ * - Set sp = np.
+ * - Set np to the physical block where the bag size will change.
+ * This is the minimum of (the pblk of the next unprocessed rmap)
+ * and (startblock + len of each rmap in the bag).
+ *
+ * Like all the other repairers, we make a list of all the refcount
+ * records we need, then reinitialize the rt refcount btree root and
+ * insert all the records.
+ */
+
+struct xrep_rtrefc {
+ /* refcount extents */
+ struct xfarray *refcount_records;
+
+ /* new refcountbt information */
+ struct xrep_newbt new_btree;
+
+ /* old refcountbt blocks */
+ struct xfsb_bitmap old_rtrefcountbt_blocks;
+
+ struct xfs_scrub *sc;
+
+ /* get_records()'s position in the rt refcount record array. */
+ xfarray_idx_t array_cur;
+
+ /* # of refcountbt blocks */
+ xfs_filblks_t btblocks;
+};
+
+/* Set us up to repair refcount btrees. */
+int
+xrep_setup_rtrefcountbt(
+ struct xfs_scrub *sc)
+{
+ char *descr;
+ int error;
+
+ descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ return error;
+}
+
+/* Check for any obvious conflicts with this shared/CoW staging extent. */
+STATIC int
+xrep_rtrefc_check_ext(
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *rec)
+{
+ xfs_rgblock_t last;
+
+ if (xfs_rtrefcount_check_irec(sc->sr.rtg, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ if (xfs_rgbno_to_rtxoff(sc->mp, rec->rc_startblock) != 0)
+ return -EFSCORRUPTED;
+
+ last = rec->rc_startblock + rec->rc_blockcount - 1;
+ if (xfs_rgbno_to_rtxoff(sc->mp, last) != sc->mp->m_sb.sb_rextsize - 1)
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space or misaligned. */
+ return xrep_require_rtext_inuse(sc, rec->rc_startblock,
+ rec->rc_blockcount);
+}
+
+/* Record a reference count extent. */
+STATIC int
+xrep_rtrefc_stash(
+ struct xrep_rtrefc *rr,
+ enum xfs_refc_domain domain,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len,
+ uint64_t refcount)
+{
+ struct xfs_refcount_irec irec = {
+ .rc_startblock = bno,
+ .rc_blockcount = len,
+ .rc_refcount = refcount,
+ .rc_domain = domain,
+ };
+ int error = 0;
+
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount);
+
+ error = xrep_rtrefc_check_ext(rr->sc, &irec);
+ if (error)
+ return error;
+
+ trace_xrep_refc_found(rtg_group(rr->sc->sr.rtg), &irec);
+
+ return xfarray_append(rr->refcount_records, &irec);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rtrefc_stash_cow(
+ struct xrep_rtrefc *rr,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ return xrep_rtrefc_stash(rr, XFS_REFC_DOMAIN_COW, bno, len, 1);
+}
+
+/* Decide if an rmap could describe a shared extent. */
+static inline bool
+xrep_rtrefc_rmap_shareable(
+ const struct xfs_rmap_irec *rmap)
+{
+ /* rt metadata are never sharable */
+ if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+ return false;
+
+ /* Unwritten file blocks are not shareable. */
+ if (rmap->rm_flags & XFS_RMAP_UNWRITTEN)
+ return false;
+
+ return true;
+}
+
+/* Grab the next (abbreviated) rmap record from the rmapbt. */
+STATIC int
+xrep_rtrefc_walk_rmaps(
+ struct xrep_rtrefc *rr,
+ struct xfs_rmap_irec *rmap,
+ bool *have_rec)
+{
+ struct xfs_btree_cur *cur = rr->sc->sr.rmap_cur;
+ struct xfs_mount *mp = cur->bc_mp;
+ int have_gt;
+ int error = 0;
+
+ *have_rec = false;
+
+ /*
+ * Loop through the remaining rmaps. Remember CoW staging
+ * extents and the refcountbt blocks from the old tree for later
+ * disposal. We can only share written data fork extents, so
+ * keep looping until we find an rmap for one.
+ */
+ do {
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ error = xfs_btree_increment(cur, 0, &have_gt);
+ if (error)
+ return error;
+ if (!have_gt)
+ return 0;
+
+ error = xfs_rmap_get_rec(cur, rmap, &have_gt);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(mp, !have_gt)) {
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+
+ if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
+ error = xrep_rtrefc_stash_cow(rr, rmap->rm_startblock,
+ rmap->rm_blockcount);
+ if (error)
+ return error;
+ } else if (xfs_is_sb_inum(mp, rmap->rm_owner) ||
+ (rmap->rm_flags & (XFS_RMAP_ATTR_FORK |
+ XFS_RMAP_BMBT_BLOCK))) {
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+ } while (!xrep_rtrefc_rmap_shareable(rmap));
+
+ *have_rec = true;
+ return 0;
+}
+
+static inline uint32_t
+xrep_rtrefc_encode_startblock(
+ const struct xfs_refcount_irec *irec)
+{
+ uint32_t start;
+
+ start = irec->rc_startblock & ~XFS_REFC_COWFLAG;
+ if (irec->rc_domain == XFS_REFC_DOMAIN_COW)
+ start |= XFS_REFC_COWFLAG;
+
+ return start;
+}
+
+/*
+ * Compare two refcount records. We want to sort in order of increasing block
+ * number.
+ */
+static int
+xrep_rtrefc_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_refcount_irec *ap = a;
+ const struct xfs_refcount_irec *bp = b;
+ uint32_t sa, sb;
+
+ sa = xrep_rtrefc_encode_startblock(ap);
+ sb = xrep_rtrefc_encode_startblock(bp);
+
+ if (sa > sb)
+ return 1;
+ if (sa < sb)
+ return -1;
+ return 0;
+}
+
+/*
+ * Sort the refcount extents by startblock or else the btree records will be in
+ * the wrong order. Make sure the records do not overlap in physical space.
+ */
+STATIC int
+xrep_rtrefc_sort_records(
+ struct xrep_rtrefc *rr)
+{
+ struct xfs_refcount_irec irec;
+ xfarray_idx_t cur;
+ enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED;
+ xfs_rgblock_t next_rgbno = 0;
+ int error;
+
+ error = xfarray_sort(rr->refcount_records, xrep_rtrefc_extent_cmp,
+ XFARRAY_SORT_KILLABLE);
+ if (error)
+ return error;
+
+ foreach_xfarray_idx(rr->refcount_records, cur) {
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ error = xfarray_load(rr->refcount_records, cur, &irec);
+ if (error)
+ return error;
+
+ if (dom == XFS_REFC_DOMAIN_SHARED &&
+ irec.rc_domain == XFS_REFC_DOMAIN_COW) {
+ dom = irec.rc_domain;
+ next_rgbno = 0;
+ }
+
+ if (dom != irec.rc_domain)
+ return -EFSCORRUPTED;
+ if (irec.rc_startblock < next_rgbno)
+ return -EFSCORRUPTED;
+
+ next_rgbno = irec.rc_startblock + irec.rc_blockcount;
+ }
+
+ return error;
+}
+
+/* Record extents that belong to the realtime refcount inode. */
+STATIC int
+xrep_rtrefc_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rtrefc *rr = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ /* Skip extents which are not owned by this inode and fork. */
+ if (rec->rm_owner != rr->sc->ip->i_ino)
+ return 0;
+
+ error = xrep_check_ino_btree_mapping(rr->sc, rec);
+ if (error)
+ return error;
+
+ return xfsb_bitmap_set(&rr->old_rtrefcountbt_blocks,
+ xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock),
+ rec->rm_blockcount);
+}
+
+/*
+ * Walk forward through the rmap btree to collect all rmaps starting at
+ * @bno in @rmap_bag. These represent the file(s) that share ownership of
+ * the current block. Upon return, the rmap cursor points to the last record
+ * satisfying the startblock constraint.
+ */
+static int
+xrep_rtrefc_push_rmaps_at(
+ struct xrep_rtrefc *rr,
+ struct rcbag *rcstack,
+ xfs_rgblock_t bno,
+ struct xfs_rmap_irec *rmap,
+ bool *have)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int have_gt;
+ int error;
+
+ while (*have && rmap->rm_startblock == bno) {
+ error = rcbag_add(rcstack, rr->sc->tp, rmap);
+ if (error)
+ return error;
+
+ error = xrep_rtrefc_walk_rmaps(rr, rmap, have);
+ if (error)
+ return error;
+ }
+
+ error = xfs_btree_decrement(sc->sr.rmap_cur, 0, &have_gt);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(sc->mp, !have_gt)) {
+ xfs_btree_mark_sick(sc->sr.rmap_cur);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/* Scan one AG for reverse mappings for the realtime refcount btree. */
+STATIC int
+xrep_rtrefc_scan_ag(
+ struct xrep_rtrefc *rr,
+ struct xfs_perag *pag)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ error = xrep_ag_init(sc, pag, &sc->sa);
+ if (error)
+ return error;
+
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrefc_walk_rmap, rr);
+ xchk_ag_free(sc, &sc->sa);
+ return error;
+}
+
+/* Iterate all the rmap records to generate reference count data. */
+STATIC int
+xrep_rtrefc_find_refcounts(
+ struct xrep_rtrefc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct rcbag *rcstack;
+ struct xfs_perag *pag = NULL;
+ uint64_t old_stack_height;
+ xfs_rgblock_t sbno;
+ xfs_rgblock_t cbno;
+ xfs_rgblock_t nbno;
+ bool have;
+ int error;
+
+ /* Scan for old rtrefc btree blocks. */
+ while ((pag = xfs_perag_next(sc->mp, pag))) {
+ error = xrep_rtrefc_scan_ag(rr, pag);
+ if (error) {
+ xfs_perag_rele(pag);
+ return error;
+ }
+ }
+
+ xrep_rtgroup_btcur_init(sc, &sc->sr);
+
+ /*
+ * Set up a bag to store all the rmap records that we're tracking to
+ * generate a reference count record. If this exceeds
+ * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount.
+ */
+ error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
+ if (error)
+ goto out_cur;
+
+ /* Start the rtrmapbt cursor to the left of all records. */
+ error = xfs_btree_goto_left_edge(sc->sr.rmap_cur);
+ if (error)
+ goto out_bag;
+
+ /* Process reverse mappings into refcount data. */
+ while (xfs_btree_has_more_records(sc->sr.rmap_cur)) {
+ struct xfs_rmap_irec rmap;
+
+ /* Push all rmaps with pblk == sbno onto the stack */
+ error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have);
+ if (error)
+ goto out_bag;
+ if (!have)
+ break;
+ sbno = cbno = rmap.rm_startblock;
+ error = xrep_rtrefc_push_rmaps_at(rr, rcstack, sbno, &rmap,
+ &have);
+ if (error)
+ goto out_bag;
+
+ /* Set nbno to the bno of the next refcount change */
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno);
+ if (error)
+ goto out_bag;
+
+ ASSERT(nbno > sbno);
+ old_stack_height = rcbag_count(rcstack);
+
+ /* While stack isn't empty... */
+ while (rcbag_count(rcstack) > 0) {
+ /* Pop all rmaps that end at nbno */
+ error = rcbag_remove_ending_at(rcstack, sc->tp, nbno);
+ if (error)
+ goto out_bag;
+
+ /* Push array items that start at nbno */
+ error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have);
+ if (error)
+ goto out_bag;
+ if (have) {
+ error = xrep_rtrefc_push_rmaps_at(rr, rcstack,
+ nbno, &rmap, &have);
+ if (error)
+ goto out_bag;
+ }
+
+ /* Emit refcount if necessary */
+ ASSERT(nbno > cbno);
+ if (rcbag_count(rcstack) != old_stack_height) {
+ if (old_stack_height > 1) {
+ error = xrep_rtrefc_stash(rr,
+ XFS_REFC_DOMAIN_SHARED,
+ cbno, nbno - cbno,
+ old_stack_height);
+ if (error)
+ goto out_bag;
+ }
+ cbno = nbno;
+ }
+
+ /* Stack empty, go find the next rmap */
+ if (rcbag_count(rcstack) == 0)
+ break;
+ old_stack_height = rcbag_count(rcstack);
+ sbno = nbno;
+
+ /* Set nbno to the bno of the next refcount change */
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
+ &nbno);
+ if (error)
+ goto out_bag;
+
+ ASSERT(nbno > sbno);
+ }
+ }
+
+ ASSERT(rcbag_count(rcstack) == 0);
+out_bag:
+ rcbag_free(&rcstack);
+out_cur:
+ xchk_rtgroup_btcur_free(&sc->sr);
+ return error;
+}
+
+/* Retrieve refcountbt data for bulk load. */
+STATIC int
+xrep_rtrefc_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xrep_rtrefc *rr = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ error = xfarray_load(rr->refcount_records, rr->array_cur++,
+ &cur->bc_rec.rc);
+ if (error)
+ return error;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rtrefc_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_rtrefc *rr = priv;
+
+ return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_rtrefc_iroot_size(
+ struct xfs_btree_cur *cur,
+ unsigned int level,
+ unsigned int nr_this_level,
+ void *priv)
+{
+ return xfs_rtrefcount_broot_space_calc(cur->bc_mp, level,
+ nr_this_level);
+}
+
+/*
+ * Use the collected refcount information to stage a new rt refcount btree. If
+ * this is successful we'll return with the new btree root information logged
+ * to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_rtrefc_build_new_tree(
+ struct xrep_rtrefc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
+ struct xfs_btree_cur *refc_cur;
+ int error;
+
+ error = xrep_rtrefc_sort_records(rr);
+ if (error)
+ return error;
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the realtime refcount inode.
+ */
+ error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc);
+ if (error)
+ return error;
+
+ rr->new_btree.bload.get_records = xrep_rtrefc_get_records;
+ rr->new_btree.bload.claim_block = xrep_rtrefc_claim_block;
+ rr->new_btree.bload.iroot_size = xrep_rtrefc_iroot_size;
+
+ refc_cur = xfs_rtrefcountbt_init_cursor(NULL, rtg);
+ xfs_btree_stage_ifakeroot(refc_cur, &rr->new_btree.ifake);
+
+ /* Compute how many blocks we'll need. */
+ error = xfs_btree_bload_compute_geometry(refc_cur, &rr->new_btree.bload,
+ xfarray_length(rr->refcount_records));
+ if (error)
+ goto err_cur;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_cur;
+
+ /*
+ * Guess how many blocks we're going to need to rebuild an entire
+ * rtrefcountbt from the number of extents we found, and pump up our
+ * transaction to have sufficient block reservation. We're allowed
+ * to exceed quota to repair inconsistent metadata, though this is
+ * unlikely.
+ */
+ error = xfs_trans_reserve_more_inode(sc->tp, rtg_refcount(rtg),
+ rr->new_btree.bload.nr_blocks, 0, true);
+ if (error)
+ goto err_cur;
+
+ /* Reserve the space we'll need for the new btree. */
+ error = xrep_newbt_alloc_blocks(&rr->new_btree,
+ rr->new_btree.bload.nr_blocks);
+ if (error)
+ goto err_cur;
+
+ /* Add all observed refcount records. */
+ rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE;
+ rr->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr);
+ if (error)
+ goto err_cur;
+
+ /*
+ * Install the new rtrefc btree in the inode. After this point the old
+ * btree is no longer accessible, the new tree is live, and we can
+ * delete the cursor.
+ */
+ xfs_rtrefcountbt_commit_staged_btree(refc_cur, sc->tp);
+ xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks);
+ xfs_btree_del_cursor(refc_cur, 0);
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rr->new_btree);
+ if (error)
+ return error;
+
+ return xrep_roll_trans(sc);
+err_cur:
+ xfs_btree_del_cursor(refc_cur, error);
+ xrep_newbt_cancel(&rr->new_btree);
+ return error;
+}
+
+/* Rebuild the rt refcount btree. */
+int
+xrep_rtrefcountbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rtrefc *rr;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rtrmapbt(mp))
+ return -EOPNOTSUPP;
+
+ /* Make sure any problems with the fork are fixed. */
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ return error;
+
+ rr = kzalloc(sizeof(struct xrep_rtrefc), XCHK_GFP_FLAGS);
+ if (!rr)
+ return -ENOMEM;
+ rr->sc = sc;
+
+ /* Set up enough storage to handle one refcount record per rt extent. */
+ descr = xchk_xfile_ag_descr(sc, "reference count records");
+ error = xfarray_create(descr, mp->m_sb.sb_rextents,
+ sizeof(struct xfs_refcount_irec),
+ &rr->refcount_records);
+ kfree(descr);
+ if (error)
+ goto out_rr;
+
+ /* Collect all reference counts. */
+ xfsb_bitmap_init(&rr->old_rtrefcountbt_blocks);
+ error = xrep_rtrefc_find_refcounts(rr);
+ if (error)
+ goto out_bitmap;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Rebuild the refcount information. */
+ error = xrep_rtrefc_build_new_tree(rr);
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Free all the extents that were allocated to the former rtrefcountbt
+ * and aren't cross-linked with something else.
+ */
+ error = xrep_reap_metadir_fsblocks(rr->sc,
+ &rr->old_rtrefcountbt_blocks);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xfsb_bitmap_destroy(&rr->old_rtrefcountbt_blocks);
+ xfarray_destroy(rr->refcount_records);
+out_rr:
+ kfree(rr);
+ return error;
+}
diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
new file mode 100644
index 000000000000..12989fe80e8b
--- /dev/null
+++ b/fs/xfs/scrub/rtrmap.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rtgroup.h"
+#include "xfs_metafile.h"
+#include "xfs_refcount.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/* Set us up with the realtime metadata locked. */
+int
+xchk_setup_rtrmapbt(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_rtrmapbt(sc);
+ if (error)
+ return error;
+ }
+
+ error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
+ if (error)
+ return error;
+
+ error = xchk_setup_rt(sc);
+ if (error)
+ return error;
+
+ error = xchk_install_live_inode(sc, rtg_rmap(sc->sr.rtg));
+ if (error)
+ return error;
+
+ return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+}
+
+/* Realtime reverse mapping. */
+
+struct xchk_rtrmap {
+ /*
+ * The furthest-reaching of the rmapbt records that we've already
+ * processed. This enables us to detect overlapping records for space
+ * allocations that cannot be shared.
+ */
+ struct xfs_rmap_irec overlap_rec;
+
+ /*
+ * The previous rmapbt record, so that we can check for two records
+ * that could be one.
+ */
+ struct xfs_rmap_irec prev_rec;
+};
+
+static inline bool
+xchk_rtrmapbt_is_shareable(
+ struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *irec)
+{
+ if (!xfs_has_rtreflink(sc->mp))
+ return false;
+ if (irec->rm_flags & XFS_RMAP_UNWRITTEN)
+ return false;
+ return true;
+}
+
+/* Flag failures for records that overlap but cannot. */
+STATIC void
+xchk_rtrmapbt_check_overlapping(
+ struct xchk_btree *bs,
+ struct xchk_rtrmap *cr,
+ const struct xfs_rmap_irec *irec)
+{
+ xfs_rtblock_t pnext, inext;
+
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ /* No previous record? */
+ if (cr->overlap_rec.rm_blockcount == 0)
+ goto set_prev;
+
+ /* Do overlap_rec and irec overlap? */
+ pnext = cr->overlap_rec.rm_startblock + cr->overlap_rec.rm_blockcount;
+ if (pnext <= irec->rm_startblock)
+ goto set_prev;
+
+ /* Overlap is only allowed if both records are data fork mappings. */
+ if (!xchk_rtrmapbt_is_shareable(bs->sc, &cr->overlap_rec) ||
+ !xchk_rtrmapbt_is_shareable(bs->sc, irec))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ /* Save whichever rmap record extends furthest. */
+ inext = irec->rm_startblock + irec->rm_blockcount;
+ if (pnext > inext)
+ return;
+
+set_prev:
+ memcpy(&cr->overlap_rec, irec, sizeof(struct xfs_rmap_irec));
+}
+
+/* Decide if two reverse-mapping records can be merged. */
+static inline bool
+xchk_rtrmap_mergeable(
+ struct xchk_rtrmap *cr,
+ const struct xfs_rmap_irec *r2)
+{
+ const struct xfs_rmap_irec *r1 = &cr->prev_rec;
+
+ /* Ignore if prev_rec is not yet initialized. */
+ if (cr->prev_rec.rm_blockcount == 0)
+ return false;
+
+ if (r1->rm_owner != r2->rm_owner)
+ return false;
+ if (r1->rm_startblock + r1->rm_blockcount != r2->rm_startblock)
+ return false;
+ if ((unsigned long long)r1->rm_blockcount + r2->rm_blockcount >
+ XFS_RMAP_LEN_MAX)
+ return false;
+ if (r1->rm_flags != r2->rm_flags)
+ return false;
+ return r1->rm_offset + r1->rm_blockcount == r2->rm_offset;
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_rtrmapbt_check_mergeable(
+ struct xchk_btree *bs,
+ struct xchk_rtrmap *cr,
+ const struct xfs_rmap_irec *irec)
+{
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ if (xchk_rtrmap_mergeable(cr, irec))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec));
+}
+
+/* Cross-reference a rmap against the refcount btree. */
+STATIC void
+xchk_rtrmapbt_xref_rtrefc(
+ struct xfs_scrub *sc,
+ struct xfs_rmap_irec *irec)
+{
+ xfs_rgblock_t fbno;
+ xfs_extlen_t flen;
+ bool is_inode;
+ bool is_bmbt;
+ bool is_attr;
+ bool is_unwritten;
+ int error;
+
+ if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
+ is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK;
+ is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK;
+ is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN;
+
+ /* If this is shared, must be a data fork extent. */
+ error = xfs_refcount_find_shared(sc->sr.refc_cur, irec->rm_startblock,
+ irec->rm_blockcount, &fbno, &flen, false);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur))
+ return;
+ if (flen != 0 && (!is_inode || is_attr || is_bmbt || is_unwritten))
+ xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0);
+}
+
+/* Cross-reference with other metadata. */
+STATIC void
+xchk_rtrmapbt_xref(
+ struct xfs_scrub *sc,
+ struct xfs_rmap_irec *irec)
+{
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ xchk_xref_is_used_rt_space(sc,
+ xfs_rgbno_to_rtb(sc->sr.rtg, irec->rm_startblock),
+ irec->rm_blockcount);
+ if (irec->rm_owner == XFS_RMAP_OWN_COW)
+ xchk_xref_is_cow_staging(sc, irec->rm_startblock,
+ irec->rm_blockcount);
+ else
+ xchk_rtrmapbt_xref_rtrefc(sc, irec);
+}
+
+/* Scrub a realtime rmapbt record. */
+STATIC int
+xchk_rtrmapbt_rec(
+ struct xchk_btree *bs,
+ const union xfs_btree_rec *rec)
+{
+ struct xchk_rtrmap *cr = bs->private;
+ struct xfs_rmap_irec irec;
+
+ if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL ||
+ xfs_rtrmap_check_irec(to_rtg(bs->cur->bc_group), &irec) != NULL) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ xchk_rtrmapbt_check_mergeable(bs, cr, &irec);
+ xchk_rtrmapbt_check_overlapping(bs, cr, &irec);
+ xchk_rtrmapbt_xref(bs->sc, &irec);
+ return 0;
+}
+
+/* Scrub the realtime rmap btree. */
+int
+xchk_rtrmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xfs_inode *ip = rtg_rmap(sc->sr.rtg);
+ struct xfs_owner_info oinfo;
+ struct xchk_rtrmap cr = { };
+ int error;
+
+ error = xchk_metadata_inode_forks(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, XFS_DATA_FORK);
+ return xchk_btree(sc, sc->sr.rmap_cur, xchk_rtrmapbt_rec, &oinfo, &cr);
+}
+
+/* xref check that the extent has no realtime reverse mapping at all */
+void
+xchk_xref_has_no_rt_owner(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+ return;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
+
+/* xref check that the extent is completely mapped */
+void
+xchk_xref_has_rt_owner(
+ struct xfs_scrub *sc,
+ xfs_rgblock_t bno,
+ xfs_extlen_t len)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+ return;
+ if (outcome != XBTREE_RECPACKING_FULL)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
+
+/* xref check that the extent is only owned by a given owner */
+void
+xchk_xref_is_only_rt_owned_by(
+ struct xfs_scrub *sc,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_rmap_matches res;
+ int error;
+
+ if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_rmap_count_owners(sc->sr.rmap_cur, bno, len, oinfo, &res);
+ if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur))
+ return;
+ if (res.matches != 1)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+ if (res.bad_non_owner_matches)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+ if (res.non_owner_matches)
+ xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0);
+}
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
new file mode 100644
index 000000000000..fc2592c53af5
--- /dev/null
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -0,0 +1,987 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_quota.h"
+#include "xfs_rtalloc.h"
+#include "xfs_ag.h"
+#include "xfs_rtgroup.h"
+#include "xfs_refcount.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/rgb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Realtime Reverse Mapping Btree Repair
+ * =====================================
+ *
+ * This isn't quite as difficult as repairing the rmap btree on the data
+ * device, since we only store the data fork extents of realtime files on the
+ * realtime device. We still have to freeze the filesystem and stop the
+ * background threads like we do for the rmap repair, but we only have to scan
+ * realtime inodes.
+ *
+ * Collecting entries for the new realtime rmap btree is easy -- all we have
+ * to do is generate rtrmap entries from the data fork mappings of all realtime
+ * files in the filesystem. We then scan the rmap btrees of the data device
+ * looking for extents belonging to the old btree and note them in a bitmap.
+ *
+ * To rebuild the realtime rmap btree, we bulk-load the collected mappings into
+ * a new btree cursor and atomically swap that into the realtime inode. Then
+ * we can free the blocks from the old btree.
+ *
+ * We use the 'xrep_rtrmap' prefix for all the rmap functions.
+ */
+
+/* Context for collecting rmaps */
+struct xrep_rtrmap {
+ /* new rtrmapbt information */
+ struct xrep_newbt new_btree;
+
+ /* lock for the xfbtree and xfile */
+ struct mutex lock;
+
+ /* rmap records generated from primary metadata */
+ struct xfbtree rtrmap_btree;
+
+ struct xfs_scrub *sc;
+
+ /* bitmap of old rtrmapbt blocks */
+ struct xfsb_bitmap old_rtrmapbt_blocks;
+
+ /* Hooks into rtrmap update code. */
+ struct xfs_rmap_hook rhook;
+
+ /* inode scan cursor */
+ struct xchk_iscan iscan;
+
+ /* in-memory btree cursor for the ->get_blocks walk */
+ struct xfs_btree_cur *mcur;
+
+ /* Number of records we're staging in the new btree. */
+ uint64_t nr_records;
+};
+
+/* Set us up to repair rt reverse mapping btrees. */
+int
+xrep_setup_rtrmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rtrmap *rr;
+ char *descr;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
+
+ descr = xchk_xfile_rtgroup_descr(sc, "reverse mapping records");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ if (error)
+ return error;
+
+ rr = kzalloc(sizeof(struct xrep_rtrmap), XCHK_GFP_FLAGS);
+ if (!rr)
+ return -ENOMEM;
+
+ rr->sc = sc;
+ sc->buf = rr;
+ return 0;
+}
+
+/* Make sure there's nothing funny about this mapping. */
+STATIC int
+xrep_rtrmap_check_mapping(
+ struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *rec)
+{
+ if (xfs_rtrmap_check_irec(sc->sr.rtg, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ return xrep_require_rtext_inuse(sc, rec->rm_startblock,
+ rec->rm_blockcount);
+}
+
+/* Store a reverse-mapping record. */
+static inline int
+xrep_rtrmap_stash(
+ struct xrep_rtrmap *rr,
+ xfs_rgblock_t startblock,
+ xfs_extlen_t blockcount,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ struct xfs_rmap_irec rmap = {
+ .rm_startblock = startblock,
+ .rm_blockcount = blockcount,
+ .rm_owner = owner,
+ .rm_offset = offset,
+ .rm_flags = flags,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_btree_cur *mcur;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ if (xchk_iscan_aborted(&rr->iscan))
+ return -EFSCORRUPTED;
+
+ trace_xrep_rtrmap_found(sc->mp, &rmap);
+
+ /* Add entry to in-memory btree. */
+ mutex_lock(&rr->lock);
+ mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, sc->tp, &rr->rtrmap_btree);
+ error = xfs_rmap_map_raw(mcur, &rmap);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_cancel;
+
+ error = xfbtree_trans_commit(&rr->rtrmap_btree, sc->tp);
+ if (error)
+ goto out_abort;
+
+ mutex_unlock(&rr->lock);
+ return 0;
+
+out_cancel:
+ xfbtree_trans_cancel(&rr->rtrmap_btree, sc->tp);
+out_abort:
+ xchk_iscan_abort(&rr->iscan);
+ mutex_unlock(&rr->lock);
+ return error;
+}
+
+/* Finding all file and bmbt extents. */
+
+/* Context for accumulating rmaps for an inode fork. */
+struct xrep_rtrmap_ifork {
+ /*
+ * Accumulate rmap data here to turn multiple adjacent bmaps into a
+ * single rmap.
+ */
+ struct xfs_rmap_irec accum;
+
+ struct xrep_rtrmap *rr;
+};
+
+/* Stash an rmap that we accumulated while walking an inode fork. */
+STATIC int
+xrep_rtrmap_stash_accumulated(
+ struct xrep_rtrmap_ifork *rf)
+{
+ if (rf->accum.rm_blockcount == 0)
+ return 0;
+
+ return xrep_rtrmap_stash(rf->rr, rf->accum.rm_startblock,
+ rf->accum.rm_blockcount, rf->accum.rm_owner,
+ rf->accum.rm_offset, rf->accum.rm_flags);
+}
+
+/* Accumulate a bmbt record. */
+STATIC int
+xrep_rtrmap_visit_bmbt(
+ struct xfs_btree_cur *cur,
+ struct xfs_bmbt_irec *rec,
+ void *priv)
+{
+ struct xrep_rtrmap_ifork *rf = priv;
+ struct xfs_rmap_irec *accum = &rf->accum;
+ struct xfs_mount *mp = rf->rr->sc->mp;
+ xfs_rgblock_t rgbno;
+ unsigned int rmap_flags = 0;
+ int error;
+
+ if (xfs_rtb_to_rgno(mp, rec->br_startblock) !=
+ rtg_rgno(rf->rr->sc->sr.rtg))
+ return 0;
+
+ if (rec->br_state == XFS_EXT_UNWRITTEN)
+ rmap_flags |= XFS_RMAP_UNWRITTEN;
+
+ /* If this bmap is adjacent to the previous one, just add it. */
+ rgbno = xfs_rtb_to_rgbno(mp, rec->br_startblock);
+ if (accum->rm_blockcount > 0 &&
+ rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
+ rgbno == accum->rm_startblock + accum->rm_blockcount &&
+ rmap_flags == accum->rm_flags) {
+ accum->rm_blockcount += rec->br_blockcount;
+ return 0;
+ }
+
+ /* Otherwise stash the old rmap and start accumulating a new one. */
+ error = xrep_rtrmap_stash_accumulated(rf);
+ if (error)
+ return error;
+
+ accum->rm_startblock = rgbno;
+ accum->rm_blockcount = rec->br_blockcount;
+ accum->rm_offset = rec->br_startoff;
+ accum->rm_flags = rmap_flags;
+ return 0;
+}
+
+/*
+ * Iterate the block mapping btree to collect rmap records for anything in this
+ * fork that maps to the rt volume. Sets @mappings_done to true if we've
+ * scanned the block mappings in this fork.
+ */
+STATIC int
+xrep_rtrmap_scan_bmbt(
+ struct xrep_rtrmap_ifork *rf,
+ struct xfs_inode *ip,
+ bool *mappings_done)
+{
+ struct xrep_rtrmap *rr = rf->rr;
+ struct xfs_btree_cur *cur;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ int error = 0;
+
+ *mappings_done = false;
+
+ /*
+ * If the incore extent cache is already loaded, we'll just use the
+ * incore extent scanner to record mappings. Don't bother walking the
+ * ondisk extent tree.
+ */
+ if (!xfs_need_iread_extents(ifp))
+ return 0;
+
+ /* Accumulate all the mappings in the bmap btree. */
+ cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, XFS_DATA_FORK);
+ error = xfs_bmap_query_all(cur, xrep_rtrmap_visit_bmbt, rf);
+ xfs_btree_del_cursor(cur, error);
+ if (error)
+ return error;
+
+ /* Stash any remaining accumulated rmaps and exit. */
+ *mappings_done = true;
+ return xrep_rtrmap_stash_accumulated(rf);
+}
+
+/*
+ * Iterate the in-core extent cache to collect rmap records for anything in
+ * this fork that matches the AG.
+ */
+STATIC int
+xrep_rtrmap_scan_iext(
+ struct xrep_rtrmap_ifork *rf,
+ struct xfs_ifork *ifp)
+{
+ struct xfs_bmbt_irec rec;
+ struct xfs_iext_cursor icur;
+ int error;
+
+ for_each_xfs_iext(ifp, &icur, &rec) {
+ if (isnullstartblock(rec.br_startblock))
+ continue;
+ error = xrep_rtrmap_visit_bmbt(NULL, &rec, rf);
+ if (error)
+ return error;
+ }
+
+ return xrep_rtrmap_stash_accumulated(rf);
+}
+
+/* Find all the extents on the realtime device mapped by an inode fork. */
+STATIC int
+xrep_rtrmap_scan_dfork(
+ struct xrep_rtrmap *rr,
+ struct xfs_inode *ip)
+{
+ struct xrep_rtrmap_ifork rf = {
+ .accum = { .rm_owner = ip->i_ino, },
+ .rr = rr,
+ };
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ int error = 0;
+
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ bool mappings_done;
+
+ /*
+ * Scan the bmbt for mappings. If the incore extent tree is
+ * loaded, we want to scan the cached mappings since that's
+ * faster when the extent counts are very high.
+ */
+ error = xrep_rtrmap_scan_bmbt(&rf, ip, &mappings_done);
+ if (error || mappings_done)
+ return error;
+ } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
+ /* realtime data forks should only be extents or btree */
+ return -EFSCORRUPTED;
+ }
+
+ /* Scan incore extent cache. */
+ return xrep_rtrmap_scan_iext(&rf, ifp);
+}
+
+/* Record reverse mappings for a file. */
+STATIC int
+xrep_rtrmap_scan_inode(
+ struct xrep_rtrmap *rr,
+ struct xfs_inode *ip)
+{
+ unsigned int lock_mode;
+ int error = 0;
+
+ /* Skip the rt rmap btree inode. */
+ if (rr->sc->ip == ip)
+ return 0;
+
+ lock_mode = xfs_ilock_data_map_shared(ip);
+
+ /* Check the data fork if it's on the realtime device. */
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ error = xrep_rtrmap_scan_dfork(rr, ip);
+ if (error)
+ goto out_unlock;
+ }
+
+ xchk_iscan_mark_visited(&rr->iscan, ip);
+out_unlock:
+ xfs_iunlock(ip, lock_mode);
+ return error;
+}
+
+/* Record extents that belong to the realtime rmap inode. */
+STATIC int
+xrep_rtrmap_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rtrmap *rr = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ /* Skip extents which are not owned by this inode and fork. */
+ if (rec->rm_owner != rr->sc->ip->i_ino)
+ return 0;
+
+ error = xrep_check_ino_btree_mapping(rr->sc, rec);
+ if (error)
+ return error;
+
+ return xfsb_bitmap_set(&rr->old_rtrmapbt_blocks,
+ xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock),
+ rec->rm_blockcount);
+}
+
+/* Scan one AG for reverse mappings for the realtime rmap btree. */
+STATIC int
+xrep_rtrmap_scan_ag(
+ struct xrep_rtrmap *rr,
+ struct xfs_perag *pag)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ error = xrep_ag_init(sc, pag, &sc->sa);
+ if (error)
+ return error;
+
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrmap_walk_rmap, rr);
+ xchk_ag_free(sc, &sc->sa);
+ return error;
+}
+
+struct xrep_rtrmap_stash_run {
+ struct xrep_rtrmap *rr;
+ uint64_t owner;
+};
+
+static int
+xrep_rtrmap_stash_run(
+ uint32_t start,
+ uint32_t len,
+ void *priv)
+{
+ struct xrep_rtrmap_stash_run *rsr = priv;
+ struct xrep_rtrmap *rr = rsr->rr;
+ xfs_rgblock_t rgbno = start;
+
+ return xrep_rtrmap_stash(rr, rgbno, len, rsr->owner, 0, 0);
+}
+
+/*
+ * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure
+ * that the ranges are in units of FS blocks.
+ */
+STATIC int
+xrep_rtrmap_stash_bitmap(
+ struct xrep_rtrmap *rr,
+ struct xrgb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xrep_rtrmap_stash_run rsr = {
+ .rr = rr,
+ .owner = oinfo->oi_owner,
+ };
+
+ return xrgb_bitmap_walk(bitmap, xrep_rtrmap_stash_run, &rsr);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rtrmap_walk_cowblocks(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *irec,
+ void *priv)
+{
+ struct xrgb_bitmap *bitmap = priv;
+
+ if (!xfs_refcount_check_domain(irec) ||
+ irec->rc_domain != XFS_REFC_DOMAIN_COW)
+ return -EFSCORRUPTED;
+
+ return xrgb_bitmap_set(bitmap, irec->rc_startblock,
+ irec->rc_blockcount);
+}
+
+/*
+ * Collect rmaps for the blocks containing the refcount btree, and all CoW
+ * staging extents.
+ */
+STATIC int
+xrep_rtrmap_find_refcount_rmaps(
+ struct xrep_rtrmap *rr)
+{
+ struct xrgb_bitmap cow_blocks; /* COWBIT */
+ struct xfs_refcount_irec low = {
+ .rc_startblock = 0,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_refcount_irec high = {
+ .rc_startblock = -1U,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ if (!xfs_has_rtreflink(sc->mp))
+ return 0;
+
+ xrgb_bitmap_init(&cow_blocks);
+
+ /* Collect rmaps for CoW staging extents. */
+ error = xfs_refcount_query_range(sc->sr.refc_cur, &low, &high,
+ xrep_rtrmap_walk_cowblocks, &cow_blocks);
+ if (error)
+ goto out_bitmap;
+
+ /* Generate rmaps for everything. */
+ error = xrep_rtrmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xrgb_bitmap_destroy(&cow_blocks);
+ return error;
+}
+
+/* Count and check all collected records. */
+STATIC int
+xrep_rtrmap_check_record(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rtrmap *rr = priv;
+ int error;
+
+ error = xrep_rtrmap_check_mapping(rr->sc, rec);
+ if (error)
+ return error;
+
+ rr->nr_records++;
+ return 0;
+}
+
+/* Generate all the reverse-mappings for the realtime device. */
+STATIC int
+xrep_rtrmap_find_rmaps(
+ struct xrep_rtrmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = NULL;
+ struct xfs_inode *ip;
+ struct xfs_btree_cur *mcur;
+ int error;
+
+ /* Generate rmaps for the realtime superblock */
+ if (xfs_has_rtsb(sc->mp) && rtg_rgno(rr->sc->sr.rtg) == 0) {
+ error = xrep_rtrmap_stash(rr, 0, sc->mp->m_sb.sb_rextsize,
+ XFS_RMAP_OWN_FS, 0, 0);
+ if (error)
+ return error;
+ }
+
+ /* Find CoW staging extents. */
+ xrep_rtgroup_btcur_init(sc, &sc->sr);
+ error = xrep_rtrmap_find_refcount_rmaps(rr);
+ xchk_rtgroup_btcur_free(&sc->sr);
+ if (error)
+ return error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Unlock the realtime metadata inodes and cancel the transaction to
+ * release the log grant space while we scan the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done even though
+ * we take the IOLOCK to quiesce the file because empty transactions
+ * do not take sb_internal.
+ */
+ xchk_trans_cancel(sc);
+ xchk_rtgroup_unlock(&sc->sr);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
+ error = xrep_rtrmap_scan_inode(rr, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&rr->iscan);
+ if (error)
+ return error;
+
+ /*
+ * Switch out for a real transaction and lock the RT metadata in
+ * preparation for building a new tree.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_setup_rt(sc);
+ if (error)
+ return error;
+ error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
+ if (error)
+ return error;
+
+ /*
+ * If a hook failed to update the in-memory btree, we lack the data to
+ * continue the repair.
+ */
+ if (xchk_iscan_aborted(&rr->iscan))
+ return -EFSCORRUPTED;
+
+ /* Scan for old rtrmap blocks. */
+ while ((pag = xfs_perag_next(sc->mp, pag))) {
+ error = xrep_rtrmap_scan_ag(rr, pag);
+ if (error) {
+ xfs_perag_rele(pag);
+ return error;
+ }
+ }
+
+ /*
+ * Now that we have everything locked again, we need to count the
+ * number of rmap records stashed in the btree. This should reflect
+ * all actively-owned rt files in the filesystem. At the same time,
+ * check all our records before we start building a new btree, which
+ * requires the rtbitmap lock.
+ */
+ mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, NULL, &rr->rtrmap_btree);
+ rr->nr_records = 0;
+ error = xfs_rmap_query_all(mcur, xrep_rtrmap_check_record, rr);
+ xfs_btree_del_cursor(mcur, error);
+
+ return error;
+}
+
+/* Building the new rtrmap btree. */
+
+/* Retrieve rtrmapbt data for bulk load. */
+STATIC int
+xrep_rtrmap_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xrep_rtrmap *rr = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ int stat = 0;
+
+ error = xfs_btree_increment(rr->mcur, 0, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return -EFSCORRUPTED;
+
+ error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return -EFSCORRUPTED;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rtrmap_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_rtrmap *rr = priv;
+
+ return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_rtrmap_iroot_size(
+ struct xfs_btree_cur *cur,
+ unsigned int level,
+ unsigned int nr_this_level,
+ void *priv)
+{
+ return xfs_rtrmap_broot_space_calc(cur->bc_mp, level, nr_this_level);
+}
+
+/*
+ * Use the collected rmap information to stage a new rmap btree. If this is
+ * successful we'll return with the new btree root information logged to the
+ * repair transaction but not yet committed. This implements section (III)
+ * above.
+ */
+STATIC int
+xrep_rtrmap_build_new_tree(
+ struct xrep_rtrmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
+ struct xfs_btree_cur *rmap_cur;
+ int error;
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the realtime rmapbt inode.
+ */
+ error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc);
+ if (error)
+ return error;
+
+ rr->new_btree.bload.get_records = xrep_rtrmap_get_records;
+ rr->new_btree.bload.claim_block = xrep_rtrmap_claim_block;
+ rr->new_btree.bload.iroot_size = xrep_rtrmap_iroot_size;
+
+ rmap_cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
+ xfs_btree_stage_ifakeroot(rmap_cur, &rr->new_btree.ifake);
+
+ /* Compute how many blocks we'll need for the rmaps collected. */
+ error = xfs_btree_bload_compute_geometry(rmap_cur,
+ &rr->new_btree.bload, rr->nr_records);
+ if (error)
+ goto err_cur;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_cur;
+
+ /*
+ * Guess how many blocks we're going to need to rebuild an entire
+ * rtrmapbt from the number of extents we found, and pump up our
+ * transaction to have sufficient block reservation. We're allowed
+ * to exceed quota to repair inconsistent metadata, though this is
+ * unlikely.
+ */
+ error = xfs_trans_reserve_more_inode(sc->tp, rtg_rmap(rtg),
+ rr->new_btree.bload.nr_blocks, 0, true);
+ if (error)
+ goto err_cur;
+
+ /* Reserve the space we'll need for the new btree. */
+ error = xrep_newbt_alloc_blocks(&rr->new_btree,
+ rr->new_btree.bload.nr_blocks);
+ if (error)
+ goto err_cur;
+
+ /*
+ * Create a cursor to the in-memory btree so that we can bulk load the
+ * new btree.
+ */
+ rr->mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, NULL, &rr->rtrmap_btree);
+ error = xfs_btree_goto_left_edge(rr->mcur);
+ if (error)
+ goto err_mcur;
+
+ /* Add all observed rmap records. */
+ rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE;
+ error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
+ if (error)
+ goto err_mcur;
+
+ /*
+ * Install the new rtrmap btree in the inode. After this point the old
+ * btree is no longer accessible, the new tree is live, and we can
+ * delete the cursor.
+ */
+ xfs_rtrmapbt_commit_staged_btree(rmap_cur, sc->tp);
+ xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks);
+ xfs_btree_del_cursor(rmap_cur, 0);
+ xfs_btree_del_cursor(rr->mcur, 0);
+ rr->mcur = NULL;
+
+ /*
+ * Now that we've written the new btree to disk, we don't need to keep
+ * updating the in-memory btree. Abort the scan to stop live updates.
+ */
+ xchk_iscan_abort(&rr->iscan);
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rr->new_btree);
+ if (error)
+ return error;
+
+ return xrep_roll_trans(sc);
+
+err_mcur:
+ xfs_btree_del_cursor(rr->mcur, error);
+err_cur:
+ xfs_btree_del_cursor(rmap_cur, error);
+ xrep_newbt_cancel(&rr->new_btree);
+ return error;
+}
+
+/* Reaping the old btree. */
+
+static inline bool
+xrep_rtrmapbt_want_live_update(
+ struct xchk_iscan *iscan,
+ const struct xfs_owner_info *oi)
+{
+ if (xchk_iscan_aborted(iscan))
+ return false;
+
+ /*
+ * We scanned the CoW staging extents before we started the iscan, so
+ * we need all the updates.
+ */
+ if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
+ return true;
+
+ /* Ignore updates to files that the scanner hasn't visited yet. */
+ return xchk_iscan_want_live_update(iscan, oi->oi_owner);
+}
+
+/*
+ * Apply a rtrmapbt update from the regular filesystem into our shadow btree.
+ * We're running from the thread that owns the rtrmap ILOCK and is generating
+ * the update, so we must be careful about which parts of the struct
+ * xrep_rtrmap that we change.
+ */
+static int
+xrep_rtrmapbt_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_rmap_update_params *p = data;
+ struct xrep_rtrmap *rr;
+ struct xfs_mount *mp;
+ struct xfs_btree_cur *mcur;
+ struct xfs_trans *tp;
+ void *txcookie;
+ int error;
+
+ rr = container_of(nb, struct xrep_rtrmap, rhook.rmap_hook.nb);
+ mp = rr->sc->mp;
+
+ if (!xrep_rtrmapbt_want_live_update(&rr->iscan, &p->oinfo))
+ goto out_unlock;
+
+ trace_xrep_rmap_live_update(rtg_group(rr->sc->sr.rtg), action, p);
+
+ error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
+ if (error)
+ goto out_abort;
+
+ mutex_lock(&rr->lock);
+ mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, tp, &rr->rtrmap_btree);
+ error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
+ p->blockcount, &p->oinfo, p->unwritten);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_cancel;
+
+ error = xfbtree_trans_commit(&rr->rtrmap_btree, tp);
+ if (error)
+ goto out_cancel;
+
+ xrep_trans_cancel_hook_dummy(&txcookie, tp);
+ mutex_unlock(&rr->lock);
+ return NOTIFY_DONE;
+
+out_cancel:
+ xfbtree_trans_cancel(&rr->rtrmap_btree, tp);
+ xrep_trans_cancel_hook_dummy(&txcookie, tp);
+out_abort:
+ xchk_iscan_abort(&rr->iscan);
+ mutex_unlock(&rr->lock);
+out_unlock:
+ return NOTIFY_DONE;
+}
+
+/* Set up the filesystem scan components. */
+STATIC int
+xrep_rtrmap_setup_scan(
+ struct xrep_rtrmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ mutex_init(&rr->lock);
+ xfsb_bitmap_init(&rr->old_rtrmapbt_blocks);
+
+ /* Set up some storage */
+ error = xfs_rtrmapbt_mem_init(sc->mp, &rr->rtrmap_btree, sc->xmbtp,
+ rtg_rgno(sc->sr.rtg));
+ if (error)
+ goto out_bitmap;
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &rr->iscan);
+
+ /*
+ * Hook into live rtrmap operations so that we can update our in-memory
+ * btree to reflect live changes on the filesystem. Since we drop the
+ * rtrmap ILOCK to scan all the inodes, we need this piece to avoid
+ * installing a stale btree.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_RMAP);
+ xfs_rmap_hook_setup(&rr->rhook, xrep_rtrmapbt_live_update);
+ error = xfs_rmap_hook_add(rtg_group(sc->sr.rtg), &rr->rhook);
+ if (error)
+ goto out_iscan;
+ return 0;
+
+out_iscan:
+ xchk_iscan_teardown(&rr->iscan);
+ xfbtree_destroy(&rr->rtrmap_btree);
+out_bitmap:
+ xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
+ mutex_destroy(&rr->lock);
+ return error;
+}
+
+/* Tear down scan components. */
+STATIC void
+xrep_rtrmap_teardown(
+ struct xrep_rtrmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ xchk_iscan_abort(&rr->iscan);
+ xfs_rmap_hook_del(rtg_group(sc->sr.rtg), &rr->rhook);
+ xchk_iscan_teardown(&rr->iscan);
+ xfbtree_destroy(&rr->rtrmap_btree);
+ xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
+ mutex_destroy(&rr->lock);
+}
+
+/* Repair the realtime rmap btree. */
+int
+xrep_rtrmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rtrmap *rr = sc->buf;
+ int error;
+
+ /* Make sure any problems with the fork are fixed. */
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ return error;
+
+ error = xrep_rtrmap_setup_scan(rr);
+ if (error)
+ return error;
+
+ /* Collect rmaps for realtime files. */
+ error = xrep_rtrmap_find_rmaps(rr);
+ if (error)
+ goto out_records;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Rebuild the rtrmap information. */
+ error = xrep_rtrmap_build_new_tree(rr);
+ if (error)
+ goto out_records;
+
+ /*
+ * Free all the extents that were allocated to the former rtrmapbt and
+ * aren't cross-linked with something else.
+ */
+ error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
+ if (error)
+ goto out_records;
+
+out_records:
+ xrep_rtrmap_teardown(rr);
+ return error;
+}
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index fabd0ed9dfa6..4ac679c1bd29 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -16,10 +16,16 @@
#include "xfs_rtbitmap.h"
#include "xfs_bit.h"
#include "xfs_bmap.h"
+#include "xfs_sb.h"
+#include "xfs_exchmaps.h"
+#include "xfs_rtgroup.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/xfile.h"
+#include "scrub/repair.h"
+#include "scrub/tempexch.h"
+#include "scrub/rtsummary.h"
/*
* Realtime Summary
@@ -31,18 +37,6 @@
* (potentially large) amount of data in pageable memory.
*/
-struct xchk_rtsummary {
- struct xfs_rtalloc_args args;
-
- uint64_t rextents;
- uint64_t rbmblocks;
- uint64_t rsumsize;
- unsigned int rsumlevels;
-
- /* Memory buffer for the summary comparison. */
- union xfs_suminfo_raw words[];
-};
-
/* Set us up to check the rtsummary file. */
int
xchk_setup_rtsummary(
@@ -53,27 +47,41 @@ xchk_setup_rtsummary(
struct xchk_rtsummary *rts;
int error;
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize),
XCHK_GFP_FLAGS);
if (!rts)
return -ENOMEM;
sc->buf = rts;
+ error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
+ if (error)
+ return error;
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_rtsummary(sc, rts);
+ if (error)
+ return error;
+ }
+
/*
* Create an xfile to construct a new rtsummary file. The xfile allows
* us to avoid pinning kernel memory for this purpose.
*/
descr = xchk_xfile_descr(sc, "realtime summary file");
- error = xfile_create(descr, mp->m_rsumsize, &sc->xfile);
+ error = xfile_create(descr, XFS_FSB_TO_B(mp, mp->m_rsumblocks),
+ &sc->xfile);
kfree(descr);
if (error)
return error;
- error = xchk_trans_alloc(sc, 0);
+ error = xchk_trans_alloc(sc, rts->resblks);
if (error)
return error;
- error = xchk_install_live_inode(sc, mp->m_rsumip);
+ error = xchk_install_live_inode(sc, rtg_summary(sc->sr.rtg));
if (error)
return error;
@@ -81,32 +89,27 @@ xchk_setup_rtsummary(
if (error)
return error;
- /*
- * Locking order requires us to take the rtbitmap first. We must be
- * careful to unlock it ourselves when we are done with the rtbitmap
- * file since the scrub infrastructure won't do that for us. Only
- * then we can lock the rtsummary inode.
- */
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
- xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+ error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP);
+ if (error)
+ return error;
/*
* Now that we've locked the rtbitmap and rtsummary, we can't race with
* growfsrt trying to expand the summary or change the size of the rt
* volume. Hence it is safe to compute and check the geometry values.
+ *
+ * Note that there is no strict requirement for an exclusive lock on the
+ * summary here, but to keep the locking APIs simple we lock both inodes
+ * exclusively here. If we ever start caring about running concurrent
+ * fsmap with scrub this could be changed.
*/
if (mp->m_sb.sb_rblocks) {
- xfs_filblks_t rsumblocks;
- int rextslog;
-
- rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
- rextslog = xfs_compute_rextslog(rts->rextents);
- rts->rsumlevels = rextslog + 1;
- rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents);
- rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels,
- rts->rbmblocks);
- rts->rsumsize = XFS_FSB_TO_B(mp, rsumblocks);
+ rts->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
+ rts->rbmblocks = xfs_rtbitmap_blockcount(mp);
+ rts->rsumblocks =
+ xfs_rtsummary_blockcount(mp, &rts->rsumlevels);
}
+
return 0;
}
@@ -118,7 +121,7 @@ xfsum_load(
xfs_rtsumoff_t sumoff,
union xfs_suminfo_raw *rawinfo)
{
- return xfile_obj_load(sc->xfile, rawinfo,
+ return xfile_load(sc->xfile, rawinfo,
sizeof(union xfs_suminfo_raw),
sumoff << XFS_WORDLOG);
}
@@ -129,19 +132,19 @@ xfsum_store(
xfs_rtsumoff_t sumoff,
const union xfs_suminfo_raw rawinfo)
{
- return xfile_obj_store(sc->xfile, &rawinfo,
+ return xfile_store(sc->xfile, &rawinfo,
sizeof(union xfs_suminfo_raw),
sumoff << XFS_WORDLOG);
}
-static inline int
+inline int
xfsum_copyout(
struct xfs_scrub *sc,
xfs_rtsumoff_t sumoff,
union xfs_suminfo_raw *rawinfo,
unsigned int nr_words)
{
- return xfile_obj_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG,
+ return xfile_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG,
sumoff << XFS_WORDLOG);
}
@@ -150,6 +153,11 @@ xchk_rtsum_inc(
struct xfs_mount *mp,
union xfs_suminfo_raw *v)
{
+ if (xfs_has_rtgroups(mp)) {
+ be32_add_cpu(&v->rtg, 1);
+ return be32_to_cpu(v->rtg);
+ }
+
v->old += 1;
return v->old;
}
@@ -157,11 +165,12 @@ xchk_rtsum_inc(
/* Update the summary file to reflect the free extent that we've accumulated. */
STATIC int
xchk_rtsum_record_free(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
{
+ struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_scrub *sc = priv;
xfs_fileoff_t rbmoff;
xfs_rtblock_t rtbno;
@@ -180,11 +189,11 @@ xchk_rtsum_record_free(
lenlog = xfs_highbit64(rec->ar_extcount);
offs = xfs_rtsumoffs(mp, lenlog, rbmoff);
- rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
- rtlen = xfs_rtx_to_rtb(mp, rec->ar_extcount);
+ rtbno = xfs_rtx_to_rtb(rtg, rec->ar_startext);
+ rtlen = xfs_rtxlen_to_extlen(mp, rec->ar_extcount);
if (!xfs_verify_rtbext(mp, rtbno, rtlen)) {
- xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
+ xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
return -EFSCORRUPTED;
}
@@ -206,15 +215,14 @@ xchk_rtsum_compute(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
- unsigned long long rtbmp_blocks;
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
/* If the bitmap size doesn't match the computed size, bail. */
- rtbmp_blocks = xfs_rtbitmap_blockcount(mp, mp->m_sb.sb_rextents);
- if (XFS_FSB_TO_B(mp, rtbmp_blocks) != mp->m_rbmip->i_disk_size)
+ if (XFS_FSB_TO_B(mp, xfs_rtbitmap_blockcount(mp)) !=
+ rtg_bitmap(rtg)->i_disk_size)
return -EFSCORRUPTED;
- return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free,
- sc);
+ return xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtsum_record_free, sc);
}
/* Compare the rtsummary file against the one we computed. */
@@ -233,8 +241,9 @@ xchk_rtsum_compare(
xfs_rtsumoff_t sumoff = 0;
int error = 0;
- rts->args.mp = sc->mp;
+ rts->args.mp = mp;
rts->args.tp = sc->tp;
+ rts->args.rtg = sc->sr.rtg;
/* Mappings may not cross or lie beyond EOF. */
endoff = XFS_B_TO_FSB(mp, ip->i_disk_size);
@@ -301,31 +310,34 @@ xchk_rtsummary(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
+ struct xfs_inode *rbmip = rtg_bitmap(rtg);
+ struct xfs_inode *rsumip = rtg_summary(rtg);
struct xchk_rtsummary *rts = sc->buf;
- int error = 0;
+ int error;
/* Is sb_rextents correct? */
if (mp->m_sb.sb_rextents != rts->rextents) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
- goto out_rbm;
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
+ return 0;
}
/* Is m_rsumlevels correct? */
if (mp->m_rsumlevels != rts->rsumlevels) {
- xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
- goto out_rbm;
+ xchk_ino_set_corrupt(sc, rsumip->i_ino);
+ return 0;
}
/* Is m_rsumsize correct? */
- if (mp->m_rsumsize != rts->rsumsize) {
- xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
- goto out_rbm;
+ if (mp->m_rsumblocks != rts->rsumblocks) {
+ xchk_ino_set_corrupt(sc, rsumip->i_ino);
+ return 0;
}
/* The summary file length must be aligned to an fsblock. */
- if (mp->m_rsumip->i_disk_size & mp->m_blockmask) {
- xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
- goto out_rbm;
+ if (rsumip->i_disk_size & mp->m_blockmask) {
+ xchk_ino_set_corrupt(sc, rsumip->i_ino);
+ return 0;
}
/*
@@ -333,15 +345,15 @@ xchk_rtsummary(
* growfsrt expands the summary file before updating sb_rextents, so
* the file can be larger than rsumsize.
*/
- if (mp->m_rsumip->i_disk_size < rts->rsumsize) {
- xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
- goto out_rbm;
+ if (rsumip->i_disk_size < XFS_FSB_TO_B(mp, rts->rsumblocks)) {
+ xchk_ino_set_corrupt(sc, rsumip->i_ino);
+ return 0;
}
/* Invoke the fork scrubber. */
error = xchk_metadata_inode_forks(sc);
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
- goto out_rbm;
+ return error;
/* Construct the new summary file from the rtbitmap. */
error = xchk_rtsum_compute(sc);
@@ -350,18 +362,12 @@ xchk_rtsummary(
* EFSCORRUPTED means the rtbitmap is corrupt, which is an xref
* error since we're checking the summary file.
*/
- xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
- error = 0;
- goto out_rbm;
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
+ return 0;
}
if (error)
- goto out_rbm;
+ return error;
/* Does the computed summary file match the actual rtsummary file? */
- error = xchk_rtsum_compare(sc);
-
-out_rbm:
- /* Unlock the rtbitmap since we're done with it. */
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
- return error;
+ return xchk_rtsum_compare(sc);
}
diff --git a/fs/xfs/scrub/rtsummary.h b/fs/xfs/scrub/rtsummary.h
new file mode 100644
index 000000000000..e44b04cb6e2d
--- /dev/null
+++ b/fs/xfs/scrub/rtsummary.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RTSUMMARY_H__
+#define __XFS_SCRUB_RTSUMMARY_H__
+
+struct xchk_rtsummary {
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ struct xrep_tempexch tempexch;
+#endif
+ struct xfs_rtalloc_args args;
+
+ uint64_t rextents;
+ uint64_t rbmblocks;
+ xfs_filblks_t rsumblocks;
+ unsigned int rsumlevels;
+ unsigned int resblks;
+
+ /* suminfo position of xfile as we write buffers to disk. */
+ xfs_rtsumoff_t prep_wordoff;
+
+ /* Memory buffer for the summary comparison. */
+ union xfs_suminfo_raw words[];
+};
+
+int xfsum_copyout(struct xfs_scrub *sc, xfs_rtsumoff_t sumoff,
+ union xfs_suminfo_raw *rawinfo, unsigned int nr_words);
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int xrep_setup_rtsummary(struct xfs_scrub *sc, struct xchk_rtsummary *rts);
+#else
+# define xrep_setup_rtsummary(sc, rts) (0)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_RTSUMMARY_H__ */
diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c
new file mode 100644
index 000000000000..d593977d70df
--- /dev/null
+++ b/fs/xfs/scrub/rtsummary_repair.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_rtalloc.h"
+#include "xfs_inode.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_exchmaps.h"
+#include "xfs_rtbitmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/reap.h"
+#include "scrub/xfile.h"
+#include "scrub/rtsummary.h"
+
+/* Set us up to repair the rtsummary file. */
+int
+xrep_setup_rtsummary(
+ struct xfs_scrub *sc,
+ struct xchk_rtsummary *rts)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long blocks;
+ int error;
+
+ error = xrep_tempfile_create(sc, S_IFREG);
+ if (error)
+ return error;
+
+ /*
+ * If we're doing a repair, we reserve enough blocks to write out a
+ * completely new summary file, plus twice as many blocks as we would
+ * need if we can only allocate one block per data fork mapping. This
+ * should cover the preallocation of the temporary file and exchanging
+ * the extent mappings.
+ *
+ * We cannot use xfs_exchmaps_estimate because we have not yet
+ * constructed the replacement rtsummary and therefore do not know how
+ * many extents it will use. By the time we do, we will have a dirty
+ * transaction (which we cannot drop because we cannot drop the
+ * rtsummary ILOCK) and cannot ask for more reservation.
+ */
+ blocks = mp->m_rsumblocks;
+ blocks += xfs_bmbt_calc_size(mp, blocks) * 2;
+ if (blocks > UINT_MAX)
+ return -EOPNOTSUPP;
+
+ rts->resblks += blocks;
+ return 0;
+}
+
+static int
+xrep_rtsummary_prep_buf(
+ struct xfs_scrub *sc,
+ struct xfs_buf *bp,
+ void *data)
+{
+ struct xchk_rtsummary *rts = data;
+ struct xfs_mount *mp = sc->mp;
+ union xfs_suminfo_raw *ondisk;
+ int error;
+
+ rts->args.mp = mp;
+ rts->args.tp = sc->tp;
+ rts->args.rtg = sc->sr.rtg;
+ rts->args.sumbp = bp;
+ ondisk = xfs_rsumblock_infoptr(&rts->args, 0);
+ rts->args.sumbp = NULL;
+
+ error = xfsum_copyout(sc, rts->prep_wordoff, ondisk, mp->m_blockwsize);
+ if (error)
+ return error;
+
+ if (xfs_has_rtgroups(sc->mp)) {
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+
+ hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC);
+ hdr->rt_owner = cpu_to_be64(sc->ip->i_ino);
+ hdr->rt_blkno = cpu_to_be64(xfs_buf_daddr(bp));
+ hdr->rt_lsn = 0;
+ uuid_copy(&hdr->rt_uuid, &sc->mp->m_sb.sb_meta_uuid);
+ bp->b_ops = &xfs_rtsummary_buf_ops;
+ } else {
+ bp->b_ops = &xfs_rtbuf_ops;
+ }
+
+ rts->prep_wordoff += mp->m_blockwsize;
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTSUMMARY_BUF);
+ return 0;
+}
+
+/* Repair the realtime summary. */
+int
+xrep_rtsummary(
+ struct xfs_scrub *sc)
+{
+ struct xchk_rtsummary *rts = sc->buf;
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(mp))
+ return -EOPNOTSUPP;
+
+ /* Walk away if we disagree on the size of the rt bitmap. */
+ if (rts->rbmblocks != mp->m_sb.sb_rbmblocks)
+ return 0;
+
+ /* Make sure any problems with the fork are fixed. */
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ return error;
+
+ /*
+ * Try to take ILOCK_EXCL of the temporary file. We had better be the
+ * only ones holding onto this inode, but we can't block while holding
+ * the rtsummary file's ILOCK_EXCL.
+ */
+ while (!xrep_tempfile_ilock_nowait(sc)) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+ delay(1);
+ }
+
+ /* Make sure we have space allocated for the entire summary file. */
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+ error = xrep_tempfile_prealloc(sc, 0, rts->rsumblocks);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /* Copy the rtsummary file that we generated. */
+ error = xrep_tempfile_copyin(sc, 0, rts->rsumblocks,
+ xrep_rtsummary_prep_buf, rts);
+ if (error)
+ return error;
+ error = xrep_tempfile_set_isize(sc, XFS_FSB_TO_B(mp, rts->rsumblocks));
+ if (error)
+ return error;
+
+ /*
+ * Now exchange the contents. Nothing in repair uses the temporary
+ * buffer, so we can reuse it for the tempfile exchrange information.
+ */
+ error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, 0,
+ rts->rsumblocks, &rts->tempexch);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_contents(sc, &rts->tempexch);
+ if (error)
+ return error;
+
+ /* Reset incore state and blow out the summary cache. */
+ if (sc->sr.rtg->rtg_rsum_cache)
+ memset(sc->sr.rtg->rtg_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks);
+
+ mp->m_rsumlevels = rts->rsumlevels;
+ mp->m_rsumblocks = rts->rsumblocks;
+
+ /* Free the old rtsummary blocks if they're not in use. */
+ return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index caf324c2b991..76e24032e99a 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -15,6 +15,13 @@
#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_scrub.h"
+#include "xfs_buf_mem.h"
+#include "xfs_rmap.h"
+#include "xfs_exchrange.h"
+#include "xfs_exchmaps.h"
+#include "xfs_dir2.h"
+#include "xfs_parent.h"
+#include "xfs_icache.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -22,6 +29,8 @@
#include "scrub/health.h"
#include "scrub/stats.h"
#include "scrub/xfile.h"
+#include "scrub/tempfile.h"
+#include "scrub/orphanage.h"
/*
* Online Scrub and Repair
@@ -140,6 +149,18 @@ xchk_probe(
if (xchk_should_terminate(sc, &error))
return error;
+ /*
+ * If the caller is probing to see if repair works but repair isn't
+ * built into the kernel, return EOPNOTSUPP because that's the signal
+ * that userspace expects. If online repair is built in, set the
+ * CORRUPT flag (without any of the usual tracing/logging) to force us
+ * into xrep_probe.
+ */
+ if (xchk_could_repair(sc)) {
+ if (!IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR))
+ return -EOPNOTSUPP;
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ }
return 0;
}
@@ -155,11 +176,53 @@ xchk_fsgates_disable(
trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL);
if (sc->flags & XCHK_FSGATES_DRAIN)
- xfs_drain_wait_disable();
+ xfs_defer_drain_wait_disable();
+
+ if (sc->flags & XCHK_FSGATES_QUOTA)
+ xfs_dqtrx_hook_disable();
+
+ if (sc->flags & XCHK_FSGATES_DIRENTS)
+ xfs_dir_hook_disable();
+
+ if (sc->flags & XCHK_FSGATES_RMAP)
+ xfs_rmap_hook_disable();
sc->flags &= ~XCHK_FSGATES_ALL;
}
+/* Free the resources associated with a scrub subtype. */
+void
+xchk_scrub_free_subord(
+ struct xfs_scrub_subord *sub)
+{
+ struct xfs_scrub *sc = sub->parent_sc;
+
+ ASSERT(sc->ip == sub->sc.ip);
+ ASSERT(sc->orphanage == sub->sc.orphanage);
+ ASSERT(sc->tempip == sub->sc.tempip);
+
+ sc->sm->sm_type = sub->old_smtype;
+ sc->sm->sm_flags = sub->old_smflags |
+ (sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT);
+ sc->tp = sub->sc.tp;
+
+ if (sub->sc.buf) {
+ if (sub->sc.buf_cleanup)
+ sub->sc.buf_cleanup(sub->sc.buf);
+ kvfree(sub->sc.buf);
+ }
+ if (sub->sc.xmbtp)
+ xmbuf_free(sub->sc.xmbtp);
+ if (sub->sc.xfile)
+ xfile_destroy(sub->sc.xfile);
+
+ sc->ilock_flags = sub->sc.ilock_flags;
+ sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags;
+ sc->temp_ilock_flags = sub->sc.temp_ilock_flags;
+
+ kfree(sub);
+}
+
/* Free all the resources and finish the transactions. */
STATIC int
xchk_teardown(
@@ -167,6 +230,8 @@ xchk_teardown(
int error)
{
xchk_ag_free(sc, &sc->sa);
+ xchk_rtgroup_btcur_free(&sc->sr);
+
if (sc->tp) {
if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
error = xfs_trans_commit(sc->tp);
@@ -174,6 +239,8 @@ xchk_teardown(
xfs_trans_cancel(sc->tp);
sc->tp = NULL;
}
+ if (sc->sr.rtg)
+ xchk_rtgroup_free(sc, &sc->sr);
if (sc->ip) {
if (sc->ilock_flags)
xchk_iunlock(sc, sc->ilock_flags);
@@ -184,6 +251,10 @@ xchk_teardown(
sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
mnt_drop_write_file(sc->file);
}
+ if (sc->xmbtp) {
+ xmbuf_free(sc->xmbtp);
+ sc->xmbtp = NULL;
+ }
if (sc->xfile) {
xfile_destroy(sc->xfile);
sc->xfile = NULL;
@@ -196,6 +267,8 @@ xchk_teardown(
sc->buf = NULL;
}
+ xrep_tempfile_rele(sc);
+ xrep_orphanage_rele(sc);
xchk_fsgates_disable(sc);
return error;
}
@@ -267,7 +340,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.setup = xchk_setup_ag_rmapbt,
.scrub = xchk_rmapbt,
.has = xfs_has_rmapbt,
- .repair = xrep_notsupported,
+ .repair = xrep_rmapbt,
},
[XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */
.type = ST_PERAG,
@@ -304,37 +377,39 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_INODE,
.setup = xchk_setup_directory,
.scrub = xchk_directory,
- .repair = xrep_notsupported,
+ .repair = xrep_directory,
},
[XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */
.type = ST_INODE,
.setup = xchk_setup_xattr,
.scrub = xchk_xattr,
- .repair = xrep_notsupported,
+ .repair = xrep_xattr,
},
[XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */
.type = ST_INODE,
.setup = xchk_setup_symlink,
.scrub = xchk_symlink,
- .repair = xrep_notsupported,
+ .repair = xrep_symlink,
},
[XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */
.type = ST_INODE,
.setup = xchk_setup_parent,
.scrub = xchk_parent,
- .repair = xrep_notsupported,
+ .repair = xrep_parent,
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
- .type = ST_FS,
+ .type = ST_RTGROUP,
+ .has = xfs_has_nonzoned,
.setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
.repair = xrep_rtbitmap,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
- .type = ST_FS,
+ .type = ST_RTGROUP,
+ .has = xfs_has_nonzoned,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
- .repair = xrep_notsupported,
+ .repair = xrep_rtsummary,
},
[XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */
.type = ST_FS,
@@ -358,7 +433,60 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_FS,
.setup = xchk_setup_fscounters,
.scrub = xchk_fscounters,
- .repair = xrep_notsupported,
+ .repair = xrep_fscounters,
+ },
+ [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */
+ .type = ST_FS,
+ .setup = xchk_setup_quotacheck,
+ .scrub = xchk_quotacheck,
+ .repair = xrep_quotacheck,
+ },
+ [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */
+ .type = ST_FS,
+ .setup = xchk_setup_nlinks,
+ .scrub = xchk_nlinks,
+ .repair = xrep_nlinks,
+ },
+ [XFS_SCRUB_TYPE_HEALTHY] = { /* fs healthy; clean all reminders */
+ .type = ST_FS,
+ .setup = xchk_setup_fs,
+ .scrub = xchk_health_record,
+ .repair = xrep_notsupported,
+ },
+ [XFS_SCRUB_TYPE_DIRTREE] = { /* directory tree structure */
+ .type = ST_INODE,
+ .setup = xchk_setup_dirtree,
+ .scrub = xchk_dirtree,
+ .has = xfs_has_parent,
+ .repair = xrep_dirtree,
+ },
+ [XFS_SCRUB_TYPE_METAPATH] = { /* metadata directory tree path */
+ .type = ST_GENERIC,
+ .setup = xchk_setup_metapath,
+ .scrub = xchk_metapath,
+ .has = xfs_has_metadir,
+ .repair = xrep_metapath,
+ },
+ [XFS_SCRUB_TYPE_RGSUPER] = { /* realtime group superblock */
+ .type = ST_RTGROUP,
+ .setup = xchk_setup_rgsuperblock,
+ .scrub = xchk_rgsuperblock,
+ .has = xfs_has_rtsb,
+ .repair = xrep_rgsuperblock,
+ },
+ [XFS_SCRUB_TYPE_RTRMAPBT] = { /* realtime group rmapbt */
+ .type = ST_RTGROUP,
+ .setup = xchk_setup_rtrmapbt,
+ .scrub = xchk_rtrmapbt,
+ .has = xfs_has_rtrmapbt,
+ .repair = xrep_rtrmapbt,
+ },
+ [XFS_SCRUB_TYPE_RTREFCBT] = { /* realtime refcountbt */
+ .type = ST_RTGROUP,
+ .setup = xchk_setup_rtrefcountbt,
+ .scrub = xchk_rtrefcountbt,
+ .has = xfs_has_rtreflink,
+ .repair = xrep_rtrefcountbt,
},
};
@@ -407,6 +535,35 @@ xchk_validate_inputs(
if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino))
goto out;
break;
+ case ST_GENERIC:
+ break;
+ case ST_RTGROUP:
+ if (sm->sm_ino || sm->sm_gen)
+ goto out;
+ if (xfs_has_rtgroups(mp)) {
+ /*
+ * On a rtgroups filesystem, there won't be an rtbitmap
+ * or rtsummary file for group 0 unless there's
+ * actually a realtime volume attached. However, older
+ * xfs_scrub always calls the rtbitmap/rtsummary
+ * scrubbers with sm_agno==0 so transform the error
+ * code to ENOENT.
+ */
+ if (sm->sm_agno >= mp->m_sb.sb_rgcount) {
+ if (sm->sm_agno == 0)
+ error = -ENOENT;
+ goto out;
+ }
+ } else {
+ /*
+ * Prior to rtgroups, the rtbitmap/rtsummary scrubbers
+ * accepted sm_agno==0, so we still accept that for
+ * scrubbing pre-rtgroups filesystems.
+ */
+ if (sm->sm_agno != 0)
+ goto out;
+ }
+ break;
default:
goto out;
}
@@ -464,8 +621,38 @@ static inline void xchk_postmortem(struct xfs_scrub *sc)
}
#endif /* CONFIG_XFS_ONLINE_REPAIR */
+/*
+ * Create a new scrub context from an existing one, but with a different scrub
+ * type.
+ */
+struct xfs_scrub_subord *
+xchk_scrub_create_subord(
+ struct xfs_scrub *sc,
+ unsigned int subtype)
+{
+ struct xfs_scrub_subord *sub;
+
+ sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS);
+ if (!sub)
+ return ERR_PTR(-ENOMEM);
+
+ sub->old_smtype = sc->sm->sm_type;
+ sub->old_smflags = sc->sm->sm_flags;
+ sub->parent_sc = sc;
+ memcpy(&sub->sc, sc, sizeof(struct xfs_scrub));
+ sub->sc.ops = &meta_scrub_ops[subtype];
+ sub->sc.sm->sm_type = subtype;
+ sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ sub->sc.buf = NULL;
+ sub->sc.buf_cleanup = NULL;
+ sub->sc.xfile = NULL;
+ sub->sc.xmbtp = NULL;
+
+ return sub;
+}
+
/* Dispatch metadata scrubbing. */
-int
+STATIC int
xfs_scrub_metadata(
struct file *file,
struct xfs_scrub_metadata *sm)
@@ -493,9 +680,6 @@ xfs_scrub_metadata(
if (error)
goto out;
- xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB,
- "EXPERIMENTAL online scrub feature in use. Use at your own risk!");
-
sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS);
if (!sc) {
error = -ENOMEM;
@@ -507,6 +691,7 @@ xfs_scrub_metadata(
sc->sm = sm;
sc->ops = &meta_scrub_ops[sm->sm_type];
sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
+ sc->relax = INIT_XCHK_RELAX;
retry_op:
/*
* When repairs are allowed, prevent freezing or readonly remount while
@@ -610,3 +795,221 @@ try_harder:
run.retries++;
goto retry_op;
}
+
+/* Scrub one aspect of one piece of metadata. */
+int
+xfs_ioc_scrub_metadata(
+ struct file *file,
+ void __user *arg)
+{
+ struct xfs_scrub_metadata scrub;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&scrub, arg, sizeof(scrub)))
+ return -EFAULT;
+
+ error = xfs_scrub_metadata(file, &scrub);
+ if (error)
+ return error;
+
+ if (copy_to_user(arg, &scrub, sizeof(scrub)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Decide if there have been any scrub failures up to this point. */
+static inline int
+xfs_scrubv_check_barrier(
+ struct xfs_mount *mp,
+ const struct xfs_scrub_vec *vectors,
+ const struct xfs_scrub_vec *stop_vec)
+{
+ const struct xfs_scrub_vec *v;
+ __u32 failmask;
+
+ failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT;
+
+ for (v = vectors; v < stop_vec; v++) {
+ if (v->sv_type == XFS_SCRUB_TYPE_BARRIER)
+ continue;
+
+ /*
+ * Runtime errors count as a previous failure, except the ones
+ * used to ask userspace to retry.
+ */
+ switch (v->sv_ret) {
+ case -EBUSY:
+ case -ENOENT:
+ case -EUSERS:
+ case 0:
+ break;
+ default:
+ return -ECANCELED;
+ }
+
+ /*
+ * If any of the out-flags on the scrub vector match the mask
+ * that was set on the barrier vector, that's a previous fail.
+ */
+ if (v->sv_flags & failmask)
+ return -ECANCELED;
+ }
+
+ return 0;
+}
+
+/*
+ * If the caller provided us with a nonzero inode number that isn't the ioctl
+ * file, try to grab a reference to it to eliminate all further untrusted inode
+ * lookups. If we can't get the inode, let each scrub function try again.
+ */
+STATIC struct xfs_inode *
+xchk_scrubv_open_by_handle(
+ struct xfs_mount *mp,
+ const struct xfs_scrub_vec_head *head)
+{
+ struct xfs_trans *tp;
+ struct xfs_inode *ip;
+ int error;
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return NULL;
+
+ error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip);
+ xfs_trans_cancel(tp);
+ if (error)
+ return NULL;
+
+ if (VFS_I(ip)->i_generation != head->svh_gen) {
+ xfs_irele(ip);
+ return NULL;
+ }
+
+ return ip;
+}
+
+/* Vectored scrub implementation to reduce ioctl calls. */
+int
+xfs_ioc_scrubv_metadata(
+ struct file *file,
+ void __user *arg)
+{
+ struct xfs_scrub_vec_head head;
+ struct xfs_scrub_vec_head __user *uhead = arg;
+ struct xfs_scrub_vec *vectors;
+ struct xfs_scrub_vec __user *uvectors;
+ struct xfs_inode *ip_in = XFS_I(file_inode(file));
+ struct xfs_mount *mp = ip_in->i_mount;
+ struct xfs_inode *handle_ip = NULL;
+ struct xfs_scrub_vec *v;
+ size_t vec_bytes;
+ unsigned int i;
+ int error = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&head, uhead, sizeof(head)))
+ return -EFAULT;
+
+ if (head.svh_reserved)
+ return -EINVAL;
+ if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL)
+ return -EINVAL;
+ if (head.svh_nr == 0)
+ return 0;
+
+ vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec));
+ if (vec_bytes > PAGE_SIZE)
+ return -ENOMEM;
+
+ uvectors = u64_to_user_ptr(head.svh_vectors);
+ vectors = memdup_user(uvectors, vec_bytes);
+ if (IS_ERR(vectors))
+ return PTR_ERR(vectors);
+
+ trace_xchk_scrubv_start(ip_in, &head);
+
+ for (i = 0, v = vectors; i < head.svh_nr; i++, v++) {
+ if (v->sv_reserved) {
+ error = -EINVAL;
+ goto out_free;
+ }
+
+ if (v->sv_type == XFS_SCRUB_TYPE_BARRIER &&
+ (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) {
+ error = -EINVAL;
+ goto out_free;
+ }
+
+ trace_xchk_scrubv_item(mp, &head, i, v);
+ }
+
+ /*
+ * If the caller wants us to do a scrub-by-handle and the file used to
+ * call the ioctl is not the same file, load the incore inode and pin
+ * it across all the scrubv actions to avoid repeated UNTRUSTED
+ * lookups. The reference is not passed to deeper layers of scrub
+ * because each scrubber gets to decide its own strategy and return
+ * values for getting an inode.
+ */
+ if (head.svh_ino && head.svh_ino != ip_in->i_ino)
+ handle_ip = xchk_scrubv_open_by_handle(mp, &head);
+
+ /* Run all the scrubbers. */
+ for (i = 0, v = vectors; i < head.svh_nr; i++, v++) {
+ struct xfs_scrub_metadata sm = {
+ .sm_type = v->sv_type,
+ .sm_flags = v->sv_flags,
+ .sm_ino = head.svh_ino,
+ .sm_gen = head.svh_gen,
+ .sm_agno = head.svh_agno,
+ };
+
+ if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) {
+ v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v);
+ if (v->sv_ret) {
+ trace_xchk_scrubv_barrier_fail(mp, &head, i, v);
+ break;
+ }
+
+ continue;
+ }
+
+ v->sv_ret = xfs_scrub_metadata(file, &sm);
+ v->sv_flags = sm.sm_flags;
+
+ trace_xchk_scrubv_outcome(mp, &head, i, v);
+
+ if (head.svh_rest_us) {
+ ktime_t expires;
+
+ expires = ktime_add_ns(ktime_get(),
+ head.svh_rest_us * 1000);
+ set_current_state(TASK_KILLABLE);
+ schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+ }
+
+ if (fatal_signal_pending(current)) {
+ error = -EINTR;
+ goto out_free;
+ }
+ }
+
+ if (copy_to_user(uvectors, vectors, vec_bytes) ||
+ copy_to_user(uhead, &head, sizeof(head))) {
+ error = -EFAULT;
+ goto out_free;
+ }
+
+out_free:
+ if (handle_ip)
+ xfs_irele(handle_ip);
+ kfree(vectors);
+ return error;
+}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 7fc50654c4fe..a3f1abc91390 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -8,6 +8,49 @@
struct xfs_scrub;
+struct xchk_relax {
+ unsigned long next_resched;
+ unsigned int resched_nr;
+ bool interruptible;
+};
+
+/* Yield to the scheduler at most 10x per second. */
+#define XCHK_RELAX_NEXT (jiffies + (HZ / 10))
+
+#define INIT_XCHK_RELAX \
+ (struct xchk_relax){ \
+ .next_resched = XCHK_RELAX_NEXT, \
+ .resched_nr = 0, \
+ .interruptible = true, \
+ }
+
+/*
+ * Relax during a scrub operation and exit if there's a fatal signal pending.
+ *
+ * If preemption is disabled, we need to yield to the scheduler every now and
+ * then so that we don't run afoul of the soft lockup watchdog or RCU stall
+ * detector. cond_resched calls are somewhat expensive (~5ns) so we want to
+ * ratelimit this to 10x per second. Amortize the cost of the other checks by
+ * only doing it once every 100 calls.
+ */
+static inline int xchk_maybe_relax(struct xchk_relax *widget)
+{
+ /* Amortize the cost of scheduling and checking signals. */
+ if (likely(++widget->resched_nr < 100))
+ return 0;
+ widget->resched_nr = 0;
+
+ if (unlikely(widget->next_resched <= jiffies)) {
+ cond_resched();
+ widget->next_resched = XCHK_RELAX_NEXT;
+ }
+
+ if (widget->interruptible && fatal_signal_pending(current))
+ return -EINTR;
+
+ return 0;
+}
+
/*
* Standard flags for allocating memory within scrub. NOFS context is
* configured by the process allocation scope. Scrub and repair must be able
@@ -17,12 +60,21 @@ struct xfs_scrub;
#define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \
__GFP_RETRY_MAYFAIL))
+/*
+ * For opening files by handle for fsck operations, we don't trust the inumber
+ * or the allocation state; therefore, perform an untrusted lookup. We don't
+ * want these inodes to pollute the cache, so mark them for immediate removal.
+ */
+#define XCHK_IGET_FLAGS (XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE)
+
/* Type info and names for the scrub types. */
enum xchk_type {
ST_NONE = 1, /* disabled */
ST_PERAG, /* per-AG metadata */
ST_FS, /* per-FS metadata */
ST_INODE, /* per-inode metadata */
+ ST_GENERIC, /* determined by the scrubber */
+ ST_RTGROUP, /* rtgroup metadata */
};
struct xchk_meta_ops {
@@ -44,7 +96,7 @@ struct xchk_meta_ops {
int (*repair_eval)(struct xfs_scrub *sc);
/* Decide if we even have this piece of metadata. */
- bool (*has)(struct xfs_mount *);
+ bool (*has)(const struct xfs_mount *);
/* type describing required/allowed inputs */
enum xchk_type type;
@@ -67,6 +119,19 @@ struct xchk_ag {
struct xfs_btree_cur *refc_cur;
};
+/* Inode lock state for the RT volume. */
+struct xchk_rt {
+ /* incore rtgroup, if applicable */
+ struct xfs_rtgroup *rtg;
+
+ /* XFS_RTGLOCK_* lock state if locked */
+ unsigned int rtlock_flags;
+
+ /* rtgroup btrees */
+ struct xfs_btree_cur *rmap_cur;
+ struct xfs_btree_cur *refc_cur;
+};
+
struct xfs_scrub {
/* General scrub state. */
struct xfs_mount *mp;
@@ -99,9 +164,20 @@ struct xfs_scrub {
/* xfile used by the scrubbers; freed at teardown. */
struct xfile *xfile;
+ /* buffer target for in-memory btrees; also freed at teardown. */
+ struct xfs_buftarg *xmbtp;
+
/* Lock flags for @ip. */
uint ilock_flags;
+ /* The orphanage, for stashing files that have lost their parent. */
+ uint orphanage_ilock_flags;
+ struct xfs_inode *orphanage;
+
+ /* A temporary file on this filesystem, for staging new metadata. */
+ struct xfs_inode *tempip;
+ uint temp_ilock_flags;
+
/* See the XCHK/XREP state flags below. */
unsigned int flags;
@@ -112,8 +188,20 @@ struct xfs_scrub {
*/
unsigned int sick_mask;
+ /*
+ * Clear these XFS_SICK_* flags but only if the scan is ok. Useful for
+ * removing ZAPPED flags after a repair.
+ */
+ unsigned int healthy_mask;
+
+ /* next time we want to cond_resched() */
+ struct xchk_relax relax;
+
/* State tracking for single-AG operations. */
struct xchk_ag sa;
+
+ /* State tracking for realtime operations. */
+ struct xchk_rt sr;
};
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
@@ -121,6 +209,9 @@ struct xfs_scrub {
#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
+#define XCHK_FSGATES_QUOTA (1U << 4) /* quota live update enabled */
+#define XCHK_FSGATES_DIRENTS (1U << 5) /* directory live update enabled */
+#define XCHK_FSGATES_RMAP (1U << 6) /* rmapbt live update enabled */
#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
@@ -130,7 +221,44 @@ struct xfs_scrub {
* features are gated off via dynamic code patching, which is why the state
* must be enabled during scrub setup and can only be torn down afterwards.
*/
-#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN)
+#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN | \
+ XCHK_FSGATES_QUOTA | \
+ XCHK_FSGATES_DIRENTS | \
+ XCHK_FSGATES_RMAP)
+
+struct xfs_scrub_subord {
+ struct xfs_scrub sc;
+ struct xfs_scrub *parent_sc;
+ unsigned int old_smtype;
+ unsigned int old_smflags;
+};
+
+struct xfs_scrub_subord *xchk_scrub_create_subord(struct xfs_scrub *sc,
+ unsigned int subtype);
+void xchk_scrub_free_subord(struct xfs_scrub_subord *sub);
+
+/*
+ * We /could/ terminate a scrub/repair operation early. If we're not
+ * in a good place to continue (fatal signal, etc.) then bail out.
+ * Note that we're careful not to make any judgements about *error.
+ */
+static inline bool
+xchk_should_terminate(
+ struct xfs_scrub *sc,
+ int *error)
+{
+ if (xchk_maybe_relax(&sc->relax)) {
+ if (*error == 0)
+ *error = -EINTR;
+ return true;
+ }
+ return false;
+}
+
+static inline int xchk_nothing(struct xfs_scrub *sc)
+{
+ return -ENOENT;
+}
/* Metadata scrubbers */
int xchk_tester(struct xfs_scrub *sc);
@@ -150,31 +278,30 @@ int xchk_directory(struct xfs_scrub *sc);
int xchk_xattr(struct xfs_scrub *sc);
int xchk_symlink(struct xfs_scrub *sc);
int xchk_parent(struct xfs_scrub *sc);
+int xchk_dirtree(struct xfs_scrub *sc);
+int xchk_metapath(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xchk_rtbitmap(struct xfs_scrub *sc);
int xchk_rtsummary(struct xfs_scrub *sc);
+int xchk_rgsuperblock(struct xfs_scrub *sc);
+int xchk_rtrmapbt(struct xfs_scrub *sc);
+int xchk_rtrefcountbt(struct xfs_scrub *sc);
#else
-static inline int
-xchk_rtbitmap(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
-static inline int
-xchk_rtsummary(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
+# define xchk_rtbitmap xchk_nothing
+# define xchk_rtsummary xchk_nothing
+# define xchk_rgsuperblock xchk_nothing
+# define xchk_rtrmapbt xchk_nothing
+# define xchk_rtrefcountbt xchk_nothing
#endif
#ifdef CONFIG_XFS_QUOTA
int xchk_quota(struct xfs_scrub *sc);
+int xchk_quotacheck(struct xfs_scrub *sc);
#else
-static inline int
-xchk_quota(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
+# define xchk_quota xchk_nothing
+# define xchk_quotacheck xchk_nothing
#endif
int xchk_fscounters(struct xfs_scrub *sc);
+int xchk_nlinks(struct xfs_scrub *sc);
/* cross-referencing helpers */
void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno,
@@ -198,8 +325,26 @@ void xchk_xref_is_not_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
#ifdef CONFIG_XFS_RT
void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
xfs_extlen_t len);
+void xchk_xref_has_no_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len);
+void xchk_xref_has_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len);
+void xchk_xref_is_only_rt_owned_by(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo);
+void xchk_xref_is_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len);
+void xchk_xref_is_not_rt_shared(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len);
+void xchk_xref_is_not_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno,
+ xfs_extlen_t len);
#else
# define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
+# define xchk_xref_has_no_rt_owner(sc, rtbno, len) do { } while (0)
+# define xchk_xref_has_rt_owner(sc, rtbno, len) do { } while (0)
+# define xchk_xref_is_only_rt_owned_by(sc, bno, len, oinfo) do { } while (0)
+# define xchk_xref_is_rt_cow_staging(sc, bno, len) do { } while (0)
+# define xchk_xref_is_not_rt_shared(sc, bno, len) do { } while (0)
+# define xchk_xref_is_not_rt_cow_staging(sc, bno, len) do { } while (0)
#endif
#endif /* __XFS_SCRUB_SCRUB_H__ */
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index cd91db4a5548..f8a37ea97791 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -77,6 +77,13 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_GQUOTA] = "grpquota",
[XFS_SCRUB_TYPE_PQUOTA] = "prjquota",
[XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters",
+ [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck",
+ [XFS_SCRUB_TYPE_NLINKS] = "nlinks",
+ [XFS_SCRUB_TYPE_DIRTREE] = "dirtree",
+ [XFS_SCRUB_TYPE_METAPATH] = "metapath",
+ [XFS_SCRUB_TYPE_RGSUPER] = "rgsuper",
+ [XFS_SCRUB_TYPE_RTRMAPBT] = "rtrmapbt",
+ [XFS_SCRUB_TYPE_RTREFCBT] = "rtrefcountbt",
};
/* Format the scrub stats into a text buffer, similar to pcp style. */
@@ -329,9 +336,9 @@ xchk_stats_register(
if (!cs->cs_debugfs)
return;
- debugfs_create_file("stats", 0644, cs->cs_debugfs, cs,
+ debugfs_create_file("stats", 0444, cs->cs_debugfs, cs,
&scrub_stats_fops);
- debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs,
+ debugfs_create_file("clear_stats", 0200, cs->cs_debugfs, cs,
&clear_scrub_stats_fops);
}
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index ddff86713df3..c848bcc07cd5 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -10,24 +10,36 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_symlink.h"
#include "xfs_health.h"
+#include "xfs_symlink_remote.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/health.h"
+#include "scrub/repair.h"
/* Set us up to scrub a symbolic link. */
int
xchk_setup_symlink(
struct xfs_scrub *sc)
{
+ unsigned int resblks = 0;
+ int error;
+
/* Allocate the buffer without the inode lock held. */
sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
- return xchk_setup_inode_contents(sc, 0);
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_symlink(sc, &resblks);
+ if (error)
+ return error;
+ }
+
+ return xchk_setup_inode_contents(sc, resblks);
}
/* Symbolic links. */
@@ -67,7 +79,7 @@ xchk_symlink(
}
/* Remote symlink; must read the contents. */
- error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf);
+ error = xfs_symlink_remote_read(sc->ip, sc->buf);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
return error;
if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len)
diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c
new file mode 100644
index 000000000000..953ce7be78dc
--- /dev/null
+++ b/fs/xfs/scrub/symlink_repair.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_symlink.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_symlink_remote.h"
+#include "xfs_exchmaps.h"
+#include "xfs_exchrange.h"
+#include "xfs_health.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/reap.h"
+#include "scrub/health.h"
+
+/*
+ * Symbolic Link Repair
+ * ====================
+ *
+ * We repair symbolic links by reading whatever target data we can find, up to
+ * the first NULL byte. If the recovered target strlen matches i_size, then
+ * we rewrite the target. In all other cases, we replace the target with an
+ * overly long string that cannot possibly resolve. The new target is written
+ * into a private hidden temporary file, and then a file contents exchange
+ * commits the new symlink target to the file being repaired.
+ */
+
+/* Set us up to repair the symlink file. */
+int
+xrep_setup_symlink(
+ struct xfs_scrub *sc,
+ unsigned int *resblks)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long blocks;
+ int error;
+
+ error = xrep_tempfile_create(sc, S_IFLNK);
+ if (error)
+ return error;
+
+ /*
+ * If we're doing a repair, we reserve enough blocks to write out a
+ * completely new symlink file, plus twice as many blocks as we would
+ * need if we can only allocate one block per data fork mapping. This
+ * should cover the preallocation of the temporary file and exchanging
+ * the extent mappings.
+ *
+ * We cannot use xfs_exchmaps_estimate because we have not yet
+ * constructed the replacement symlink and therefore do not know how
+ * many extents it will use. By the time we do, we will have a dirty
+ * transaction (which we cannot drop because we cannot drop the
+ * symlink ILOCK) and cannot ask for more reservation.
+ */
+ blocks = xfs_symlink_blocks(sc->mp, XFS_SYMLINK_MAXLEN);
+ blocks += xfs_bmbt_calc_size(mp, blocks) * 2;
+ if (blocks > UINT_MAX)
+ return -EOPNOTSUPP;
+
+ *resblks += blocks;
+ return 0;
+}
+
+/*
+ * Try to salvage the pathname from remote blocks. Returns the number of bytes
+ * salvaged or a negative errno.
+ */
+STATIC ssize_t
+xrep_symlink_salvage_remote(
+ struct xfs_scrub *sc)
+{
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_buf *bp;
+ char *target_buf = sc->buf;
+ xfs_failaddr_t fa;
+ xfs_filblks_t fsblocks;
+ xfs_daddr_t d;
+ loff_t len;
+ loff_t offset = 0;
+ unsigned int byte_cnt;
+ bool magic_ok;
+ bool hdr_ok;
+ int n;
+ int nmaps = XFS_SYMLINK_MAPS;
+ int error;
+
+ /* We'll only read until the buffer is full. */
+ len = min_t(loff_t, ip->i_disk_size, XFS_SYMLINK_MAXLEN);
+ fsblocks = xfs_symlink_blocks(sc->mp, len);
+ error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
+ if (error)
+ return error;
+
+ for (n = 0; n < nmaps; n++) {
+ struct xfs_dsymlink_hdr *dsl;
+
+ d = XFS_FSB_TO_DADDR(sc->mp, mval[n].br_startblock);
+
+ /* Read the rmt block. We'll run the verifiers manually. */
+ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
+ d, XFS_FSB_TO_BB(sc->mp, mval[n].br_blockcount),
+ 0, &bp, NULL);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ /* How many bytes do we expect to get out of this buffer? */
+ byte_cnt = XFS_FSB_TO_B(sc->mp, mval[n].br_blockcount);
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(sc->mp, byte_cnt);
+ byte_cnt = min_t(unsigned int, byte_cnt, len);
+
+ /*
+ * See if the verifiers accept this block. We're willing to
+ * salvage if the if the offset/byte/ino are ok and either the
+ * verifier passed or the magic is ok. Anything else and we
+ * stop dead in our tracks.
+ */
+ fa = bp->b_ops->verify_struct(bp);
+ dsl = bp->b_addr;
+ magic_ok = dsl->sl_magic == cpu_to_be32(XFS_SYMLINK_MAGIC);
+ hdr_ok = xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp);
+ if (!hdr_ok || (fa != NULL && !magic_ok))
+ break;
+
+ memcpy(target_buf + offset, dsl + 1, byte_cnt);
+
+ len -= byte_cnt;
+ offset += byte_cnt;
+ }
+ return offset;
+}
+
+/*
+ * Try to salvage an inline symlink's contents. Returns the number of bytes
+ * salvaged or a negative errno.
+ */
+STATIC ssize_t
+xrep_symlink_salvage_inline(
+ struct xfs_scrub *sc)
+{
+ struct xfs_inode *ip = sc->ip;
+ char *target_buf = sc->buf;
+ char *old_target;
+ struct xfs_ifork *ifp;
+ unsigned int nr;
+
+ ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ if (!ifp->if_data)
+ return 0;
+
+ /*
+ * If inode repair zapped the link target, pretend that we didn't find
+ * any bytes at all so that we can replace the (now totally lost) link
+ * target with a warning message.
+ */
+ old_target = ifp->if_data;
+ if (xfs_inode_has_sickness(sc->ip, XFS_SICK_INO_SYMLINK_ZAPPED) &&
+ sc->ip->i_disk_size == 1 && old_target[0] == '?')
+ return 0;
+
+ nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip));
+ strncpy(target_buf, ifp->if_data, nr);
+ return nr;
+}
+
+#define DUMMY_TARGET \
+ "The target of this symbolic link could not be recovered at all and " \
+ "has been replaced with this explanatory message. To avoid " \
+ "accidentally pointing to an existing file path, this message is " \
+ "longer than the maximum supported file name length. That is an " \
+ "acceptable length for a symlink target on XFS but will produce " \
+ "File Name Too Long errors if resolved."
+
+/* Salvage whatever we can of the target. */
+STATIC int
+xrep_symlink_salvage(
+ struct xfs_scrub *sc)
+{
+ char *target_buf = sc->buf;
+ ssize_t buflen = 0;
+
+ BUILD_BUG_ON(sizeof(DUMMY_TARGET) - 1 <= NAME_MAX);
+
+ /*
+ * Salvage the target if there weren't any corruption problems observed
+ * while scanning it.
+ */
+ if (!(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+ if (sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+ buflen = xrep_symlink_salvage_inline(sc);
+ else
+ buflen = xrep_symlink_salvage_remote(sc);
+ if (buflen < 0)
+ return buflen;
+
+ /*
+ * NULL-terminate the buffer because the ondisk target does not
+ * do that for us. If salvage didn't find the exact amount of
+ * data that we expected to find, don't salvage anything.
+ */
+ target_buf[buflen] = 0;
+ if (strlen(target_buf) != sc->ip->i_disk_size)
+ buflen = 0;
+ }
+
+ /*
+ * Change an empty target into a dummy target and clear the symlink
+ * target zapped flag.
+ */
+ if (buflen == 0) {
+ xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED);
+ sprintf(target_buf, DUMMY_TARGET);
+ }
+
+ trace_xrep_symlink_salvage_target(sc->ip, target_buf,
+ strlen(target_buf));
+ return 0;
+}
+
+STATIC void
+xrep_symlink_local_to_remote(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp,
+ void *priv)
+{
+ struct xfs_scrub *sc = priv;
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+ xfs_symlink_local_to_remote(tp, bp, ip, ifp, NULL);
+
+ if (!xfs_has_crc(sc->mp))
+ return;
+
+ dsl->sl_owner = cpu_to_be64(sc->ip->i_ino);
+ xfs_trans_log_buf(tp, bp, 0,
+ sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1);
+}
+
+/*
+ * Prepare both links' data forks for an exchange. Promote the tempfile from
+ * local format to extents format, and if the file being repaired has a short
+ * format data fork, turn it into an empty extent list.
+ */
+STATIC int
+xrep_symlink_swap_prep(
+ struct xfs_scrub *sc,
+ bool temp_local,
+ bool ip_local)
+{
+ int error;
+
+ /*
+ * If the temp link is in shortform format, convert that to a remote
+ * target so that we can use the atomic mapping exchange.
+ */
+ if (temp_local) {
+ int logflags = XFS_ILOG_CORE;
+
+ error = xfs_bmap_local_to_extents(sc->tp, sc->tempip, 1,
+ &logflags, XFS_DATA_FORK,
+ xrep_symlink_local_to_remote,
+ sc);
+ if (error)
+ return error;
+
+ xfs_trans_log_inode(sc->tp, sc->ip, 0);
+
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If the file being repaired had a shortform data fork, convert that
+ * to an empty extent list in preparation for the atomic mapping
+ * exchange.
+ */
+ if (ip_local) {
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(ifp);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ifp->if_nextents = 0;
+ ifp->if_bytes = 0;
+ ifp->if_data = NULL;
+ ifp->if_height = 0;
+
+ xfs_trans_log_inode(sc->tp, sc->ip,
+ XFS_ILOG_CORE | XFS_ILOG_DDATA);
+ }
+
+ return 0;
+}
+
+/* Exchange the temporary symlink's data fork with the one being repaired. */
+STATIC int
+xrep_symlink_swap(
+ struct xfs_scrub *sc)
+{
+ struct xrep_tempexch *tx = sc->buf;
+ bool ip_local, temp_local;
+ int error;
+
+ ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+ temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+
+ /*
+ * If the both links have a local format data fork and the rebuilt
+ * remote data would fit in the repaired file's data fork, copy the
+ * contents from the tempfile and declare ourselves done.
+ */
+ if (ip_local && temp_local &&
+ sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
+ xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
+ return 0;
+ }
+
+ /* Otherwise, make sure both data forks are in block-mapping mode. */
+ error = xrep_symlink_swap_prep(sc, temp_local, ip_local);
+ if (error)
+ return error;
+
+ return xrep_tempexch_contents(sc, tx);
+}
+
+/*
+ * Free all the remote blocks and reset the data fork. The caller must join
+ * the inode to the transaction. This function returns with the inode joined
+ * to a clean scrub transaction.
+ */
+STATIC int
+xrep_symlink_reset_fork(
+ struct xfs_scrub *sc)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
+ int error;
+
+ /* Unmap all the remote target buffers. */
+ if (xfs_ifork_has_extents(ifp)) {
+ error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+
+ trace_xrep_symlink_reset_fork(sc->tempip);
+
+ /* Reset the temp symlink target to dummy content. */
+ xfs_idestroy_fork(ifp);
+ return xfs_symlink_write_target(sc->tp, sc->tempip, sc->tempip->i_ino,
+ "?", 1, 0, 0);
+}
+
+/*
+ * Reinitialize a link target. Caller must ensure the inode is joined to
+ * the transaction.
+ */
+STATIC int
+xrep_symlink_rebuild(
+ struct xfs_scrub *sc)
+{
+ struct xrep_tempexch *tx;
+ char *target_buf = sc->buf;
+ xfs_fsblock_t fs_blocks;
+ unsigned int target_len;
+ unsigned int resblks;
+ int error;
+
+ /* How many blocks do we need? */
+ target_len = strlen(target_buf);
+ ASSERT(target_len != 0);
+ if (target_len == 0 || target_len > XFS_SYMLINK_MAXLEN)
+ return -EFSCORRUPTED;
+
+ trace_xrep_symlink_rebuild(sc->ip);
+
+ /*
+ * In preparation to write the new symlink target to the temporary
+ * file, drop the ILOCK of the file being repaired (it shouldn't be
+ * joined) and take the ILOCK of the temporary file.
+ *
+ * The VFS does not take the IOLOCK while reading a symlink (and new
+ * symlinks are hidden with INEW until they've been written) so it's
+ * possible that a readlink() could see the old corrupted contents
+ * while we're doing this.
+ */
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ xrep_tempfile_ilock(sc);
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+ /*
+ * Reserve resources to reinitialize the target. We're allowed to
+ * exceed file quota to repair inconsistent metadata, though this is
+ * unlikely.
+ */
+ fs_blocks = xfs_symlink_blocks(sc->mp, target_len);
+ resblks = xfs_symlink_space_res(sc->mp, target_len, fs_blocks);
+ error = xfs_trans_reserve_quota_nblks(sc->tp, sc->tempip, resblks, 0,
+ true);
+ if (error)
+ return error;
+
+ /* Erase the dummy target set up by the tempfile initialization. */
+ xfs_idestroy_fork(&sc->tempip->i_df);
+ sc->tempip->i_df.if_bytes = 0;
+ sc->tempip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+
+ /* Write the salvaged target to the temporary link. */
+ error = xfs_symlink_write_target(sc->tp, sc->tempip, sc->ip->i_ino,
+ target_buf, target_len, fs_blocks, resblks);
+ if (error)
+ return error;
+
+ /*
+ * Commit the repair transaction so that we can use the atomic mapping
+ * exchange functions to compute the correct block reservations and
+ * re-lock the inodes.
+ */
+ target_buf = NULL;
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ xrep_tempfile_iunlock(sc);
+
+ /*
+ * We're done with the temporary buffer, so we can reuse it for the
+ * tempfile contents exchange information.
+ */
+ tx = sc->buf;
+ error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, tx);
+ if (error)
+ return error;
+
+ /*
+ * Exchange the temp link's data fork with the file being repaired.
+ * This recreates the transaction and takes the ILOCKs of the file
+ * being repaired and the temporary file.
+ */
+ error = xrep_symlink_swap(sc);
+ if (error)
+ return error;
+
+ /*
+ * Release the old symlink blocks and reset the data fork of the temp
+ * link to an empty shortform link. This is the last repair action we
+ * perform on the symlink, so we don't need to clean the transaction.
+ */
+ return xrep_symlink_reset_fork(sc);
+}
+
+/* Repair a symbolic link. */
+int
+xrep_symlink(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ /* The rmapbt is required to reap the old data fork. */
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+ /* We require atomic file exchange range to rebuild anything. */
+ if (!xfs_has_exchange_range(sc->mp))
+ return -EOPNOTSUPP;
+
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ error = xrep_symlink_salvage(sc);
+ if (error)
+ return error;
+
+ /* Now reset the target. */
+ error = xrep_symlink_rebuild(sc);
+ if (error)
+ return error;
+
+ return xrep_trans_commit(sc);
+}
diff --git a/fs/xfs/scrub/tempexch.h b/fs/xfs/scrub/tempexch.h
new file mode 100644
index 000000000000..eccda720c2ca
--- /dev/null
+++ b/fs/xfs/scrub/tempexch.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_TEMPEXCH_H__
+#define __XFS_SCRUB_TEMPEXCH_H__
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+struct xrep_tempexch {
+ struct xfs_exchmaps_req req;
+};
+
+int xrep_tempexch_trans_reserve(struct xfs_scrub *sc, int whichfork,
+ xfs_fileoff_t off, xfs_filblks_t len, struct xrep_tempexch *ti);
+int xrep_tempexch_trans_alloc(struct xfs_scrub *sc, int whichfork,
+ struct xrep_tempexch *ti);
+
+int xrep_tempexch_contents(struct xfs_scrub *sc, struct xrep_tempexch *ti);
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_TEMPEXCH_H__ */
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
new file mode 100644
index 000000000000..cf99e0ca51b0
--- /dev/null
+++ b/fs/xfs/scrub/tempfile.c
@@ -0,0 +1,980 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_dir2.h"
+#include "xfs_exchrange.h"
+#include "xfs_exchmaps.h"
+#include "xfs_defer.h"
+#include "xfs_symlink_remote.h"
+#include "xfs_metafile.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/trace.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/xfile.h"
+
+/*
+ * Create a temporary file for reconstructing metadata, with the intention of
+ * atomically exchanging the temporary file's contents with the file that's
+ * being repaired.
+ */
+int
+xrep_tempfile_create(
+ struct xfs_scrub *sc,
+ uint16_t mode)
+{
+ struct xfs_icreate_args args = {
+ .pip = sc->mp->m_rootip,
+ .mode = mode,
+ .flags = XFS_ICREATE_TMPFILE | XFS_ICREATE_UNLINKABLE,
+ };
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = NULL;
+ struct xfs_dquot *udqp;
+ struct xfs_dquot *gdqp;
+ struct xfs_dquot *pdqp;
+ struct xfs_trans_res *tres;
+ struct xfs_inode *dp = mp->m_rootip;
+ xfs_ino_t ino;
+ unsigned int resblks;
+ bool is_dir = S_ISDIR(mode);
+ int error;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+ if (xfs_is_readonly(mp))
+ return -EROFS;
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(sc->tempip == NULL);
+
+ /*
+ * Make sure that we have allocated dquot(s) on disk. The temporary
+ * inode should be completely root owned so that we don't fail due to
+ * quota limits.
+ */
+ error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp);
+ if (error)
+ return error;
+
+ if (is_dir) {
+ resblks = xfs_mkdir_space_res(mp, 0);
+ tres = &M_RES(mp)->tr_mkdir;
+ } else {
+ resblks = XFS_IALLOC_SPACE_RES(mp);
+ tres = &M_RES(mp)->tr_create_tmpfile;
+ }
+
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+ &tp);
+ if (error)
+ goto out_release_dquots;
+
+ /* Allocate inode, set up directory. */
+ error = xfs_dialloc(&tp, &args, &ino);
+ if (error)
+ goto out_trans_cancel;
+ error = xfs_icreate(tp, ino, &args, &sc->tempip);
+ if (error)
+ goto out_trans_cancel;
+
+ /* We don't touch file data, so drop the realtime flags. */
+ sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
+ xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE);
+
+ /*
+ * Mark our temporary file as private so that LSMs and the ACL code
+ * don't try to add their own metadata or reason about these files.
+ * The file should never be exposed to userspace.
+ */
+ VFS_I(sc->tempip)->i_flags |= S_PRIVATE;
+ VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR;
+
+ if (is_dir) {
+ error = xfs_dir_init(tp, sc->tempip, dp);
+ if (error)
+ goto out_trans_cancel;
+ } else if (S_ISLNK(VFS_I(sc->tempip)->i_mode)) {
+ /*
+ * Initialize the temporary symlink with a meaningless target
+ * that won't trip the verifiers. Repair must rewrite the
+ * target with meaningful content before swapping with the file
+ * being repaired. A single-byte target will not write a
+ * remote target block, so the owner is irrelevant.
+ */
+ error = xfs_symlink_write_target(tp, sc->tempip,
+ sc->tempip->i_ino, ".", 1, 0, 0);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
+ * Attach the dquot(s) to the inodes and modify them incore.
+ * These ids of the inode couldn't have changed since the new
+ * inode has been locked ever since it was created.
+ */
+ xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp);
+
+ /*
+ * Put our temp file on the unlinked list so it's purged automatically.
+ * All file-based metadata being reconstructed using this file must be
+ * atomically exchanged with the original file because the contents
+ * here will be purged when the inode is dropped or log recovery cleans
+ * out the unlinked list.
+ */
+ error = xfs_iunlink(tp, sc->tempip);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_release_inode;
+
+ trace_xrep_tempfile_create(sc);
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ /* Finish setting up the incore / vfs context. */
+ xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
+ xfs_setup_iops(sc->tempip);
+ xfs_finish_inode_setup(sc->tempip);
+
+ sc->temp_ilock_flags = 0;
+ return error;
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+out_release_inode:
+ /*
+ * Wait until after the current transaction is aborted to finish the
+ * setup of the inode and release the inode. This prevents recursive
+ * transactions and deadlocks from xfs_inactive.
+ */
+ if (sc->tempip) {
+ xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
+ xfs_finish_inode_setup(sc->tempip);
+ xchk_irele(sc, sc->tempip);
+ }
+out_release_dquots:
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ return error;
+}
+
+/*
+ * Move sc->tempip from the regular directory tree to the metadata directory
+ * tree if sc->ip is part of the metadata directory tree and tempip has an
+ * eligible file mode.
+ *
+ * Temporary files have to be created before we even know which inode we're
+ * going to scrub, so we assume that they will be part of the regular directory
+ * tree. If it turns out that we're actually scrubbing a file from the
+ * metadata directory tree, we have to subtract the temp file from the root
+ * dquots and detach the dquots prior to setting the METADATA iflag. However,
+ * the scrub setup functions grab sc->ip and create sc->tempip before we
+ * actually get around to checking if the file mode is the right type for the
+ * scrubber.
+ */
+int
+xrep_tempfile_adjust_directory_tree(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ if (!sc->tempip)
+ return 0;
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(!xfs_is_metadir_inode(sc->tempip));
+
+ if (!sc->ip || !xfs_is_metadir_inode(sc->ip))
+ return 0;
+ if (!S_ISDIR(VFS_I(sc->tempip)->i_mode) &&
+ !S_ISREG(VFS_I(sc->tempip)->i_mode))
+ return 0;
+
+ xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
+ sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ goto out_iolock;
+
+ xrep_tempfile_ilock(sc);
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+ /* Metadir files are not accounted in quota, so drop icount */
+ xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, -1L);
+ xfs_metafile_set_iflag(sc->tp, sc->tempip, XFS_METAFILE_UNKNOWN);
+
+ error = xrep_trans_commit(sc);
+ if (error)
+ goto out_ilock;
+
+ xfs_iflags_set(sc->tempip, XFS_IRECOVERY);
+ xfs_qm_dqdetach(sc->tempip);
+out_ilock:
+ xrep_tempfile_iunlock(sc);
+out_iolock:
+ xrep_tempfile_iounlock(sc);
+ return error;
+}
+
+/*
+ * Remove this temporary file from the metadata directory tree so that it can
+ * be inactivated the normal way.
+ */
+STATIC int
+xrep_tempfile_remove_metadir(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ if (!sc->tempip || !xfs_is_metadir_inode(sc->tempip))
+ return 0;
+
+ ASSERT(sc->tp == NULL);
+
+ xfs_iflags_clear(sc->tempip, XFS_IRECOVERY);
+
+ xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
+ sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ goto out_iolock;
+
+ xrep_tempfile_ilock(sc);
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+ xfs_metafile_clear_iflag(sc->tp, sc->tempip);
+
+ /* Non-metadir files are accounted in quota, so bump bcount/icount */
+ error = xfs_qm_dqattach_locked(sc->tempip, false);
+ if (error)
+ goto out_cancel;
+
+ xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, 1L);
+ xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_BCOUNT,
+ sc->tempip->i_nblocks);
+ error = xrep_trans_commit(sc);
+ goto out_ilock;
+
+out_cancel:
+ xchk_trans_cancel(sc);
+out_ilock:
+ xrep_tempfile_iunlock(sc);
+out_iolock:
+ xrep_tempfile_iounlock(sc);
+ return error;
+}
+
+/* Take IOLOCK_EXCL on the temporary file, maybe. */
+bool
+xrep_tempfile_iolock_nowait(
+ struct xfs_scrub *sc)
+{
+ if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) {
+ sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Take the temporary file's IOLOCK while holding a different inode's IOLOCK.
+ * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock
+ * to avoid deadlocks and lockdep complaints.
+ */
+int
+xrep_tempfile_iolock_polled(
+ struct xfs_scrub *sc)
+{
+ int error = 0;
+
+ while (!xrep_tempfile_iolock_nowait(sc)) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+ delay(1);
+ }
+
+ return 0;
+}
+
+/* Release IOLOCK_EXCL on the temporary file. */
+void
+xrep_tempfile_iounlock(
+ struct xfs_scrub *sc)
+{
+ xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL);
+ sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL;
+}
+
+/* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */
+void
+xrep_tempfile_ilock(
+ struct xfs_scrub *sc)
+{
+ sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->tempip, XFS_ILOCK_EXCL);
+}
+
+/* Try to grab ILOCK_EXCL on the temporary file. */
+bool
+xrep_tempfile_ilock_nowait(
+ struct xfs_scrub *sc)
+{
+ if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) {
+ sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+ return true;
+ }
+
+ return false;
+}
+
+/* Unlock ILOCK_EXCL on the temporary file after an update. */
+void
+xrep_tempfile_iunlock(
+ struct xfs_scrub *sc)
+{
+ xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
+ sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL;
+}
+
+/*
+ * Begin the process of making changes to both the file being scrubbed and
+ * the temporary file by taking ILOCK_EXCL on both.
+ */
+void
+xrep_tempfile_ilock_both(
+ struct xfs_scrub *sc)
+{
+ xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip, XFS_ILOCK_EXCL);
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+}
+
+/* Unlock ILOCK_EXCL on both files. */
+void
+xrep_tempfile_iunlock_both(
+ struct xfs_scrub *sc)
+{
+ xrep_tempfile_iunlock(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+}
+
+/* Release the temporary file. */
+void
+xrep_tempfile_rele(
+ struct xfs_scrub *sc)
+{
+ if (!sc->tempip)
+ return;
+
+ if (sc->temp_ilock_flags) {
+ xfs_iunlock(sc->tempip, sc->temp_ilock_flags);
+ sc->temp_ilock_flags = 0;
+ }
+
+ xrep_tempfile_remove_metadir(sc);
+ xchk_irele(sc, sc->tempip);
+ sc->tempip = NULL;
+}
+
+/*
+ * Make sure that the given range of the data fork of the temporary file is
+ * mapped to written blocks. The caller must ensure that both inodes are
+ * joined to the transaction.
+ */
+int
+xrep_tempfile_prealloc(
+ struct xfs_scrub *sc,
+ xfs_fileoff_t off,
+ xfs_filblks_t len)
+{
+ struct xfs_bmbt_irec map;
+ xfs_fileoff_t end = off + len;
+ int error;
+
+ ASSERT(sc->tempip != NULL);
+ ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip));
+
+ for (; off < end; off = map.br_startoff + map.br_blockcount) {
+ int nmaps = 1;
+
+ /*
+ * If we have a real extent mapping this block then we're
+ * in ok shape.
+ */
+ error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps,
+ XFS_DATA_FORK);
+ if (error)
+ return error;
+ if (nmaps == 0) {
+ ASSERT(nmaps != 0);
+ return -EFSCORRUPTED;
+ }
+
+ if (xfs_bmap_is_written_extent(&map))
+ continue;
+
+ /*
+ * If we find a delalloc reservation then something is very
+ * very wrong. Bail out.
+ */
+ if (map.br_startblock == DELAYSTARTBLOCK)
+ return -EFSCORRUPTED;
+
+ /*
+ * Make sure this block has a real zeroed extent allocated to
+ * it.
+ */
+ nmaps = 1;
+ error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off,
+ XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map,
+ &nmaps);
+ if (error)
+ return error;
+ if (nmaps != 1)
+ return -EFSCORRUPTED;
+
+ trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map);
+
+ /* Commit new extent and all deferred work. */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Write data to each block of a file. The given range of the tempfile's data
+ * fork must already be populated with written extents.
+ */
+int
+xrep_tempfile_copyin(
+ struct xfs_scrub *sc,
+ xfs_fileoff_t off,
+ xfs_filblks_t len,
+ xrep_tempfile_copyin_fn prep_fn,
+ void *data)
+{
+ LIST_HEAD(buffers_list);
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+ xfs_fileoff_t flush_mask;
+ xfs_fileoff_t end = off + len;
+ loff_t pos = XFS_FSB_TO_B(mp, off);
+ int error = 0;
+
+ ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode));
+
+ /* Flush buffers to disk every 512K */
+ flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1;
+
+ for (; off < end; off++, pos += mp->m_sb.sb_blocksize) {
+ struct xfs_bmbt_irec map;
+ int nmaps = 1;
+
+ /* Read block mapping for this file block. */
+ error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0);
+ if (error)
+ goto out_err;
+ if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) {
+ error = -EFSCORRUPTED;
+ goto out_err;
+ }
+
+ /* Get the metadata buffer for this offset in the file. */
+ error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ mp->m_bsize, 0, &bp);
+ if (error)
+ goto out_err;
+
+ trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map);
+
+ /* Read in a block's worth of data from the xfile. */
+ error = prep_fn(sc, bp, data);
+ if (error) {
+ xfs_trans_brelse(sc->tp, bp);
+ goto out_err;
+ }
+
+ /* Queue buffer, and flush if we have too much dirty data. */
+ xfs_buf_delwri_queue_here(bp, &buffers_list);
+ xfs_trans_brelse(sc->tp, bp);
+
+ if (!(off & flush_mask)) {
+ error = xfs_buf_delwri_submit(&buffers_list);
+ if (error)
+ goto out_err;
+ }
+ }
+
+ /*
+ * Write the new blocks to disk. If the ordered list isn't empty after
+ * that, then something went wrong and we have to fail. This should
+ * never happen, but we'll check anyway.
+ */
+ error = xfs_buf_delwri_submit(&buffers_list);
+ if (error)
+ goto out_err;
+
+ if (!list_empty(&buffers_list)) {
+ ASSERT(list_empty(&buffers_list));
+ error = -EIO;
+ goto out_err;
+ }
+
+ return 0;
+
+out_err:
+ xfs_buf_delwri_cancel(&buffers_list);
+ return error;
+}
+
+/*
+ * Set the temporary file's size. Caller must join the tempfile to the scrub
+ * transaction and is responsible for adjusting block mappings as needed.
+ */
+int
+xrep_tempfile_set_isize(
+ struct xfs_scrub *sc,
+ unsigned long long isize)
+{
+ if (sc->tempip->i_disk_size == isize)
+ return 0;
+
+ sc->tempip->i_disk_size = isize;
+ i_size_write(VFS_I(sc->tempip), isize);
+ return xrep_tempfile_roll_trans(sc);
+}
+
+/*
+ * Roll a repair transaction involving the temporary file. Caller must join
+ * both the temporary file and the file being scrubbed to the transaction.
+ * This function return with both inodes joined to a new scrub transaction,
+ * or the usual negative errno.
+ */
+int
+xrep_tempfile_roll_trans(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE);
+ error = xrep_roll_trans(sc);
+ if (error)
+ return error;
+
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+ return 0;
+}
+
+/*
+ * Fill out the mapping exchange request in preparation for atomically
+ * committing the contents of a metadata file that we've rebuilt in the temp
+ * file.
+ */
+STATIC int
+xrep_tempexch_prep_request(
+ struct xfs_scrub *sc,
+ int whichfork,
+ xfs_fileoff_t off,
+ xfs_filblks_t len,
+ struct xrep_tempexch *tx)
+{
+ struct xfs_exchmaps_req *req = &tx->req;
+
+ memset(tx, 0, sizeof(struct xrep_tempexch));
+
+ /* COW forks don't exist on disk. */
+ if (whichfork == XFS_COW_FORK) {
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ /* Both files should have the relevant forks. */
+ if (!xfs_ifork_ptr(sc->ip, whichfork) ||
+ !xfs_ifork_ptr(sc->tempip, whichfork)) {
+ ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
+ ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL);
+ return -EINVAL;
+ }
+
+ /* Exchange all mappings in both forks. */
+ req->ip1 = sc->tempip;
+ req->ip2 = sc->ip;
+ req->startoff1 = off;
+ req->startoff2 = off;
+ switch (whichfork) {
+ case XFS_ATTR_FORK:
+ req->flags |= XFS_EXCHMAPS_ATTR_FORK;
+ break;
+ case XFS_DATA_FORK:
+ /* Exchange sizes when exchanging all data fork mappings. */
+ if (off == 0 && len == XFS_MAX_FILEOFF)
+ req->flags |= XFS_EXCHMAPS_SET_SIZES;
+ break;
+ }
+ req->blockcount = len;
+
+ return 0;
+}
+
+/*
+ * Fill out the mapping exchange resource estimation structures in preparation
+ * for exchanging the contents of a metadata file that we've rebuilt in the
+ * temp file. Caller must hold IOLOCK_EXCL but not ILOCK_EXCL on both files.
+ */
+STATIC int
+xrep_tempexch_estimate(
+ struct xfs_scrub *sc,
+ struct xrep_tempexch *tx)
+{
+ struct xfs_exchmaps_req *req = &tx->req;
+ struct xfs_ifork *ifp;
+ struct xfs_ifork *tifp;
+ int whichfork = xfs_exchmaps_reqfork(req);
+ int state = 0;
+
+ /*
+ * The exchmaps code only knows how to exchange file fork space
+ * mappings. Any fork data in local format must be promoted to a
+ * single block before the exchange can take place.
+ */
+ ifp = xfs_ifork_ptr(sc->ip, whichfork);
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
+ state |= 1;
+
+ tifp = xfs_ifork_ptr(sc->tempip, whichfork);
+ if (tifp->if_format == XFS_DINODE_FMT_LOCAL)
+ state |= 2;
+
+ switch (state) {
+ case 0:
+ /* Both files have mapped extents; use the regular estimate. */
+ return xfs_exchrange_estimate(req);
+ case 1:
+ /*
+ * The file being repaired is in local format, but the temp
+ * file has mapped extents. To perform the exchange, the file
+ * being repaired must have its shorform data converted to an
+ * ondisk block so that the forks will be in extents format.
+ * We need one resblk for the conversion; the number of
+ * exchanges is (worst case) the temporary file's extent count
+ * plus the block we converted.
+ */
+ req->ip1_bcount = sc->tempip->i_nblocks;
+ req->ip2_bcount = 1;
+ req->nr_exchanges = 1 + tifp->if_nextents;
+ req->resblks = 1;
+ break;
+ case 2:
+ /*
+ * The temporary file is in local format, but the file being
+ * repaired has mapped extents. To perform the exchange, the
+ * temp file must have its shortform data converted to an
+ * ondisk block, and the fork changed to extents format. We
+ * need one resblk for the conversion; the number of exchanges
+ * is (worst case) the extent count of the file being repaired
+ * plus the block we converted.
+ */
+ req->ip1_bcount = 1;
+ req->ip2_bcount = sc->ip->i_nblocks;
+ req->nr_exchanges = 1 + ifp->if_nextents;
+ req->resblks = 1;
+ break;
+ case 3:
+ /*
+ * Both forks are in local format. To perform the exchange,
+ * both files must have their shortform data converted to
+ * fsblocks, and both forks must be converted to extents
+ * format. We need two resblks for the two conversions, and
+ * the number of exchanges is 1 since there's only one block at
+ * fileoff 0. Presumably, the caller could not exchange the
+ * two inode fork areas directly.
+ */
+ req->ip1_bcount = 1;
+ req->ip2_bcount = 1;
+ req->nr_exchanges = 1;
+ req->resblks = 2;
+ break;
+ }
+
+ return xfs_exchmaps_estimate_overhead(req);
+}
+
+/*
+ * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
+ * this if quota enforcement is disabled or if both inodes' dquots are the
+ * same. The qretry structure must be initialized to zeroes before the first
+ * call to this function.
+ */
+STATIC int
+xrep_tempexch_reserve_quota(
+ struct xfs_scrub *sc,
+ const struct xrep_tempexch *tx)
+{
+ struct xfs_trans *tp = sc->tp;
+ const struct xfs_exchmaps_req *req = &tx->req;
+ int64_t ddelta, rdelta;
+ int error;
+
+ /*
+ * Don't bother with a quota reservation if we're not enforcing them
+ * or the two inodes have the same dquots.
+ */
+ if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
+ xfs_is_metadir_inode(req->ip1) ||
+ (req->ip1->i_udquot == req->ip2->i_udquot &&
+ req->ip1->i_gdquot == req->ip2->i_gdquot &&
+ req->ip1->i_pdquot == req->ip2->i_pdquot))
+ return 0;
+
+ /*
+ * Quota reservation for each file comes from two sources. First, we
+ * need to account for any net gain in mapped blocks during the
+ * exchange. Second, we need reservation for the gross gain in mapped
+ * blocks so that we don't trip over any quota block reservation
+ * assertions. We must reserve the gross gain because the quota code
+ * subtracts from bcount the number of blocks that we unmap; it does
+ * not add that quantity back to the quota block reservation.
+ */
+ ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount);
+ rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount);
+ error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
+ ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount,
+ true);
+ if (error)
+ return error;
+
+ ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount);
+ rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount);
+ return xfs_trans_reserve_quota_nblks(tp, req->ip2,
+ ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount,
+ true);
+}
+
+/*
+ * Prepare an existing transaction for an atomic file contents exchange.
+ *
+ * This function fills out the mapping exchange request and resource estimation
+ * structures in preparation for exchanging the contents of a metadata file
+ * that has been rebuilt in the temp file. Next, it reserves space and quota
+ * for the transaction.
+ *
+ * The caller must hold ILOCK_EXCL of the scrub target file and the temporary
+ * file. The caller must join both inodes to the transaction with no unlock
+ * flags, and is responsible for dropping both ILOCKs when appropriate. Only
+ * use this when those ILOCKs cannot be dropped.
+ */
+int
+xrep_tempexch_trans_reserve(
+ struct xfs_scrub *sc,
+ int whichfork,
+ xfs_fileoff_t off,
+ xfs_filblks_t len,
+ struct xrep_tempexch *tx)
+{
+ int error;
+
+ ASSERT(sc->tp != NULL);
+ xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL);
+ xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL);
+
+ error = xrep_tempexch_prep_request(sc, whichfork, off, len, tx);
+ if (error)
+ return error;
+
+ error = xfs_exchmaps_estimate(&tx->req);
+ if (error)
+ return error;
+
+ error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0);
+ if (error)
+ return error;
+
+ return xrep_tempexch_reserve_quota(sc, tx);
+}
+
+/*
+ * Create a new transaction for a file contents exchange.
+ *
+ * This function fills out the mapping excahange request and resource
+ * estimation structures in preparation for exchanging the contents of a
+ * metadata file that has been rebuilt in the temp file. Next, it reserves
+ * space, takes ILOCK_EXCL of both inodes, joins them to the transaction and
+ * reserves quota for the transaction.
+ *
+ * The caller is responsible for dropping both ILOCKs when appropriate.
+ */
+int
+xrep_tempexch_trans_alloc(
+ struct xfs_scrub *sc,
+ int whichfork,
+ struct xrep_tempexch *tx)
+{
+ unsigned int flags = 0;
+ int error;
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(xfs_has_exchange_range(sc->mp));
+
+ error = xrep_tempexch_prep_request(sc, whichfork, 0, XFS_MAX_FILEOFF,
+ tx);
+ if (error)
+ return error;
+
+ error = xrep_tempexch_estimate(sc, tx);
+ if (error)
+ return error;
+
+ if (xfs_has_lazysbcount(sc->mp))
+ flags |= XFS_TRANS_RES_FDBLKS;
+
+ error = xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
+ tx->req.resblks, 0, flags, &sc->tp);
+ if (error)
+ return error;
+
+ sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_exchrange_ilock(sc->tp, sc->ip, sc->tempip);
+
+ return xrep_tempexch_reserve_quota(sc, tx);
+}
+
+/*
+ * Exchange file mappings (and hence file contents) between the file being
+ * repaired and the temporary file. Returns with both inodes locked and joined
+ * to a clean scrub transaction.
+ */
+int
+xrep_tempexch_contents(
+ struct xfs_scrub *sc,
+ struct xrep_tempexch *tx)
+{
+ int error;
+
+ ASSERT(xfs_has_exchange_range(sc->mp));
+
+ xfs_exchange_mappings(sc->tp, &tx->req);
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+
+ /*
+ * If we exchanged the ondisk sizes of two metadata files, we must
+ * exchanged the incore sizes as well.
+ */
+ if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) {
+ loff_t temp;
+
+ temp = i_size_read(VFS_I(sc->ip));
+ i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
+ i_size_write(VFS_I(sc->tempip), temp);
+ }
+
+ return 0;
+}
+
+/*
+ * Write local format data from one of the temporary file's forks into the same
+ * fork of file being repaired, and exchange the file sizes, if appropriate.
+ * Caller must ensure that the file being repaired has enough fork space to
+ * hold all the bytes.
+ */
+void
+xrep_tempfile_copyout_local(
+ struct xfs_scrub *sc,
+ int whichfork)
+{
+ struct xfs_ifork *temp_ifp;
+ struct xfs_ifork *ifp;
+ unsigned int ilog_flags = XFS_ILOG_CORE;
+
+ temp_ifp = xfs_ifork_ptr(sc->tempip, whichfork);
+ ifp = xfs_ifork_ptr(sc->ip, whichfork);
+
+ ASSERT(temp_ifp != NULL);
+ ASSERT(ifp != NULL);
+ ASSERT(temp_ifp->if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ ASSERT(sc->tempip->i_disk_size <=
+ xfs_inode_data_fork_size(sc->ip));
+ break;
+ case XFS_ATTR_FORK:
+ ASSERT(sc->tempip->i_forkoff >= sc->ip->i_forkoff);
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ /* Recreate @sc->ip's incore fork (ifp) with data from temp_ifp. */
+ xfs_idestroy_fork(ifp);
+ xfs_init_local_fork(sc->ip, whichfork, temp_ifp->if_data,
+ temp_ifp->if_bytes);
+
+ if (whichfork == XFS_DATA_FORK) {
+ i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
+ sc->ip->i_disk_size = sc->tempip->i_disk_size;
+ }
+
+ ilog_flags |= xfs_ilog_fdata(whichfork);
+ xfs_trans_log_inode(sc->tp, sc->ip, ilog_flags);
+}
+
+/* Decide if a given XFS inode is a temporary file for a repair. */
+bool
+xrep_is_tempfile(
+ const struct xfs_inode *ip)
+{
+ const struct inode *inode = &ip->i_vnode;
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * Files in the metadata directory tree also have S_PRIVATE set and
+ * IOP_XATTR unset, so we must distinguish them separately. We (ab)use
+ * the IRECOVERY flag to mark temporary metadir inodes knowing that the
+ * end of log recovery clears IRECOVERY, so the only ones that can
+ * exist during online repair are the ones we create.
+ */
+ if (xfs_has_metadir(mp) && (ip->i_diflags2 & XFS_DIFLAG2_METADATA))
+ return __xfs_iflags_test(ip, XFS_IRECOVERY);
+
+ if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR))
+ return true;
+
+ return false;
+}
diff --git a/fs/xfs/scrub/tempfile.h b/fs/xfs/scrub/tempfile.h
new file mode 100644
index 000000000000..71c1b54599c3
--- /dev/null
+++ b/fs/xfs/scrub/tempfile.h
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_TEMPFILE_H__
+#define __XFS_SCRUB_TEMPFILE_H__
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int xrep_tempfile_create(struct xfs_scrub *sc, uint16_t mode);
+void xrep_tempfile_rele(struct xfs_scrub *sc);
+
+int xrep_tempfile_adjust_directory_tree(struct xfs_scrub *sc);
+
+bool xrep_tempfile_iolock_nowait(struct xfs_scrub *sc);
+int xrep_tempfile_iolock_polled(struct xfs_scrub *sc);
+void xrep_tempfile_iounlock(struct xfs_scrub *sc);
+
+void xrep_tempfile_ilock(struct xfs_scrub *sc);
+bool xrep_tempfile_ilock_nowait(struct xfs_scrub *sc);
+void xrep_tempfile_iunlock(struct xfs_scrub *sc);
+void xrep_tempfile_iunlock_both(struct xfs_scrub *sc);
+void xrep_tempfile_ilock_both(struct xfs_scrub *sc);
+
+int xrep_tempfile_prealloc(struct xfs_scrub *sc, xfs_fileoff_t off,
+ xfs_filblks_t len);
+
+enum xfs_blft;
+
+typedef int (*xrep_tempfile_copyin_fn)(struct xfs_scrub *sc,
+ struct xfs_buf *bp, void *data);
+
+int xrep_tempfile_copyin(struct xfs_scrub *sc, xfs_fileoff_t off,
+ xfs_filblks_t len, xrep_tempfile_copyin_fn fn, void *data);
+
+int xrep_tempfile_set_isize(struct xfs_scrub *sc, unsigned long long isize);
+
+int xrep_tempfile_roll_trans(struct xfs_scrub *sc);
+void xrep_tempfile_copyout_local(struct xfs_scrub *sc, int whichfork);
+bool xrep_is_tempfile(const struct xfs_inode *ip);
+#else
+static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc)
+{
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+}
+# define xrep_is_tempfile(ip) (false)
+# define xrep_tempfile_adjust_directory_tree(sc) (0)
+# define xrep_tempfile_rele(sc)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_TEMPFILE_H__ */
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index d0e24ffaf754..2450e214103f 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -16,10 +16,24 @@
#include "xfs_rtbitmap.h"
#include "xfs_quota.h"
#include "xfs_quota_defs.h"
+#include "xfs_da_format.h"
+#include "xfs_dir2.h"
+#include "xfs_rmap.h"
+#include "xfs_parent.h"
+#include "xfs_metafile.h"
+#include "xfs_rtgroup.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/quota.h"
+#include "scrub/iscan.h"
+#include "scrub/orphanage.h"
+#include "scrub/nlinks.h"
+#include "scrub/fscounters.h"
+#include "scrub/bitmap.h"
+#include "scrub/ino_bitmap.h"
+#include "scrub/xfblob.h"
+#include "scrub/dirtree.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
@@ -32,7 +46,7 @@ xchk_btree_cur_fsbno(
xfs_buf_daddr(cur->bc_levels[level].bp));
if (level == cur->bc_nlevels - 1 &&
- (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
return NULLFSBLOCK;
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 6bbb4e8639dc..d7c4ced47c15 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -15,11 +15,22 @@
#include <linux/tracepoint.h>
#include "xfs_bit.h"
+#include "xfs_quota_defs.h"
+struct xfs_rtgroup;
+struct xfs_scrub;
struct xfile;
struct xfarray;
struct xfarray_sortinfo;
struct xchk_dqiter;
+struct xchk_iscan;
+struct xchk_nlink;
+struct xchk_fscounters;
+struct xfs_rmap_update_params;
+struct xfs_parent_rec;
+enum xchk_dirpath_outcome;
+struct xchk_dirtree;
+struct xchk_dirtree_outcomes;
/*
* ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -27,17 +38,12 @@ struct xchk_dqiter;
* ring buffer. Somehow this was only worth mentioning in the ftrace sample
* code.
*/
-TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
-
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
+TRACE_DEFINE_ENUM(XG_TYPE_AG);
+TRACE_DEFINE_ENUM(XG_TYPE_RTG);
+
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF);
@@ -63,6 +69,15 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTREFCBT);
#define XFS_SCRUB_TYPE_STRINGS \
{ XFS_SCRUB_TYPE_PROBE, "probe" }, \
@@ -89,7 +104,16 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \
{ XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \
{ XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \
- { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }
+ { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \
+ { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \
+ { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \
+ { XFS_SCRUB_TYPE_HEALTHY, "healthy" }, \
+ { XFS_SCRUB_TYPE_DIRTREE, "dirtree" }, \
+ { XFS_SCRUB_TYPE_BARRIER, "barrier" }, \
+ { XFS_SCRUB_TYPE_METAPATH, "metapath" }, \
+ { XFS_SCRUB_TYPE_RGSUPER, "rgsuper" }, \
+ { XFS_SCRUB_TYPE_RTRMAPBT, "rtrmapbt" }, \
+ { XFS_SCRUB_TYPE_RTREFCBT, "rtrefcountbt" }
#define XFS_SCRUB_FLAG_STRINGS \
{ XFS_SCRUB_IFLAG_REPAIR, "repair" }, \
@@ -107,9 +131,21 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \
{ XCHK_FSGATES_DRAIN, "fsgates_drain" }, \
{ XCHK_NEED_DRAIN, "need_drain" }, \
+ { XCHK_FSGATES_QUOTA, "fsgates_quota" }, \
+ { XCHK_FSGATES_DIRENTS, "fsgates_dirents" }, \
+ { XCHK_FSGATES_RMAP, "fsgates_rmap" }, \
{ XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \
{ XREP_ALREADY_FIXED, "already_fixed" }
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_ALLOC);
+TRACE_DEFINE_ENUM(XFS_RMAP_FREE);
+
DECLARE_EVENT_CLASS(xchk_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
int error),
@@ -153,6 +189,8 @@ DEFINE_EVENT(xchk_class, name, \
DEFINE_SCRUB_EVENT(xchk_start);
DEFINE_SCRUB_EVENT(xchk_done);
DEFINE_SCRUB_EVENT(xchk_deadlock_retry);
+DEFINE_SCRUB_EVENT(xchk_dirtree_start);
+DEFINE_SCRUB_EVENT(xchk_dirtree_done);
DEFINE_SCRUB_EVENT(xrep_attempt);
DEFINE_SCRUB_EVENT(xrep_done);
@@ -183,6 +221,81 @@ DEFINE_EVENT(xchk_fsgate_class, name, \
DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable);
DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable);
+DECLARE_EVENT_CLASS(xchk_vector_head_class,
+ TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead),
+ TP_ARGS(ip, vhead),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, inum)
+ __field(unsigned int, gen)
+ __field(unsigned int, flags)
+ __field(unsigned short, rest_us)
+ __field(unsigned short, nr_vecs)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->agno = vhead->svh_agno;
+ __entry->inum = vhead->svh_ino;
+ __entry->gen = vhead->svh_gen;
+ __entry->flags = vhead->svh_flags;
+ __entry->rest_us = vhead->svh_rest_us;
+ __entry->nr_vecs = vhead->svh_nr;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx agno 0x%x inum 0x%llx gen 0x%x flags 0x%x rest_us %u nr_vecs %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->agno,
+ __entry->inum,
+ __entry->gen,
+ __entry->flags,
+ __entry->rest_us,
+ __entry->nr_vecs)
+)
+#define DEFINE_SCRUBV_HEAD_EVENT(name) \
+DEFINE_EVENT(xchk_vector_head_class, name, \
+ TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), \
+ TP_ARGS(ip, vhead))
+
+DEFINE_SCRUBV_HEAD_EVENT(xchk_scrubv_start);
+
+DECLARE_EVENT_CLASS(xchk_vector_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead,
+ unsigned int vec_nr, struct xfs_scrub_vec *v),
+ TP_ARGS(mp, vhead, vec_nr, v),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, vec_nr)
+ __field(unsigned int, vec_type)
+ __field(unsigned int, vec_flags)
+ __field(int, vec_ret)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->vec_nr = vec_nr;
+ __entry->vec_type = v->sv_type;
+ __entry->vec_flags = v->sv_flags;
+ __entry->vec_ret = v->sv_ret;
+ ),
+ TP_printk("dev %d:%d vec[%u] type %s flags %s ret %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->vec_nr,
+ __print_symbolic(__entry->vec_type, XFS_SCRUB_TYPE_STRINGS),
+ __print_flags(__entry->vec_flags, "|", XFS_SCRUB_FLAG_STRINGS),
+ __entry->vec_ret)
+)
+#define DEFINE_SCRUBV_EVENT(name) \
+DEFINE_EVENT(xchk_vector_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, \
+ unsigned int vec_nr, struct xfs_scrub_vec *v), \
+ TP_ARGS(mp, vhead, vec_nr, v))
+
+DEFINE_SCRUBV_EVENT(xchk_scrubv_barrier_fail);
+DEFINE_SCRUBV_EVENT(xchk_scrubv_item);
+DEFINE_SCRUBV_EVENT(xchk_scrubv_outcome);
+
TRACE_EVENT(xchk_op_error,
TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int error, void *ret_ip),
@@ -348,6 +461,7 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error);
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning);
+DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_preen);
#ifdef CONFIG_XFS_QUOTA
DECLARE_EVENT_CLASS(xchk_dqiter_class,
@@ -395,6 +509,29 @@ DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap);
DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap);
DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore);
DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter);
+
+TRACE_EVENT(xchk_qcheck_error,
+ TP_PROTO(struct xfs_scrub *sc, xfs_dqtype_t dqtype, xfs_dqid_t id,
+ void *ret_ip),
+ TP_ARGS(sc, dqtype, id, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_dqtype_t, dqtype)
+ __field(xfs_dqid_t, id)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->dqtype = dqtype;
+ __entry->id = id;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d dquot type %s id 0x%x ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS),
+ __entry->id,
+ __entry->ret_ip)
+);
#endif /* CONFIG_XFS_QUOTA */
TRACE_EVENT(xchk_incomplete,
@@ -423,7 +560,7 @@ TRACE_EVENT(xchk_btree_op_error,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
@@ -436,7 +573,7 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -444,10 +581,10 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
+ TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -465,7 +602,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__field(xfs_ino_t, ino)
__field(int, whichfork)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(int, ptr)
__field(xfs_agnumber_t, agno)
@@ -476,10 +613,10 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
TP_fast_assign(
xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
__entry->dev = sc->mp->m_super->s_dev;
- __entry->ino = sc->ip->i_ino;
+ __entry->ino = cur->bc_ino.ip->i_ino;
__entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name);
__entry->level = level;
__entry->ptr = cur->bc_levels[level].ptr;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
@@ -487,12 +624,12 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -508,7 +645,7 @@ TRACE_EVENT(xchk_btree_error,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
@@ -519,17 +656,17 @@ TRACE_EVENT(xchk_btree_error,
xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
+ TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -546,7 +683,7 @@ TRACE_EVENT(xchk_ifork_btree_error,
__field(xfs_ino_t, ino)
__field(int, whichfork)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
@@ -559,19 +696,19 @@ TRACE_EVENT(xchk_ifork_btree_error,
__entry->ino = sc->ip->i_ino;
__entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -586,7 +723,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
__field(int, level)
@@ -598,17 +735,17 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name);
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_levels[level].ptr;
),
- TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
+ TP_printk("dev %d:%d type %s %sbt agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->agno,
__entry->bno,
__entry->level,
@@ -647,12 +784,12 @@ TRACE_EVENT(xchk_xref_error,
);
TRACE_EVENT(xchk_iallocbt_check_cluster,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t startino, xfs_daddr_t map_daddr,
- unsigned short map_len, unsigned int chunk_ino,
- unsigned int nr_inodes, uint16_t cluster_mask,
- uint16_t holemask, unsigned int cluster_ino),
- TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes,
+ TP_PROTO(const struct xfs_perag *pag, xfs_agino_t startino,
+ xfs_daddr_t map_daddr, unsigned short map_len,
+ unsigned int chunk_ino, unsigned int nr_inodes,
+ uint16_t cluster_mask, uint16_t holemask,
+ unsigned int cluster_ino),
+ TP_ARGS(pag, startino, map_daddr, map_len, chunk_ino, nr_inodes,
cluster_mask, holemask, cluster_ino),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -667,8 +804,8 @@ TRACE_EVENT(xchk_iallocbt_check_cluster,
__field(uint16_t, holemask)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->startino = startino;
__entry->map_daddr = map_daddr;
__entry->map_len = map_len;
@@ -797,7 +934,8 @@ DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze);
DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw);
TRACE_EVENT(xchk_refcount_incorrect,
- TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec,
+ TP_PROTO(const struct xfs_perag *pag,
+ const struct xfs_refcount_irec *irec,
xfs_nlink_t seen),
TP_ARGS(pag, irec, seen),
TP_STRUCT__entry(
@@ -810,8 +948,8 @@ TRACE_EVENT(xchk_refcount_incorrect,
__field(xfs_nlink_t, seen)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
@@ -834,18 +972,16 @@ TRACE_EVENT(xfile_create,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned long, ino)
- __array(char, pathname, 256)
+ __array(char, pathname, MAXNAMELEN)
),
TP_fast_assign(
- char pathname[257];
char *path;
__entry->ino = file_inode(xf->file)->i_ino;
- memset(pathname, 0, sizeof(pathname));
- path = file_path(xf->file, pathname, sizeof(pathname) - 1);
+ path = file_path(xf->file, __entry->pathname, MAXNAMELEN);
if (IS_ERR(path))
- path = "(unknown)";
- strncpy(__entry->pathname, path, sizeof(__entry->pathname));
+ strncpy(__entry->pathname, "(unknown)",
+ sizeof(__entry->pathname));
),
TP_printk("xfino 0x%lx path '%s'",
__entry->ino,
@@ -861,18 +997,11 @@ TRACE_EVENT(xfile_destroy,
__field(loff_t, size)
),
TP_fast_assign(
- struct xfile_stat statbuf;
- int ret;
+ struct inode *inode = file_inode(xf->file);
- ret = xfile_stat(xf, &statbuf);
- if (!ret) {
- __entry->bytes = statbuf.bytes;
- __entry->size = statbuf.size;
- } else {
- __entry->bytes = -1;
- __entry->size = -1;
- }
- __entry->ino = file_inode(xf->file)->i_ino;
+ __entry->ino = inode->i_ino;
+ __entry->bytes = inode->i_blocks << SECTOR_SHIFT;
+ __entry->size = i_size_read(inode);
),
TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx",
__entry->ino,
@@ -891,19 +1020,12 @@ DECLARE_EVENT_CLASS(xfile_class,
__field(unsigned long long, bytecount)
),
TP_fast_assign(
- struct xfile_stat statbuf;
- int ret;
+ struct inode *inode = file_inode(xf->file);
- ret = xfile_stat(xf, &statbuf);
- if (!ret) {
- __entry->bytes_used = statbuf.bytes;
- __entry->size = statbuf.size;
- } else {
- __entry->bytes_used = -1;
- __entry->size = -1;
- }
- __entry->ino = file_inode(xf->file)->i_ino;
+ __entry->ino = inode->i_ino;
+ __entry->bytes_used = inode->i_blocks << SECTOR_SHIFT;
__entry->pos = pos;
+ __entry->size = i_size_read(inode);
__entry->bytecount = bytecount;
),
TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx",
@@ -917,11 +1039,12 @@ DECLARE_EVENT_CLASS(xfile_class,
DEFINE_EVENT(xfile_class, name, \
TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \
TP_ARGS(xf, pos, bytecount))
-DEFINE_XFILE_EVENT(xfile_pread);
-DEFINE_XFILE_EVENT(xfile_pwrite);
+DEFINE_XFILE_EVENT(xfile_load);
+DEFINE_XFILE_EVENT(xfile_store);
DEFINE_XFILE_EVENT(xfile_seek_data);
-DEFINE_XFILE_EVENT(xfile_get_page);
-DEFINE_XFILE_EVENT(xfile_put_page);
+DEFINE_XFILE_EVENT(xfile_get_folio);
+DEFINE_XFILE_EVENT(xfile_put_folio);
+DEFINE_XFILE_EVENT(xfile_discard);
TRACE_EVENT(xfarray_create,
TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity),
@@ -968,7 +1091,7 @@ TRACE_EVENT(xfarray_isort,
__entry->hi - __entry->lo)
);
-TRACE_EVENT(xfarray_pagesort,
+TRACE_EVENT(xfarray_foliosort,
TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
TP_ARGS(si, lo, hi),
TP_STRUCT__entry(
@@ -1039,6 +1162,47 @@ TRACE_EVENT(xfarray_sort,
__entry->bytes)
);
+TRACE_EVENT(xfarray_sort_scan,
+ TP_PROTO(struct xfarray_sortinfo *si, unsigned long long idx),
+ TP_ARGS(si, idx),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, nr)
+ __field(size_t, obj_size)
+ __field(unsigned long long, idx)
+ __field(unsigned long long, folio_pos)
+ __field(unsigned long, folio_bytes)
+ __field(unsigned long long, first_idx)
+ __field(unsigned long long, last_idx)
+ ),
+ TP_fast_assign(
+ __entry->nr = si->array->nr;
+ __entry->obj_size = si->array->obj_size;
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+ __entry->idx = idx;
+ if (si->folio) {
+ __entry->folio_pos = folio_pos(si->folio);
+ __entry->folio_bytes = folio_size(si->folio);
+ __entry->first_idx = si->first_folio_idx;
+ __entry->last_idx = si->last_folio_idx;
+ } else {
+ __entry->folio_pos = 0;
+ __entry->folio_bytes = 0;
+ __entry->first_idx = 0;
+ __entry->last_idx = 0;
+ }
+ ),
+ TP_printk("xfino 0x%lx nr %llu objsz %zu idx %llu folio_pos 0x%llx folio_bytes 0x%lx first_idx %llu last_idx %llu",
+ __entry->ino,
+ __entry->nr,
+ __entry->obj_size,
+ __entry->idx,
+ __entry->folio_pos,
+ __entry->folio_bytes,
+ __entry->first_idx,
+ __entry->last_idx)
+);
+
TRACE_EVENT(xfarray_sort_stats,
TP_PROTO(struct xfarray_sortinfo *si, int error),
TP_ARGS(si, error),
@@ -1119,76 +1283,763 @@ TRACE_EVENT(xchk_rtsum_record_free,
);
#endif /* CONFIG_XFS_RT */
+DECLARE_EVENT_CLASS(xchk_iscan_class,
+ TP_PROTO(struct xchk_iscan *iscan),
+ TP_ARGS(iscan),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited)
+)
+#define DEFINE_ISCAN_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_class, name, \
+ TP_PROTO(struct xchk_iscan *iscan), \
+ TP_ARGS(iscan))
+DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor);
+DEFINE_ISCAN_EVENT(xchk_iscan_visit);
+DEFINE_ISCAN_EVENT(xchk_iscan_skip);
+DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag);
+
+DECLARE_EVENT_CLASS(xchk_iscan_ino_class,
+ TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino),
+ TP_ARGS(iscan, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, startino)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->startino = iscan->scan_start_ino;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->ino = ino;
+ ),
+ TP_printk("dev %d:%d iscan start 0x%llx cursor 0x%llx visited 0x%llx ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->startino,
+ __entry->cursor,
+ __entry->visited,
+ __entry->ino)
+)
+#define DEFINE_ISCAN_INO_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_ino_class, name, \
+ TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), \
+ TP_ARGS(iscan, ino))
+DEFINE_ISCAN_INO_EVENT(xchk_iscan_want_live_update);
+DEFINE_ISCAN_INO_EVENT(xchk_iscan_start);
+
+TRACE_EVENT(xchk_iscan_iget,
+ TP_PROTO(struct xchk_iscan *iscan, int error),
+ TP_ARGS(iscan, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->error = error;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->error)
+);
+
+TRACE_EVENT(xchk_iscan_iget_batch,
+ TP_PROTO(struct xfs_mount *mp, struct xchk_iscan *iscan,
+ unsigned int nr, unsigned int avail),
+ TP_ARGS(mp, iscan, nr, avail),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(unsigned int, nr)
+ __field(unsigned int, avail)
+ __field(unsigned int, unavail)
+ __field(xfs_ino_t, batch_ino)
+ __field(unsigned long long, skipmask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->nr = nr;
+ __entry->avail = avail;
+ __entry->unavail = hweight64(iscan->__skipped_inomask);
+ __entry->batch_ino = iscan->__batch_ino;
+ __entry->skipmask = iscan->__skipped_inomask;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx batchino 0x%llx skipmask 0x%llx nr %u avail %u unavail %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->batch_ino,
+ __entry->skipmask,
+ __entry->nr,
+ __entry->avail,
+ __entry->unavail)
+);
+
+DECLARE_EVENT_CLASS(xchk_iscan_retry_wait_class,
+ TP_PROTO(struct xchk_iscan *iscan),
+ TP_ARGS(iscan),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(unsigned int, retry_delay)
+ __field(unsigned long, remaining)
+ __field(unsigned int, iget_timeout)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->retry_delay = iscan->iget_retry_delay;
+ __entry->remaining = jiffies_to_msecs(iscan->__iget_deadline - jiffies);
+ __entry->iget_timeout = iscan->iget_timeout;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx remaining %lu timeout %u delay %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->remaining,
+ __entry->iget_timeout,
+ __entry->retry_delay)
+)
+#define DEFINE_ISCAN_RETRY_WAIT_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_retry_wait_class, name, \
+ TP_PROTO(struct xchk_iscan *iscan), \
+ TP_ARGS(iscan))
+DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_iget_retry_wait);
+DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_agi_retry_wait);
+
+TRACE_EVENT(xchk_nlinks_collect_dirent,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp,
+ xfs_ino_t ino, const struct xfs_name *name),
+ TP_ARGS(mp, dp, ino, name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->dir = dp->i_ino;
+ __entry->ino = ino;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir,
+ __entry->ino,
+ __entry->namelen,
+ __get_str(name))
+);
+
+TRACE_EVENT(xchk_nlinks_collect_pptr,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp,
+ const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr),
+ TP_ARGS(mp, dp, name, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->dir = dp->i_ino;
+ __entry->ino = be64_to_cpu(pptr->p_ino);
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir,
+ __entry->ino,
+ __entry->namelen,
+ __get_str(name))
+);
+
+TRACE_EVENT(xchk_nlinks_collect_metafile,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino),
+ TP_ARGS(mp, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino)
+);
+
+TRACE_EVENT(xchk_nlinks_live_update,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_inode *dp,
+ int action, xfs_ino_t ino, int delta,
+ const char *name, unsigned int namelen),
+ TP_ARGS(mp, dp, action, ino, delta, name, namelen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir)
+ __field(int, action)
+ __field(xfs_ino_t, ino)
+ __field(int, delta)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, namelen)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->dir = dp ? dp->i_ino : NULLFSINO;
+ __entry->action = action;
+ __entry->ino = ino;
+ __entry->delta = delta;
+ __entry->namelen = namelen;
+ memcpy(__get_str(name), name, namelen);
+ ),
+ TP_printk("dev %d:%d dir 0x%llx ino 0x%llx nlink_delta %d name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir,
+ __entry->ino,
+ __entry->delta,
+ __entry->namelen,
+ __get_str(name))
+);
+
+TRACE_EVENT(xchk_nlinks_check_zero,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
+ const struct xchk_nlink *live),
+ TP_ARGS(mp, ino, live),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->parents = live->parents;
+ __entry->backrefs = live->backrefs;
+ __entry->children = live->children;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parents,
+ __entry->backrefs,
+ __entry->children)
+);
+
+TRACE_EVENT(xchk_nlinks_update_incore,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
+ const struct xchk_nlink *live, int parents_delta,
+ int backrefs_delta, int children_delta),
+ TP_ARGS(mp, ino, live, parents_delta, backrefs_delta, children_delta),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ __field(int, parents_delta)
+ __field(int, backrefs_delta)
+ __field(int, children_delta)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->parents = live->parents;
+ __entry->backrefs = live->backrefs;
+ __entry->children = live->children;
+ __entry->parents_delta = parents_delta;
+ __entry->backrefs_delta = backrefs_delta;
+ __entry->children_delta = children_delta;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parents %d:%u backrefs %d:%u children %d:%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parents_delta,
+ __entry->parents,
+ __entry->backrefs_delta,
+ __entry->backrefs,
+ __entry->children_delta,
+ __entry->children)
+);
+
+DECLARE_EVENT_CLASS(xchk_nlinks_diff_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip,
+ const struct xchk_nlink *live),
+ TP_ARGS(mp, ip, live),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(uint8_t, ftype)
+ __field(xfs_nlink_t, nlink)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->ftype = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
+ __entry->nlink = VFS_I(ip)->i_nlink;
+ __entry->parents = live->parents;
+ __entry->backrefs = live->backrefs;
+ __entry->children = live->children;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx ftype %s nlink %u parents %u backrefs %u children %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __entry->nlink,
+ __entry->parents,
+ __entry->backrefs,
+ __entry->children)
+);
+#define DEFINE_SCRUB_NLINKS_DIFF_EVENT(name) \
+DEFINE_EVENT(xchk_nlinks_diff_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, \
+ const struct xchk_nlink *live), \
+ TP_ARGS(mp, ip, live))
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode);
+
+DECLARE_EVENT_CLASS(xchk_pptr_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name,
+ xfs_ino_t far_ino),
+ TP_ARGS(ip, name, far_ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ __field(xfs_ino_t, far_ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name, name->len);
+ __entry->far_ino = far_ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx name '%.*s' far_ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->namelen,
+ __get_str(name),
+ __entry->far_ino)
+)
+#define DEFINE_XCHK_PPTR_EVENT(name) \
+DEFINE_EVENT(xchk_pptr_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \
+ xfs_ino_t far_ino), \
+ TP_ARGS(ip, name, far_ino))
+DEFINE_XCHK_PPTR_EVENT(xchk_dir_defer);
+DEFINE_XCHK_PPTR_EVENT(xchk_dir_slowpath);
+DEFINE_XCHK_PPTR_EVENT(xchk_dir_ultraslowpath);
+DEFINE_XCHK_PPTR_EVENT(xchk_parent_defer);
+DEFINE_XCHK_PPTR_EVENT(xchk_parent_slowpath);
+DEFINE_XCHK_PPTR_EVENT(xchk_parent_ultraslowpath);
+
+DECLARE_EVENT_CLASS(xchk_dirtree_class,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip,
+ unsigned int path_nr, const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr),
+ TP_ARGS(sc, ip, path_nr, name, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, path_nr)
+ __field(xfs_ino_t, child_ino)
+ __field(unsigned int, child_gen)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->path_nr = path_nr;
+ __entry->child_ino = ip->i_ino;
+ __entry->child_gen = VFS_I(ip)->i_generation;
+ __entry->parent_ino = be64_to_cpu(pptr->p_ino);
+ __entry->parent_gen = be32_to_cpu(pptr->p_gen);
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d path %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->path_nr,
+ __entry->child_ino,
+ __entry->child_gen,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+);
+#define DEFINE_XCHK_DIRTREE_EVENT(name) \
+DEFINE_EVENT(xchk_dirtree_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \
+ unsigned int path_nr, const struct xfs_name *name, \
+ const struct xfs_parent_rec *pptr), \
+ TP_ARGS(sc, ip, path_nr, name, pptr))
+DEFINE_XCHK_DIRTREE_EVENT(xchk_dirtree_create_path);
+DEFINE_XCHK_DIRTREE_EVENT(xchk_dirpath_walk_upwards);
+
+DECLARE_EVENT_CLASS(xchk_dirpath_class,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip,
+ unsigned int path_nr, unsigned int step_nr,
+ const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr),
+ TP_ARGS(sc, ip, path_nr, step_nr, name, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, path_nr)
+ __field(unsigned int, step_nr)
+ __field(xfs_ino_t, child_ino)
+ __field(unsigned int, child_gen)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->path_nr = path_nr;
+ __entry->step_nr = step_nr;
+ __entry->child_ino = ip->i_ino;
+ __entry->child_gen = VFS_I(ip)->i_generation;
+ __entry->parent_ino = be64_to_cpu(pptr->p_ino);
+ __entry->parent_gen = be32_to_cpu(pptr->p_gen);
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d path %u step %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->path_nr,
+ __entry->step_nr,
+ __entry->child_ino,
+ __entry->child_gen,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+);
+#define DEFINE_XCHK_DIRPATH_EVENT(name) \
+DEFINE_EVENT(xchk_dirpath_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \
+ unsigned int path_nr, unsigned int step_nr, \
+ const struct xfs_name *name, \
+ const struct xfs_parent_rec *pptr), \
+ TP_ARGS(sc, ip, path_nr, step_nr, name, pptr))
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_disappeared);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_badgen);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_nondir_parent);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_unlinked_parent);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_found_next_step);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_crosses_tree);
+
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_SCANNING);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_DELETE);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_CORRUPT);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_LOOP);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_STALE);
+TRACE_DEFINE_ENUM(XCHK_DIRPATH_OK);
+TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETING);
+TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETED);
+TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTING);
+TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTED);
+
+#define XCHK_DIRPATH_OUTCOME_STRINGS \
+ { XCHK_DIRPATH_SCANNING, "scanning" }, \
+ { XCHK_DIRPATH_DELETE, "delete" }, \
+ { XCHK_DIRPATH_CORRUPT, "corrupt" }, \
+ { XCHK_DIRPATH_LOOP, "loop" }, \
+ { XCHK_DIRPATH_STALE, "stale" }, \
+ { XCHK_DIRPATH_OK, "ok" }, \
+ { XREP_DIRPATH_DELETING, "deleting" }, \
+ { XREP_DIRPATH_DELETED, "deleted" }, \
+ { XREP_DIRPATH_ADOPTING, "adopting" }, \
+ { XREP_DIRPATH_ADOPTED, "adopted" }
+
+DECLARE_EVENT_CLASS(xchk_dirpath_outcome_class,
+ TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr,
+ unsigned int nr_steps, \
+ unsigned int outcome),
+ TP_ARGS(sc, path_nr, nr_steps, outcome),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long long, path_nr)
+ __field(unsigned int, nr_steps)
+ __field(unsigned int, outcome)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->path_nr = path_nr;
+ __entry->nr_steps = nr_steps;
+ __entry->outcome = outcome;
+ ),
+ TP_printk("dev %d:%d path %llu steps %u outcome %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->path_nr,
+ __entry->nr_steps,
+ __print_symbolic(__entry->outcome, XCHK_DIRPATH_OUTCOME_STRINGS))
+);
+#define DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(name) \
+DEFINE_EVENT(xchk_dirpath_outcome_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr, \
+ unsigned int nr_steps, \
+ unsigned int outcome), \
+ TP_ARGS(sc, path_nr, nr_steps, outcome))
+DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_set_outcome);
+DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_evaluate_path);
+
+DECLARE_EVENT_CLASS(xchk_dirtree_evaluate_class,
+ TP_PROTO(const struct xchk_dirtree *dl,
+ const struct xchk_dirtree_outcomes *oc),
+ TP_ARGS(dl, oc),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, rootino)
+ __field(unsigned int, nr_paths)
+ __field(unsigned int, bad)
+ __field(unsigned int, suspect)
+ __field(unsigned int, good)
+ __field(bool, needs_adoption)
+ ),
+ TP_fast_assign(
+ __entry->dev = dl->sc->mp->m_super->s_dev;
+ __entry->ino = dl->sc->ip->i_ino;
+ __entry->rootino = dl->root_ino;
+ __entry->nr_paths = dl->nr_paths;
+ __entry->bad = oc->bad;
+ __entry->suspect = oc->suspect;
+ __entry->good = oc->good;
+ __entry->needs_adoption = oc->needs_adoption ? 1 : 0;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx rootino 0x%llx nr_paths %u bad %u suspect %u good %u adopt? %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->rootino,
+ __entry->nr_paths,
+ __entry->bad,
+ __entry->suspect,
+ __entry->good,
+ __entry->needs_adoption)
+);
+#define DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(name) \
+DEFINE_EVENT(xchk_dirtree_evaluate_class, name, \
+ TP_PROTO(const struct xchk_dirtree *dl, \
+ const struct xchk_dirtree_outcomes *oc), \
+ TP_ARGS(dl, oc))
+DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xchk_dirtree_evaluate);
+
+TRACE_EVENT(xchk_dirpath_changed,
+ TP_PROTO(struct xfs_scrub *sc, unsigned int path_nr,
+ unsigned int step_nr, const struct xfs_inode *dp,
+ const struct xfs_inode *ip, const struct xfs_name *xname),
+ TP_ARGS(sc, path_nr, step_nr, dp, ip, xname),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, path_nr)
+ __field(unsigned int, step_nr)
+ __field(xfs_ino_t, child_ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, xname->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->path_nr = path_nr;
+ __entry->step_nr = step_nr;
+ __entry->child_ino = ip->i_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->namelen = xname->len;
+ memcpy(__get_str(name), xname->name, xname->len);
+ ),
+ TP_printk("dev %d:%d path %u step %u child_ino 0x%llx parent_ino 0x%llx name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->path_nr,
+ __entry->step_nr,
+ __entry->child_ino,
+ __entry->parent_ino,
+ __entry->namelen,
+ __get_str(name))
+);
+
+TRACE_EVENT(xchk_dirtree_live_update,
+ TP_PROTO(struct xfs_scrub *sc, const struct xfs_inode *dp,
+ int action, const struct xfs_inode *ip, int delta,
+ const struct xfs_name *xname),
+ TP_ARGS(sc, dp, action, ip, delta, xname),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, parent_ino)
+ __field(int, action)
+ __field(xfs_ino_t, child_ino)
+ __field(int, delta)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, xname->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->parent_ino = dp->i_ino;
+ __entry->action = action;
+ __entry->child_ino = ip->i_ino;
+ __entry->delta = delta;
+ __entry->namelen = xname->len;
+ memcpy(__get_str(name), xname->name, xname->len);
+ ),
+ TP_printk("dev %d:%d parent_ino 0x%llx child_ino 0x%llx nlink_delta %d name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->parent_ino,
+ __entry->child_ino,
+ __entry->delta,
+ __entry->namelen,
+ __get_str(name))
+);
+
+DECLARE_EVENT_CLASS(xchk_metapath_class,
+ TP_PROTO(struct xfs_scrub *sc, const char *path,
+ struct xfs_inode *dp, xfs_ino_t ino),
+ TP_ARGS(sc, path, dp, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, scrub_ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(xfs_ino_t, ino)
+ __string(name, path)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->scrub_ino = sc->ip ? sc->ip->i_ino : NULLFSINO;
+ __entry->parent_ino = dp ? dp->i_ino : NULLFSINO;
+ __entry->ino = ino;
+ __assign_str(name);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx name '%s' ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->scrub_ino,
+ __entry->parent_ino,
+ __get_str(name),
+ __entry->ino)
+);
+#define DEFINE_XCHK_METAPATH_EVENT(name) \
+DEFINE_EVENT(xchk_metapath_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, const char *path, \
+ struct xfs_inode *dp, xfs_ino_t ino), \
+ TP_ARGS(sc, path, dp, ino))
+DEFINE_XCHK_METAPATH_EVENT(xchk_metapath_lookup);
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
DECLARE_EVENT_CLASS(xrep_extent_class,
- TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len),
- TP_ARGS(pag, agbno, len),
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
+ xfs_extlen_t len),
+ TP_ARGS(xg, agbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->agno = xg->xg_gno;
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agbno,
__entry->len)
);
#define DEFINE_REPAIR_EXTENT_EVENT(name) \
DEFINE_EVENT(xrep_extent_class, name, \
- TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), \
- TP_ARGS(pag, agbno, len))
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \
+ xfs_extlen_t len), \
+ TP_ARGS(xg, agbno, len))
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent);
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent);
DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
+DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
DECLARE_EVENT_CLASS(xrep_reap_find_class,
- TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len,
- bool crosslinked),
- TP_ARGS(pag, agbno, len, crosslinked),
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
+ xfs_extlen_t len, bool crosslinked),
+ TP_ARGS(xg, agbno, len, crosslinked),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
__field(bool, crosslinked)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->agno = xg->xg_gno;
__entry->agbno = agbno;
__entry->len = len;
__entry->crosslinked = crosslinked;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d",
+ TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x crosslinked %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agbno,
__entry->len,
__entry->crosslinked ? 1 : 0)
);
#define DEFINE_REPAIR_REAP_FIND_EVENT(name) \
DEFINE_EVENT(xrep_reap_find_class, name, \
- TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, \
- bool crosslinked), \
- TP_ARGS(pag, agbno, len, crosslinked))
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \
+ xfs_extlen_t len, bool crosslinked), \
+ TP_ARGS(xg, agbno, len, crosslinked))
DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select);
+DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select);
-DECLARE_EVENT_CLASS(xrep_rmap_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno, xfs_extlen_t len,
- uint64_t owner, uint64_t offset, unsigned int flags),
- TP_ARGS(mp, agno, agbno, len, owner, offset, flags),
+TRACE_EVENT(xrep_ibt_walk_rmap,
+ TP_PROTO(const struct xfs_perag *pag, const struct xfs_rmap_irec *rec),
+ TP_ARGS(pag, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -1199,13 +2050,13 @@ DECLARE_EVENT_CLASS(xrep_rmap_class,
__field(unsigned int, flags)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->agbno = agbno;
- __entry->len = len;
- __entry->owner = owner;
- __entry->offset = offset;
- __entry->flags = flags;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
+ __entry->agbno = rec->rm_startblock;
+ __entry->len = rec->rm_blockcount;
+ __entry->owner = rec->rm_owner;
+ __entry->offset = rec->rm_offset;
+ __entry->flags = rec->rm_flags;
),
TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1216,20 +2067,11 @@ DECLARE_EVENT_CLASS(xrep_rmap_class,
__entry->offset,
__entry->flags)
);
-#define DEFINE_REPAIR_RMAP_EVENT(name) \
-DEFINE_EVENT(xrep_rmap_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- xfs_agblock_t agbno, xfs_extlen_t len, \
- uint64_t owner, uint64_t offset, unsigned int flags), \
- TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
-DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap);
-DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn);
-DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap);
TRACE_EVENT(xrep_abt_found,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ TP_PROTO(const struct xfs_perag *pag,
const struct xfs_alloc_rec_incore *rec),
- TP_ARGS(mp, agno, rec),
+ TP_ARGS(pag, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -1237,8 +2079,8 @@ TRACE_EVENT(xrep_abt_found,
__field(xfs_extlen_t, blockcount)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->startblock = rec->ar_startblock;
__entry->blockcount = rec->ar_blockcount;
),
@@ -1250,9 +2092,9 @@ TRACE_EVENT(xrep_abt_found,
)
TRACE_EVENT(xrep_ibt_found,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ TP_PROTO(const struct xfs_perag *pag,
const struct xfs_inobt_rec_incore *rec),
- TP_ARGS(mp, agno, rec),
+ TP_ARGS(pag, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -1263,8 +2105,8 @@ TRACE_EVENT(xrep_ibt_found,
__field(uint64_t, freemask)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->startino = rec->ir_startino;
__entry->holemask = rec->ir_holemask;
__entry->count = rec->ir_count;
@@ -1282,28 +2124,33 @@ TRACE_EVENT(xrep_ibt_found,
)
TRACE_EVENT(xrep_refc_found,
- TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec),
- TP_ARGS(pag, rec),
+ TP_PROTO(const struct xfs_group *xg,
+ const struct xfs_refcount_irec *rec),
+ TP_ARGS(xg, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(enum xfs_refc_domain, domain)
+ __field(enum xfs_group_type, type)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
__field(xfs_nlink_t, refcount)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->agno = xg->xg_gno;
+ __entry->type = xg->xg_type;
__entry->domain = rec->rc_domain;
__entry->startblock = rec->rc_startblock;
__entry->blockcount = rec->rc_blockcount;
__entry->refcount = rec->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d %sno 0x%x dom %s %sbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount)
@@ -1341,10 +2188,41 @@ TRACE_EVENT(xrep_bmap_found,
__entry->state)
);
+TRACE_EVENT(xrep_rmap_found,
+ TP_PROTO(const struct xfs_perag *pag, const struct xfs_rmap_irec *rec),
+ TP_ARGS(pag, rec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
+ __entry->agbno = rec->rm_startblock;
+ __entry->len = rec->rm_blockcount;
+ __entry->owner = rec->rm_owner;
+ __entry->offset = rec->rm_offset;
+ __entry->flags = rec->rm_flags;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+
TRACE_EVENT(xrep_findroot_block,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
uint32_t magic, uint16_t level),
- TP_ARGS(mp, agno, agbno, magic, level),
+ TP_ARGS(pag, agbno, magic, level),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -1353,8 +2231,8 @@ TRACE_EVENT(xrep_findroot_block,
__field(uint16_t, level)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->agbno = agbno;
__entry->magic = magic;
__entry->level = level;
@@ -1367,10 +2245,10 @@ TRACE_EVENT(xrep_findroot_block,
__entry->level)
)
TRACE_EVENT(xrep_calc_ag_resblks,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t icount, xfs_agblock_t aglen, xfs_agblock_t freelen,
+ TP_PROTO(const struct xfs_perag *pag, xfs_agino_t icount,
+ xfs_agblock_t aglen, xfs_agblock_t freelen,
xfs_agblock_t usedlen),
- TP_ARGS(mp, agno, icount, aglen, freelen, usedlen),
+ TP_ARGS(pag, icount, aglen, freelen, usedlen),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -1380,8 +2258,8 @@ TRACE_EVENT(xrep_calc_ag_resblks,
__field(xfs_agblock_t, usedlen)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->icount = icount;
__entry->aglen = aglen;
__entry->freelen = freelen;
@@ -1396,10 +2274,10 @@ TRACE_EVENT(xrep_calc_ag_resblks,
__entry->usedlen)
)
TRACE_EVENT(xrep_calc_ag_resblks_btsize,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t bnobt_sz, xfs_agblock_t inobt_sz,
- xfs_agblock_t rmapbt_sz, xfs_agblock_t refcbt_sz),
- TP_ARGS(mp, agno, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz),
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t bnobt_sz,
+ xfs_agblock_t inobt_sz, xfs_agblock_t rmapbt_sz,
+ xfs_agblock_t refcbt_sz),
+ TP_ARGS(pag, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -1409,8 +2287,8 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize,
__field(xfs_agblock_t, refcbt_sz)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->bnobt_sz = bnobt_sz;
__entry->inobt_sz = inobt_sz;
__entry->rmapbt_sz = rmapbt_sz;
@@ -1424,24 +2302,61 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize,
__entry->rmapbt_sz,
__entry->refcbt_sz)
)
+
+#ifdef CONFIG_XFS_RT
+TRACE_EVENT(xrep_calc_rtgroup_resblks_btsize,
+ TP_PROTO(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+ xfs_rgblock_t usedlen, xfs_rgblock_t rmapbt_sz),
+ TP_ARGS(mp, rgno, usedlen, rmapbt_sz),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, usedlen)
+ __field(xfs_rgblock_t, rmapbt_sz)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rgno = rgno;
+ __entry->usedlen = usedlen;
+ __entry->rmapbt_sz = rmapbt_sz;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x usedlen %u rmapbt %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->usedlen,
+ __entry->rmapbt_sz)
+);
+#endif /* CONFIG_XFS_RT */
+
TRACE_EVENT(xrep_reset_counters,
- TP_PROTO(struct xfs_mount *mp),
- TP_ARGS(mp),
+ TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc),
+ TP_ARGS(mp, fsc),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(uint64_t, icount)
+ __field(uint64_t, ifree)
+ __field(uint64_t, fdblocks)
+ __field(uint64_t, frextents)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
+ __entry->icount = fsc->icount;
+ __entry->ifree = fsc->ifree;
+ __entry->fdblocks = fsc->fdblocks;
+ __entry->frextents = fsc->frextents;
),
- TP_printk("dev %d:%d",
- MAJOR(__entry->dev), MINOR(__entry->dev))
+ TP_printk("dev %d:%d icount %llu ifree %llu fdblocks %llu frextents %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->icount,
+ __entry->ifree,
+ __entry->fdblocks,
+ __entry->frextents)
)
DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno, xfs_extlen_t len,
- int64_t owner),
- TP_ARGS(mp, agno, agbno, len, owner),
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
+ xfs_extlen_t len, int64_t owner),
+ TP_ARGS(pag, agbno, len, owner),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -1450,8 +2365,8 @@ DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
__field(int64_t, owner)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->agbno = agbno;
__entry->len = len;
__entry->owner = owner;
@@ -1465,10 +2380,9 @@ DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
);
#define DEFINE_NEWBT_EXTENT_EVENT(name) \
DEFINE_EVENT(xrep_newbt_extent_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- xfs_agblock_t agbno, xfs_extlen_t len, \
- int64_t owner), \
- TP_ARGS(mp, agno, agbno, len, owner))
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \
+ xfs_extlen_t len, int64_t owner), \
+ TP_ARGS(pag, agbno, len, owner))
DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks);
DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks);
DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks);
@@ -1645,6 +2559,55 @@ TRACE_EVENT(xrep_dinode_count_rmaps,
__entry->attr_extents)
);
+TRACE_EVENT(xrep_dinode_findmode_dirent,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp,
+ unsigned int ftype),
+ TP_ARGS(sc, dp, ftype),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->ftype = ftype;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR))
+);
+
+TRACE_EVENT(xrep_dinode_findmode_dirent_inval,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp,
+ unsigned int ftype, unsigned int found_ftype),
+ TP_ARGS(sc, dp, ftype, found_ftype),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, ftype)
+ __field(unsigned int, found_ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->ftype = ftype;
+ __entry->found_ftype = found_ftype;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s' found_ftype '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __print_symbolic(__entry->found_ftype, XFS_DIR3_FTYPE_STR))
+);
+
TRACE_EVENT(xrep_cow_mark_file_range,
TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock,
xfs_fileoff_t startoff, xfs_filblks_t blockcount),
@@ -1707,7 +2670,7 @@ TRACE_EVENT(xrep_cow_replace_mapping,
);
TRACE_EVENT(xrep_cow_free_staging,
- TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno,
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
xfs_extlen_t blockcount),
TP_ARGS(pag, agbno, blockcount),
TP_STRUCT__entry(
@@ -1717,8 +2680,8 @@ TRACE_EVENT(xrep_cow_free_staging,
__field(xfs_extlen_t, blockcount)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->agbno = agbno;
__entry->blockcount = blockcount;
),
@@ -1756,8 +2719,1122 @@ DEFINE_EVENT(xrep_dquot_class, name, \
DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item);
DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot);
DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole);
+DEFINE_XREP_DQUOT_EVENT(xrep_quotacheck_dquot);
#endif /* CONFIG_XFS_QUOTA */
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode);
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode);
+
+TRACE_EVENT(xrep_rmap_live_update,
+ TP_PROTO(const struct xfs_group *xg, unsigned int op,
+ const struct xfs_rmap_update_params *p),
+ TP_ARGS(xg, op, p),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(enum xfs_group_type, type)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, op)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->agno = xg->xg_gno;
+ __entry->op = op;
+ __entry->agbno = p->startblock;
+ __entry->len = p->blockcount;
+ xfs_owner_info_unpack(&p->oinfo, &__entry->owner,
+ &__entry->offset, &__entry->flags);
+ if (p->unwritten)
+ __entry->flags |= XFS_RMAP_UNWRITTEN;
+ ),
+ TP_printk("dev %d:%d %sno 0x%x op %d %sbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
+ __entry->agno,
+ __entry->op,
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
+ __entry->agbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+
+TRACE_EVENT(xrep_tempfile_create,
+ TP_PROTO(struct xfs_scrub *sc),
+ TP_ARGS(sc),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, inum)
+ __field(unsigned int, gen)
+ __field(unsigned int, flags)
+ __field(xfs_ino_t, temp_inum)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->file ? XFS_I(file_inode(sc->file))->i_ino : 0;
+ __entry->type = sc->sm->sm_type;
+ __entry->agno = sc->sm->sm_agno;
+ __entry->inum = sc->sm->sm_ino;
+ __entry->gen = sc->sm->sm_gen;
+ __entry->flags = sc->sm->sm_flags;
+ __entry->temp_inum = sc->tempip->i_ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx type %s inum 0x%llx gen 0x%x flags 0x%x temp_inum 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __entry->inum,
+ __entry->gen,
+ __entry->flags,
+ __entry->temp_inum)
+);
+
+DECLARE_EVENT_CLASS(xrep_tempfile_class,
+ TP_PROTO(struct xfs_scrub *sc, int whichfork,
+ struct xfs_bmbt_irec *irec),
+ TP_ARGS(sc, whichfork, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, lblk)
+ __field(xfs_filblks_t, len)
+ __field(xfs_fsblock_t, pblk)
+ __field(int, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->tempip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->lblk = irec->br_startoff;
+ __entry->len = irec->br_blockcount;
+ __entry->pblk = irec->br_startblock;
+ __entry->state = irec->br_state;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->lblk,
+ __entry->len,
+ __entry->pblk,
+ __entry->state)
+);
+#define DEFINE_XREP_TEMPFILE_EVENT(name) \
+DEFINE_EVENT(xrep_tempfile_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, int whichfork, \
+ struct xfs_bmbt_irec *irec), \
+ TP_ARGS(sc, whichfork, irec))
+DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_prealloc);
+DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_copyin);
+
+TRACE_EVENT(xreap_ifork_extent,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork,
+ const struct xfs_bmbt_irec *irec),
+ TP_ARGS(sc, ip, whichfork, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, fileoff)
+ __field(xfs_filblks_t, len)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(int, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->fileoff = irec->br_startoff;
+ __entry->len = irec->br_blockcount;
+ __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock);
+ __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock);
+ __entry->state = irec->br_state;
+ ),
+ TP_printk("dev %d:%d ip 0x%llx whichfork %s agno 0x%x agbno 0x%x fileoff 0x%llx fsbcount 0x%llx state 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->agno,
+ __entry->agbno,
+ __entry->fileoff,
+ __entry->len,
+ __entry->state)
+);
+
+TRACE_EVENT(xreap_bmapi_binval_scan,
+ TP_PROTO(struct xfs_scrub *sc, const struct xfs_bmbt_irec *irec,
+ xfs_extlen_t scan_blocks),
+ TP_ARGS(sc, irec, scan_blocks),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_filblks_t, len)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, scan_blocks)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->len = irec->br_blockcount;
+ __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock);
+ __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock);
+ __entry->scan_blocks = scan_blocks;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%llx scan_blocks 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->scan_blocks)
+);
+
+TRACE_EVENT(xrep_xattr_recover_leafblock,
+ TP_PROTO(struct xfs_inode *ip, xfs_dablk_t dabno, uint16_t magic),
+ TP_ARGS(ip, dabno, magic),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_dablk_t, dabno)
+ __field(uint16_t, magic)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->dabno = dabno;
+ __entry->magic = magic;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx dablk 0x%x magic 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->dabno,
+ __entry->magic)
+);
+
+DECLARE_EVENT_CLASS(xrep_xattr_salvage_class,
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name,
+ unsigned int namelen, unsigned int valuelen),
+ TP_ARGS(ip, flags, name, namelen, valuelen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, flags)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, namelen)
+ __field(unsigned int, valuelen)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->flags = flags;
+ __entry->namelen = namelen;
+ memcpy(__get_str(name), name, namelen);
+ __entry->valuelen = valuelen;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx flags %s name '%.*s' valuelen 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_flags(__entry->flags, "|", XFS_ATTR_NAMESPACE_STR),
+ __entry->namelen,
+ __get_str(name),
+ __entry->valuelen)
+);
+#define DEFINE_XREP_XATTR_SALVAGE_EVENT(name) \
+DEFINE_EVENT(xrep_xattr_salvage_class, name, \
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name, \
+ unsigned int namelen, unsigned int valuelen), \
+ TP_ARGS(ip, flags, name, namelen, valuelen))
+DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_salvage_rec);
+DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_insert_rec);
+DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_stash_xattr);
+DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_insert_xattr);
+
+DECLARE_EVENT_CLASS(xrep_pptr_salvage_class,
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name,
+ unsigned int namelen, const void *value, unsigned int valuelen),
+ TP_ARGS(ip, flags, name, namelen, value, valuelen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, namelen)
+ ),
+ TP_fast_assign(
+ const struct xfs_parent_rec *rec = value;
+
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->parent_ino = be64_to_cpu(rec->p_ino);
+ __entry->parent_gen = be32_to_cpu(rec->p_gen);
+ __entry->namelen = namelen;
+ memcpy(__get_str(name), name, namelen);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+)
+#define DEFINE_XREP_PPTR_SALVAGE_EVENT(name) \
+DEFINE_EVENT(xrep_pptr_salvage_class, name, \
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name, \
+ unsigned int namelen, const void *value, unsigned int valuelen), \
+ TP_ARGS(ip, flags, name, namelen, value, valuelen))
+DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_salvage_pptr);
+DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_insert_pptr);
+
+TRACE_EVENT(xrep_xattr_class,
+ TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip),
+ TP_ARGS(ip, arg_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, src_ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->src_ino = arg_ip->i_ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx src 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->src_ino)
+)
+#define DEFINE_XREP_XATTR_EVENT(name) \
+DEFINE_EVENT(xrep_xattr_class, name, \
+ TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip), \
+ TP_ARGS(ip, arg_ip))
+DEFINE_XREP_XATTR_EVENT(xrep_xattr_rebuild_tree);
+DEFINE_XREP_XATTR_EVENT(xrep_xattr_reset_fork);
+DEFINE_XREP_XATTR_EVENT(xrep_xattr_full_reset);
+
+DECLARE_EVENT_CLASS(xrep_xattr_pptr_scan_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp,
+ const struct xfs_name *name),
+ TP_ARGS(ip, dp, name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->parent_gen = VFS_IC(dp)->i_generation;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+)
+#define DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(name) \
+DEFINE_EVENT(xrep_xattr_pptr_scan_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \
+ const struct xfs_name *name), \
+ TP_ARGS(ip, dp, name))
+DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentadd);
+DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentremove);
+
+TRACE_EVENT(xrep_dir_recover_dirblock,
+ TP_PROTO(struct xfs_inode *dp, xfs_dablk_t dabno, uint32_t magic,
+ uint32_t magic_guess),
+ TP_ARGS(dp, dabno, magic, magic_guess),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_dablk_t, dabno)
+ __field(uint32_t, magic)
+ __field(uint32_t, magic_guess)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->dabno = dabno;
+ __entry->magic = magic;
+ __entry->magic_guess = magic_guess;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx dablk 0x%x magic 0x%x magic_guess 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->dabno,
+ __entry->magic,
+ __entry->magic_guess)
+);
+
+DECLARE_EVENT_CLASS(xrep_dir_class,
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino),
+ TP_ARGS(dp, parent_ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_ino_t, parent_ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->parent_ino = parent_ino;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx parent 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->parent_ino)
+)
+#define DEFINE_XREP_DIR_EVENT(name) \
+DEFINE_EVENT(xrep_dir_class, name, \
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino), \
+ TP_ARGS(dp, parent_ino))
+DEFINE_XREP_DIR_EVENT(xrep_dir_rebuild_tree);
+DEFINE_XREP_DIR_EVENT(xrep_dir_reset_fork);
+DEFINE_XREP_DIR_EVENT(xrep_parent_reset_dotdot);
+
+DECLARE_EVENT_CLASS(xrep_dirent_class,
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name,
+ xfs_ino_t ino),
+ TP_ARGS(dp, name, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ __field(xfs_ino_t, ino)
+ __field(uint8_t, ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ __entry->ino = ino;
+ __entry->ftype = name->type;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx ftype %s name '%.*s' ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __entry->namelen,
+ __get_str(name),
+ __entry->ino)
+)
+#define DEFINE_XREP_DIRENT_EVENT(name) \
+DEFINE_EVENT(xrep_dirent_class, name, \
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name, \
+ xfs_ino_t ino), \
+ TP_ARGS(dp, name, ino))
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_salvage_entry);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_createname);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_createname);
+DEFINE_XREP_DIRENT_EVENT(xrep_adoption_reparent);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_removename);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_removename);
+
+DECLARE_EVENT_CLASS(xrep_adoption_class,
+ TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved),
+ TP_ARGS(dp, ip, moved),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_ino_t, child_ino)
+ __field(bool, moved)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->child_ino = ip->i_ino;
+ __entry->moved = moved;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx child 0x%llx moved? %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->child_ino,
+ __entry->moved)
+);
+#define DEFINE_XREP_ADOPTION_EVENT(name) \
+DEFINE_EVENT(xrep_adoption_class, name, \
+ TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved), \
+ TP_ARGS(dp, ip, moved))
+DEFINE_XREP_ADOPTION_EVENT(xrep_adoption_trans_roll);
+
+DECLARE_EVENT_CLASS(xrep_parent_salvage_class,
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino),
+ TP_ARGS(dp, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->ino = ino;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx parent 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->ino)
+)
+#define DEFINE_XREP_PARENT_SALVAGE_EVENT(name) \
+DEFINE_EVENT(xrep_parent_salvage_class, name, \
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino), \
+ TP_ARGS(dp, ino))
+DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_dir_salvaged_parent);
+DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_dirent);
+DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_from_dcache);
+
+DECLARE_EVENT_CLASS(xrep_pptr_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name,
+ const struct xfs_parent_rec *pptr),
+ TP_ARGS(ip, name, pptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->parent_ino = be64_to_cpu(pptr->p_ino);
+ __entry->parent_gen = be32_to_cpu(pptr->p_gen);
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+)
+#define DEFINE_XREP_PPTR_EVENT(name) \
+DEFINE_EVENT(xrep_pptr_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \
+ const struct xfs_parent_rec *pptr), \
+ TP_ARGS(ip, name, pptr))
+DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentadd);
+DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentremove);
+DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentadd);
+DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentremove);
+
+DECLARE_EVENT_CLASS(xrep_pptr_scan_class,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp,
+ const struct xfs_name *name),
+ TP_ARGS(ip, dp, name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, parent_gen)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->parent_gen = VFS_IC(dp)->i_generation;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __entry->parent_gen,
+ __entry->namelen,
+ __get_str(name))
+)
+#define DEFINE_XREP_PPTR_SCAN_EVENT(name) \
+DEFINE_EVENT(xrep_pptr_scan_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \
+ const struct xfs_name *name), \
+ TP_ARGS(ip, dp, name))
+DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentadd);
+DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentremove);
+
+TRACE_EVENT(xrep_nlinks_set_record,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
+ const struct xchk_nlink *obs),
+ TP_ARGS(mp, ino, obs),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->parents = obs->parents;
+ __entry->backrefs = obs->backrefs;
+ __entry->children = obs->children;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parents,
+ __entry->backrefs,
+ __entry->children)
+);
+
+DECLARE_EVENT_CLASS(xrep_dentry_class,
+ TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry),
+ TP_ARGS(mp, dentry),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, flags)
+ __field(unsigned long, ino)
+ __field(bool, positive)
+ __field(unsigned long, parent_ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, dentry->d_name.len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->flags = dentry->d_flags;
+ __entry->positive = d_is_positive(dentry);
+ if (dentry->d_parent && d_inode(dentry->d_parent))
+ __entry->parent_ino = d_inode(dentry->d_parent)->i_ino;
+ else
+ __entry->parent_ino = -1UL;
+ __entry->ino = d_inode(dentry) ? d_inode(dentry)->i_ino : 0;
+ __entry->namelen = dentry->d_name.len;
+ memcpy(__get_str(name), dentry->d_name.name, dentry->d_name.len);
+ ),
+ TP_printk("dev %d:%d flags 0x%x positive? %d parent_ino 0x%lx ino 0x%lx name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->flags,
+ __entry->positive,
+ __entry->parent_ino,
+ __entry->ino,
+ __entry->namelen,
+ __get_str(name))
+);
+#define DEFINE_REPAIR_DENTRY_EVENT(name) \
+DEFINE_EVENT(xrep_dentry_class, name, \
+ TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry), \
+ TP_ARGS(mp, dentry))
+DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_check_child);
+DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_invalidate_child);
+DEFINE_REPAIR_DENTRY_EVENT(xrep_dirtree_delete_child);
+
+TRACE_EVENT(xrep_symlink_salvage_target,
+ TP_PROTO(struct xfs_inode *ip, char *target, unsigned int targetlen),
+ TP_ARGS(ip, target, targetlen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, targetlen)
+ __dynamic_array(char, target, targetlen + 1)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->targetlen = targetlen;
+ memcpy(__get_str(target), target, targetlen);
+ __get_str(target)[targetlen] = 0;
+ ),
+ TP_printk("dev %d:%d ip 0x%llx target '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->targetlen,
+ __get_str(target))
+);
+
+DECLARE_EVENT_CLASS(xrep_symlink_class,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ ),
+ TP_printk("dev %d:%d ip 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino)
+);
+
+#define DEFINE_XREP_SYMLINK_EVENT(name) \
+DEFINE_EVENT(xrep_symlink_class, name, \
+ TP_PROTO(struct xfs_inode *ip), \
+ TP_ARGS(ip))
+DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_rebuild);
+DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_reset_fork);
+
+TRACE_EVENT(xrep_iunlink_visit,
+ TP_PROTO(const struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t bucket_agino, struct xfs_inode *ip),
+ TP_ARGS(pag, bucket, bucket_agino, ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, bucket_agino)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
+ __entry->agino = XFS_INO_TO_AGINO(pag_mount(pag), ip->i_ino);
+ __entry->bucket = bucket;
+ __entry->bucket_agino = bucket_agino;
+ __entry->prev_agino = ip->i_prev_unlinked;
+ __entry->next_agino = ip->i_next_unlinked;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x bucket_agino 0x%x prev_agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->agino,
+ __entry->bucket_agino,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_reload_next,
+ TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino),
+ TP_ARGS(ip, prev_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, old_prev_agino)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ __field(unsigned int, nlink)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->old_prev_agino = ip->i_prev_unlinked;
+ __entry->prev_agino = prev_agino;
+ __entry->next_agino = ip->i_next_unlinked;
+ __entry->nlink = VFS_I(ip)->i_nlink;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u old_prev_agino %u prev_agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS,
+ __entry->agino,
+ __entry->nlink,
+ __entry->old_prev_agino,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_reload_ondisk,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(unsigned int, nlink)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->nlink = VFS_I(ip)->i_nlink;
+ __entry->next_agino = ip->i_next_unlinked;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS,
+ __entry->agino,
+ __entry->nlink,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket,
+ TP_PROTO(const struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t prev_agino, xfs_agino_t next_agino),
+ TP_ARGS(pag, bucket, prev_agino, next_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
+ __entry->bucket = bucket;
+ __entry->prev_agino = prev_agino;
+ __entry->next_agino = next_agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+
+DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class,
+ TP_PROTO(const struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t prev_agino, xfs_agino_t next_agino),
+ TP_ARGS(pag, bucket, prev_agino, next_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
+ __entry->bucket = bucket;
+ __entry->prev_agino = prev_agino;
+ __entry->next_agino = next_agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+#define DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(name) \
+DEFINE_EVENT(xrep_iunlink_resolve_class, name, \
+ TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, \
+ xfs_agino_t prev_agino, xfs_agino_t next_agino), \
+ TP_ARGS(pag, bucket, prev_agino, next_agino))
+DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_uncached);
+DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_wronglist);
+DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_nolist);
+DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_ok);
+
+TRACE_EVENT(xrep_iunlink_relink_next,
+ TP_PROTO(struct xfs_inode *ip, xfs_agino_t next_agino),
+ TP_ARGS(ip, next_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, next_agino)
+ __field(xfs_agino_t, new_next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->next_agino = ip->i_next_unlinked;
+ __entry->new_next_agino = next_agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x -> 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS,
+ __entry->agino,
+ __entry->next_agino,
+ __entry->new_next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_relink_prev,
+ TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino),
+ TP_ARGS(ip, prev_agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, new_prev_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->prev_agino = ip->i_prev_unlinked;
+ __entry->new_prev_agino = prev_agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x prev_agino 0x%x -> 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS,
+ __entry->agino,
+ __entry->prev_agino,
+ __entry->new_prev_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_add_to_bucket,
+ TP_PROTO(const struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t agino, xfs_agino_t curr_head),
+ TP_ARGS(pag, bucket, agino, curr_head),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
+ __entry->bucket = bucket;
+ __entry->agino = agino;
+ __entry->next_agino = curr_head;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->agino,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xrep_iunlink_commit_bucket,
+ TP_PROTO(const struct xfs_perag *pag, unsigned int bucket,
+ xfs_agino_t old_agino, xfs_agino_t agino),
+ TP_ARGS(pag, bucket, old_agino, agino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, old_agino)
+ __field(xfs_agino_t, agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
+ __entry->bucket = bucket;
+ __entry->old_agino = old_agino;
+ __entry->agino = agino;
+ ),
+ TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x -> 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->old_agino,
+ __entry->agino)
+);
+
+DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xrep_dirpath_set_outcome);
+DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_delete_path);
+DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_create_adoption);
+DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xrep_dirtree_decided_fate);
+
+DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_lookup);
+DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_try_unlink);
+DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_unlink);
+DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_link);
+
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xrep_rtbitmap_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, xfs_rtxnum_t end),
+ TP_ARGS(mp, start, end),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(xfs_rtxnum_t, start)
+ __field(xfs_rtxnum_t, end)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rtdev = mp->m_rtdev_targp->bt_dev;
+ __entry->start = start;
+ __entry->end = end;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d startrtx 0x%llx endrtx 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->start,
+ __entry->end)
+);
+#define DEFINE_REPAIR_RGBITMAP_EVENT(name) \
+DEFINE_EVENT(xrep_rtbitmap_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, \
+ xfs_rtxnum_t end), \
+ TP_ARGS(mp, start, end))
+DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rtbitmap_record_free);
+DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rtbitmap_record_free_bulk);
+
+TRACE_EVENT(xrep_rtbitmap_or,
+ TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff,
+ xfs_rtword_t mask, xfs_rtword_t word),
+ TP_ARGS(mp, wordoff, mask, word),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(unsigned long long, wordoff)
+ __field(unsigned int, mask)
+ __field(unsigned int, word)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rtdev = mp->m_rtdev_targp->bt_dev;
+ __entry->wordoff = wordoff;
+ __entry->mask = mask;
+ __entry->word = word;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx mask 0x%x word 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->wordoff,
+ __entry->mask,
+ __entry->word)
+);
+
+TRACE_EVENT(xrep_rtbitmap_load,
+ TP_PROTO(struct xfs_rtgroup *rtg, xfs_fileoff_t rbmoff,
+ xfs_rtxnum_t rtx, xfs_rtxnum_t len),
+ TP_ARGS(rtg, rbmoff, rtx, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_fileoff_t, rbmoff)
+ __field(xfs_rtxnum_t, rtx)
+ __field(xfs_rtxnum_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+ __entry->rtdev = rtg_mount(rtg)->m_rtdev_targp->bt_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->rbmoff = rbmoff;
+ __entry->rtx = rtx;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d rgno 0x%x rbmoff 0x%llx rtx 0x%llx rtxcount 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->rgno,
+ __entry->rbmoff,
+ __entry->rtx,
+ __entry->len)
+);
+
+TRACE_EVENT(xrep_rtbitmap_load_words,
+ TP_PROTO(struct xfs_mount *mp, xfs_fileoff_t rbmoff,
+ unsigned long long wordoff, unsigned int wordcnt),
+ TP_ARGS(mp, rbmoff, wordoff, wordcnt),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(xfs_fileoff_t, rbmoff)
+ __field(unsigned long long, wordoff)
+ __field(unsigned int, wordcnt)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rtdev = mp->m_rtdev_targp->bt_dev;
+ __entry->rbmoff = rbmoff;
+ __entry->wordoff = wordoff;
+ __entry->wordcnt = wordcnt;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d rbmoff 0x%llx wordoff 0x%llx wordcnt 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->rbmoff,
+ __entry->wordoff,
+ __entry->wordcnt)
+);
+
+TRACE_EVENT(xrep_rtbitmap_load_word,
+ TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff,
+ unsigned int bit, xfs_rtword_t ondisk_word,
+ xfs_rtword_t xfile_word, xfs_rtword_t word_mask),
+ TP_ARGS(mp, wordoff, bit, ondisk_word, xfile_word, word_mask),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(unsigned long long, wordoff)
+ __field(unsigned int, bit)
+ __field(xfs_rtword_t, ondisk_word)
+ __field(xfs_rtword_t, xfile_word)
+ __field(xfs_rtword_t, word_mask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rtdev = mp->m_rtdev_targp->bt_dev;
+ __entry->wordoff = wordoff;
+ __entry->bit = bit;
+ __entry->ondisk_word = ondisk_word;
+ __entry->xfile_word = xfile_word;
+ __entry->word_mask = word_mask;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx bit %u ondisk 0x%x(0x%x) inmem 0x%x(0x%x) result 0x%x mask 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->wordoff,
+ __entry->bit,
+ __entry->ondisk_word,
+ __entry->ondisk_word & __entry->word_mask,
+ __entry->xfile_word,
+ __entry->xfile_word & ~__entry->word_mask,
+ (__entry->xfile_word & ~__entry->word_mask) |
+ (__entry->ondisk_word & __entry->word_mask),
+ __entry->word_mask)
+);
+
+TRACE_EVENT(xrep_rtrmap_found,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_rmap_irec *rec),
+ TP_ARGS(mp, rec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(xfs_rgblock_t, rgbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rtdev = mp->m_rtdev_targp->bt_dev;
+ __entry->rgbno = rec->rm_startblock;
+ __entry->len = rec->rm_blockcount;
+ __entry->owner = rec->rm_owner;
+ __entry->offset = rec->rm_offset;
+ __entry->flags = rec->rm_flags;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d rgbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->rgbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+#endif /* CONFIG_XFS_RT */
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
index f0f532c10a5a..cdd13ed9c569 100644
--- a/fs/xfs/scrub/xfarray.c
+++ b/fs/xfs/scrub/xfarray.c
@@ -7,16 +7,16 @@
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
+#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
-#include "scrub/scrub.h"
#include "scrub/trace.h"
/*
* Large Arrays of Fixed-Size Records
* ==================================
*
- * This memory array uses an xfile (which itself is a memfd "file") to store
+ * This memory array uses an xfile (which itself is a shmem file) to store
* large numbers of fixed-size records in memory that can be paged out. This
* puts less stress on the memory reclaim algorithms during an online repair
* because we don't have to pin so much memory. However, array access is less
@@ -136,7 +136,7 @@ xfarray_load(
if (idx >= array->nr)
return -ENODATA;
- return xfile_obj_load(array->xfile, ptr, array->obj_size,
+ return xfile_load(array->xfile, ptr, array->obj_size,
xfarray_pos(array, idx));
}
@@ -152,7 +152,7 @@ xfarray_is_unset(
if (array->unset_slots == 0)
return false;
- error = xfile_obj_load(array->xfile, temp, array->obj_size, pos);
+ error = xfile_load(array->xfile, temp, array->obj_size, pos);
if (!error && xfarray_element_is_null(array, temp))
return true;
@@ -184,7 +184,7 @@ xfarray_unset(
return 0;
memset(temp, 0, array->obj_size);
- error = xfile_obj_store(array->xfile, temp, array->obj_size, pos);
+ error = xfile_store(array->xfile, temp, array->obj_size, pos);
if (error)
return error;
@@ -209,7 +209,7 @@ xfarray_store(
ASSERT(!xfarray_element_is_null(array, ptr));
- ret = xfile_obj_store(array->xfile, ptr, array->obj_size,
+ ret = xfile_store(array->xfile, ptr, array->obj_size,
xfarray_pos(array, idx));
if (ret)
return ret;
@@ -245,12 +245,12 @@ xfarray_store_anywhere(
for (pos = 0;
pos < endpos && array->unset_slots > 0;
pos += array->obj_size) {
- error = xfile_obj_load(array->xfile, temp, array->obj_size,
+ error = xfile_load(array->xfile, temp, array->obj_size,
pos);
if (error || !xfarray_element_is_null(array, temp))
continue;
- error = xfile_obj_store(array->xfile, ptr, array->obj_size,
+ error = xfile_store(array->xfile, ptr, array->obj_size,
pos);
if (error)
return error;
@@ -486,6 +486,9 @@ xfarray_sortinfo_alloc(
xfarray_sortinfo_lo(si)[0] = 0;
xfarray_sortinfo_hi(si)[0] = array->nr - 1;
+ si->relax = INIT_XCHK_RELAX;
+ if (flags & XFARRAY_SORT_KILLABLE)
+ si->relax.interruptible = false;
trace_xfarray_sort(si, nr_bytes);
*infop = si;
@@ -503,10 +506,7 @@ xfarray_sort_terminated(
* few seconds so that we don't run afoul of the soft lockup watchdog
* or RCU stall detector.
*/
- cond_resched();
-
- if ((si->flags & XFARRAY_SORT_KILLABLE) &&
- fatal_signal_pending(current)) {
+ if (xchk_maybe_relax(&si->relax)) {
if (*error == 0)
*error = -EINTR;
return true;
@@ -552,7 +552,7 @@ xfarray_isort(
trace_xfarray_isort(si, lo, hi);
xfarray_sort_bump_loads(si);
- error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos);
+ error = xfile_load(si->array->xfile, scratch, len, lo_pos);
if (error)
return error;
@@ -560,88 +560,45 @@ xfarray_isort(
sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
xfarray_sort_bump_stores(si);
- return xfile_obj_store(si->array->xfile, scratch, len, lo_pos);
-}
-
-/* Grab a page for sorting records. */
-static inline int
-xfarray_sort_get_page(
- struct xfarray_sortinfo *si,
- loff_t pos,
- uint64_t len)
-{
- int error;
-
- error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage);
- if (error)
- return error;
-
- /*
- * xfile pages must never be mapped into userspace, so we skip the
- * dcache flush when mapping the page.
- */
- si->page_kaddr = kmap_local_page(si->xfpage.page);
- return 0;
-}
-
-/* Release a page we grabbed for sorting records. */
-static inline int
-xfarray_sort_put_page(
- struct xfarray_sortinfo *si)
-{
- if (!si->page_kaddr)
- return 0;
-
- kunmap_local(si->page_kaddr);
- si->page_kaddr = NULL;
-
- return xfile_put_page(si->array->xfile, &si->xfpage);
+ return xfile_store(si->array->xfile, scratch, len, lo_pos);
}
-/* Decide if these records are eligible for in-page sorting. */
-static inline bool
-xfarray_want_pagesort(
- struct xfarray_sortinfo *si,
- xfarray_idx_t lo,
- xfarray_idx_t hi)
-{
- pgoff_t lo_page;
- pgoff_t hi_page;
- loff_t end_pos;
-
- /* We can only map one page at a time. */
- lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT;
- end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1;
- hi_page = end_pos >> PAGE_SHIFT;
-
- return lo_page == hi_page;
-}
-
-/* Sort a bunch of records that all live in the same memory page. */
+/*
+ * Sort the records from lo to hi (inclusive) if they are all backed by the
+ * same memory folio. Returns 1 if it sorted, 0 if it did not, or a negative
+ * errno.
+ */
STATIC int
-xfarray_pagesort(
+xfarray_foliosort(
struct xfarray_sortinfo *si,
xfarray_idx_t lo,
xfarray_idx_t hi)
{
+ struct folio *folio;
void *startp;
loff_t lo_pos = xfarray_pos(si->array, lo);
- uint64_t len = xfarray_pos(si->array, hi - lo);
- int error = 0;
+ uint64_t len = xfarray_pos(si->array, hi - lo + 1);
- trace_xfarray_pagesort(si, lo, hi);
+ /* No single folio could back this many records. */
+ if (len > XFILE_MAX_FOLIO_SIZE)
+ return 0;
xfarray_sort_bump_loads(si);
- error = xfarray_sort_get_page(si, lo_pos, len);
- if (error)
- return error;
+ folio = xfile_get_folio(si->array->xfile, lo_pos, len, XFILE_ALLOC);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ if (!folio)
+ return 0;
+
+ trace_xfarray_foliosort(si, lo, hi);
xfarray_sort_bump_heapsorts(si);
- startp = si->page_kaddr + offset_in_page(lo_pos);
+ startp = folio_address(folio) + offset_in_folio(folio, lo_pos);
sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
xfarray_sort_bump_stores(si);
- return xfarray_sort_put_page(si);
+ xfile_put_folio(si->array->xfile, folio);
+ return 1;
}
/* Return a pointer to the xfarray pivot record within the sortinfo struct. */
@@ -829,63 +786,80 @@ xfarray_qsort_push(
return 0;
}
+static inline void
+xfarray_sort_scan_done(
+ struct xfarray_sortinfo *si)
+{
+ if (si->folio)
+ xfile_put_folio(si->array->xfile, si->folio);
+ si->folio = NULL;
+}
+
/*
- * Load an element from the array into the first scratchpad and cache the page,
- * if possible.
+ * Cache the folio backing the start of the given array element. If the array
+ * element is contained entirely within the folio, return a pointer to the
+ * cached folio. Otherwise, load the element into the scratchpad and return a
+ * pointer to the scratchpad.
*/
static inline int
-xfarray_sort_load_cached(
+xfarray_sort_scan(
struct xfarray_sortinfo *si,
xfarray_idx_t idx,
- void *ptr)
+ void **ptrp)
{
loff_t idx_pos = xfarray_pos(si->array, idx);
- pgoff_t startpage;
- pgoff_t endpage;
int error = 0;
- /*
- * If this load would split a page, release the cached page, if any,
- * and perform a traditional read.
- */
- startpage = idx_pos >> PAGE_SHIFT;
- endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT;
- if (startpage != endpage) {
- error = xfarray_sort_put_page(si);
- if (error)
- return error;
+ if (xfarray_sort_terminated(si, &error))
+ return error;
- if (xfarray_sort_terminated(si, &error))
- return error;
+ trace_xfarray_sort_scan(si, idx);
- return xfile_obj_load(si->array->xfile, ptr,
- si->array->obj_size, idx_pos);
- }
+ /* If the cached folio doesn't cover this index, release it. */
+ if (si->folio &&
+ (idx < si->first_folio_idx || idx > si->last_folio_idx))
+ xfarray_sort_scan_done(si);
- /* If the cached page is not the one we want, release it. */
- if (xfile_page_cached(&si->xfpage) &&
- xfile_page_index(&si->xfpage) != startpage) {
- error = xfarray_sort_put_page(si);
- if (error)
- return error;
+ /* Grab the first folio that backs this array element. */
+ if (!si->folio) {
+ struct folio *folio;
+ loff_t next_pos;
+
+ folio = xfile_get_folio(si->array->xfile, idx_pos,
+ si->array->obj_size, XFILE_ALLOC);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ si->folio = folio;
+
+ si->first_folio_idx = xfarray_idx(si->array,
+ folio_pos(si->folio) + si->array->obj_size - 1);
+
+ next_pos = folio_pos(si->folio) + folio_size(si->folio);
+ si->last_folio_idx = xfarray_idx(si->array, next_pos - 1);
+ if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos)
+ si->last_folio_idx--;
+
+ trace_xfarray_sort_scan(si, idx);
}
/*
- * If we don't have a cached page (and we know the load is contained
- * in a single page) then grab it.
+ * If this folio still doesn't cover the desired element, it must cross
+ * a folio boundary. Read into the scratchpad and we're done.
*/
- if (!xfile_page_cached(&si->xfpage)) {
- if (xfarray_sort_terminated(si, &error))
- return error;
+ if (idx < si->first_folio_idx || idx > si->last_folio_idx) {
+ void *temp = xfarray_scratch(si->array);
- error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT,
- PAGE_SIZE);
+ error = xfile_load(si->array->xfile, temp, si->array->obj_size,
+ idx_pos);
if (error)
return error;
+
+ *ptrp = temp;
+ return 0;
}
- memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos),
- si->array->obj_size);
+ /* Otherwise return a pointer to the array element in the folio. */
+ *ptrp = folio_address(si->folio) + offset_in_folio(si->folio, idx_pos);
return 0;
}
@@ -952,6 +926,8 @@ xfarray_sort(
pivot = xfarray_sortinfo_pivot(si);
while (si->stack_depth >= 0) {
+ int ret;
+
lo = si_lo[si->stack_depth];
hi = si_hi[si->stack_depth];
@@ -964,13 +940,13 @@ xfarray_sort(
}
/*
- * If directly mapping the page and sorting can solve our
+ * If directly mapping the folio and sorting can solve our
* problems, we're done.
*/
- if (xfarray_want_pagesort(si, lo, hi)) {
- error = xfarray_pagesort(si, lo, hi);
- if (error)
- goto out_free;
+ ret = xfarray_foliosort(si, lo, hi);
+ if (ret < 0)
+ goto out_free;
+ if (ret == 1) {
si->stack_depth--;
continue;
}
@@ -995,25 +971,24 @@ xfarray_sort(
* than the pivot is on the right side of the range.
*/
while (lo < hi) {
+ void *p;
+
/*
* Decrement hi until it finds an a[hi] less than the
* pivot value.
*/
- error = xfarray_sort_load_cached(si, hi, scratch);
+ error = xfarray_sort_scan(si, hi, &p);
if (error)
goto out_free;
- while (xfarray_sort_cmp(si, scratch, pivot) >= 0 &&
- lo < hi) {
+ while (xfarray_sort_cmp(si, p, pivot) >= 0 && lo < hi) {
hi--;
- error = xfarray_sort_load_cached(si, hi,
- scratch);
+ error = xfarray_sort_scan(si, hi, &p);
if (error)
goto out_free;
}
- error = xfarray_sort_put_page(si);
- if (error)
- goto out_free;
-
+ if (p != scratch)
+ memcpy(scratch, p, si->array->obj_size);
+ xfarray_sort_scan_done(si);
if (xfarray_sort_terminated(si, &error))
goto out_free;
@@ -1028,21 +1003,18 @@ xfarray_sort(
* Increment lo until it finds an a[lo] greater than
* the pivot value.
*/
- error = xfarray_sort_load_cached(si, lo, scratch);
+ error = xfarray_sort_scan(si, lo, &p);
if (error)
goto out_free;
- while (xfarray_sort_cmp(si, scratch, pivot) <= 0 &&
- lo < hi) {
+ while (xfarray_sort_cmp(si, p, pivot) <= 0 && lo < hi) {
lo++;
- error = xfarray_sort_load_cached(si, lo,
- scratch);
+ error = xfarray_sort_scan(si, lo, &p);
if (error)
goto out_free;
}
- error = xfarray_sort_put_page(si);
- if (error)
- goto out_free;
-
+ if (p != scratch)
+ memcpy(scratch, p, si->array->obj_size);
+ xfarray_sort_scan_done(si);
if (xfarray_sort_terminated(si, &error))
goto out_free;
@@ -1078,6 +1050,24 @@ xfarray_sort(
out_free:
trace_xfarray_sort_stats(si, error);
+ xfarray_sort_scan_done(si);
kvfree(si);
return error;
}
+
+/* How many bytes is this array consuming? */
+unsigned long long
+xfarray_bytes(
+ struct xfarray *array)
+{
+ return xfile_bytes(array->xfile);
+}
+
+/* Empty the entire array. */
+void
+xfarray_truncate(
+ struct xfarray *array)
+{
+ xfile_discard(array->xfile, 0, MAX_LFS_FILESIZE);
+ array->nr = 0;
+}
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
index 62b9c506fdd1..5eeeeed13ae2 100644
--- a/fs/xfs/scrub/xfarray.h
+++ b/fs/xfs/scrub/xfarray.h
@@ -8,6 +8,7 @@
/* xfile array index type, along with cursor initialization */
typedef uint64_t xfarray_idx_t;
+#define XFARRAY_NULLIDX ((__force xfarray_idx_t)-1ULL)
#define XFARRAY_CURSOR_INIT ((__force xfarray_idx_t)0)
/* Iterate each index of an xfile array. */
@@ -44,6 +45,27 @@ int xfarray_unset(struct xfarray *array, xfarray_idx_t idx);
int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr);
int xfarray_store_anywhere(struct xfarray *array, const void *ptr);
bool xfarray_element_is_null(struct xfarray *array, const void *ptr);
+void xfarray_truncate(struct xfarray *array);
+unsigned long long xfarray_bytes(struct xfarray *array);
+
+/*
+ * Load an array element, but zero the buffer if there's no data because we
+ * haven't stored to that array element yet.
+ */
+static inline int
+xfarray_load_sparse(
+ struct xfarray *array,
+ uint64_t idx,
+ void *rec)
+{
+ int error = xfarray_load(array, idx, rec);
+
+ if (error == -ENODATA) {
+ memset(rec, 0, array->obj_size);
+ return 0;
+ }
+ return error;
+}
/* Append an element to the array. */
static inline int xfarray_append(struct xfarray *array, const void *ptr)
@@ -105,9 +127,17 @@ struct xfarray_sortinfo {
/* XFARRAY_SORT_* flags; see below. */
unsigned int flags;
- /* Cache a page here for faster access. */
- struct xfile_page xfpage;
- void *page_kaddr;
+ /* next time we want to cond_resched() */
+ struct xchk_relax relax;
+
+ /* Cache a folio here for faster scanning for pivots */
+ struct folio *folio;
+
+ /* First array index in folio that is completely readable */
+ xfarray_idx_t first_folio_idx;
+
+ /* Last array index in folio that is completely readable */
+ xfarray_idx_t last_folio_idx;
#ifdef DEBUG
/* Performance statistics. */
diff --git a/fs/xfs/scrub/xfblob.c b/fs/xfs/scrub/xfblob.c
new file mode 100644
index 000000000000..6ef2a9637f16
--- /dev/null
+++ b/fs/xfs/scrub/xfblob.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "scrub/scrub.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+
+/*
+ * XFS Blob Storage
+ * ================
+ * Stores and retrieves blobs using an xfile. Objects are appended to the file
+ * and the offset is returned as a magic cookie for retrieval.
+ */
+
+#define XB_KEY_MAGIC 0xABAADDAD
+struct xb_key {
+ uint32_t xb_magic; /* XB_KEY_MAGIC */
+ uint32_t xb_size; /* size of the blob, in bytes */
+ loff_t xb_offset; /* byte offset of this key */
+ /* blob comes after here */
+} __packed;
+
+/* Initialize a blob storage object. */
+int
+xfblob_create(
+ const char *description,
+ struct xfblob **blobp)
+{
+ struct xfblob *blob;
+ struct xfile *xfile;
+ int error;
+
+ error = xfile_create(description, 0, &xfile);
+ if (error)
+ return error;
+
+ blob = kmalloc(sizeof(struct xfblob), XCHK_GFP_FLAGS);
+ if (!blob) {
+ error = -ENOMEM;
+ goto out_xfile;
+ }
+
+ blob->xfile = xfile;
+ blob->last_offset = PAGE_SIZE;
+
+ *blobp = blob;
+ return 0;
+
+out_xfile:
+ xfile_destroy(xfile);
+ return error;
+}
+
+/* Destroy a blob storage object. */
+void
+xfblob_destroy(
+ struct xfblob *blob)
+{
+ xfile_destroy(blob->xfile);
+ kfree(blob);
+}
+
+/* Retrieve a blob. */
+int
+xfblob_load(
+ struct xfblob *blob,
+ xfblob_cookie cookie,
+ void *ptr,
+ uint32_t size)
+{
+ struct xb_key key;
+ int error;
+
+ error = xfile_load(blob->xfile, &key, sizeof(key), cookie);
+ if (error)
+ return error;
+
+ if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) {
+ ASSERT(0);
+ return -ENODATA;
+ }
+ if (size < key.xb_size) {
+ ASSERT(0);
+ return -EFBIG;
+ }
+
+ return xfile_load(blob->xfile, ptr, key.xb_size,
+ cookie + sizeof(key));
+}
+
+/* Store a blob. */
+int
+xfblob_store(
+ struct xfblob *blob,
+ xfblob_cookie *cookie,
+ const void *ptr,
+ uint32_t size)
+{
+ struct xb_key key = {
+ .xb_offset = blob->last_offset,
+ .xb_magic = XB_KEY_MAGIC,
+ .xb_size = size,
+ };
+ loff_t pos = blob->last_offset;
+ int error;
+
+ error = xfile_store(blob->xfile, &key, sizeof(key), pos);
+ if (error)
+ return error;
+
+ pos += sizeof(key);
+ error = xfile_store(blob->xfile, ptr, size, pos);
+ if (error)
+ goto out_err;
+
+ *cookie = blob->last_offset;
+ blob->last_offset += sizeof(key) + size;
+ return 0;
+out_err:
+ xfile_discard(blob->xfile, blob->last_offset, sizeof(key));
+ return error;
+}
+
+/* Free a blob. */
+int
+xfblob_free(
+ struct xfblob *blob,
+ xfblob_cookie cookie)
+{
+ struct xb_key key;
+ int error;
+
+ error = xfile_load(blob->xfile, &key, sizeof(key), cookie);
+ if (error)
+ return error;
+
+ if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) {
+ ASSERT(0);
+ return -ENODATA;
+ }
+
+ xfile_discard(blob->xfile, cookie, sizeof(key) + key.xb_size);
+ return 0;
+}
+
+/* How many bytes is this blob storage object consuming? */
+unsigned long long
+xfblob_bytes(
+ struct xfblob *blob)
+{
+ return xfile_bytes(blob->xfile);
+}
+
+/* Drop all the blobs. */
+void
+xfblob_truncate(
+ struct xfblob *blob)
+{
+ xfile_discard(blob->xfile, PAGE_SIZE, MAX_LFS_FILESIZE - PAGE_SIZE);
+ blob->last_offset = PAGE_SIZE;
+}
diff --git a/fs/xfs/scrub/xfblob.h b/fs/xfs/scrub/xfblob.h
new file mode 100644
index 000000000000..ae78322613ca
--- /dev/null
+++ b/fs/xfs/scrub/xfblob.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_XFBLOB_H__
+#define __XFS_SCRUB_XFBLOB_H__
+
+struct xfblob {
+ struct xfile *xfile;
+ loff_t last_offset;
+};
+
+typedef loff_t xfblob_cookie;
+
+int xfblob_create(const char *descr, struct xfblob **blobp);
+void xfblob_destroy(struct xfblob *blob);
+int xfblob_load(struct xfblob *blob, xfblob_cookie cookie, void *ptr,
+ uint32_t size);
+int xfblob_store(struct xfblob *blob, xfblob_cookie *cookie, const void *ptr,
+ uint32_t size);
+int xfblob_free(struct xfblob *blob, xfblob_cookie cookie);
+unsigned long long xfblob_bytes(struct xfblob *blob);
+void xfblob_truncate(struct xfblob *blob);
+
+static inline int
+xfblob_storename(
+ struct xfblob *blob,
+ xfblob_cookie *cookie,
+ const struct xfs_name *xname)
+{
+ return xfblob_store(blob, cookie, xname->name, xname->len);
+}
+
+static inline int
+xfblob_loadname(
+ struct xfblob *blob,
+ xfblob_cookie cookie,
+ struct xfs_name *xname,
+ uint32_t size)
+{
+ int ret = xfblob_load(blob, cookie, (void *)xname->name, size);
+ if (ret)
+ return ret;
+
+ xname->len = size;
+ return 0;
+}
+
+#endif /* __XFS_SCRUB_XFBLOB_H__ */
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index 090c3ead43fd..c753c79df203 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -10,9 +10,9 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
-#include "scrub/scrub.h"
#include "scrub/trace.h"
#include <linux/shmem_fs.h>
@@ -34,13 +34,6 @@
* xfiles assume that the caller will handle all required concurrency
* management; standard vfs locks (freezer and inode) are not taken. Reads
* and writes are satisfied directly from the page cache.
- *
- * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
- * of a hole cause a page to be mapped into the file. If you are going to
- * create a sparse xfile, please be careful about reading from uninitialized
- * parts of the file. These pages are !Uptodate and will eventually be
- * reclaimed if not written, but in the short term this boosts memory
- * consumption.
*/
/*
@@ -62,38 +55,27 @@ xfile_create(
{
struct inode *inode;
struct xfile *xf;
- int error = -ENOMEM;
+ int error;
xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
if (!xf)
return -ENOMEM;
- xf->file = shmem_file_setup(description, isize, 0);
- if (!xf->file)
- goto out_xfile;
+ xf->file = shmem_kernel_file_setup(description, isize, VM_NORESERVE);
if (IS_ERR(xf->file)) {
error = PTR_ERR(xf->file);
goto out_xfile;
}
- /*
- * We want a large sparse file that we can pread, pwrite, and seek.
- * xfile users are responsible for keeping the xfile hidden away from
- * all other callers, so we skip timestamp updates and security checks.
- * Make the inode only accessible by root, just in case the xfile ever
- * escapes.
- */
- xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
- FMODE_LSEEK;
- xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
inode = file_inode(xf->file);
- inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
- inode->i_mode &= ~0177;
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
-
lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
+ /*
+ * We don't want to bother with kmapping data during repair, so don't
+ * allow highmem pages to back this mapping.
+ */
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
+
trace_xfile_create(xf);
*xfilep = xf;
@@ -118,164 +100,128 @@ xfile_destroy(
}
/*
- * Read a memory object directly from the xfile's page cache. Unlike regular
- * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
- * high an offset, instead of truncating the read. Otherwise, we return
- * bytes read or an error code, like regular pread.
+ * Load an object. Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
*/
-ssize_t
-xfile_pread(
+int
+xfile_load(
struct xfile *xf,
void *buf,
size_t count,
loff_t pos)
{
struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- struct page *page = NULL;
- ssize_t read = 0;
unsigned int pflags;
- int error = 0;
if (count > MAX_RW_COUNT)
- return -E2BIG;
+ return -ENOMEM;
if (inode->i_sb->s_maxbytes - pos < count)
- return -EFBIG;
+ return -ENOMEM;
- trace_xfile_pread(xf, pos, count);
+ trace_xfile_load(xf, pos, count);
pflags = memalloc_nofs_save();
while (count > 0) {
- void *p, *kaddr;
+ struct folio *folio;
unsigned int len;
+ unsigned int offset;
- len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
-
- /*
- * In-kernel reads of a shmem file cause it to allocate a page
- * if the mapping shows a hole. Therefore, if we hit ENOMEM
- * we can continue by zeroing the caller's buffer.
- */
- page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
- __GFP_NOWARN);
- if (IS_ERR(page)) {
- error = PTR_ERR(page);
- if (error != -ENOMEM)
- break;
-
- memset(buf, 0, len);
- goto advance;
- }
-
- if (PageUptodate(page)) {
+ if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
+ SGP_READ) < 0)
+ break;
+ if (!folio) {
/*
- * xfile pages must never be mapped into userspace, so
- * we skip the dcache flush.
+ * No data stored at this offset, just zero the output
+ * buffer until the next page boundary.
*/
- kaddr = kmap_local_page(page);
- p = kaddr + offset_in_page(pos);
- memcpy(buf, p, len);
- kunmap_local(kaddr);
- } else {
+ len = min_t(ssize_t, count,
+ PAGE_SIZE - offset_in_page(pos));
memset(buf, 0, len);
- }
- put_page(page);
+ } else {
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ break;
+ }
-advance:
+ offset = offset_in_folio(folio, pos);
+ len = min_t(ssize_t, count, folio_size(folio) - offset);
+ memcpy(buf, folio_address(folio) + offset, len);
+
+ folio_unlock(folio);
+ folio_put(folio);
+ }
count -= len;
pos += len;
buf += len;
- read += len;
}
memalloc_nofs_restore(pflags);
- if (read > 0)
- return read;
- return error;
+ if (count)
+ return -ENOMEM;
+ return 0;
}
/*
- * Write a memory object directly to the xfile's page cache. Unlike regular
- * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
- * high an offset, instead of truncating the write. Otherwise, we return
- * bytes written or an error code, like regular pwrite.
+ * Store an object. Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
*/
-ssize_t
-xfile_pwrite(
+int
+xfile_store(
struct xfile *xf,
const void *buf,
size_t count,
loff_t pos)
{
struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- const struct address_space_operations *aops = mapping->a_ops;
- struct page *page = NULL;
- ssize_t written = 0;
unsigned int pflags;
- int error = 0;
if (count > MAX_RW_COUNT)
- return -E2BIG;
+ return -ENOMEM;
if (inode->i_sb->s_maxbytes - pos < count)
- return -EFBIG;
+ return -ENOMEM;
- trace_xfile_pwrite(xf, pos, count);
+ trace_xfile_store(xf, pos, count);
+
+ /*
+ * Increase the file size first so that shmem_get_folio(..., SGP_CACHE),
+ * actually allocates a folio instead of erroring out.
+ */
+ if (pos + count > i_size_read(inode))
+ i_size_write(inode, pos + count);
pflags = memalloc_nofs_save();
while (count > 0) {
- void *fsdata = NULL;
- void *p, *kaddr;
+ struct folio *folio;
unsigned int len;
- int ret;
-
- len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
-
- /*
- * We call write_begin directly here to avoid all the freezer
- * protection lock-taking that happens in the normal path.
- * shmem doesn't support fs freeze, but lockdep doesn't know
- * that and will trip over that.
- */
- error = aops->write_begin(NULL, mapping, pos, len, &page,
- &fsdata);
- if (error)
- break;
+ unsigned int offset;
- /*
- * xfile pages must never be mapped into userspace, so we skip
- * the dcache flush. If the page is not uptodate, zero it
- * before writing data.
- */
- kaddr = kmap_local_page(page);
- if (!PageUptodate(page)) {
- memset(kaddr, 0, PAGE_SIZE);
- SetPageUptodate(page);
- }
- p = kaddr + offset_in_page(pos);
- memcpy(p, buf, len);
- kunmap_local(kaddr);
-
- ret = aops->write_end(NULL, mapping, pos, len, len, page,
- fsdata);
- if (ret < 0) {
- error = ret;
+ if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
+ SGP_CACHE) < 0)
+ break;
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
- written += ret;
- if (ret != len)
- break;
+ offset = offset_in_folio(folio, pos);
+ len = min_t(ssize_t, count, folio_size(folio) - offset);
+ memcpy(folio_address(folio) + offset, buf, len);
+
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
- count -= ret;
- pos += ret;
- buf += ret;
+ count -= len;
+ pos += len;
+ buf += len;
}
memalloc_nofs_restore(pflags);
- if (written > 0)
- return written;
- return error;
+ if (count)
+ return -ENOMEM;
+ return 0;
}
/* Find the next written area in the xfile data for a given offset. */
@@ -291,129 +237,88 @@ xfile_seek_data(
return ret;
}
-/* Query stat information for an xfile. */
-int
-xfile_stat(
- struct xfile *xf,
- struct xfile_stat *statbuf)
-{
- struct kstat ks;
- int error;
-
- error = vfs_getattr_nosec(&xf->file->f_path, &ks,
- STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
- if (error)
- return error;
-
- statbuf->size = ks.size;
- statbuf->bytes = ks.blocks << SECTOR_SHIFT;
- return 0;
-}
-
/*
- * Grab the (locked) page for a memory object. The object cannot span a page
- * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we
- * cannot grab the page, or the usual negative errno.
+ * Grab the (locked) folio for a memory object. The object cannot span a folio
+ * boundary. Returns the locked folio if successful, NULL if there was no
+ * folio or it didn't cover the range requested, or an ERR_PTR on failure.
*/
-int
-xfile_get_page(
+struct folio *
+xfile_get_folio(
struct xfile *xf,
loff_t pos,
- unsigned int len,
- struct xfile_page *xfpage)
+ size_t len,
+ unsigned int flags)
{
struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- const struct address_space_operations *aops = mapping->a_ops;
- struct page *page = NULL;
- void *fsdata = NULL;
- loff_t key = round_down(pos, PAGE_SIZE);
+ struct folio *folio = NULL;
unsigned int pflags;
int error;
if (inode->i_sb->s_maxbytes - pos < len)
- return -ENOMEM;
- if (len > PAGE_SIZE - offset_in_page(pos))
- return -ENOTBLK;
+ return ERR_PTR(-ENOMEM);
- trace_xfile_get_page(xf, pos, len);
-
- pflags = memalloc_nofs_save();
+ trace_xfile_get_folio(xf, pos, len);
/*
- * We call write_begin directly here to avoid all the freezer
- * protection lock-taking that happens in the normal path. shmem
- * doesn't support fs freeze, but lockdep doesn't know that and will
- * trip over that.
+ * Increase the file size first so that shmem_get_folio(..., SGP_CACHE),
+ * actually allocates a folio instead of erroring out.
*/
- error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
- &fsdata);
+ if ((flags & XFILE_ALLOC) && pos + len > i_size_read(inode))
+ i_size_write(inode, pos + len);
+
+ pflags = memalloc_nofs_save();
+ error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
+ (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ);
+ memalloc_nofs_restore(pflags);
if (error)
- goto out_pflags;
+ return ERR_PTR(error);
- /* We got the page, so make sure we push out EOF. */
- if (i_size_read(inode) < pos + len)
- i_size_write(inode, pos + len);
+ if (!folio)
+ return NULL;
- /*
- * If the page isn't up to date, fill it with zeroes before we hand it
- * to the caller and make sure the backing store will hold on to them.
- */
- if (!PageUptodate(page)) {
- void *kaddr;
+ if (len > folio_size(folio) - offset_in_folio(folio, pos)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return NULL;
+ }
- kaddr = kmap_local_page(page);
- memset(kaddr, 0, PAGE_SIZE);
- kunmap_local(kaddr);
- SetPageUptodate(page);
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ERR_PTR(-EIO);
}
/*
- * Mark each page dirty so that the contents are written to some
- * backing store when we drop this buffer, and take an extra reference
- * to prevent the xfile page from being swapped or removed from the
- * page cache by reclaim if the caller unlocks the page.
+ * Mark the folio dirty so that it won't be reclaimed once we drop the
+ * (potentially last) reference in xfile_put_folio.
*/
- set_page_dirty(page);
- get_page(page);
-
- xfpage->page = page;
- xfpage->fsdata = fsdata;
- xfpage->pos = key;
-out_pflags:
- memalloc_nofs_restore(pflags);
- return error;
+ if (flags & XFILE_ALLOC)
+ folio_mark_dirty(folio);
+ return folio;
}
/*
- * Release the (locked) page for a memory object. Returns 0 or a negative
- * errno.
+ * Release the (locked) folio for a memory object.
*/
-int
-xfile_put_page(
+void
+xfile_put_folio(
struct xfile *xf,
- struct xfile_page *xfpage)
+ struct folio *folio)
{
- struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- const struct address_space_operations *aops = mapping->a_ops;
- unsigned int pflags;
- int ret;
-
- trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
+ trace_xfile_put_folio(xf, folio_pos(folio), folio_size(folio));
- /* Give back the reference that we took in xfile_get_page. */
- put_page(xfpage->page);
+ folio_unlock(folio);
+ folio_put(folio);
+}
- pflags = memalloc_nofs_save();
- ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
- xfpage->page, xfpage->fsdata);
- memalloc_nofs_restore(pflags);
- memset(xfpage, 0, sizeof(struct xfile_page));
+/* Discard the page cache that's backing a range of the xfile. */
+void
+xfile_discard(
+ struct xfile *xf,
+ loff_t pos,
+ u64 count)
+{
+ trace_xfile_discard(xf, pos, count);
- if (ret < 0)
- return ret;
- if (ret != PAGE_SIZE)
- return -EIO;
- return 0;
+ shmem_truncate_range(file_inode(xf->file), pos, pos + count - 1);
}
diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h
index d56643b0f429..cc2cc1714cd4 100644
--- a/fs/xfs/scrub/xfile.h
+++ b/fs/xfs/scrub/xfile.h
@@ -6,22 +6,6 @@
#ifndef __XFS_SCRUB_XFILE_H__
#define __XFS_SCRUB_XFILE_H__
-struct xfile_page {
- struct page *page;
- void *fsdata;
- loff_t pos;
-};
-
-static inline bool xfile_page_cached(const struct xfile_page *xfpage)
-{
- return xfpage->page != NULL;
-}
-
-static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage)
-{
- return xfpage->page->index;
-}
-
struct xfile {
struct file *file;
};
@@ -29,49 +13,23 @@ struct xfile {
int xfile_create(const char *description, loff_t isize, struct xfile **xfilep);
void xfile_destroy(struct xfile *xf);
-ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos);
-ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count,
+int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos);
+int xfile_store(struct xfile *xf, const void *buf, size_t count,
loff_t pos);
-/*
- * Load an object. Since we're treating this file as "memory", any error or
- * short IO is treated as a failure to allocate memory.
- */
-static inline int
-xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos)
-{
- ssize_t ret = xfile_pread(xf, buf, count, pos);
-
- if (ret < 0 || ret != count)
- return -ENOMEM;
- return 0;
-}
-
-/*
- * Store an object. Since we're treating this file as "memory", any error or
- * short IO is treated as a failure to allocate memory.
- */
-static inline int
-xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos)
-{
- ssize_t ret = xfile_pwrite(xf, buf, count, pos);
-
- if (ret < 0 || ret != count)
- return -ENOMEM;
- return 0;
-}
-
+void xfile_discard(struct xfile *xf, loff_t pos, u64 count);
loff_t xfile_seek_data(struct xfile *xf, loff_t pos);
-struct xfile_stat {
- loff_t size;
- unsigned long long bytes;
-};
+#define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER)
-int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf);
+#define XFILE_ALLOC (1 << 0) /* allocate folio if not present */
+struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len,
+ unsigned int flags);
+void xfile_put_folio(struct xfile *xf, struct folio *folio);
-int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len,
- struct xfile_page *xbuf);
-int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf);
+static inline unsigned long long xfile_bytes(struct xfile *xf)
+{
+ return file_inode(xf->file)->i_blocks << SECTOR_SHIFT;
+}
#endif /* __XFS_SCRUB_XFILE_H__ */
diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h
index a39befa743ce..f17173b83e6f 100644
--- a/fs/xfs/scrub/xfs_scrub.h
+++ b/fs/xfs/scrub/xfs_scrub.h
@@ -7,9 +7,11 @@
#define __XFS_SCRUB_H__
#ifndef CONFIG_XFS_ONLINE_SCRUB
-# define xfs_scrub_metadata(file, sm) (-ENOTTY)
+# define xfs_ioc_scrub_metadata(f, a) (-ENOTTY)
+# define xfs_ioc_scrubv_metadata(f, a) (-ENOTTY)
#else
-int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm);
+int xfs_ioc_scrub_metadata(struct file *file, void __user *arg);
+int xfs_ioc_scrubv_metadata(struct file *file, void __user *arg);
#endif /* CONFIG_XFS_ONLINE_SCRUB */
#endif /* __XFS_SCRUB_H__ */