diff options
Diffstat (limited to 'fs/xfs/scrub')
91 files changed, 22974 insertions, 1091 deletions
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index e954f07679dd..303374df44bd 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -15,6 +15,7 @@ #include "xfs_ialloc.h" #include "xfs_rmap.h" #include "xfs_ag.h" +#include "xfs_inode.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -59,6 +60,34 @@ xchk_superblock_xref( } /* + * Calculate the ondisk superblock size in bytes given the feature set of the + * mounted filesystem (aka the primary sb). This is subtlely different from + * the logic in xfs_repair, which computes the size of a secondary sb given the + * featureset listed in the secondary sb. + */ +STATIC size_t +xchk_superblock_ondisk_size( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + return offsetofend(struct xfs_dsb, sb_rtreserved); + if (xfs_has_metadir(mp)) + return offsetofend(struct xfs_dsb, sb_pad); + if (xfs_has_metauuid(mp)) + return offsetofend(struct xfs_dsb, sb_meta_uuid); + if (xfs_has_crc(mp)) + return offsetofend(struct xfs_dsb, sb_lsn); + if (xfs_sb_version_hasmorebits(&mp->m_sb)) + return offsetofend(struct xfs_dsb, sb_bad_features2); + if (xfs_has_logv2(mp)) + return offsetofend(struct xfs_dsb, sb_logsunit); + if (xfs_has_sector(mp)) + return offsetofend(struct xfs_dsb, sb_logsectsize); + /* only support dirv2 or more recent */ + return offsetofend(struct xfs_dsb, sb_dirblklog); +} + +/* * Scrub the filesystem superblock. * * Note: We do /not/ attempt to check AG 0's superblock. Mount is @@ -74,6 +103,7 @@ xchk_superblock( struct xfs_buf *bp; struct xfs_dsb *sb; struct xfs_perag *pag; + size_t sblen; xfs_agnumber_t agno; uint32_t v2_ok; __be32 features_mask; @@ -143,11 +173,19 @@ xchk_superblock( if (sb->sb_rootino != cpu_to_be64(mp->m_sb.sb_rootino)) xchk_block_set_preen(sc, bp); - if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino)) - xchk_block_set_preen(sc, bp); + if (xfs_has_metadir(sc->mp)) { + if (sb->sb_rbmino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); - if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino)) - xchk_block_set_preen(sc, bp); + if (sb->sb_rsumino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); + } else { + if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino)) + xchk_block_set_preen(sc, bp); + } if (sb->sb_rextsize != cpu_to_be32(mp->m_sb.sb_rextsize)) xchk_block_set_corrupt(sc, bp); @@ -165,8 +203,7 @@ xchk_superblock( xchk_block_set_corrupt(sc, bp); /* Check sb_versionnum bits that are set at mkfs time. */ - vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS | - XFS_SB_VERSION_NUMBITS | + vernum_mask = cpu_to_be16(XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALIGNBIT | XFS_SB_VERSION_DALIGNBIT | XFS_SB_VERSION_SHAREDBIT | @@ -224,11 +261,19 @@ xchk_superblock( * sb_icount, sb_ifree, sb_fdblocks, sb_frexents */ - if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino)) - xchk_block_set_preen(sc, bp); + if (xfs_has_metadir(mp)) { + if (sb->sb_uquotino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); - if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino)) - xchk_block_set_preen(sc, bp); + if (sb->sb_gquotino != cpu_to_be64(0)) + xchk_block_set_preen(sc, bp); + } else { + if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino)) + xchk_block_set_preen(sc, bp); + } /* * Skip the quota flags since repair will force quotacheck. @@ -337,8 +382,13 @@ xchk_superblock( if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align)) xchk_block_set_corrupt(sc, bp); - if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino)) - xchk_block_set_preen(sc, bp); + if (xfs_has_metadir(mp)) { + if (sb->sb_pquotino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); + } else { + if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino)) + xchk_block_set_preen(sc, bp); + } /* Don't care about sb_lsn */ } @@ -349,9 +399,26 @@ xchk_superblock( xchk_block_set_corrupt(sc, bp); } + if (xfs_has_metadir(mp)) { + if (sb->sb_metadirino != cpu_to_be64(mp->m_sb.sb_metadirino)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_rgcount != cpu_to_be32(mp->m_sb.sb_rgcount)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_rgextents != cpu_to_be32(mp->m_sb.sb_rgextents)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_rgblklog != mp->m_sb.sb_rgblklog) + xchk_block_set_corrupt(sc, bp); + + if (memchr_inv(sb->sb_pad, 0, sizeof(sb->sb_pad))) + xchk_block_set_corrupt(sc, bp); + } + /* Everything else must be zero. */ - if (memchr_inv(sb + 1, 0, - BBTOB(bp->b_length) - sizeof(struct xfs_dsb))) + sblen = xchk_superblock_ondisk_size(mp); + if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen)) xchk_block_set_corrupt(sc, bp); xchk_superblock_xref(sc, bp); @@ -434,7 +501,7 @@ xchk_agf_xref_btreeblks( { struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_mount *mp = sc->mp; - xfs_agblock_t blocks; + xfs_filblks_t blocks; xfs_agblock_t btreeblks; int error; @@ -483,7 +550,7 @@ xchk_agf_xref_refcblks( struct xfs_scrub *sc) { struct xfs_agf *agf = sc->sa.agf_bp->b_addr; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error; if (!sc->sa.refc_cur) @@ -552,7 +619,7 @@ xchk_agf( /* Check the AG length */ eoag = be32_to_cpu(agf->agf_length); - if (eoag != pag->block_count) + if (eoag != pag_group(pag)->xg_block_count) xchk_block_set_corrupt(sc, sc->sa.agf_bp); /* Check the AGF btree roots and levels */ @@ -816,7 +883,7 @@ xchk_agi_xref_fiblocks( struct xfs_scrub *sc) { struct xfs_agi *agi = sc->sa.agi_bp->b_addr; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error = 0; if (!xfs_has_inobtcounts(sc->mp)) @@ -865,6 +932,43 @@ xchk_agi_xref( /* scrub teardown will take care of sc->sa for us */ } +/* + * Check the unlinked buckets for links to bad inodes. We hold the AGI, so + * there cannot be any threads updating unlinked list pointers in this AG. + */ +STATIC void +xchk_iunlink( + struct xfs_scrub *sc, + struct xfs_agi *agi) +{ + unsigned int i; + struct xfs_inode *ip; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + xfs_agino_t agino = be32_to_cpu(agi->agi_unlinked[i]); + + while (agino != NULLAGINO) { + if (agino % XFS_AGI_UNLINKED_BUCKETS != i) { + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + return; + } + + ip = xfs_iunlink_lookup(sc->sa.pag, agino); + if (!ip) { + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + return; + } + + if (!xfs_inode_on_unlinked_list(ip)) { + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + return; + } + + agino = ip->i_next_unlinked; + } + } +} + /* Scrub the AGI. */ int xchk_agi( @@ -895,7 +999,7 @@ xchk_agi( /* Check the AG length */ eoag = be32_to_cpu(agi->agi_length); - if (eoag != pag->block_count) + if (eoag != pag_group(pag)->xg_block_count) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check btree roots and levels */ @@ -949,6 +1053,8 @@ xchk_agi( if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); + xchk_iunlink(sc, agi); + xchk_agi_xref(sc); out: return error; diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 427054b65b23..cd6f0223879f 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -21,13 +21,18 @@ #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_ag.h" +#include "xfs_inode.h" +#include "xfs_iunlink_item.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" #include "scrub/agb_bitmap.h" +#include "scrub/agino_bitmap.h" #include "scrub/reap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" /* Superblock */ @@ -203,8 +208,8 @@ xrep_agf_init_header( memset(agf, 0, BBTOB(agf_bp->b_length)); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); - agf->agf_seqno = cpu_to_be32(pag->pag_agno); - agf->agf_length = cpu_to_be32(pag->block_count); + agf->agf_seqno = cpu_to_be32(pag_agno(pag)); + agf->agf_length = cpu_to_be32(pag_group(pag)->xg_block_count); agf->agf_flfirst = old_agf->agf_flfirst; agf->agf_fllast = old_agf->agf_fllast; agf->agf_flcount = old_agf->agf_flcount; @@ -251,7 +256,7 @@ xrep_agf_calc_from_btrees( struct xfs_agf *agf = agf_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agblock_t btreeblks; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error; /* Update the AGF counters from the bnobt. */ @@ -379,7 +384,7 @@ xrep_agf( * was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AG_DADDR(mp, pag_agno(sc->sa.pag), XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL); if (error) @@ -642,7 +647,7 @@ xrep_agfl_fill( xfs_agblock_t agbno = start; int error; - trace_xrep_agfl_insert(sc->sa.pag, agbno, len); + trace_xrep_agfl_insert(pag_group(sc->sa.pag), agbno, len); while (agbno < start + len && af->fl_off < af->flcount) af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++); @@ -682,7 +687,7 @@ xrep_agfl_init_header( agfl = XFS_BUF_TO_AGFL(agfl_bp); memset(agfl, 0xFF, BBTOB(agfl_bp->b_length)); agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); - agfl->agfl_seqno = cpu_to_be32(sc->sa.pag->pag_agno); + agfl->agfl_seqno = cpu_to_be32(pag_agno(sc->sa.pag)); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); /* @@ -691,7 +696,7 @@ xrep_agfl_init_header( * step. */ xagb_bitmap_init(&af.used_extents); - af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp), + af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp); xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af); error = xagb_bitmap_disunion(agfl_extents, &af.used_extents); if (error) @@ -736,7 +741,7 @@ xrep_agfl( * was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AG_DADDR(mp, pag_agno(sc->sa.pag), XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL); if (error) @@ -796,15 +801,57 @@ enum { XREP_AGI_MAX }; +#define XREP_AGI_LOOKUP_BATCH 32 + +struct xrep_agi { + struct xfs_scrub *sc; + + /* AGI buffer, tracked separately */ + struct xfs_buf *agi_bp; + + /* context for finding btree roots */ + struct xrep_find_ag_btree fab[XREP_AGI_MAX]; + + /* old AGI contents in case we have to revert */ + struct xfs_agi old_agi; + + /* bitmap of which inodes are unlinked */ + struct xagino_bitmap iunlink_bmp; + + /* heads of the unlinked inode bucket lists */ + xfs_agino_t iunlink_heads[XFS_AGI_UNLINKED_BUCKETS]; + + /* scratchpad for batched lookups of the radix tree */ + struct xfs_inode *lookup_batch[XREP_AGI_LOOKUP_BATCH]; + + /* Map of ino -> next_ino for unlinked inode processing. */ + struct xfarray *iunlink_next; + + /* Map of ino -> prev_ino for unlinked inode processing. */ + struct xfarray *iunlink_prev; +}; + +static void +xrep_agi_buf_cleanup( + void *buf) +{ + struct xrep_agi *ragi = buf; + + xfarray_destroy(ragi->iunlink_prev); + xfarray_destroy(ragi->iunlink_next); + xagino_bitmap_destroy(&ragi->iunlink_bmp); +} + /* * Given the inode btree roots described by *fab, find the roots, check them * for sanity, and pass the root data back out via *fab. */ STATIC int xrep_agi_find_btrees( - struct xfs_scrub *sc, - struct xrep_find_ag_btree *fab) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xrep_find_ag_btree *fab = ragi->fab; struct xfs_buf *agf_bp; struct xfs_mount *mp = sc->mp; int error; @@ -837,10 +884,11 @@ xrep_agi_find_btrees( */ STATIC void xrep_agi_init_header( - struct xfs_scrub *sc, - struct xfs_buf *agi_bp, - struct xfs_agi *old_agi) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; + struct xfs_agi *old_agi = &ragi->old_agi; struct xfs_agi *agi = agi_bp->b_addr; struct xfs_perag *pag = sc->sa.pag; struct xfs_mount *mp = sc->mp; @@ -849,17 +897,13 @@ xrep_agi_init_header( memset(agi, 0, BBTOB(agi_bp->b_length)); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); - agi->agi_seqno = cpu_to_be32(pag->pag_agno); - agi->agi_length = cpu_to_be32(pag->block_count); + agi->agi_seqno = cpu_to_be32(pag_agno(pag)); + agi->agi_length = cpu_to_be32(pag_group(pag)->xg_block_count); agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_has_crc(mp)) uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); - /* We don't know how to fix the unlinked list yet. */ - memcpy(&agi->agi_unlinked, &old_agi->agi_unlinked, - sizeof(agi->agi_unlinked)); - /* Mark the incore AGF data stale until we're done fixing things. */ ASSERT(xfs_perag_initialised_agi(pag)); clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); @@ -868,10 +912,12 @@ xrep_agi_init_header( /* Set btree root information in an AGI. */ STATIC void xrep_agi_set_roots( - struct xfs_scrub *sc, - struct xfs_agi *agi, - struct xrep_find_ag_btree *fab) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_agi *agi = ragi->agi_bp->b_addr; + struct xrep_find_ag_btree *fab = ragi->fab; + agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root); agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height); @@ -884,9 +930,10 @@ xrep_agi_set_roots( /* Update the AGI counters. */ STATIC int xrep_agi_calc_from_btrees( - struct xfs_scrub *sc, - struct xfs_buf *agi_bp) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; struct xfs_btree_cur *cur; struct xfs_agi *agi = agi_bp->b_addr; struct xfs_mount *mp = sc->mp; @@ -899,7 +946,7 @@ xrep_agi_calc_from_btrees( if (error) goto err; if (xfs_has_inobtcounts(mp)) { - xfs_agblock_t blocks; + xfs_filblks_t blocks; error = xfs_btree_count_blocks(cur, &blocks); if (error) @@ -912,7 +959,7 @@ xrep_agi_calc_from_btrees( agi->agi_freecount = cpu_to_be32(freecount); if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) { - xfs_agblock_t blocks; + xfs_filblks_t blocks; cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); error = xfs_btree_count_blocks(cur, &blocks); @@ -928,12 +975,713 @@ err: return error; } +/* + * Record a forwards unlinked chain pointer from agino -> next_agino in our + * staging information. + */ +static inline int +xrep_iunlink_store_next( + struct xrep_agi *ragi, + xfs_agino_t agino, + xfs_agino_t next_agino) +{ + ASSERT(next_agino != 0); + + return xfarray_store(ragi->iunlink_next, agino, &next_agino); +} + +/* + * Record a backwards unlinked chain pointer from prev_ino <- agino in our + * staging information. + */ +static inline int +xrep_iunlink_store_prev( + struct xrep_agi *ragi, + xfs_agino_t agino, + xfs_agino_t prev_agino) +{ + ASSERT(prev_agino != 0); + + return xfarray_store(ragi->iunlink_prev, agino, &prev_agino); +} + +/* + * Given an @agino, look up the next inode in the iunlink bucket. Returns + * NULLAGINO if we're at the end of the chain, 0 if @agino is not in memory + * like it should be, or a per-AG inode number. + */ +static inline xfs_agino_t +xrep_iunlink_next( + struct xfs_scrub *sc, + xfs_agino_t agino) +{ + struct xfs_inode *ip; + + ip = xfs_iunlink_lookup(sc->sa.pag, agino); + if (!ip) + return 0; + + return ip->i_next_unlinked; +} + +/* + * Load the inode @agino into memory, set its i_prev_unlinked, and drop the + * inode so it can be inactivated. Returns NULLAGINO if we're at the end of + * the chain or if we should stop walking the chain due to corruption; or a + * per-AG inode number. + */ +STATIC xfs_agino_t +xrep_iunlink_reload_next( + struct xrep_agi *ragi, + xfs_agino_t prev_agino, + xfs_agino_t agino) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_inode *ip; + xfs_agino_t ret = NULLAGINO; + int error; + + error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino), &ip); + if (error) + return ret; + + trace_xrep_iunlink_reload_next(ip, prev_agino); + + /* If this is a linked inode, stop processing the chain. */ + if (VFS_I(ip)->i_nlink != 0) { + xrep_iunlink_store_next(ragi, agino, NULLAGINO); + goto rele; + } + + ip->i_prev_unlinked = prev_agino; + ret = ip->i_next_unlinked; + + /* + * Drop the inode reference that we just took. We hold the AGI, so + * this inode cannot move off the unlinked list and hence cannot be + * reclaimed. + */ +rele: + xchk_irele(sc, ip); + return ret; +} + +/* + * Walk an AGI unlinked bucket's list to load incore any unlinked inodes that + * still existed at mount time. This can happen if iunlink processing fails + * during log recovery. + */ +STATIC int +xrep_iunlink_walk_ondisk_bucket( + struct xrep_agi *ragi, + unsigned int bucket) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + xfs_agino_t prev_agino = NULLAGINO; + xfs_agino_t next_agino; + int error = 0; + + next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (next_agino != NULLAGINO) { + xfs_agino_t agino = next_agino; + + if (xchk_should_terminate(ragi->sc, &error)) + return error; + + trace_xrep_iunlink_walk_ondisk_bucket(sc->sa.pag, bucket, + prev_agino, agino); + + if (bucket != agino % XFS_AGI_UNLINKED_BUCKETS) + break; + + next_agino = xrep_iunlink_next(sc, agino); + if (!next_agino) + next_agino = xrep_iunlink_reload_next(ragi, prev_agino, + agino); + + prev_agino = agino; + } + + return 0; +} + +/* Decide if this is an unlinked inode in this AG. */ +STATIC bool +xrep_iunlink_igrab( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = pag_mount(pag); + + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag)) + return false; + + if (!xfs_inode_on_unlinked_list(ip)) + return false; + + return true; +} + +/* + * Mark the given inode in the lookup batch in our unlinked inode bitmap, and + * remember if this inode is the start of the unlinked chain. + */ +STATIC int +xrep_iunlink_visit( + struct xrep_agi *ragi, + unsigned int batch_idx) +{ + struct xfs_mount *mp = ragi->sc->mp; + struct xfs_inode *ip = ragi->lookup_batch[batch_idx]; + xfs_agino_t agino; + unsigned int bucket; + int error; + + ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == pag_agno(ragi->sc->sa.pag)); + ASSERT(xfs_inode_on_unlinked_list(ip)); + + agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + bucket = agino % XFS_AGI_UNLINKED_BUCKETS; + + trace_xrep_iunlink_visit(ragi->sc->sa.pag, bucket, + ragi->iunlink_heads[bucket], ip); + + error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1); + if (error) + return error; + + if (ip->i_prev_unlinked == NULLAGINO) { + if (ragi->iunlink_heads[bucket] == NULLAGINO) + ragi->iunlink_heads[bucket] = agino; + } + + return 0; +} + +/* + * Find all incore unlinked inodes so that we can rebuild the unlinked buckets. + * We hold the AGI so there should not be any modifications to the unlinked + * list. + */ +STATIC int +xrep_iunlink_mark_incore( + struct xrep_agi *ragi) +{ + struct xfs_perag *pag = ragi->sc->sa.pag; + struct xfs_mount *mp = pag_mount(pag); + uint32_t first_index = 0; + bool done = false; + unsigned int nr_found = 0; + + do { + unsigned int i; + int error = 0; + + if (xchk_should_terminate(ragi->sc, &error)) + return error; + + rcu_read_lock(); + + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)&ragi->lookup_batch, first_index, + XREP_AGI_LOOKUP_BATCH); + if (!nr_found) { + rcu_read_unlock(); + return 0; + } + + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = ragi->lookup_batch[i]; + + if (done || !xrep_iunlink_igrab(pag, ip)) + ragi->lookup_batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag)) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = true; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!ragi->lookup_batch[i]) + continue; + error = xrep_iunlink_visit(ragi, i); + if (error) + return error; + } + } while (!done); + + return 0; +} + +/* Mark all the unlinked ondisk inodes in this inobt record in iunlink_bmp. */ +STATIC int +xrep_iunlink_mark_ondisk_rec( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_inobt_rec_incore irec; + struct xrep_agi *ragi = priv; + struct xfs_scrub *sc = ragi->sc; + struct xfs_mount *mp = cur->bc_mp; + xfs_agino_t agino; + unsigned int i; + int error = 0; + + xfs_inobt_btrec_to_irec(mp, rec, &irec); + + for (i = 0, agino = irec.ir_startino; + i < XFS_INODES_PER_CHUNK; + i++, agino++) { + struct xfs_inode *ip; + unsigned int len = 1; + + /* Skip free inodes */ + if (XFS_INOBT_MASK(i) & irec.ir_free) + continue; + /* Skip inodes we've seen before */ + if (xagino_bitmap_test(&ragi->iunlink_bmp, agino, &len)) + continue; + + /* + * Skip incore inodes; these were already picked up by + * the _mark_incore step. + */ + rcu_read_lock(); + ip = radix_tree_lookup(&sc->sa.pag->pag_ici_root, agino); + rcu_read_unlock(); + if (ip) + continue; + + /* + * Try to look up this inode. If we can't get it, just move + * on because we haven't actually scrubbed the inobt or the + * inodes yet. + */ + error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino), + &ip); + if (error) + continue; + + trace_xrep_iunlink_reload_ondisk(ip); + + if (VFS_I(ip)->i_nlink == 0) + error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1); + xchk_irele(sc, ip); + if (error) + break; + } + + return error; +} + +/* + * Find ondisk inodes that are unlinked and not in cache, and mark them in + * iunlink_bmp. We haven't checked the inobt yet, so we don't error out if + * the btree is corrupt. + */ +STATIC void +xrep_iunlink_mark_ondisk( + struct xrep_agi *ragi) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; + struct xfs_btree_cur *cur; + int error; + + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); + error = xfs_btree_query_all(cur, xrep_iunlink_mark_ondisk_rec, ragi); + xfs_btree_del_cursor(cur, error); +} + +/* + * Walk an iunlink bucket's inode list. For each inode that should be on this + * chain, clear its entry in in iunlink_bmp because it's ok and we don't need + * to touch it further. + */ +STATIC int +xrep_iunlink_resolve_bucket( + struct xrep_agi *ragi, + unsigned int bucket) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_inode *ip; + xfs_agino_t prev_agino = NULLAGINO; + xfs_agino_t next_agino = ragi->iunlink_heads[bucket]; + int error = 0; + + while (next_agino != NULLAGINO) { + if (xchk_should_terminate(ragi->sc, &error)) + return error; + + /* Find the next inode in the chain. */ + ip = xfs_iunlink_lookup(sc->sa.pag, next_agino); + if (!ip) { + /* Inode not incore? Terminate the chain. */ + trace_xrep_iunlink_resolve_uncached(sc->sa.pag, + bucket, prev_agino, next_agino); + + next_agino = NULLAGINO; + break; + } + + if (next_agino % XFS_AGI_UNLINKED_BUCKETS != bucket) { + /* + * Inode is in the wrong bucket. Advance the list, + * but pretend we didn't see this inode. + */ + trace_xrep_iunlink_resolve_wronglist(sc->sa.pag, + bucket, prev_agino, next_agino); + + next_agino = ip->i_next_unlinked; + continue; + } + + if (!xfs_inode_on_unlinked_list(ip)) { + /* + * Incore inode doesn't think this inode is on an + * unlinked list. This is probably because we reloaded + * it from disk. Advance the list, but pretend we + * didn't see this inode; we'll fix that later. + */ + trace_xrep_iunlink_resolve_nolist(sc->sa.pag, + bucket, prev_agino, next_agino); + next_agino = ip->i_next_unlinked; + continue; + } + + trace_xrep_iunlink_resolve_ok(sc->sa.pag, bucket, prev_agino, + next_agino); + + /* + * Otherwise, this inode's unlinked pointers are ok. Clear it + * from the unlinked bitmap since we're done with it, and make + * sure the chain is still correct. + */ + error = xagino_bitmap_clear(&ragi->iunlink_bmp, next_agino, 1); + if (error) + return error; + + /* Remember the previous inode's next pointer. */ + if (prev_agino != NULLAGINO) { + error = xrep_iunlink_store_next(ragi, prev_agino, + next_agino); + if (error) + return error; + } + + /* Remember this inode's previous pointer. */ + error = xrep_iunlink_store_prev(ragi, next_agino, prev_agino); + if (error) + return error; + + /* Advance the list and remember this inode. */ + prev_agino = next_agino; + next_agino = ip->i_next_unlinked; + } + + /* Update the previous inode's next pointer. */ + if (prev_agino != NULLAGINO) { + error = xrep_iunlink_store_next(ragi, prev_agino, next_agino); + if (error) + return error; + } + + return 0; +} + +/* Reinsert this unlinked inode into the head of the staged bucket list. */ +STATIC int +xrep_iunlink_add_to_bucket( + struct xrep_agi *ragi, + xfs_agino_t agino) +{ + xfs_agino_t current_head; + unsigned int bucket; + int error; + + bucket = agino % XFS_AGI_UNLINKED_BUCKETS; + + /* Point this inode at the current head of the bucket list. */ + current_head = ragi->iunlink_heads[bucket]; + + trace_xrep_iunlink_add_to_bucket(ragi->sc->sa.pag, bucket, agino, + current_head); + + error = xrep_iunlink_store_next(ragi, agino, current_head); + if (error) + return error; + + /* Remember the head inode's previous pointer. */ + if (current_head != NULLAGINO) { + error = xrep_iunlink_store_prev(ragi, current_head, agino); + if (error) + return error; + } + + ragi->iunlink_heads[bucket] = agino; + return 0; +} + +/* Reinsert unlinked inodes into the staged iunlink buckets. */ +STATIC int +xrep_iunlink_add_lost_inodes( + uint32_t start, + uint32_t len, + void *priv) +{ + struct xrep_agi *ragi = priv; + int error; + + for (; len > 0; start++, len--) { + error = xrep_iunlink_add_to_bucket(ragi, start); + if (error) + return error; + } + + return 0; +} + +/* + * Figure out the iunlink bucket values and find inodes that need to be + * reinserted into the list. + */ +STATIC int +xrep_iunlink_rebuild_buckets( + struct xrep_agi *ragi) +{ + unsigned int i; + int error; + + /* + * Walk the ondisk AGI unlinked list to find inodes that are on the + * list but aren't in memory. This can happen if a past log recovery + * tried to clear the iunlinked list but failed. Our scan rebuilds the + * unlinked list using incore inodes, so we must load and link them + * properly. + */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + error = xrep_iunlink_walk_ondisk_bucket(ragi, i); + if (error) + return error; + } + + /* + * Record all the incore unlinked inodes in iunlink_bmp that we didn't + * find by walking the ondisk iunlink buckets. This shouldn't happen, + * but we can't risk forgetting an inode somewhere. + */ + error = xrep_iunlink_mark_incore(ragi); + if (error) + return error; + + /* + * If there are ondisk inodes that are unlinked and are not been loaded + * into cache, record them in iunlink_bmp. + */ + xrep_iunlink_mark_ondisk(ragi); + + /* + * Walk each iunlink bucket to (re)construct as much of the incore list + * as would be correct. For each inode that survives this step, mark + * it clear in iunlink_bmp; we're done with those inodes. + */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + error = xrep_iunlink_resolve_bucket(ragi, i); + if (error) + return error; + } + + /* + * Any unlinked inodes that we didn't find through the bucket list + * walk (or was ignored by the walk) must be inserted into the bucket + * list. Stage this in memory for now. + */ + return xagino_bitmap_walk(&ragi->iunlink_bmp, + xrep_iunlink_add_lost_inodes, ragi); +} + +/* Update i_next_iunlinked for the inode @agino. */ +STATIC int +xrep_iunlink_relink_next( + struct xrep_agi *ragi, + xfarray_idx_t idx, + xfs_agino_t next_agino) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_inode *ip; + xfarray_idx_t agino = idx - 1; + bool want_rele = false; + int error = 0; + + ip = xfs_iunlink_lookup(pag, agino); + if (!ip) { + xfs_agino_t prev_agino; + + /* + * No inode exists in cache. Load it off the disk so that we + * can reinsert it into the incore unlinked list. + */ + error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip); + if (error) + return -EFSCORRUPTED; + + want_rele = true; + + /* Set the backward pointer since this just came off disk. */ + error = xfarray_load(ragi->iunlink_prev, agino, &prev_agino); + if (error) + goto out_rele; + + trace_xrep_iunlink_relink_prev(ip, prev_agino); + ip->i_prev_unlinked = prev_agino; + } + + /* Update the forward pointer. */ + if (ip->i_next_unlinked != next_agino) { + error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino); + if (error) + goto out_rele; + + trace_xrep_iunlink_relink_next(ip, next_agino); + ip->i_next_unlinked = next_agino; + } + +out_rele: + /* + * The iunlink lookup doesn't igrab because we hold the AGI buffer lock + * and the inode cannot be reclaimed. However, if we used iget to load + * a missing inode, we must irele it here. + */ + if (want_rele) + xchk_irele(sc, ip); + return error; +} + +/* Update i_prev_iunlinked for the inode @agino. */ +STATIC int +xrep_iunlink_relink_prev( + struct xrep_agi *ragi, + xfarray_idx_t idx, + xfs_agino_t prev_agino) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_inode *ip; + xfarray_idx_t agino = idx - 1; + bool want_rele = false; + int error = 0; + + ASSERT(prev_agino != 0); + + ip = xfs_iunlink_lookup(pag, agino); + if (!ip) { + xfs_agino_t next_agino; + + /* + * No inode exists in cache. Load it off the disk so that we + * can reinsert it into the incore unlinked list. + */ + error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip); + if (error) + return -EFSCORRUPTED; + + want_rele = true; + + /* Set the forward pointer since this just came off disk. */ + error = xfarray_load(ragi->iunlink_prev, agino, &next_agino); + if (error) + goto out_rele; + + error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino); + if (error) + goto out_rele; + + trace_xrep_iunlink_relink_next(ip, next_agino); + ip->i_next_unlinked = next_agino; + } + + /* Update the backward pointer. */ + if (ip->i_prev_unlinked != prev_agino) { + trace_xrep_iunlink_relink_prev(ip, prev_agino); + ip->i_prev_unlinked = prev_agino; + } + +out_rele: + /* + * The iunlink lookup doesn't igrab because we hold the AGI buffer lock + * and the inode cannot be reclaimed. However, if we used iget to load + * a missing inode, we must irele it here. + */ + if (want_rele) + xchk_irele(sc, ip); + return error; +} + +/* Log all the iunlink updates we need to finish regenerating the AGI. */ +STATIC int +xrep_iunlink_commit( + struct xrep_agi *ragi) +{ + struct xfs_agi *agi = ragi->agi_bp->b_addr; + xfarray_idx_t idx = XFARRAY_CURSOR_INIT; + xfs_agino_t agino; + unsigned int i; + int error; + + /* Fix all the forward links */ + while ((error = xfarray_iter(ragi->iunlink_next, &idx, &agino)) == 1) { + error = xrep_iunlink_relink_next(ragi, idx, agino); + if (error) + return error; + } + + /* Fix all the back links */ + idx = XFARRAY_CURSOR_INIT; + while ((error = xfarray_iter(ragi->iunlink_prev, &idx, &agino)) == 1) { + error = xrep_iunlink_relink_prev(ragi, idx, agino); + if (error) + return error; + } + + /* Copy the staged iunlink buckets to the new AGI. */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + trace_xrep_iunlink_commit_bucket(ragi->sc->sa.pag, i, + be32_to_cpu(ragi->old_agi.agi_unlinked[i]), + ragi->iunlink_heads[i]); + + agi->agi_unlinked[i] = cpu_to_be32(ragi->iunlink_heads[i]); + } + + return 0; +} + /* Trigger reinitialization of the in-core data. */ STATIC int xrep_agi_commit_new( - struct xfs_scrub *sc, - struct xfs_buf *agi_bp) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; struct xfs_perag *pag; struct xfs_agi *agi = agi_bp->b_addr; @@ -956,48 +1704,76 @@ xrep_agi_commit_new( /* Repair the AGI. */ int xrep_agi( - struct xfs_scrub *sc) + struct xfs_scrub *sc) { - struct xrep_find_ag_btree fab[XREP_AGI_MAX] = { - [XREP_AGI_INOBT] = { - .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_inobt_buf_ops, - .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, - }, - [XREP_AGI_FINOBT] = { - .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_finobt_buf_ops, - .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, - }, - [XREP_AGI_END] = { - .buf_ops = NULL - }, - }; - struct xfs_agi old_agi; - struct xfs_mount *mp = sc->mp; - struct xfs_buf *agi_bp; - struct xfs_agi *agi; - int error; + struct xrep_agi *ragi; + struct xfs_mount *mp = sc->mp; + char *descr; + unsigned int i; + int error; /* We require the rmapbt to rebuild anything. */ if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP; + sc->buf = kzalloc(sizeof(struct xrep_agi), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + ragi = sc->buf; + ragi->sc = sc; + + ragi->fab[XREP_AGI_INOBT] = (struct xrep_find_ag_btree){ + .rmap_owner = XFS_RMAP_OWN_INOBT, + .buf_ops = &xfs_inobt_buf_ops, + .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, + }; + ragi->fab[XREP_AGI_FINOBT] = (struct xrep_find_ag_btree){ + .rmap_owner = XFS_RMAP_OWN_INOBT, + .buf_ops = &xfs_finobt_buf_ops, + .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, + }; + ragi->fab[XREP_AGI_END] = (struct xrep_find_ag_btree){ + .buf_ops = NULL, + }; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) + ragi->iunlink_heads[i] = NULLAGINO; + + xagino_bitmap_init(&ragi->iunlink_bmp); + sc->buf_cleanup = xrep_agi_buf_cleanup; + + descr = xchk_xfile_ag_descr(sc, "iunlinked next pointers"); + error = xfarray_create(descr, 0, sizeof(xfs_agino_t), + &ragi->iunlink_next); + kfree(descr); + if (error) + return error; + + descr = xchk_xfile_ag_descr(sc, "iunlinked prev pointers"); + error = xfarray_create(descr, 0, sizeof(xfs_agino_t), + &ragi->iunlink_prev); + kfree(descr); + if (error) + return error; + /* * Make sure we have the AGI buffer, as scrub might have decided it * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AG_DADDR(mp, pag_agno(sc->sa.pag), XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL); + XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL); if (error) return error; - agi_bp->b_ops = &xfs_agi_buf_ops; - agi = agi_bp->b_addr; + ragi->agi_bp->b_ops = &xfs_agi_buf_ops; /* Find the AGI btree roots. */ - error = xrep_agi_find_btrees(sc, fab); + error = xrep_agi_find_btrees(ragi); + if (error) + return error; + + error = xrep_iunlink_rebuild_buckets(ragi); if (error) return error; @@ -1006,18 +1782,21 @@ xrep_agi( return error; /* Start rewriting the header and implant the btrees we found. */ - xrep_agi_init_header(sc, agi_bp, &old_agi); - xrep_agi_set_roots(sc, agi, fab); - error = xrep_agi_calc_from_btrees(sc, agi_bp); + xrep_agi_init_header(ragi); + xrep_agi_set_roots(ragi); + error = xrep_agi_calc_from_btrees(ragi); + if (error) + goto out_revert; + error = xrep_iunlink_commit(ragi); if (error) goto out_revert; /* Reinitialize in-core state. */ - return xrep_agi_commit_new(sc, agi_bp); + return xrep_agi_commit_new(ragi); out_revert: /* Mark the incore AGI state stale and revert the AGI. */ clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate); - memcpy(agi, &old_agi, sizeof(old_agi)); + memcpy(ragi->agi_bp->b_addr, &ragi->old_agi, sizeof(struct xfs_agi)); return error; } diff --git a/fs/xfs/scrub/agino_bitmap.h b/fs/xfs/scrub/agino_bitmap.h new file mode 100644 index 000000000000..56d7db5f1699 --- /dev/null +++ b/fs/xfs/scrub/agino_bitmap.h @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_AGINO_BITMAP_H__ +#define __XFS_SCRUB_AGINO_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_agino_t */ + +struct xagino_bitmap { + struct xbitmap32 aginobitmap; +}; + +static inline void xagino_bitmap_init(struct xagino_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->aginobitmap); +} + +static inline void xagino_bitmap_destroy(struct xagino_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->aginobitmap); +} + +static inline int xagino_bitmap_clear(struct xagino_bitmap *bitmap, + xfs_agino_t agino, unsigned int len) +{ + return xbitmap32_clear(&bitmap->aginobitmap, agino, len); +} + +static inline int xagino_bitmap_set(struct xagino_bitmap *bitmap, + xfs_agino_t agino, unsigned int len) +{ + return xbitmap32_set(&bitmap->aginobitmap, agino, len); +} + +static inline bool xagino_bitmap_test(struct xagino_bitmap *bitmap, + xfs_agino_t agino, unsigned int *len) +{ + return xbitmap32_test(&bitmap->aginobitmap, agino, len); +} + +static inline int xagino_bitmap_walk(struct xagino_bitmap *bitmap, + xbitmap32_walk_fn fn, void *priv) +{ + return xbitmap32_walk(&bitmap->aginobitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_AGINO_BITMAP_H__ */ diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index d1b8a4997dd2..8b282138097f 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -139,7 +139,7 @@ xchk_allocbt_rec( struct xchk_alloc *ca = bs->private; xfs_alloc_btrec_to_irec(rec, &irec); - if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { + if (xfs_alloc_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c index d421b253923e..bed6a09aa791 100644 --- a/fs/xfs/scrub/alloc_repair.c +++ b/fs/xfs/scrub/alloc_repair.c @@ -132,17 +132,16 @@ int xrep_setup_ag_allocbt( struct xfs_scrub *sc) { + struct xfs_group *xg = pag_group(sc->sa.pag); unsigned int busy_gen; /* * Make sure the busy extent list is clear because we can't put extents * on there twice. */ - busy_gen = READ_ONCE(sc->sa.pag->pagb_gen); - if (xfs_extent_busy_list_empty(sc->sa.pag)) + if (xfs_extent_busy_list_empty(xg, &busy_gen)) return 0; - - return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0); + return xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0); } /* Check for any obvious conflicts in the free extent. */ @@ -210,7 +209,7 @@ xrep_abt_stash( if (error) return error; - trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec); + trace_xrep_abt_found(sc->sa.pag, &arec); error = xfarray_append(ra->free_records, &arec); if (error) @@ -484,8 +483,8 @@ xrep_abt_reserve_space( ASSERT(arec.ar_blockcount <= UINT_MAX); len = min_t(unsigned int, arec.ar_blockcount, desired); - trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno, - arec.ar_startblock, len, XFS_RMAP_OWN_AG); + trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, arec.ar_startblock, + len, XFS_RMAP_OWN_AG); error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag, arec.ar_startblock, len); @@ -543,8 +542,9 @@ xrep_abt_dispose_one( /* Add a deferred rmap for each extent we used. */ if (resv->used > 0) - xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno, - resv->used, XFS_RMAP_OWN_AG); + xfs_rmap_alloc_extent(sc->tp, false, + xfs_agbno_to_fsb(pag, resv->agbno), resv->used, + XFS_RMAP_OWN_AG); /* * For each reserved btree block we didn't use, add it to the free @@ -554,8 +554,8 @@ xrep_abt_dispose_one( if (free_aglen == 0) return 0; - trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, - free_aglen, ra->new_bnobt.oinfo.oi_owner); + trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, + ra->new_bnobt.oinfo.oi_owner); error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen, &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true); @@ -778,7 +778,7 @@ xrep_abt_build_new_trees( error = xrep_bnobt_sort_records(ra); if (error) - return error; + goto err_levels; /* Load the free space by block number tree. */ ra->array_cur = XFARRAY_CURSOR_INIT; @@ -849,6 +849,7 @@ xrep_allocbt( { struct xrep_abt *ra; struct xfs_mount *mp = sc->mp; + unsigned int busy_gen; char *descr; int error; @@ -869,7 +870,7 @@ xrep_allocbt( * on there twice. In theory we cleared this before we started, but * let's not risk the filesystem. */ - if (!xfs_extent_busy_list_empty(sc->sa.pag)) { + if (!xfs_extent_busy_list_empty(pag_group(sc->sa.pag), &busy_gen)) { error = -EDEADLOCK; goto out_ra; } diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 83c7feb38714..708334f9b2bd 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -10,16 +10,20 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_attr_sf.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/attr.h" +#include "scrub/listxattr.h" +#include "scrub/repair.h" /* Free the buffers linked from the xattr buffer. */ static void @@ -35,6 +39,8 @@ xchk_xattr_buf_cleanup( kvfree(ab->value); ab->value = NULL; ab->value_sz = 0; + kvfree(ab->name); + ab->name = NULL; } /* @@ -65,7 +71,7 @@ xchk_xattr_want_freemap( * reallocating the buffer if necessary. Buffer contents are not preserved * across a reallocation. */ -static int +int xchk_setup_xattr_buf( struct xfs_scrub *sc, size_t value_size) @@ -95,6 +101,12 @@ xchk_setup_xattr_buf( return -ENOMEM; } + if (xchk_could_repair(sc)) { + ab->name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS); + if (!ab->name) + return -ENOMEM; + } + resize_value: if (ab->value_sz >= value_size) return 0; @@ -121,6 +133,12 @@ xchk_setup_xattr( { int error; + if (xchk_could_repair(sc)) { + error = xrep_setup_xattr(sc); + if (error) + return error; + } + /* * We failed to get memory while checking attrs, so this time try to * get all the memory we're ever going to need. Allocate the buffer @@ -137,106 +155,105 @@ xchk_setup_xattr( /* Extended Attributes */ -struct xchk_xattr { - struct xfs_attr_list_context context; - struct xfs_scrub *sc; -}; - /* * Check that an extended attribute key can be looked up by hash. * - * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked) - * to call this function for every attribute key in an inode. Once - * we're here, we load the attribute value to see if any errors happen, - * or if we get more or less data than we expected. + * We use the extended attribute walk helper to call this function for every + * attribute key in an inode. Once we're here, we load the attribute value to + * see if any errors happen, or if we get more or less data than we expected. */ -static void -xchk_xattr_listent( - struct xfs_attr_list_context *context, - int flags, - unsigned char *name, - int namelen, - int valuelen) +static int +xchk_xattr_actor( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) { struct xfs_da_args args = { - .op_flags = XFS_DA_OP_NOTIME, - .attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK, - .geo = context->dp->i_mount->m_attr_geo, + .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK, + .geo = sc->mp->m_attr_geo, .whichfork = XFS_ATTR_FORK, - .dp = context->dp, + .dp = ip, .name = name, .namelen = namelen, - .hashval = xfs_da_hashname(name, namelen), - .trans = context->tp, + .trans = sc->tp, .valuelen = valuelen, + .owner = ip->i_ino, }; struct xchk_xattr_buf *ab; - struct xchk_xattr *sx; int error = 0; - sx = container_of(context, struct xchk_xattr, context); - ab = sx->sc->buf; + ab = sc->buf; - if (xchk_should_terminate(sx->sc, &error)) { - context->seen_enough = error; - return; + if (xchk_should_terminate(sc, &error)) + return error; + + if (attr_flags & ~XFS_ATTR_ONDISK_MASK) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + return -ECANCELED; } - if (flags & XFS_ATTR_INCOMPLETE) { + if (attr_flags & XFS_ATTR_INCOMPLETE) { /* Incomplete attr key, just mark the inode for preening. */ - xchk_ino_set_preen(sx->sc, context->dp->i_ino); - return; + xchk_ino_set_preen(sc, ip->i_ino); + return 0; } - /* Only one namespace bit allowed. */ - if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) { - xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - goto fail_xref; + /* Does this name make sense? */ + if (!xfs_attr_namecheck(attr_flags, name, namelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + return -ECANCELED; } - /* Does this name make sense? */ - if (!xfs_attr_namecheck(name, namelen)) { - xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - goto fail_xref; + /* Check parent pointer record. */ + if ((attr_flags & XFS_ATTR_PARENT) && + !xfs_parent_valuecheck(sc->mp, value, valuelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + return -ECANCELED; } /* - * Local xattr values are stored in the attr leaf block, so we don't - * need to retrieve the value from a remote block to detect corruption - * problems. + * Try to allocate enough memory to extract the attr value. If that + * doesn't work, return -EDEADLOCK as a signal to try again with a + * maximally sized buffer. */ - if (flags & XFS_ATTR_LOCAL) - goto fail_xref; + error = xchk_setup_xattr_buf(sc, valuelen); + if (error == -ENOMEM) + error = -EDEADLOCK; + if (error) + return error; /* - * Try to allocate enough memory to extrat the attr value. If that - * doesn't work, we overload the seen_enough variable to convey - * the error message back to the main scrub function. + * Parent pointers are matched on attr name and value, so we must + * supply the xfs_parent_rec here when confirming that the dabtree + * indexing works correctly. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen); - if (error == -ENOMEM) - error = -EDEADLOCK; - if (error) { - context->seen_enough = error; - return; - } + if (attr_flags & XFS_ATTR_PARENT) + memcpy(ab->value, value, valuelen); args.value = ab->value; + /* + * Get the attr value to ensure that lookup can find this attribute + * through the dabtree indexing and that remote value retrieval also + * works correctly. + */ + xfs_attr_sethash(&args); error = xfs_attr_get_ilocked(&args); /* ENODATA means the hash lookup failed and the attr is bad */ if (error == -ENODATA) error = -EFSCORRUPTED; - if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, + if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, args.blkno, &error)) - goto fail_xref; + return error; if (args.valuelen != valuelen) - xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, - args.blkno); -fail_xref: - if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - context->seen_enough = 1; - return; + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + + return 0; } /* @@ -246,7 +263,7 @@ fail_xref: * Within a char, the lowest bit of the char represents the byte with * the smallest address */ -STATIC bool +bool xchk_xattr_set_map( struct xfs_scrub *sc, unsigned long *map, @@ -403,6 +420,17 @@ xchk_xattr_block( xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); hdrsize = xfs_attr3_leaf_hdr_size(leaf); + /* + * Empty xattr leaf blocks mapped at block 0 are probably a byproduct + * of a race between setxattr and a log shutdown. Anywhere else in the + * attr fork is a corruption. + */ + if (leafhdr.count == 0) { + if (blk->blkno == 0) + xchk_da_set_preen(ds, level); + else + xchk_da_set_corrupt(ds, level); + } if (leafhdr.usedbytes > mp->m_attr_geo->blksize) xchk_da_set_corrupt(ds, level); if (leafhdr.firstused > mp->m_attr_geo->blksize) @@ -411,6 +439,8 @@ xchk_xattr_block( xchk_da_set_corrupt(ds, level); if (!xchk_xattr_set_map(ds->sc, ab->usedmap, 0, hdrsize)) xchk_da_set_corrupt(ds, level); + if (leafhdr.holes) + xchk_da_set_preen(ds, level); if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out; @@ -463,7 +493,6 @@ xchk_xattr_rec( xfs_dahash_t hash; int nameidx; int hdrsize; - unsigned int badflags; int error; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -493,10 +522,15 @@ xchk_xattr_rec( /* Retrieve the entry and check it. */ hash = be32_to_cpu(ent->hashval); - badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE | - XFS_ATTR_INCOMPLETE); - if ((ent->flags & badflags) != 0) + if (ent->flags & ~XFS_ATTR_ONDISK_MASK) { + xchk_da_set_corrupt(ds, level); + return 0; + } + if (!xfs_attr_check_namespace(ent->flags)) { xchk_da_set_corrupt(ds, level); + return 0; + } + if (ent->flags & XFS_ATTR_LOCAL) { lentry = (struct xfs_attr_leaf_name_local *) (((char *)bp->b_addr) + nameidx); @@ -504,7 +538,10 @@ xchk_xattr_rec( xchk_da_set_corrupt(ds, level); goto out; } - calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen); + calc_hash = xfs_attr_hashval(mp, ent->flags, lentry->nameval, + lentry->namelen, + lentry->nameval + lentry->namelen, + be16_to_cpu(lentry->valuelen)); } else { rentry = (struct xfs_attr_leaf_name_remote *) (((char *)bp->b_addr) + nameidx); @@ -512,7 +549,13 @@ xchk_xattr_rec( xchk_da_set_corrupt(ds, level); goto out; } - calc_hash = xfs_da_hashname(rentry->name, rentry->namelen); + if (ent->flags & XFS_ATTR_PARENT) { + xchk_da_set_corrupt(ds, level); + goto out; + } + calc_hash = xfs_attr_hashval(mp, ent->flags, rentry->name, + rentry->namelen, NULL, + be32_to_cpu(rentry->valuelen)); } if (calc_hash != hash) xchk_da_set_corrupt(ds, level); @@ -556,6 +599,15 @@ xchk_xattr_check_sf( break; } + /* + * Shortform entries do not set LOCAL or INCOMPLETE, so the + * only valid flag bits here are for namespaces. + */ + if (sfe->flags & ~XFS_ATTR_NSP_ONDISK_MASK) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)sfe - (char *)sf, sizeof(struct xfs_attr_sf_entry))) { @@ -588,16 +640,6 @@ int xchk_xattr( struct xfs_scrub *sc) { - struct xchk_xattr sx = { - .sc = sc, - .context = { - .dp = sc->ip, - .tp = sc->tp, - .resynch = 1, - .put_listent = xchk_xattr_listent, - .allow_incomplete = true, - }, - }; xfs_dablk_t last_checked = -1U; int error = 0; @@ -626,12 +668,6 @@ xchk_xattr( /* * Look up every xattr in this file by name and hash. * - * Use the backend implementation of xfs_attr_list to call - * xchk_xattr_listent on every attribute key in this inode. - * In other words, we use the same iterator/callback mechanism - * that listattr uses to scrub extended attributes, though in our - * _listent function, we check the value of the attribute. - * * The VFS only locks i_rwsem when modifying attrs, so keep all * three locks held because that's the only way to ensure we're * the only thread poking into the da btree. We traverse the da @@ -639,13 +675,9 @@ xchk_xattr( * iteration, which doesn't really follow the usual buffer * locking order. */ - error = xfs_attr_list_ilocked(&sx.context); + error = xchk_xattr_walk(sc, sc->ip, xchk_xattr_actor, NULL, NULL); if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) return error; - /* Did our listent function try to return any errors? */ - if (sx.context.seen_enough < 0) - return sx.context.seen_enough; - return 0; } diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h index 48fd9402c432..7db58af56646 100644 --- a/fs/xfs/scrub/attr.h +++ b/fs/xfs/scrub/attr.h @@ -16,9 +16,16 @@ struct xchk_xattr_buf { /* Bitmap of free space in xattr leaf blocks. */ unsigned long *freemap; + /* Memory buffer used to hold salvaged xattr names. */ + unsigned char *name; + /* Memory buffer used to extract xattr values. */ void *value; size_t value_sz; }; +bool xchk_xattr_set_map(struct xfs_scrub *sc, unsigned long *map, + unsigned int start, unsigned int len); +int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size); + #endif /* __XFS_SCRUB_ATTR_H__ */ diff --git a/fs/xfs/scrub/attr_repair.c b/fs/xfs/scrub/attr_repair.c new file mode 100644 index 000000000000..c7eb94069caf --- /dev/null +++ b/fs/xfs/scrub/attr_repair.c @@ -0,0 +1,1663 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_acl.h" +#include "xfs_parent.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/attr.h" +#include "scrub/reap.h" +#include "scrub/attr_repair.h" + +/* + * Extended Attribute Repair + * ========================= + * + * We repair extended attributes by reading the attr leaf blocks looking for + * attributes entries that look salvageable (name passes verifiers, value can + * be retrieved, etc). Each extended attribute worth salvaging is stashed in + * memory, and the stashed entries are periodically replayed into a temporary + * file to constrain memory use. Batching the construction of the temporary + * extended attribute structure in this fashion reduces lock cycling of the + * file being repaired and the temporary file. + * + * When salvaging completes, the remaining stashed attributes are replayed to + * the temporary file. An atomic file contents exchange is used to commit the + * new xattr blocks to the file being repaired. This will disrupt attrmulti + * cursors. + */ + +struct xrep_xattr_key { + /* Cookie for retrieval of the xattr name. */ + xfblob_cookie name_cookie; + + /* Cookie for retrieval of the xattr value. */ + xfblob_cookie value_cookie; + + /* XFS_ATTR_* flags */ + int flags; + + /* Length of the value and name. */ + uint32_t valuelen; + uint16_t namelen; +}; + +/* + * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write + * them to the temp file. + */ +#define XREP_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8) + +struct xrep_xattr { + struct xfs_scrub *sc; + + /* Information for exchanging attr fork mappings at the end. */ + struct xrep_tempexch tx; + + /* xattr keys */ + struct xfarray *xattr_records; + + /* xattr values */ + struct xfblob *xattr_blobs; + + /* Number of attributes that we are salvaging. */ + unsigned long long attrs_found; + + /* Can we flush stashed attrs to the tempfile? */ + bool can_flush; + + /* Did the live update fail, and hence the repair is now out of date? */ + bool live_update_aborted; + + /* Lock protecting parent pointer updates */ + struct mutex lock; + + /* Fixed-size array of xrep_xattr_pptr structures. */ + struct xfarray *pptr_recs; + + /* Blobs containing parent pointer names. */ + struct xfblob *pptr_names; + + /* Hook to capture parent pointer updates. */ + struct xfs_dir_hook dhook; + + /* Scratch buffer for capturing parent pointers. */ + struct xfs_da_args pptr_args; + + /* Name buffer */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; +}; + +/* Create a parent pointer in the tempfile. */ +#define XREP_XATTR_PPTR_ADD (1) + +/* Remove a parent pointer from the tempfile. */ +#define XREP_XATTR_PPTR_REMOVE (2) + +/* A stashed parent pointer update. */ +struct xrep_xattr_pptr { + /* Cookie for retrieval of the pptr name. */ + xfblob_cookie name_cookie; + + /* Parent pointer record. */ + struct xfs_parent_rec pptr_rec; + + /* Length of the pptr name. */ + uint8_t namelen; + + /* XREP_XATTR_PPTR_{ADD,REMOVE} */ + uint8_t action; +}; + +/* Set up to recreate the extended attributes. */ +int +xrep_setup_xattr( + struct xfs_scrub *sc) +{ + if (xfs_has_parent(sc->mp)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + return xrep_tempfile_create(sc, S_IFREG); +} + +/* + * Decide if we want to salvage this attribute. We don't bother with + * incomplete or oversized keys or values. The @value parameter can be null + * for remote attrs. + */ +STATIC int +xrep_xattr_want_salvage( + struct xrep_xattr *rx, + unsigned int attr_flags, + const void *name, + int namelen, + const void *value, + int valuelen) +{ + if (attr_flags & XFS_ATTR_INCOMPLETE) + return false; + if (namelen > XATTR_NAME_MAX || namelen <= 0) + return false; + if (!xfs_attr_namecheck(attr_flags, name, namelen)) + return false; + if (valuelen > XATTR_SIZE_MAX || valuelen < 0) + return false; + if (attr_flags & XFS_ATTR_PARENT) + return xfs_parent_valuecheck(rx->sc->mp, value, valuelen); + + return true; +} + +/* Allocate an in-core record to hold xattrs while we rebuild the xattr data. */ +STATIC int +xrep_xattr_salvage_key( + struct xrep_xattr *rx, + int flags, + unsigned char *name, + int namelen, + unsigned char *value, + int valuelen) +{ + struct xrep_xattr_key key = { + .valuelen = valuelen, + .flags = flags & XFS_ATTR_NSP_ONDISK_MASK, + }; + unsigned int i = 0; + int error = 0; + + if (xchk_should_terminate(rx->sc, &error)) + return error; + + /* + * Truncate the name to the first character that would trip namecheck. + * If we no longer have a name after that, ignore this attribute. + */ + if (flags & XFS_ATTR_PARENT) { + key.namelen = namelen; + + trace_xrep_xattr_salvage_pptr(rx->sc->ip, flags, name, + key.namelen, value, valuelen); + } else { + while (i < namelen && name[i] != 0) + i++; + if (i == 0) + return 0; + key.namelen = i; + + trace_xrep_xattr_salvage_rec(rx->sc->ip, flags, name, + key.namelen, valuelen); + } + + error = xfblob_store(rx->xattr_blobs, &key.name_cookie, name, + key.namelen); + if (error) + return error; + + error = xfblob_store(rx->xattr_blobs, &key.value_cookie, value, + key.valuelen); + if (error) + return error; + + error = xfarray_append(rx->xattr_records, &key); + if (error) + return error; + + rx->attrs_found++; + return 0; +} + +/* + * Record a shortform extended attribute key & value for later reinsertion + * into the inode. + */ +STATIC int +xrep_xattr_salvage_sf_attr( + struct xrep_xattr *rx, + struct xfs_attr_sf_hdr *hdr, + struct xfs_attr_sf_entry *sfe) +{ + struct xfs_scrub *sc = rx->sc; + struct xchk_xattr_buf *ab = sc->buf; + unsigned char *name = sfe->nameval; + unsigned char *value = &sfe->nameval[sfe->namelen]; + + if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)name - (char *)hdr, + sfe->namelen)) + return 0; + + if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)value - (char *)hdr, + sfe->valuelen)) + return 0; + + if (!xrep_xattr_want_salvage(rx, sfe->flags, sfe->nameval, + sfe->namelen, value, sfe->valuelen)) + return 0; + + return xrep_xattr_salvage_key(rx, sfe->flags, sfe->nameval, + sfe->namelen, value, sfe->valuelen); +} + +/* + * Record a local format extended attribute key & value for later reinsertion + * into the inode. + */ +STATIC int +xrep_xattr_salvage_local_attr( + struct xrep_xattr *rx, + struct xfs_attr_leaf_entry *ent, + unsigned int nameidx, + const char *buf_end, + struct xfs_attr_leaf_name_local *lentry) +{ + struct xchk_xattr_buf *ab = rx->sc->buf; + unsigned char *value; + unsigned int valuelen; + unsigned int namesize; + + /* + * Decode the leaf local entry format. If something seems wrong, we + * junk the attribute. + */ + value = &lentry->nameval[lentry->namelen]; + valuelen = be16_to_cpu(lentry->valuelen); + namesize = xfs_attr_leaf_entsize_local(lentry->namelen, valuelen); + if ((char *)lentry + namesize > buf_end) + return 0; + if (!xrep_xattr_want_salvage(rx, ent->flags, lentry->nameval, + lentry->namelen, value, valuelen)) + return 0; + if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize)) + return 0; + + /* Try to save this attribute. */ + return xrep_xattr_salvage_key(rx, ent->flags, lentry->nameval, + lentry->namelen, value, valuelen); +} + +/* + * Record a remote format extended attribute key & value for later reinsertion + * into the inode. + */ +STATIC int +xrep_xattr_salvage_remote_attr( + struct xrep_xattr *rx, + struct xfs_attr_leaf_entry *ent, + unsigned int nameidx, + const char *buf_end, + struct xfs_attr_leaf_name_remote *rentry, + unsigned int ent_idx, + struct xfs_buf *leaf_bp) +{ + struct xchk_xattr_buf *ab = rx->sc->buf; + struct xfs_da_args args = { + .trans = rx->sc->tp, + .dp = rx->sc->ip, + .index = ent_idx, + .geo = rx->sc->mp->m_attr_geo, + .owner = rx->sc->ip->i_ino, + .attr_filter = ent->flags & XFS_ATTR_NSP_ONDISK_MASK, + .namelen = rentry->namelen, + .name = rentry->name, + .value = ab->value, + .valuelen = be32_to_cpu(rentry->valuelen), + }; + unsigned int namesize; + int error; + + /* + * Decode the leaf remote entry format. If something seems wrong, we + * junk the attribute. Note that we should never find a zero-length + * remote attribute value. + */ + namesize = xfs_attr_leaf_entsize_remote(rentry->namelen); + if ((char *)rentry + namesize > buf_end) + return 0; + if (args.valuelen == 0 || + !xrep_xattr_want_salvage(rx, ent->flags, rentry->name, + rentry->namelen, NULL, args.valuelen)) + return 0; + if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize)) + return 0; + + /* + * Enlarge the buffer (if needed) to hold the value that we're trying + * to salvage from the old extended attribute data. + */ + error = xchk_setup_xattr_buf(rx->sc, args.valuelen); + if (error == -ENOMEM) + error = -EDEADLOCK; + if (error) + return error; + + /* Look up the remote value and stash it for reconstruction. */ + error = xfs_attr3_leaf_getvalue(leaf_bp, &args); + if (error || args.rmtblkno == 0) + goto err_free; + + error = xfs_attr_rmtval_get(&args); + if (error) + goto err_free; + + /* Try to save this attribute. */ + error = xrep_xattr_salvage_key(rx, ent->flags, rentry->name, + rentry->namelen, ab->value, args.valuelen); +err_free: + /* remote value was garbage, junk it */ + if (error == -EFSBADCRC || error == -EFSCORRUPTED) + error = 0; + return error; +} + +/* Extract every xattr key that we can from this attr fork block. */ +STATIC int +xrep_xattr_recover_leaf( + struct xrep_xattr *rx, + struct xfs_buf *bp) +{ + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_scrub *sc = rx->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_name_local *lentry; + struct xfs_attr_leaf_name_remote *rentry; + struct xfs_attr_leaf_entry *ent; + struct xfs_attr_leaf_entry *entries; + struct xchk_xattr_buf *ab = rx->sc->buf; + char *buf_end; + size_t off; + unsigned int nameidx; + unsigned int hdrsize; + int i; + int error = 0; + + bitmap_zero(ab->usedmap, mp->m_attr_geo->blksize); + + /* Check the leaf header */ + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + hdrsize = xfs_attr3_leaf_hdr_size(leaf); + xchk_xattr_set_map(sc, ab->usedmap, 0, hdrsize); + entries = xfs_attr3_leaf_entryp(leaf); + + buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; + for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) { + if (xchk_should_terminate(sc, &error)) + return error; + + /* Skip key if it conflicts with something else? */ + off = (char *)ent - (char *)leaf; + if (!xchk_xattr_set_map(sc, ab->usedmap, off, + sizeof(xfs_attr_leaf_entry_t))) + continue; + + /* Check the name information. */ + nameidx = be16_to_cpu(ent->nameidx); + if (nameidx < leafhdr.firstused || + nameidx >= mp->m_attr_geo->blksize) + continue; + + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = xfs_attr3_leaf_name_local(leaf, i); + error = xrep_xattr_salvage_local_attr(rx, ent, nameidx, + buf_end, lentry); + } else { + rentry = xfs_attr3_leaf_name_remote(leaf, i); + error = xrep_xattr_salvage_remote_attr(rx, ent, nameidx, + buf_end, rentry, i, bp); + } + if (error) + return error; + } + + return 0; +} + +/* Try to recover shortform attrs. */ +STATIC int +xrep_xattr_recover_sf( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + struct xchk_xattr_buf *ab = sc->buf; + struct xfs_attr_sf_hdr *hdr; + struct xfs_attr_sf_entry *sfe; + struct xfs_attr_sf_entry *next; + struct xfs_ifork *ifp; + unsigned char *end; + int i; + int error = 0; + + ifp = xfs_ifork_ptr(rx->sc->ip, XFS_ATTR_FORK); + hdr = ifp->if_data; + + bitmap_zero(ab->usedmap, ifp->if_bytes); + end = (unsigned char *)ifp->if_data + ifp->if_bytes; + xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*hdr)); + + sfe = xfs_attr_sf_firstentry(hdr); + if ((unsigned char *)sfe > end) + return 0; + + for (i = 0; i < hdr->count; i++) { + if (xchk_should_terminate(sc, &error)) + return error; + + next = xfs_attr_sf_nextentry(sfe); + if ((unsigned char *)next > end) + break; + + if (xchk_xattr_set_map(sc, ab->usedmap, + (char *)sfe - (char *)hdr, + sizeof(struct xfs_attr_sf_entry))) { + /* + * No conflicts with the sf entry; let's save this + * attribute. + */ + error = xrep_xattr_salvage_sf_attr(rx, hdr, sfe); + if (error) + return error; + } + + sfe = next; + } + + return 0; +} + +/* + * Try to return a buffer of xattr data for a given physical extent. + * + * Because the buffer cache get function complains if it finds a buffer + * matching the block number but not matching the length, we must be careful to + * look for incore buffers (up to the maximum length of a remote value) that + * could be hiding anywhere in the physical range. If we find an incore + * buffer, we can pass that to the caller. Optionally, read a single block and + * pass that back. + * + * Note the subtlety that remote attr value blocks for which there is no incore + * buffer will be passed to the callback one block at a time. These buffers + * will not have any ops attached and must be staled to prevent aliasing with + * multiblock buffers once we drop the ILOCK. + */ +STATIC int +xrep_xattr_find_buf( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + xfs_extlen_t max_len, + bool can_read, + struct xfs_buf **bpp) +{ + struct xrep_bufscan scan = { + .daddr = XFS_FSB_TO_DADDR(mp, fsbno), + .max_sectors = xrep_bufscan_max_sectors(mp, max_len), + .daddr_step = XFS_FSB_TO_BB(mp, 1), + }; + struct xfs_buf *bp; + + while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { + *bpp = bp; + return 0; + } + + if (!can_read) { + *bpp = NULL; + return 0; + } + + return xfs_buf_read(mp->m_ddev_targp, scan.daddr, XFS_FSB_TO_BB(mp, 1), + XBF_TRYLOCK, bpp, NULL); +} + +/* + * Deal with a buffer that we found during our walk of the attr fork. + * + * Attribute leaf and node blocks are simple -- they're a single block, so we + * can walk them one at a time and we never have to worry about discontiguous + * multiblock buffers like we do for directories. + * + * Unfortunately, remote attr blocks add a lot of complexity here. Each disk + * block is totally self contained, in the sense that the v5 header provides no + * indication that there could be more data in the next block. The incore + * buffers can span multiple blocks, though they never cross extent records. + * However, they don't necessarily start or end on an extent record boundary. + * Therefore, we need a special buffer find function to walk the buffer cache + * for us. + * + * The caller must hold the ILOCK on the file being repaired. We use + * XBF_TRYLOCK here to skip any locked buffer on the assumption that we don't + * own the block and don't want to hang the system on a potentially garbage + * buffer. + */ +STATIC int +xrep_xattr_recover_block( + struct xrep_xattr *rx, + xfs_dablk_t dabno, + xfs_fsblock_t fsbno, + xfs_extlen_t max_len, + xfs_extlen_t *actual_len) +{ + struct xfs_da_blkinfo *info; + struct xfs_buf *bp; + int error; + + error = xrep_xattr_find_buf(rx->sc->mp, fsbno, max_len, true, &bp); + if (error) + return error; + info = bp->b_addr; + *actual_len = XFS_BB_TO_FSB(rx->sc->mp, bp->b_length); + + trace_xrep_xattr_recover_leafblock(rx->sc->ip, dabno, + be16_to_cpu(info->magic)); + + /* + * If the buffer has the right magic number for an attr leaf block and + * passes a structure check (we don't care about checksums), salvage + * as much as we can from the block. */ + if (info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) && + xrep_buf_verify_struct(bp, &xfs_attr3_leaf_buf_ops) && + xfs_attr3_leaf_header_check(bp, rx->sc->ip->i_ino) == NULL) + error = xrep_xattr_recover_leaf(rx, bp); + + /* + * If the buffer didn't already have buffer ops set, it was read in by + * the _find_buf function and could very well be /part/ of a multiblock + * remote block. Mark it stale so that it doesn't hang around in + * memory to cause problems. + */ + if (bp->b_ops == NULL) + xfs_buf_stale(bp); + + xfs_buf_relse(bp); + return error; +} + +/* Insert one xattr key/value. */ +STATIC int +xrep_xattr_insert_rec( + struct xrep_xattr *rx, + const struct xrep_xattr_key *key) +{ + struct xfs_da_args args = { + .dp = rx->sc->tempip, + .attr_filter = key->flags, + .namelen = key->namelen, + .valuelen = key->valuelen, + .owner = rx->sc->ip->i_ino, + .geo = rx->sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + }; + struct xchk_xattr_buf *ab = rx->sc->buf; + int error; + + /* + * Grab pointers to the scrub buffer so that we can use them to insert + * attrs into the temp file. + */ + args.name = ab->name; + args.value = ab->value; + + /* + * The attribute name is stored near the end of the in-core buffer, + * though we reserve one more byte to ensure null termination. + */ + ab->name[XATTR_NAME_MAX] = 0; + + error = xfblob_load(rx->xattr_blobs, key->name_cookie, ab->name, + key->namelen); + if (error) + return error; + + error = xfblob_free(rx->xattr_blobs, key->name_cookie); + if (error) + return error; + + error = xfblob_load(rx->xattr_blobs, key->value_cookie, args.value, + key->valuelen); + if (error) + return error; + + error = xfblob_free(rx->xattr_blobs, key->value_cookie); + if (error) + return error; + + ab->name[key->namelen] = 0; + + if (key->flags & XFS_ATTR_PARENT) { + trace_xrep_xattr_insert_pptr(rx->sc->tempip, key->flags, + ab->name, key->namelen, ab->value, + key->valuelen); + args.op_flags |= XFS_DA_OP_LOGGED; + } else { + trace_xrep_xattr_insert_rec(rx->sc->tempip, key->flags, + ab->name, key->namelen, key->valuelen); + } + + /* + * xfs_attr_set creates and commits its own transaction. If the attr + * already exists, we'll just drop it during the rebuild. + */ + xfs_attr_sethash(&args); + error = xfs_attr_set(&args, XFS_ATTRUPDATE_CREATE, false); + if (error == -EEXIST) + error = 0; + + return error; +} + +/* + * Periodically flush salvaged attributes to the temporary file. This is done + * to reduce the memory requirements of the xattr rebuild because files can + * contain millions of attributes. + */ +STATIC int +xrep_xattr_flush_stashed( + struct xrep_xattr *rx) +{ + xfarray_idx_t array_cur; + int error; + + /* + * Entering this function, the scrub context has a reference to the + * inode being repaired, the temporary file, and a scrub transaction + * that we use during xattr salvaging to avoid livelocking if there + * are cycles in the xattr structures. We hold ILOCK_EXCL on both + * the inode being repaired, though it is not ijoined to the scrub + * transaction. + * + * To constrain kernel memory use, we occasionally flush salvaged + * xattrs from the xfarray and xfblob structures into the temporary + * file in preparation for exchanging the xattr structures at the end. + * Updating the temporary file requires a transaction, so we commit the + * scrub transaction and drop the two ILOCKs so that xfs_attr_set can + * allocate whatever transaction it wants. + * + * We still hold IOLOCK_EXCL on the inode being repaired, which + * prevents anyone from modifying the damaged xattr data while we + * repair it. + */ + error = xrep_trans_commit(rx->sc); + if (error) + return error; + xchk_iunlock(rx->sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK of the temporary file while we modify xattrs. This + * isn't strictly required because the temporary file is never revealed + * to userspace, but we follow the same locking rules. We still hold + * sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rx->sc); + if (error) + return error; + + /* Add all the salvaged attrs to the temporary file. */ + foreach_xfarray_idx(rx->xattr_records, array_cur) { + struct xrep_xattr_key key; + + error = xfarray_load(rx->xattr_records, array_cur, &key); + if (error) + return error; + + error = xrep_xattr_insert_rec(rx, &key); + if (error) + return error; + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rx->xattr_records); + xfblob_truncate(rx->xattr_blobs); + + xrep_tempfile_iounlock(rx->sc); + + /* Recreate the salvage transaction and relock the inode. */ + error = xchk_trans_alloc(rx->sc, 0); + if (error) + return error; + xchk_ilock(rx->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Decide if we've stashed too much xattr data in memory. */ +static inline bool +xrep_xattr_want_flush_stashed( + struct xrep_xattr *rx) +{ + unsigned long long bytes; + + if (!rx->can_flush) + return false; + + bytes = xfarray_bytes(rx->xattr_records) + + xfblob_bytes(rx->xattr_blobs); + return bytes > XREP_XATTR_MAX_STASH_BYTES; +} + +/* + * Did we observe rename changing parent pointer xattrs while we were flushing + * salvaged attrs? + */ +static inline bool +xrep_xattr_saw_pptr_conflict( + struct xrep_xattr *rx) +{ + bool ret; + + ASSERT(rx->can_flush); + + if (!xfs_has_parent(rx->sc->mp)) + return false; + + xfs_assert_ilocked(rx->sc->ip, XFS_ILOCK_EXCL); + + mutex_lock(&rx->lock); + ret = xfarray_bytes(rx->pptr_recs) > 0; + mutex_unlock(&rx->lock); + + return ret; +} + +/* + * Reset the entire repair state back to initial conditions, now that we've + * detected a parent pointer update to the attr structure while we were + * flushing salvaged attrs. See the locking notes in dir_repair.c for more + * information on why this is all necessary. + */ +STATIC int +xrep_xattr_full_reset( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + struct xfs_attr_sf_hdr *hdr; + struct xfs_ifork *ifp = &sc->tempip->i_af; + int error; + + trace_xrep_xattr_full_reset(sc->ip, sc->tempip); + + /* The temporary file's data fork had better not be in btree format. */ + if (sc->tempip->i_df.if_format == XFS_DINODE_FMT_BTREE) { + ASSERT(0); + return -EIO; + } + + /* + * We begin in transaction context with sc->ip ILOCKed but not joined + * to the transaction. To reset to the initial state, we must hold + * sc->ip's ILOCK to prevent rename from updating parent pointer + * information and the tempfile's ILOCK to clear its contents. + */ + xchk_iunlock(rx->sc, XFS_ILOCK_EXCL); + xrep_tempfile_ilock_both(sc); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + /* + * Free all the blocks of the attr fork of the temp file, and reset + * it back to local format. + */ + if (xfs_ifork_has_extents(&sc->tempip->i_af)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK); + if (error) + return error; + + ASSERT(ifp->if_bytes == 0); + ifp->if_format = XFS_DINODE_FMT_LOCAL; + xfs_idata_realloc(sc->tempip, sizeof(*hdr), XFS_ATTR_FORK); + } + + /* Reinitialize the attr fork to an empty shortform structure. */ + hdr = ifp->if_data; + memset(hdr, 0, sizeof(*hdr)); + hdr->totsize = cpu_to_be16(sizeof(*hdr)); + xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE | XFS_ILOG_ADATA); + + /* + * Roll this transaction to commit our reset ondisk. The tempfile + * should no longer be joined to the transaction, so we drop its ILOCK. + * This should leave us in transaction context with sc->ip ILOCKed but + * not joined to the transaction. + */ + error = xrep_roll_trans(sc); + if (error) + return error; + xrep_tempfile_iunlock(sc); + + /* + * Erase any accumulated parent pointer updates now that we've erased + * the tempfile's attr fork. We're resetting the entire repair state + * back to where we were initially, except now we won't flush salvaged + * xattrs until the very end. + */ + mutex_lock(&rx->lock); + xfarray_truncate(rx->pptr_recs); + xfblob_truncate(rx->pptr_names); + mutex_unlock(&rx->lock); + + rx->can_flush = false; + rx->attrs_found = 0; + + ASSERT(xfarray_bytes(rx->xattr_records) == 0); + ASSERT(xfblob_bytes(rx->xattr_blobs) == 0); + return 0; +} + +/* Extract as many attribute keys and values as we can. */ +STATIC int +xrep_xattr_recover( + struct xrep_xattr *rx) +{ + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = rx->sc; + struct xfs_da_geometry *geo = sc->mp->m_attr_geo; + xfs_fileoff_t offset; + xfs_extlen_t len; + xfs_dablk_t dabno; + int nmap; + int error; + +restart: + /* + * Iterate each xattr leaf block in the attr fork to scan them for any + * attributes that we might salvage. + */ + for (offset = 0; + offset < XFS_MAX_FILEOFF; + offset = got.br_startoff + got.br_blockcount) { + nmap = 1; + error = xfs_bmapi_read(sc->ip, offset, XFS_MAX_FILEOFF - offset, + &got, &nmap, XFS_BMAPI_ATTRFORK); + if (error) + return error; + if (nmap != 1) + return -EFSCORRUPTED; + if (!xfs_bmap_is_written_extent(&got)) + continue; + + for (dabno = round_up(got.br_startoff, geo->fsbcount); + dabno < got.br_startoff + got.br_blockcount; + dabno += len) { + xfs_fileoff_t curr_offset = dabno - got.br_startoff; + xfs_extlen_t maxlen; + + if (xchk_should_terminate(rx->sc, &error)) + return error; + + maxlen = min_t(xfs_filblks_t, INT_MAX, + got.br_blockcount - curr_offset); + error = xrep_xattr_recover_block(rx, dabno, + curr_offset + got.br_startblock, + maxlen, &len); + if (error) + return error; + + if (xrep_xattr_want_flush_stashed(rx)) { + error = xrep_xattr_flush_stashed(rx); + if (error) + return error; + + if (xrep_xattr_saw_pptr_conflict(rx)) { + error = xrep_xattr_full_reset(rx); + if (error) + return error; + + goto restart; + } + } + } + } + + return 0; +} + +/* + * Reset the extended attribute fork to a state where we can start re-adding + * the salvaged attributes. + */ +STATIC int +xrep_xattr_fork_remove( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + struct xfs_attr_sf_hdr *hdr; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); + + /* + * If the data fork is in btree format, we can't change di_forkoff + * because we could run afoul of the rule that the data fork isn't + * supposed to be in btree format if there's enough space in the fork + * that it could have used extents format. Instead, reinitialize the + * attr fork to have a shortform structure with zero attributes. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) { + ifp->if_format = XFS_DINODE_FMT_LOCAL; + hdr = xfs_idata_realloc(ip, (int)sizeof(*hdr) - ifp->if_bytes, + XFS_ATTR_FORK); + hdr->count = 0; + hdr->totsize = cpu_to_be16(sizeof(*hdr)); + xfs_trans_log_inode(sc->tp, ip, + XFS_ILOG_CORE | XFS_ILOG_ADATA); + return 0; + } + + /* If we still have attr fork extents, something's wrong. */ + if (ifp->if_nextents != 0) { + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec irec; + unsigned int i = 0; + + xfs_emerg(sc->mp, + "inode 0x%llx attr fork still has %llu attr extents, format %d?!", + ip->i_ino, ifp->if_nextents, ifp->if_format); + for_each_xfs_iext(ifp, &icur, &irec) { + xfs_err(sc->mp, + "[%u]: startoff %llu startblock %llu blockcount %llu state %u", + i++, irec.br_startoff, + irec.br_startblock, irec.br_blockcount, + irec.br_state); + } + ASSERT(0); + return -EFSCORRUPTED; + } + + xfs_attr_fork_remove(ip, sc->tp); + return 0; +} + +/* + * Free all the attribute fork blocks of the file being repaired and delete the + * fork. The caller must ILOCK the scrub file and join it to the transaction. + * This function returns with the inode joined to a clean transaction. + */ +int +xrep_xattr_reset_fork( + struct xfs_scrub *sc) +{ + int error; + + trace_xrep_xattr_reset_fork(sc->ip, sc->ip); + + /* Unmap all the attr blocks. */ + if (xfs_ifork_has_extents(&sc->ip->i_af)) { + error = xrep_reap_ifork(sc, sc->ip, XFS_ATTR_FORK); + if (error) + return error; + } + + error = xrep_xattr_fork_remove(sc, sc->ip); + if (error) + return error; + + return xfs_trans_roll_inode(&sc->tp, sc->ip); +} + +/* + * Free all the attribute fork blocks of the temporary file and delete the attr + * fork. The caller must ILOCK the tempfile and join it to the transaction. + * This function returns with the inode joined to a clean scrub transaction. + */ +int +xrep_xattr_reset_tempfile_fork( + struct xfs_scrub *sc) +{ + int error; + + trace_xrep_xattr_reset_fork(sc->ip, sc->tempip); + + /* + * Wipe out the attr fork of the temp file so that regular inode + * inactivation won't trip over the corrupt attr fork. + */ + if (xfs_ifork_has_extents(&sc->tempip->i_af)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK); + if (error) + return error; + } + + return xrep_xattr_fork_remove(sc, sc->tempip); +} + +/* + * Find all the extended attributes for this inode by scraping them out of the + * attribute key blocks by hand, and flushing them into the temp file. + * When we're done, free the staging memory before exchanging the xattr + * structures to reduce memory usage. + */ +STATIC int +xrep_xattr_salvage_attributes( + struct xrep_xattr *rx) +{ + struct xfs_inode *ip = rx->sc->ip; + int error; + + /* Short format xattrs are easy! */ + if (rx->sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) { + error = xrep_xattr_recover_sf(rx); + if (error) + return error; + + return xrep_xattr_flush_stashed(rx); + } + + /* + * For non-inline xattr structures, the salvage function scans the + * buffer cache looking for potential attr leaf blocks. The scan + * requires the ability to lock any buffer found and runs independently + * of any transaction <-> buffer item <-> buffer linkage. Therefore, + * roll the transaction to ensure there are no buffers joined. We hold + * the ILOCK independently of the transaction. + */ + error = xfs_trans_roll(&rx->sc->tp); + if (error) + return error; + + error = xfs_iread_extents(rx->sc->tp, ip, XFS_ATTR_FORK); + if (error) + return error; + + error = xrep_xattr_recover(rx); + if (error) + return error; + + return xrep_xattr_flush_stashed(rx); +} + +/* + * Add this stashed incore parent pointer to the temporary file. The caller + * must hold the tempdir's IOLOCK, must not hold any ILOCKs, and must not be in + * transaction context. + */ +STATIC int +xrep_xattr_replay_pptr_update( + struct xrep_xattr *rx, + const struct xfs_name *xname, + struct xrep_xattr_pptr *pptr) +{ + struct xfs_scrub *sc = rx->sc; + int error; + + switch (pptr->action) { + case XREP_XATTR_PPTR_ADD: + /* Create parent pointer. */ + trace_xrep_xattr_replay_parentadd(sc->tempip, xname, + &pptr->pptr_rec); + + error = xfs_parent_set(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rx->pptr_args); + ASSERT(error != -EEXIST); + return error; + case XREP_XATTR_PPTR_REMOVE: + /* Remove parent pointer. */ + trace_xrep_xattr_replay_parentremove(sc->tempip, xname, + &pptr->pptr_rec); + + error = xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rx->pptr_args); + ASSERT(error != -ENOATTR); + return error; + } + + ASSERT(0); + return -EIO; +} + +/* + * Flush stashed parent pointer updates that have been recorded by the scanner. + * This is done to reduce the memory requirements of the xattr rebuild, since + * files can have a lot of hardlinks and the fs can be busy. + * + * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile + * IOLOCK. + */ +STATIC int +xrep_xattr_replay_pptr_updates( + struct xrep_xattr *rx) +{ + xfarray_idx_t array_cur; + int error; + + mutex_lock(&rx->lock); + foreach_xfarray_idx(rx->pptr_recs, array_cur) { + struct xrep_xattr_pptr pptr; + + error = xfarray_load(rx->pptr_recs, array_cur, &pptr); + if (error) + goto out_unlock; + + error = xfblob_loadname(rx->pptr_names, pptr.name_cookie, + &rx->xname, pptr.namelen); + if (error) + goto out_unlock; + mutex_unlock(&rx->lock); + + error = xrep_xattr_replay_pptr_update(rx, &rx->xname, &pptr); + if (error) + return error; + + mutex_lock(&rx->lock); + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rx->pptr_recs); + xfblob_truncate(rx->pptr_names); + mutex_unlock(&rx->lock); + return 0; +out_unlock: + mutex_unlock(&rx->lock); + return error; +} + +/* + * Remember that we want to create a parent pointer in the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_xattr_stash_parentadd( + struct xrep_xattr *rx, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_xattr_pptr pptr = { + .action = XREP_XATTR_PPTR_ADD, + .namelen = name->len, + }; + int error; + + trace_xrep_xattr_stash_parentadd(rx->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rx->pptr_recs, &pptr); +} + +/* + * Remember that we want to remove a parent pointer from the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_xattr_stash_parentremove( + struct xrep_xattr *rx, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_xattr_pptr pptr = { + .action = XREP_XATTR_PPTR_REMOVE, + .namelen = name->len, + }; + int error; + + trace_xrep_xattr_stash_parentremove(rx->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rx->pptr_recs, &pptr); +} + +/* + * Capture dirent updates being made by other threads. We will have to replay + * the parent pointer updates before exchanging attr forks. + */ +STATIC int +xrep_xattr_live_dirent_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_xattr *rx; + struct xfs_scrub *sc; + int error; + + rx = container_of(nb, struct xrep_xattr, dhook.dirent_hook.nb); + sc = rx->sc; + + /* + * This thread updated a dirent that points to the file that we're + * repairing, so stash the update for replay against the temporary + * file. + */ + if (p->ip->i_ino != sc->ip->i_ino) + return NOTIFY_DONE; + + mutex_lock(&rx->lock); + if (p->delta > 0) + error = xrep_xattr_stash_parentadd(rx, p->name, p->dp); + else + error = xrep_xattr_stash_parentremove(rx, p->name, p->dp); + if (error) + rx->live_update_aborted = true; + mutex_unlock(&rx->lock); + return NOTIFY_DONE; +} + +/* + * Prepare both inodes' attribute forks for an exchange. Promote the tempfile + * from short format to leaf format, and if the file being repaired has a short + * format attr fork, turn it into an empty extent list. + */ +STATIC int +xrep_xattr_swap_prep( + struct xfs_scrub *sc, + bool temp_local, + bool ip_local) +{ + int error; + + /* + * If the tempfile's attributes are in shortform format, convert that + * to a single leaf extent so that we can use the atomic mapping + * exchange. + */ + if (temp_local) { + struct xfs_da_args args = { + .dp = sc->tempip, + .geo = sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .trans = sc->tp, + .total = 1, + .owner = sc->ip->i_ino, + }; + + error = xfs_attr_shortform_to_leaf(&args); + if (error) + return error; + + /* + * Roll the deferred log items to get us back to a clean + * transaction. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + /* + * If the file being repaired had a shortform attribute fork, convert + * that to an empty extent list in preparation for the atomic mapping + * exchange. + */ + if (ip_local) { + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + + xfs_idestroy_fork(ifp); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + ifp->if_bytes = 0; + ifp->if_data = NULL; + ifp->if_height = 0; + + xfs_trans_log_inode(sc->tp, sc->ip, + XFS_ILOG_CORE | XFS_ILOG_ADATA); + } + + return 0; +} + +/* Exchange the temporary file's attribute fork with the one being repaired. */ +int +xrep_xattr_swap( + struct xfs_scrub *sc, + struct xrep_tempexch *tx) +{ + bool ip_local, temp_local; + int error = 0; + + ip_local = sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL; + temp_local = sc->tempip->i_af.if_format == XFS_DINODE_FMT_LOCAL; + + /* + * If the both files have a local format attr fork and the rebuilt + * xattr data would fit in the repaired file's attr fork, just copy + * the contents from the tempfile and declare ourselves done. + */ + if (ip_local && temp_local) { + int forkoff; + int newsize; + + newsize = xfs_attr_sf_totsize(sc->tempip); + forkoff = xfs_attr_shortform_bytesfit(sc->ip, newsize); + if (forkoff > 0) { + sc->ip->i_forkoff = forkoff; + xrep_tempfile_copyout_local(sc, XFS_ATTR_FORK); + return 0; + } + } + + /* Otherwise, make sure both attr forks are in block-mapping mode. */ + error = xrep_xattr_swap_prep(sc, temp_local, ip_local); + if (error) + return error; + + return xrep_tempexch_contents(sc, tx); +} + +/* + * Finish replaying stashed parent pointer updates, allocate a transaction for + * exchanging extent mappings, and take the ILOCKs of both files before we + * commit the new extended attribute structure. + */ +STATIC int +xrep_xattr_finalize_tempfile( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + int error; + + if (!xfs_has_parent(sc->mp)) + return xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx); + + /* + * Repair relies on the ILOCK to quiesce all possible xattr updates. + * Replay all queued parent pointer updates into the tempfile before + * exchanging the contents, even if that means dropping the ILOCKs and + * the transaction. + */ + do { + error = xrep_xattr_replay_pptr_updates(rx); + if (error) + return error; + + error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx); + if (error) + return error; + + if (xfarray_length(rx->pptr_recs) == 0) + break; + + xchk_trans_cancel(sc); + xrep_tempfile_iunlock_both(sc); + } while (!xchk_should_terminate(sc, &error)); + return error; +} + +/* + * Exchange the new extended attribute data (which we created in the tempfile) + * with the file being repaired. + */ +STATIC int +xrep_xattr_rebuild_tree( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + int error; + + /* + * If we didn't find any attributes to salvage, repair the file by + * zapping its attr fork. + */ + if (rx->attrs_found == 0) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + error = xrep_xattr_reset_fork(sc); + if (error) + return error; + + goto forget_acls; + } + + trace_xrep_xattr_rebuild_tree(sc->ip, sc->tempip); + + /* + * Commit the repair transaction and drop the ILOCKs so that we can use + * the atomic file content exchange helper functions to compute the + * correct resource reservations. + * + * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent xattr + * modifications, but there's nothing to prevent userspace from reading + * the attributes until we're ready for the exchange operation. Reads + * will return -EIO without shutting down the fs, so we're ok with + * that. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK on the temporary file so that we can run xattr + * operations with the same locks held as we would for a normal file. + * We still hold sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rx->sc); + if (error) + return error; + + /* + * Allocate transaction, lock inodes, and make sure that we've replayed + * all the stashed parent pointer updates to the temp file. After this + * point, we're ready to exchange attr fork mappings. + */ + error = xrep_xattr_finalize_tempfile(rx); + if (error) + return error; + + /* + * Exchange the blocks mapped by the tempfile's attr fork with the file + * being repaired. The old attr blocks will then be attached to the + * tempfile, so reap its attr fork. + */ + error = xrep_xattr_swap(sc, &rx->tx); + if (error) + return error; + + error = xrep_xattr_reset_tempfile_fork(sc); + if (error) + return error; + + /* + * Roll to get a transaction without any inodes joined to it. Then we + * can drop the tempfile's ILOCK and IOLOCK before doing more work on + * the scrub target file. + */ + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + + xrep_tempfile_iunlock(sc); + xrep_tempfile_iounlock(sc); + +forget_acls: + /* Invalidate cached ACLs now that we've reloaded all the xattrs. */ + xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_FILE); + xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_DEFAULT); + return 0; +} + +/* Tear down all the incore scan stuff we created. */ +STATIC void +xrep_xattr_teardown( + struct xrep_xattr *rx) +{ + if (xfs_has_parent(rx->sc->mp)) + xfs_dir_hook_del(rx->sc->mp, &rx->dhook); + if (rx->pptr_names) + xfblob_destroy(rx->pptr_names); + if (rx->pptr_recs) + xfarray_destroy(rx->pptr_recs); + xfblob_destroy(rx->xattr_blobs); + xfarray_destroy(rx->xattr_records); + mutex_destroy(&rx->lock); + kfree(rx); +} + +/* Set up the filesystem scan so we can regenerate extended attributes. */ +STATIC int +xrep_xattr_setup_scan( + struct xfs_scrub *sc, + struct xrep_xattr **rxp) +{ + struct xrep_xattr *rx; + char *descr; + int max_len; + int error; + + rx = kzalloc(sizeof(struct xrep_xattr), XCHK_GFP_FLAGS); + if (!rx) + return -ENOMEM; + rx->sc = sc; + rx->can_flush = true; + rx->xname.name = rx->namebuf; + + mutex_init(&rx->lock); + + /* + * Allocate enough memory to handle loading local attr values from the + * xfblob data while flushing stashed attrs to the temporary file. + * We only realloc the buffer when salvaging remote attr values. + */ + max_len = xfs_attr_leaf_entsize_local_max(sc->mp->m_attr_geo->blksize); + error = xchk_setup_xattr_buf(rx->sc, max_len); + if (error == -ENOMEM) + error = -EDEADLOCK; + if (error) + goto out_rx; + + /* Set up some staging for salvaged attribute keys and values */ + descr = xchk_xfile_ino_descr(sc, "xattr keys"); + error = xfarray_create(descr, 0, sizeof(struct xrep_xattr_key), + &rx->xattr_records); + kfree(descr); + if (error) + goto out_rx; + + descr = xchk_xfile_ino_descr(sc, "xattr names"); + error = xfblob_create(descr, &rx->xattr_blobs); + kfree(descr); + if (error) + goto out_keys; + + if (xfs_has_parent(sc->mp)) { + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + + descr = xchk_xfile_ino_descr(sc, + "xattr retained parent pointer entries"); + error = xfarray_create(descr, 0, + sizeof(struct xrep_xattr_pptr), + &rx->pptr_recs); + kfree(descr); + if (error) + goto out_values; + + descr = xchk_xfile_ino_descr(sc, + "xattr retained parent pointer names"); + error = xfblob_create(descr, &rx->pptr_names); + kfree(descr); + if (error) + goto out_pprecs; + + xfs_dir_hook_setup(&rx->dhook, xrep_xattr_live_dirent_update); + error = xfs_dir_hook_add(sc->mp, &rx->dhook); + if (error) + goto out_ppnames; + } + + *rxp = rx; + return 0; +out_ppnames: + xfblob_destroy(rx->pptr_names); +out_pprecs: + xfarray_destroy(rx->pptr_recs); +out_values: + xfblob_destroy(rx->xattr_blobs); +out_keys: + xfarray_destroy(rx->xattr_records); +out_rx: + mutex_destroy(&rx->lock); + kfree(rx); + return error; +} + +/* + * Repair the extended attribute metadata. + * + * XXX: Remote attribute value buffers encompass the entire (up to 64k) buffer. + * The buffer cache in XFS can't handle aliased multiblock buffers, so this + * might misbehave if the attr fork is crosslinked with other filesystem + * metadata. + */ +int +xrep_xattr( + struct xfs_scrub *sc) +{ + struct xrep_xattr *rx = NULL; + int error; + + if (!xfs_inode_hasattr(sc->ip)) + return -ENOENT; + + /* The rmapbt is required to reap the old attr fork. */ + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + + error = xrep_xattr_setup_scan(sc, &rx); + if (error) + return error; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + error = xrep_xattr_salvage_attributes(rx); + if (error) + goto out_scan; + + if (rx->live_update_aborted) { + error = -EIO; + goto out_scan; + } + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto out_scan; + + error = xrep_xattr_rebuild_tree(rx); + if (error) + goto out_scan; + +out_scan: + xrep_xattr_teardown(rx); + return error; +} diff --git a/fs/xfs/scrub/attr_repair.h b/fs/xfs/scrub/attr_repair.h new file mode 100644 index 000000000000..979729bd4a5f --- /dev/null +++ b/fs/xfs/scrub/attr_repair.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_ATTR_REPAIR_H__ +#define __XFS_SCRUB_ATTR_REPAIR_H__ + +struct xrep_tempexch; + +int xrep_xattr_swap(struct xfs_scrub *sc, struct xrep_tempexch *tx); +int xrep_xattr_reset_fork(struct xfs_scrub *sc); +int xrep_xattr_reset_tempfile_fork(struct xfs_scrub *sc); + +#endif /* __XFS_SCRUB_ATTR_REPAIR_H__ */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 0cb8d43912a8..7ba35a7a7920 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -40,22 +40,23 @@ struct xbitmap64_node { * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll * forward-declare them anyway for clarity. */ -static inline void +static inline __maybe_unused void xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root); -static inline void +static inline __maybe_unused void xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root); -static inline struct xbitmap64_node * +static inline __maybe_unused struct xbitmap64_node * xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start, uint64_t last); -static inline struct xbitmap64_node * +static inline __maybe_unused struct xbitmap64_node * xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start, uint64_t last); INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t, - __bn_subtree_last, START, LAST, static inline, xbitmap64_tree) + __bn_subtree_last, START, LAST, static inline __maybe_unused, + xbitmap64_tree) /* Iterate each interval of a bitmap. Do not change the bitmap. */ #define for_each_xbitmap64_extent(bn, bitmap) \ @@ -314,22 +315,23 @@ struct xbitmap32_node { * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll * forward-declare them anyway for clarity. */ -static inline void +static inline __maybe_unused void xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root); -static inline void +static inline __maybe_unused void xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root); -static inline struct xbitmap32_node * +static inline __maybe_unused struct xbitmap32_node * xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start, uint32_t last); -static inline struct xbitmap32_node * +static inline __maybe_unused struct xbitmap32_node * xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start, uint32_t last); INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t, - __bn_subtree_last, START, LAST, static inline, xbitmap32_tree) + __bn_subtree_last, START, LAST, static inline __maybe_unused, + xbitmap32_tree) /* Iterate each interval of a bitmap. Do not change the bitmap. */ #define for_each_xbitmap32_extent(bn, bitmap) \ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 24a15bf784f1..4f1e2574660d 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -19,7 +19,10 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_rtgroup.h" #include "xfs_health.h" +#include "xfs_rtalloc.h" +#include "xfs_rtrmap_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" @@ -142,15 +145,22 @@ static inline bool xchk_bmap_get_rmap( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec, - xfs_agblock_t agbno, + xfs_agblock_t bno, uint64_t owner, struct xfs_rmap_irec *rmap) { + struct xfs_btree_cur **curp = &info->sc->sa.rmap_cur; xfs_fileoff_t offset; unsigned int rflags = 0; int has_rmap; int error; + if (xfs_ifork_is_realtime(info->sc->ip, info->whichfork)) + curp = &info->sc->sr.rmap_cur; + + if (*curp == NULL) + return false; + if (info->whichfork == XFS_ATTR_FORK) rflags |= XFS_RMAP_ATTR_FORK; if (irec->br_state == XFS_EXT_UNWRITTEN) @@ -171,13 +181,13 @@ xchk_bmap_get_rmap( * range rmap lookup to make sure we get the correct owner/offset. */ if (info->is_shared) { - error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno, - owner, offset, rflags, rmap, &has_rmap); + error = xfs_rmap_lookup_le_range(*curp, bno, owner, offset, + rflags, rmap, &has_rmap); } else { - error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, - owner, offset, rflags, rmap, &has_rmap); + error = xfs_rmap_lookup_le(*curp, bno, owner, offset, + rflags, rmap, &has_rmap); } - if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur)) + if (!xchk_should_check_xref(info->sc, &error, curp)) return false; if (!has_rmap) @@ -191,29 +201,29 @@ STATIC void xchk_bmap_xref_rmap( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec, - xfs_agblock_t agbno) + xfs_agblock_t bno) { struct xfs_rmap_irec rmap; unsigned long long rmap_end; uint64_t owner = info->sc->ip->i_ino; - if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm)) + if (xchk_skip_xref(info->sc->sm)) return; /* Find the rmap record for this irec. */ - if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) + if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap)) return; /* * The rmap must be an exact match for this incore file mapping record, * which may have arisen from multiple ondisk records. */ - if (rmap.rm_startblock != agbno) + if (rmap.rm_startblock != bno) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; - if (rmap_end != agbno + irec->br_blockcount) + if (rmap_end != bno + irec->br_blockcount) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -258,7 +268,7 @@ STATIC void xchk_bmap_xref_rmap_cow( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec, - xfs_agblock_t agbno) + xfs_agblock_t bno) { struct xfs_rmap_irec rmap; unsigned long long rmap_end; @@ -268,7 +278,7 @@ xchk_bmap_xref_rmap_cow( return; /* Find the rmap record for this irec. */ - if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) + if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap)) return; /* @@ -276,12 +286,12 @@ xchk_bmap_xref_rmap_cow( * can start before and end after the physical space allocated to this * mapping. There are no offsets to check. */ - if (rmap.rm_startblock > agbno) + if (rmap.rm_startblock > bno) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; - if (rmap_end < agbno + irec->br_blockcount) + if (rmap_end < bno + irec->br_blockcount) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -314,8 +324,58 @@ xchk_bmap_rt_iextent_xref( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { + struct xfs_owner_info oinfo; + xfs_rgblock_t rgbno; + int error; + + error = xchk_rtgroup_init_existing(info->sc, + xfs_rtb_to_rgno(ip->i_mount, irec->br_startblock), + &info->sc->sr); + if (!xchk_fblock_process_error(info->sc, info->whichfork, + irec->br_startoff, &error)) + return; + + error = xchk_rtgroup_lock(info->sc, &info->sc->sr, XCHK_RTGLOCK_ALL); + if (!xchk_fblock_process_error(info->sc, info->whichfork, + irec->br_startoff, &error)) + goto out_free; + xchk_xref_is_used_rt_space(info->sc, irec->br_startblock, irec->br_blockcount); + + if (!xfs_has_rtrmapbt(info->sc->mp)) + goto out_cur; + + rgbno = xfs_rtb_to_rgbno(info->sc->mp, irec->br_startblock); + + switch (info->whichfork) { + case XFS_DATA_FORK: + xchk_bmap_xref_rmap(info, irec, rgbno); + if (!xfs_is_reflink_inode(info->sc->ip)) { + xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, + info->whichfork, irec->br_startoff); + xchk_xref_is_only_rt_owned_by(info->sc, rgbno, + irec->br_blockcount, &oinfo); + xchk_xref_is_not_rt_shared(info->sc, rgbno, + irec->br_blockcount); + } + xchk_xref_is_not_rt_cow_staging(info->sc, rgbno, + irec->br_blockcount); + break; + case XFS_COW_FORK: + xchk_bmap_xref_rmap_cow(info, irec, rgbno); + xchk_xref_is_only_rt_owned_by(info->sc, rgbno, + irec->br_blockcount, &XFS_RMAP_OINFO_COW); + xchk_xref_is_rt_cow_staging(info->sc, rgbno, + irec->br_blockcount); + xchk_xref_is_not_rt_shared(info->sc, rgbno, + irec->br_blockcount); + break; + } +out_cur: + xchk_rtgroup_btcur_free(&info->sc->sr); +out_free: + xchk_rtgroup_free(info->sc, &info->sc->sr); } /* Cross-reference a single datadev extent record. */ @@ -600,9 +660,8 @@ xchk_bmap_check_rmap( if (irec.br_startoff != check_rec.rm_offset) xchk_fblock_set_corrupt(sc, sbcri->whichfork, check_rec.rm_offset); - if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, - cur->bc_ag.pag->pag_agno, - check_rec.rm_startblock)) + if (irec.br_startblock != + xfs_gbno_to_fsb(cur->bc_group, check_rec.rm_startblock)) xchk_fblock_set_corrupt(sc, sbcri->whichfork, check_rec.rm_offset); if (irec.br_blockcount > check_rec.rm_blockcount) @@ -656,6 +715,30 @@ xchk_bmap_check_ag_rmaps( return error; } +/* Make sure each rt rmap has a corresponding bmbt entry. */ +STATIC int +xchk_bmap_check_rt_rmaps( + struct xfs_scrub *sc, + struct xfs_rtgroup *rtg) +{ + struct xchk_bmap_check_rmap_info sbcri; + struct xfs_btree_cur *cur; + int error; + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg); + + sbcri.sc = sc; + sbcri.whichfork = XFS_DATA_FORK; + error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri); + if (error == -ECANCELED) + error = 0; + + xfs_btree_del_cursor(cur, error); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + return error; +} + /* * Decide if we want to scan the reverse mappings to determine if the attr * fork /really/ has zero space mappings. @@ -710,10 +793,6 @@ xchk_bmap_check_empty_datafork( { struct xfs_ifork *ifp = &ip->i_df; - /* Don't support realtime rmap checks yet. */ - if (XFS_IS_REALTIME_INODE(ip)) - return false; - /* * If the dinode repair found a bad data fork, it will reset the fork * to extents format with zero records and wait for the this scrubber @@ -761,11 +840,25 @@ xchk_bmap_check_rmaps( struct xfs_scrub *sc, int whichfork) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; int error; - for_each_perag(sc->mp, agno, pag) { + if (xfs_ifork_is_realtime(sc->ip, whichfork)) { + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) { + error = xchk_bmap_check_rt_rmaps(sc, rtg); + if (error || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + xfs_rtgroup_rele(rtg); + return error; + } + } + + return 0; + } + + while ((pag = xfs_perag_next(sc->mp, pag))) { error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { @@ -822,9 +915,12 @@ xchk_bmap_iext_mapping( /* Are these two mappings contiguous with each other? */ static inline bool xchk_are_bmaps_contiguous( + const struct xchk_bmap_info *info, const struct xfs_bmbt_irec *b1, const struct xfs_bmbt_irec *b2) { + struct xfs_mount *mp = info->sc->mp; + /* Don't try to combine unallocated mappings. */ if (!xfs_bmap_is_real_extent(b1)) return false; @@ -838,6 +934,17 @@ xchk_are_bmaps_contiguous( return false; if (b1->br_state != b2->br_state) return false; + + /* + * Don't combine bmaps that would cross rtgroup boundaries. This is a + * valid state, but if combined they will fail rtb extent checks. + */ + if (info->is_rt && xfs_has_rtgroups(mp)) { + if (xfs_rtb_to_rgno(mp, b1->br_startblock) != + xfs_rtb_to_rgno(mp, b2->br_startblock)) + return false; + } + return true; } @@ -875,7 +982,7 @@ xchk_bmap_iext_iter( * that we just read, if possible. */ while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) { - if (!xchk_are_bmaps_contiguous(irec, &got)) + if (!xchk_are_bmaps_contiguous(info, irec, &got)) break; if (!xchk_bmap_iext_mapping(info, &got)) { @@ -931,14 +1038,20 @@ xchk_bmap( switch (whichfork) { case XFS_COW_FORK: - /* No CoW forks on non-reflink filesystems. */ - if (!xfs_has_reflink(mp)) { + /* No CoW forks filesystem doesn't support out of place writes */ + if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); return 0; } break; case XFS_ATTR_FORK: - if (!xfs_has_attr(mp) && !xfs_has_attr2(mp)) + /* + * "attr" means that an attr fork was created at some point in + * the life of this filesystem. "attr2" means that inodes have + * variable-sized data/attr fork areas. Hence we only check + * attr here. + */ + if (!xfs_has_attr(mp)) xchk_ino_set_corrupt(sc, sc->ip->i_ino); break; default: @@ -951,6 +1064,7 @@ xchk_bmap( case XFS_DINODE_FMT_UUID: case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_META_BTREE: /* No mappings to check. */ if (whichfork == XFS_COW_FORK) xchk_fblock_set_corrupt(sc, whichfork, 0); diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c index 1e656fab5e41..1084213b8e9b 100644 --- a/fs/xfs/scrub/bmap_repair.c +++ b/fs/xfs/scrub/bmap_repair.c @@ -25,11 +25,13 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_rtrmap_btree.h" #include "xfs_refcount.h" #include "xfs_quota.h" #include "xfs_ialloc.h" #include "xfs_ag.h" #include "xfs_reflink.h" +#include "xfs_rtgroup.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -99,14 +101,21 @@ xrep_bmap_discover_shared( xfs_filblks_t blockcount) { struct xfs_scrub *sc = rb->sc; + struct xfs_btree_cur *cur; xfs_agblock_t agbno; xfs_agblock_t fbno; xfs_extlen_t flen; int error; - agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); - error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount, - &fbno, &flen, false); + if (XFS_IS_REALTIME_INODE(sc->ip)) { + agbno = xfs_rtb_to_rgbno(sc->mp, startblock); + cur = sc->sr.refc_cur; + } else { + agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); + cur = sc->sa.refc_cur; + } + error = xfs_refcount_find_shared(cur, agbno, blockcount, &fbno, &flen, + false); if (error) return error; @@ -196,7 +205,7 @@ xrep_bmap_check_fork_rmap( return -EFSCORRUPTED; /* Check that this is within the AG. */ - if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock, + if (!xfs_verify_agbext(to_perag(cur->bc_group), rec->rm_startblock, rec->rm_blockcount)) return -EFSCORRUPTED; @@ -237,7 +246,6 @@ xrep_bmap_walk_rmap( void *priv) { struct xrep_bmap *rb = priv; - struct xfs_mount *mp = cur->bc_mp; xfs_fsblock_t fsbno; int error = 0; @@ -269,8 +277,7 @@ xrep_bmap_walk_rmap( if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten) return -EFSCORRUPTED; - fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, - rec->rm_startblock); + fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group), rec->rm_startblock); if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { rb->old_bmbt_block_count += rec->rm_blockcount; @@ -361,6 +368,114 @@ xrep_bmap_scan_ag( return error; } +#ifdef CONFIG_XFS_RT +/* Check for any obvious errors or conflicts in the file mapping. */ +STATIC int +xrep_bmap_check_rtfork_rmap( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec) +{ + /* xattr extents are never stored on realtime devices */ + if (rec->rm_flags & XFS_RMAP_ATTR_FORK) + return -EFSCORRUPTED; + + /* bmbt blocks are never stored on realtime devices */ + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) + return -EFSCORRUPTED; + + /* Data extents for non-rt files are never stored on the rt device. */ + if (!XFS_IS_REALTIME_INODE(sc->ip)) + return -EFSCORRUPTED; + + /* Check the file offsets and physical extents. */ + if (!xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* Check that this is within the rtgroup. */ + if (!xfs_verify_rgbext(to_rtg(cur->bc_group), rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + return xrep_require_rtext_inuse(sc, rec->rm_startblock, + rec->rm_blockcount); +} + +/* Record realtime extents that belong to this inode's fork. */ +STATIC int +xrep_bmap_walk_rtrmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_bmap *rb = priv; + int error = 0; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + /* Skip extents which are not owned by this inode and fork. */ + if (rec->rm_owner != rb->sc->ip->i_ino) + return 0; + + error = xrep_bmap_check_rtfork_rmap(rb->sc, cur, rec); + if (error) + return error; + + /* + * Record all blocks allocated to this file even if the extent isn't + * for the fork we're rebuilding so that we can reset di_nblocks later. + */ + rb->nblocks += rec->rm_blockcount; + + /* If this rmap isn't for the fork we want, we're done. */ + if (rb->whichfork == XFS_DATA_FORK && + (rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + if (rb->whichfork == XFS_ATTR_FORK && + !(rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + + return xrep_bmap_from_rmap(rb, rec->rm_offset, + xfs_rgbno_to_rtb(to_rtg(cur->bc_group), + rec->rm_startblock), + rec->rm_blockcount, + rec->rm_flags & XFS_RMAP_UNWRITTEN); +} + +/* Scan the realtime reverse mappings to build the new extent map. */ +STATIC int +xrep_bmap_scan_rtgroup( + struct xrep_bmap *rb, + struct xfs_rtgroup *rtg) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + if (!xfs_has_rtrmapbt(sc->mp)) + return 0; + + error = xrep_rtgroup_init(sc, rtg, &sc->sr, + XFS_RTGLOCK_RMAP | + XFS_RTGLOCK_REFCOUNT | + XFS_RTGLOCK_BITMAP_SHARED); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_bmap_walk_rtrmap, rb); + xchk_rtgroup_btcur_free(&sc->sr); + xchk_rtgroup_free(sc, &sc->sr); + return error; +} +#else +static inline int +xrep_bmap_scan_rtgroup(struct xrep_bmap *rb, struct xfs_rtgroup *rtg) +{ + return -EFSCORRUPTED; +} +#endif + /* Find the delalloc extents from the old incore extent tree. */ STATIC int xrep_bmap_find_delalloc( @@ -409,12 +524,27 @@ xrep_bmap_find_mappings( struct xrep_bmap *rb) { struct xfs_scrub *sc = rb->sc; - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; int error = 0; + /* + * Iterate the rtrmaps for extents. Metadata files never have content + * on the realtime device, so there's no need to scan them. + */ + if (!xfs_is_metadir_inode(sc->ip)) { + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) { + error = xrep_bmap_scan_rtgroup(rb, rtg); + if (error) { + xfs_rtgroup_rele(rtg); + return error; + } + } + } + /* Iterate the rmaps for extents. */ - for_each_perag(sc->mp, agno, pag) { + while ((pag = xfs_perag_next(sc->mp, pag))) { error = xrep_bmap_scan_ag(rb, pag); if (error) { xfs_perag_rele(pag); @@ -480,7 +610,7 @@ xrep_bmap_iroot_size( { ASSERT(level > 0); - return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level); + return xfs_bmap_broot_space_calc(cur->bc_mp, nr_this_level); } /* Update the inode counters. */ @@ -734,6 +864,7 @@ xrep_bmap_check_inputs( case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_LOCAL: case XFS_DINODE_FMT_UUID: + case XFS_DINODE_FMT_META_BTREE: return -ECANCELED; case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: @@ -756,10 +887,6 @@ xrep_bmap_check_inputs( return -EINVAL; } - /* Don't know how to rebuild realtime data forks. */ - if (XFS_IS_REALTIME_INODE(sc->ip)) - return -EOPNOTSUPP; - return 0; } @@ -785,10 +912,6 @@ xrep_bmap_init_reflink_scan( if (whichfork != XFS_DATA_FORK) return RLS_IRRELEVANT; - /* cannot share realtime extents */ - if (XFS_IS_REALTIME_INODE(sc->ip)) - return RLS_IRRELEVANT; - return RLS_UNKNOWN; } @@ -801,7 +924,7 @@ xrep_bmap( { struct xrep_bmap *rb; char *descr; - unsigned int max_bmbt_recs; + xfs_extnum_t max_bmbt_recs; bool large_extcount; int error = 0; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 47a20cf5205f..28ad341df8ee 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -26,16 +26,24 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_dir2_priv.h" +#include "xfs_dir2.h" #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" #include "xfs_error.h" #include "xfs_quota.h" +#include "xfs_exchmaps.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_bmap_util.h" +#include "xfs_rtrefcount_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/health.h" +#include "scrub/tempfile.h" /* Common code for the metadata scrubbers. */ @@ -118,6 +126,17 @@ xchk_process_error( } bool +xchk_process_rt_error( + struct xfs_scrub *sc, + xfs_rgnumber_t rgno, + xfs_rgblock_t rgbno, + int *error) +{ + return __xchk_process_error(sc, rgno, rgbno, error, + XFS_SCRUB_OFLAG_CORRUPT, __return_address); +} + +bool xchk_xref_process_error( struct xfs_scrub *sc, xfs_agnumber_t agno, @@ -445,7 +464,7 @@ xchk_perag_read_headers( { int error; - error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); + error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) return error; @@ -510,7 +529,7 @@ xchk_perag_drain_and_lock( * Obviously, this should be slanted against scrub and in favor * of runtime threads. */ - if (!xfs_perag_intent_busy(sa->pag)) + if (!xfs_group_intent_busy(pag_group(sa->pag))) return 0; if (sa->agf_bp) { @@ -525,7 +544,7 @@ xchk_perag_drain_and_lock( if (!(sc->flags & XCHK_FSGATES_DRAIN)) return -ECHRNG; - error = xfs_perag_intent_drain(sa->pag); + error = xfs_group_intent_drain(pag_group(sa->pag)); if (error == -ERESTARTSYS) error = -EINTR; } while (!error); @@ -680,6 +699,163 @@ xchk_ag_init( return 0; } +#ifdef CONFIG_XFS_RT +/* + * For scrubbing a realtime group, grab all the in-core resources we'll need to + * check the metadata, which means taking the ILOCK of the realtime group's + * metadata inodes. Callers must not join these inodes to the transaction with + * non-zero lockflags or concurrency problems will result. The @rtglock_flags + * argument takes XFS_RTGLOCK_* flags. + */ +int +xchk_rtgroup_init( + struct xfs_scrub *sc, + xfs_rgnumber_t rgno, + struct xchk_rt *sr) +{ + ASSERT(sr->rtg == NULL); + ASSERT(sr->rtlock_flags == 0); + + sr->rtg = xfs_rtgroup_get(sc->mp, rgno); + if (!sr->rtg) + return -ENOENT; + return 0; +} + +/* Lock all the rt group metadata inode ILOCKs and wait for intents. */ +int +xchk_rtgroup_lock( + struct xfs_scrub *sc, + struct xchk_rt *sr, + unsigned int rtglock_flags) +{ + int error = 0; + + ASSERT(sr->rtg != NULL); + + /* + * If we're /only/ locking the rtbitmap in shared mode, then we're + * obviously not trying to compare records in two metadata inodes. + * There's no need to drain intents here because the caller (most + * likely the rgsuper scanner) doesn't need that level of consistency. + */ + if (rtglock_flags == XFS_RTGLOCK_BITMAP_SHARED) { + xfs_rtgroup_lock(sr->rtg, rtglock_flags); + sr->rtlock_flags = rtglock_flags; + return 0; + } + + do { + if (xchk_should_terminate(sc, &error)) + return error; + + xfs_rtgroup_lock(sr->rtg, rtglock_flags); + + /* + * If we've grabbed a non-metadata file for scrubbing, we + * assume that holding its ILOCK will suffice to coordinate + * with any rt intent chains involving this inode. + */ + if (sc->ip && !xfs_is_internal_inode(sc->ip)) + break; + + /* + * Decide if the rt group is quiet enough for all metadata to + * be consistent with each other. Regular file IO doesn't get + * to lock all the rt inodes at the same time, which means that + * there could be other threads in the middle of processing a + * chain of deferred ops. + * + * We just locked all the metadata inodes for this rt group; + * now take a look to see if there are any intents in progress. + * If there are, drop the rt group inode locks and wait for the + * intents to drain. Since we hold the rt group inode locks + * for the duration of the scrub, this is the only time we have + * to sample the intents counter; any threads increasing it + * after this point can't possibly be in the middle of a chain + * of rt metadata updates. + * + * Obviously, this should be slanted against scrub and in favor + * of runtime threads. + */ + if (!xfs_group_intent_busy(rtg_group(sr->rtg))) + break; + + xfs_rtgroup_unlock(sr->rtg, rtglock_flags); + + if (!(sc->flags & XCHK_FSGATES_DRAIN)) + return -ECHRNG; + error = xfs_group_intent_drain(rtg_group(sr->rtg)); + if (error) { + if (error == -ERESTARTSYS) + error = -EINTR; + return error; + } + } while (1); + + sr->rtlock_flags = rtglock_flags; + + if (xfs_has_rtrmapbt(sc->mp) && (rtglock_flags & XFS_RTGLOCK_RMAP)) + sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg); + + if (xfs_has_rtreflink(sc->mp) && (rtglock_flags & XFS_RTGLOCK_REFCOUNT)) + sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg); + + return 0; +} + +/* + * Free all the btree cursors and other incore data relating to the realtime + * group. This has to be done /before/ committing (or cancelling) the scrub + * transaction. + */ +void +xchk_rtgroup_btcur_free( + struct xchk_rt *sr) +{ + if (sr->rmap_cur) + xfs_btree_del_cursor(sr->rmap_cur, XFS_BTREE_ERROR); + if (sr->refc_cur) + xfs_btree_del_cursor(sr->refc_cur, XFS_BTREE_ERROR); + + sr->refc_cur = NULL; + sr->rmap_cur = NULL; +} + +/* + * Unlock the realtime group. This must be done /after/ committing (or + * cancelling) the scrub transaction. + */ +void +xchk_rtgroup_unlock( + struct xchk_rt *sr) +{ + ASSERT(sr->rtg != NULL); + + if (sr->rtlock_flags) { + xfs_rtgroup_unlock(sr->rtg, sr->rtlock_flags); + sr->rtlock_flags = 0; + } +} + +/* + * Unlock the realtime group and release its resources. This must be done + * /after/ committing (or cancelling) the scrub transaction. + */ +void +xchk_rtgroup_free( + struct xfs_scrub *sc, + struct xchk_rt *sr) +{ + ASSERT(sr->rtg != NULL); + + xchk_rtgroup_unlock(sr); + + xfs_rtgroup_put(sr->rtg); + sr->rtg = NULL; +} +#endif /* CONFIG_XFS_RT */ + /* Per-scrubber setup functions */ void @@ -730,6 +906,14 @@ xchk_setup_fs( return xchk_trans_alloc(sc, resblks); } +/* Set us up with a transaction and an empty context to repair rt metadata. */ +int +xchk_setup_rt( + struct xfs_scrub *sc) +{ + return xchk_trans_alloc(sc, xrep_calc_rtgroup_resblks(sc)); +} + /* Set us up with AG headers and btree cursors. */ int xchk_setup_ag_btree( @@ -781,7 +965,7 @@ xchk_iget( { ASSERT(sc->tp != NULL); - return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); + return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp); } /* @@ -827,13 +1011,13 @@ again: * in the iget cache miss path. */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); - error = xfs_ialloc_read_agi(pag, tp, agi_bpp); + error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp); xfs_perag_put(pag); if (error) return error; - error = xfs_iget(mp, tp, inum, - XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp); + error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0, + ipp); if (error == -EAGAIN) { /* * The inode may be in core but temporarily unavailable and may @@ -944,9 +1128,15 @@ xchk_iget_for_scrubbing( if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) return xchk_install_live_inode(sc, ip_in); - /* Reject internal metadata files and obviously bad inode numbers. */ - if (xfs_internal_inum(mp, sc->sm->sm_ino)) + /* + * On pre-metadir filesystems, reject internal metadata files. For + * metadir filesystems, limited scrubbing of any file in the metadata + * directory tree by handle is allowed, because that is the only way to + * validate the lack of parent pointers in the sb-root metadata inodes. + */ + if (!xfs_has_metadir(mp) && xfs_is_sb_inum(mp, sc->sm->sm_ino)) return -ENOENT; + /* Reject obviously bad inode numbers. */ if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; @@ -1060,12 +1250,6 @@ xchk_irele( spin_lock(&VFS_I(ip)->i_lock); VFS_I(ip)->i_state &= ~I_DONTCACHE; spin_unlock(&VFS_I(ip)->i_lock); - } else if (atomic_read(&VFS_I(ip)->i_count) == 1) { - /* - * If this is the last reference to the inode and the caller - * permits it, set DONTCACHE to avoid thrashing. - */ - d_mark_dontcache(VFS_I(ip)); } xfs_irele(ip); @@ -1087,6 +1271,10 @@ xchk_setup_inode_contents( if (error) return error; + error = xrep_tempfile_adjust_directory_tree(sc); + if (error) + return error; + /* Lock the inode so the VFS cannot touch this file. */ xchk_ilock(sc, XFS_IOLOCK_EXCL); @@ -1202,27 +1390,12 @@ xchk_metadata_inode_subtype( struct xfs_scrub *sc, unsigned int scrub_type) { - __u32 smtype = sc->sm->sm_type; - unsigned int sick_mask = sc->sick_mask; + struct xfs_scrub_subord *sub; int error; - sc->sm->sm_type = scrub_type; - - switch (scrub_type) { - case XFS_SCRUB_TYPE_INODE: - error = xchk_inode(sc); - break; - case XFS_SCRUB_TYPE_BMBTD: - error = xchk_bmap_data(sc); - break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - break; - } - - sc->sick_mask = sick_mask; - sc->sm->sm_type = smtype; + sub = xchk_scrub_create_subord(sc, scrub_type); + error = sub->sc.ops->scrub(&sub->sc); + xchk_scrub_free_subord(sub); return error; } @@ -1257,12 +1430,6 @@ xchk_metadata_inode_forks( return 0; } - /* They also should never have extended attributes. */ - if (xfs_inode_hasattr(sc->ip)) { - xchk_ino_set_corrupt(sc, sc->ip->i_ino); - return 0; - } - /* Invoke the data fork scrubber. */ error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) @@ -1279,6 +1446,21 @@ xchk_metadata_inode_forks( xchk_ino_set_corrupt(sc, sc->ip->i_ino); } + /* + * Metadata files can only have extended attributes on metadir + * filesystems, either for parent pointers or for actual xattr data. + */ + if (xfs_inode_hasattr(sc->ip)) { + if (!xfs_has_metadir(sc->mp)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + } + return 0; } @@ -1299,7 +1481,7 @@ xchk_fsgates_enable( trace_xchk_fsgates_enable(sc, scrub_fsgates); if (scrub_fsgates & XCHK_FSGATES_DRAIN) - xfs_drain_wait_enable(); + xfs_defer_drain_wait_enable(); if (scrub_fsgates & XCHK_FSGATES_QUOTA) xfs_dqtrx_hook_enable(); @@ -1354,7 +1536,7 @@ xchk_inode_is_allocated( } /* reject inode numbers outside existing AGs */ - ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); + ino = xfs_agino_to_ino(pag, agino); if (!xfs_verify_ino(mp, ino)) return -EINVAL; @@ -1464,3 +1646,92 @@ out_rcu: rcu_read_unlock(); return error; } + +/* Is this inode a root directory for either tree? */ +bool +xchk_inode_is_dirtree_root(const struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + return ip == mp->m_rootip || + (xfs_has_metadir(mp) && ip == mp->m_metadirip); +} + +/* Does the superblock point down to this inode? */ +bool +xchk_inode_is_sb_rooted(const struct xfs_inode *ip) +{ + return xchk_inode_is_dirtree_root(ip) || + xfs_is_sb_inum(ip->i_mount, ip->i_ino); +} + +/* What is the root directory inumber for this inode? */ +xfs_ino_t +xchk_inode_rootdir_inum(const struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + if (xfs_is_metadir_inode(ip)) + return mp->m_metadirip->i_ino; + return mp->m_rootip->i_ino; +} + +static int +xchk_meta_btree_count_blocks( + struct xfs_scrub *sc, + xfs_extnum_t *nextents, + xfs_filblks_t *count) +{ + struct xfs_btree_cur *cur; + int error; + + if (!sc->sr.rtg) { + ASSERT(0); + return -EFSCORRUPTED; + } + + switch (sc->ip->i_metatype) { + case XFS_METAFILE_RTRMAP: + cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg); + break; + case XFS_METAFILE_RTREFCOUNT: + cur = xfs_rtrefcountbt_init_cursor(sc->tp, sc->sr.rtg); + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xfs_btree_count_blocks(cur, count); + xfs_btree_del_cursor(cur, error); + if (!error) { + *nextents = 0; + (*count)--; /* don't count the btree iroot */ + } + return error; +} + +/* Count the blocks used by a file, even if it's a metadata inode. */ +int +xchk_inode_count_blocks( + struct xfs_scrub *sc, + int whichfork, + xfs_extnum_t *nextents, + xfs_filblks_t *count) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); + + if (!ifp) { + *nextents = 0; + *count = 0; + return 0; + } + + if (ifp->if_format == XFS_DINODE_FMT_META_BTREE) { + ASSERT(whichfork == XFS_DATA_FORK); + return xchk_meta_btree_count_blocks(sc, nextents, count); + } + + return xfs_bmap_count_blocks(sc->tp, sc->ip, whichfork, nextents, + count); +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 89f7bbec887e..19877d99f255 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -6,37 +6,14 @@ #ifndef __XFS_SCRUB_COMMON_H__ #define __XFS_SCRUB_COMMON_H__ -/* - * We /could/ terminate a scrub/repair operation early. If we're not - * in a good place to continue (fatal signal, etc.) then bail out. - * Note that we're careful not to make any judgements about *error. - */ -static inline bool -xchk_should_terminate( - struct xfs_scrub *sc, - int *error) -{ - /* - * If preemption is disabled, we need to yield to the scheduler every - * few seconds so that we don't run afoul of the soft lockup watchdog - * or RCU stall detector. - */ - cond_resched(); - - if (fatal_signal_pending(current)) { - if (*error == 0) - *error = -EINTR; - return true; - } - return false; -} - int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); int xchk_trans_alloc_empty(struct xfs_scrub *sc); void xchk_trans_cancel(struct xfs_scrub *sc); bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int *error); +bool xchk_process_rt_error(struct xfs_scrub *sc, xfs_rgnumber_t rgno, + xfs_rgblock_t rgbno, int *error); bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork, xfs_fileoff_t offset, int *error); @@ -78,9 +55,15 @@ int xchk_checkpoint_log(struct xfs_mount *mp); bool xchk_should_check_xref(struct xfs_scrub *sc, int *error, struct xfs_btree_cur **curpp); +static inline int xchk_setup_nothing(struct xfs_scrub *sc) +{ + return -ENOENT; +} + /* Setup functions */ int xchk_setup_agheader(struct xfs_scrub *sc); int xchk_setup_fs(struct xfs_scrub *sc); +int xchk_setup_rt(struct xfs_scrub *sc); int xchk_setup_ag_allocbt(struct xfs_scrub *sc); int xchk_setup_ag_iallocbt(struct xfs_scrub *sc); int xchk_setup_ag_rmapbt(struct xfs_scrub *sc); @@ -92,20 +75,20 @@ int xchk_setup_directory(struct xfs_scrub *sc); int xchk_setup_xattr(struct xfs_scrub *sc); int xchk_setup_symlink(struct xfs_scrub *sc); int xchk_setup_parent(struct xfs_scrub *sc); +int xchk_setup_dirtree(struct xfs_scrub *sc); +int xchk_setup_metapath(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xchk_setup_rtbitmap(struct xfs_scrub *sc); int xchk_setup_rtsummary(struct xfs_scrub *sc); +int xchk_setup_rgsuperblock(struct xfs_scrub *sc); +int xchk_setup_rtrmapbt(struct xfs_scrub *sc); +int xchk_setup_rtrefcountbt(struct xfs_scrub *sc); #else -static inline int -xchk_setup_rtbitmap(struct xfs_scrub *sc) -{ - return -ENOENT; -} -static inline int -xchk_setup_rtsummary(struct xfs_scrub *sc) -{ - return -ENOENT; -} +# define xchk_setup_rtbitmap xchk_setup_nothing +# define xchk_setup_rtsummary xchk_setup_nothing +# define xchk_setup_rgsuperblock xchk_setup_nothing +# define xchk_setup_rtrmapbt xchk_setup_nothing +# define xchk_setup_rtrefcountbt xchk_setup_nothing #endif #ifdef CONFIG_XFS_QUOTA int xchk_ino_dqattach(struct xfs_scrub *sc); @@ -117,16 +100,8 @@ xchk_ino_dqattach(struct xfs_scrub *sc) { return 0; } -static inline int -xchk_setup_quota(struct xfs_scrub *sc) -{ - return -ENOENT; -} -static inline int -xchk_setup_quotacheck(struct xfs_scrub *sc) -{ - return -ENOENT; -} +# define xchk_setup_quota xchk_setup_nothing +# define xchk_setup_quotacheck xchk_setup_nothing #endif int xchk_setup_fscounters(struct xfs_scrub *sc); int xchk_setup_nlinks(struct xfs_scrub *sc); @@ -152,6 +127,41 @@ xchk_ag_init_existing( return error == -ENOENT ? -EFSCORRUPTED : error; } +#ifdef CONFIG_XFS_RT + +/* All the locks we need to check an rtgroup. */ +#define XCHK_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ + XFS_RTGLOCK_RMAP | \ + XFS_RTGLOCK_REFCOUNT) + +int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno, + struct xchk_rt *sr); + +static inline int +xchk_rtgroup_init_existing( + struct xfs_scrub *sc, + xfs_rgnumber_t rgno, + struct xchk_rt *sr) +{ + int error = xchk_rtgroup_init(sc, rgno, sr); + + return error == -ENOENT ? -EFSCORRUPTED : error; +} + +int xchk_rtgroup_lock(struct xfs_scrub *sc, struct xchk_rt *sr, + unsigned int rtglock_flags); +void xchk_rtgroup_unlock(struct xchk_rt *sr); +void xchk_rtgroup_btcur_free(struct xchk_rt *sr); +void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr); +#else +# define xchk_rtgroup_init(sc, rgno, sr) (-EFSCORRUPTED) +# define xchk_rtgroup_init_existing(sc, rgno, sr) (-EFSCORRUPTED) +# define xchk_rtgroup_lock(sc, sr, lockflags) (-EFSCORRUPTED) +# define xchk_rtgroup_unlock(sr) do { } while (0) +# define xchk_rtgroup_btcur_free(sr) do { } while (0) +# define xchk_rtgroup_free(sc, sr) do { } while (0) +#endif /* CONFIG_XFS_RT */ + int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno, struct xchk_ag *sa); void xchk_ag_btcur_free(struct xchk_ag *sa); @@ -212,8 +222,8 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) } bool xchk_dir_looks_zapped(struct xfs_inode *dp); +bool xchk_pptr_looks_zapped(struct xfs_inode *ip); -#ifdef CONFIG_XFS_ONLINE_REPAIR /* Decide if a repair is required. */ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) { @@ -233,10 +243,6 @@ static inline bool xchk_could_repair(const struct xfs_scrub *sc) return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !(sc->flags & XREP_ALREADY_FIXED); } -#else -# define xchk_needs_repair(sc) (false) -# define xchk_could_repair(sc) (false) -#endif /* CONFIG_XFS_ONLINE_REPAIR */ int xchk_metadata_inode_forks(struct xfs_scrub *sc); @@ -250,13 +256,20 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc); #define xchk_xfile_ag_descr(sc, fmt, ...) \ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \ (sc)->mp->m_super->s_id, \ - (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \ + (sc)->sa.pag ? \ + pag_agno((sc)->sa.pag) : (sc)->sm->sm_agno, \ ##__VA_ARGS__) #define xchk_xfile_ino_descr(sc, fmt, ...) \ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \ (sc)->mp->m_super->s_id, \ (sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \ ##__VA_ARGS__) +#define xchk_xfile_rtgroup_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): rtgroup 0x%x " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->sa.pag ? \ + rtg_rgno((sc)->sr.rtg) : (sc)->sm->sm_agno, \ + ##__VA_ARGS__) /* * Setting up a hook to wait for intents to drain is costly -- we have to take @@ -274,5 +287,11 @@ void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks); int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino, bool *inuse); +int xchk_inode_count_blocks(struct xfs_scrub *sc, int whichfork, + xfs_extnum_t *nextents, xfs_filblks_t *count); + +bool xchk_inode_is_dirtree_root(const struct xfs_inode *ip); +bool xchk_inode_is_sb_rooted(const struct xfs_inode *ip); +xfs_ino_t xchk_inode_rootdir_inum(const struct xfs_inode *ip); #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c index 4de3f0f40f48..38a246b8bf11 100644 --- a/fs/xfs/scrub/cow_repair.c +++ b/fs/xfs/scrub/cow_repair.c @@ -26,6 +26,9 @@ #include "xfs_errortag.h" #include "xfs_icache.h" #include "xfs_refcount_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -34,6 +37,7 @@ #include "scrub/bitmap.h" #include "scrub/off_bitmap.h" #include "scrub/fsb_bitmap.h" +#include "scrub/rtb_bitmap.h" #include "scrub/reap.h" /* @@ -61,7 +65,10 @@ struct xrep_cow { struct xoff_bitmap bad_fileoffs; /* Bitmap of fsblocks that were removed from the CoW fork. */ - struct xfsb_bitmap old_cowfork_fsblocks; + union { + struct xfsb_bitmap old_cowfork_fsblocks; + struct xrtb_bitmap old_cowfork_rtblocks; + }; /* CoW fork mappings used to scan for bad CoW staging extents. */ struct xfs_bmbt_irec irec; @@ -137,7 +144,6 @@ xrep_cow_mark_shared_staging( { struct xrep_cow *xc = priv; struct xfs_refcount_irec rrec; - xfs_fsblock_t fsbno; if (!xfs_refcount_check_domain(rec) || rec->rc_domain != XFS_REFC_DOMAIN_SHARED) @@ -145,9 +151,9 @@ xrep_cow_mark_shared_staging( xrep_cow_trim_refcount(xc, &rrec, rec); - fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, - rrec.rc_startblock); - return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount); + return xrep_cow_mark_file_range(xc, + xfs_gbno_to_fsb(cur->bc_group, rrec.rc_startblock), + rrec.rc_blockcount); } /* @@ -178,8 +184,7 @@ xrep_cow_mark_missing_staging( goto next; error = xrep_cow_mark_file_range(xc, - XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, - xc->next_bno), + xfs_gbno_to_fsb(cur->bc_group, xc->next_bno), rrec.rc_startblock - xc->next_bno); if (error) return error; @@ -200,7 +205,6 @@ xrep_cow_mark_missing_staging_rmap( void *priv) { struct xrep_cow *xc = priv; - xfs_fsblock_t fsbno; xfs_agblock_t rec_bno; xfs_extlen_t rec_len; unsigned int adj; @@ -222,8 +226,8 @@ xrep_cow_mark_missing_staging_rmap( rec_len -= adj; } - fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno); - return xrep_cow_mark_file_range(xc, fsbno, rec_len); + return xrep_cow_mark_file_range(xc, + xfs_gbno_to_fsb(cur->bc_group, rec_bno), rec_len); } /* @@ -275,8 +279,7 @@ xrep_cow_find_bad( if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { error = xrep_cow_mark_file_range(xc, - XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, - xc->next_bno), + xfs_agbno_to_fsb(pag, xc->next_bno), xc->irec_startbno + xc->irec.br_blockcount - xc->next_bno); if (error) @@ -312,6 +315,92 @@ out_pag: } /* + * Find any part of the CoW fork mapping that isn't a single-owner CoW staging + * extent and mark the corresponding part of the file range in the bitmap. + */ +STATIC int +xrep_cow_find_bad_rt( + struct xrep_cow *xc) +{ + struct xfs_refcount_irec rc_low = { 0 }; + struct xfs_refcount_irec rc_high = { 0 }; + struct xfs_rmap_irec rm_low = { 0 }; + struct xfs_rmap_irec rm_high = { 0 }; + struct xfs_scrub *sc = xc->sc; + struct xfs_rtgroup *rtg; + int error = 0; + + xc->irec_startbno = xfs_rtb_to_rgbno(sc->mp, xc->irec.br_startblock); + + rtg = xfs_rtgroup_get(sc->mp, + xfs_rtb_to_rgno(sc->mp, xc->irec.br_startblock)); + if (!rtg) + return -EFSCORRUPTED; + + error = xrep_rtgroup_init(sc, rtg, &sc->sr, + XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT); + if (error) + goto out_rtg; + + /* Mark any CoW fork extents that are shared. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED; + error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_shared_staging, xc); + if (error) + goto out_sr; + + /* Make sure there are CoW staging extents for the whole mapping. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW; + xc->next_bno = xc->irec_startbno; + error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_missing_staging, xc); + if (error) + goto out_sr; + + if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { + error = xrep_cow_mark_file_range(xc, + xfs_rgbno_to_rtb(rtg, xc->next_bno), + xc->irec_startbno + xc->irec.br_blockcount - + xc->next_bno); + if (error) + goto out_sr; + } + + /* Mark any area has an rmap that isn't a COW staging extent. */ + rm_low.rm_startblock = xc->irec_startbno; + memset(&rm_high, 0xFF, sizeof(rm_high)); + rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + error = xfs_rmap_query_range(sc->sr.rmap_cur, &rm_low, &rm_high, + xrep_cow_mark_missing_staging_rmap, xc); + if (error) + goto out_sr; + + /* + * If userspace is forcing us to rebuild the CoW fork or someone + * turned on the debugging knob, replace everything in the + * CoW fork and then scan for staging extents in the refcountbt. + */ + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, + xc->irec.br_blockcount); + if (error) + goto out_rtg; + } + +out_sr: + xchk_rtgroup_btcur_free(&sc->sr); + xchk_rtgroup_free(sc, &sc->sr); +out_rtg: + xfs_rtgroup_put(rtg); + return error; +} + +/* * Allocate a replacement CoW staging extent of up to the given number of * blocks, and fill out the mapping. */ @@ -344,7 +433,7 @@ xrep_cow_alloc( if (args.fsbno == NULLFSBLOCK) return -ENOSPC; - xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len); + xfs_refcount_alloc_cow_extent(sc->tp, false, args.fsbno, args.len); repl->fsbno = args.fsbno; repl->len = args.len; @@ -352,6 +441,32 @@ xrep_cow_alloc( } /* + * Allocate a replacement rt CoW staging extent of up to the given number of + * blocks, and fill out the mapping. + */ +STATIC int +xrep_cow_alloc_rt( + struct xfs_scrub *sc, + xfs_extlen_t maxlen, + struct xrep_cow_extent *repl) +{ + xfs_rtxlen_t maxrtx = xfs_rtb_to_rtx(sc->mp, maxlen); + int error; + + error = xfs_trans_reserve_more(sc->tp, 0, maxrtx); + if (error) + return error; + + error = xfs_rtallocate_rtgs(sc->tp, NULLRTBLOCK, 1, maxrtx, 1, false, + false, &repl->fsbno, &repl->len); + if (error) + return error; + + xfs_refcount_alloc_cow_extent(sc->tp, true, repl->fsbno, repl->len); + return 0; +} + +/* * Look up the current CoW fork mapping so that we only allocate enough to * replace a single mapping. If we don't find a mapping that covers the start * of the file range, or we find a delalloc or written extent, something is @@ -468,7 +583,10 @@ xrep_cow_replace_range( */ alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN, nextoff - startoff); - error = xrep_cow_alloc(sc, alloc_len, &repl); + if (XFS_IS_REALTIME_INODE(sc->ip)) + error = xrep_cow_alloc_rt(sc, alloc_len, &repl); + else + error = xrep_cow_alloc(sc, alloc_len, &repl); if (error) return error; @@ -484,8 +602,12 @@ xrep_cow_replace_range( return error; /* Note the old CoW staging extents; we'll reap them all later. */ - error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock, - repl.len); + if (XFS_IS_REALTIME_INODE(sc->ip)) + error = xrtb_bitmap_set(&xc->old_cowfork_rtblocks, + got.br_startblock, repl.len); + else + error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, + got.br_startblock, repl.len); if (error) return error; @@ -541,8 +663,16 @@ xrep_bmap_cow( if (!ifp) return 0; - /* realtime files aren't supported yet */ - if (XFS_IS_REALTIME_INODE(sc->ip)) + /* + * Realtime files with large extent sizes are not supported because + * we could encounter an CoW mapping that has been partially written + * out *and* requires replacement, and there's no solution to that. + */ + if (xfs_inode_has_bigrtalloc(sc->ip)) + return -EOPNOTSUPP; + + /* Metadata inodes aren't supposed to have data on the rt volume. */ + if (xfs_is_metadir_inode(sc->ip) && XFS_IS_REALTIME_INODE(sc->ip)) return -EOPNOTSUPP; /* @@ -563,7 +693,10 @@ xrep_bmap_cow( xc->sc = sc; xoff_bitmap_init(&xc->bad_fileoffs); - xfsb_bitmap_init(&xc->old_cowfork_fsblocks); + if (XFS_IS_REALTIME_INODE(sc->ip)) + xrtb_bitmap_init(&xc->old_cowfork_rtblocks); + else + xfsb_bitmap_init(&xc->old_cowfork_fsblocks); for_each_xfs_iext(ifp, &icur, &xc->irec) { if (xchk_should_terminate(sc, &error)) @@ -586,7 +719,10 @@ xrep_bmap_cow( if (xfs_bmap_is_written_extent(&xc->irec)) continue; - error = xrep_cow_find_bad(xc); + if (XFS_IS_REALTIME_INODE(sc->ip)) + error = xrep_cow_find_bad_rt(xc); + else + error = xrep_cow_find_bad(xc); if (error) goto out_bitmap; } @@ -601,13 +737,20 @@ xrep_bmap_cow( * by the refcount btree, not the inode, so it is correct to treat them * like inode metadata. */ - error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, - &XFS_RMAP_OINFO_COW); + if (XFS_IS_REALTIME_INODE(sc->ip)) + error = xrep_reap_rtblocks(sc, &xc->old_cowfork_rtblocks, + &XFS_RMAP_OINFO_COW); + else + error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, + &XFS_RMAP_OINFO_COW); if (error) goto out_bitmap; out_bitmap: - xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); + if (XFS_IS_REALTIME_INODE(sc->ip)) + xrtb_bitmap_destroy(&xc->old_cowfork_rtblocks); + else + xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); xoff_bitmap_destroy(&xc->bad_fileoffs); kfree(xc); return error; diff --git a/fs/xfs/scrub/dab_bitmap.h b/fs/xfs/scrub/dab_bitmap.h new file mode 100644 index 000000000000..0c6e3aad4395 --- /dev/null +++ b/fs/xfs/scrub/dab_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_DAB_BITMAP_H__ +#define __XFS_SCRUB_DAB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_dablk_t */ + +struct xdab_bitmap { + struct xbitmap32 dabitmap; +}; + +static inline void xdab_bitmap_init(struct xdab_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->dabitmap); +} + +static inline void xdab_bitmap_destroy(struct xdab_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->dabitmap); +} + +static inline int xdab_bitmap_set(struct xdab_bitmap *bitmap, + xfs_dablk_t dabno, xfs_extlen_t len) +{ + return xbitmap32_set(&bitmap->dabitmap, dabno, len); +} + +static inline bool xdab_bitmap_test(struct xdab_bitmap *bitmap, + xfs_dablk_t dabno, xfs_extlen_t *len) +{ + return xbitmap32_test(&bitmap->dabitmap, dabno, len); +} + +#endif /* __XFS_SCRUB_DAB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 82b150d3b8b7..056de4819f86 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -78,6 +78,22 @@ xchk_da_set_corrupt( __return_address); } +/* Flag a da btree node in need of optimization. */ +void +xchk_da_set_preen( + struct xchk_da_btree *ds, + int level) +{ + struct xfs_scrub *sc = ds->sc; + + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; + trace_xchk_fblock_preen(sc, ds->dargs.whichfork, + xfs_dir2_da_to_db(ds->dargs.geo, + ds->state->path.blk[level].blkno), + __return_address); +} + +/* Find an entry at a certain level in a da btree. */ static struct xfs_da_node_entry * xchk_da_btree_node_entry( struct xchk_da_btree *ds, @@ -320,6 +336,7 @@ xchk_da_btree_block( struct xfs_da3_blkinfo *hdr3; struct xfs_da_args *dargs = &ds->dargs; struct xfs_inode *ip = ds->dargs.dp; + xfs_failaddr_t fa; xfs_ino_t owner; int *pmaxrecs; struct xfs_da3_icnode_hdr nodehdr; @@ -442,6 +459,12 @@ xchk_da_btree_block( goto out_freebp; } + fa = xfs_da3_header_check(blk->bp, dargs->owner); + if (fa) { + xchk_da_set_corrupt(ds, level); + goto out_freebp; + } + /* * If we've been handed a block that is below the dabtree root, does * its hashval match what the parent block expected to see? @@ -494,6 +517,7 @@ xchk_da_btree( ds->dargs.whichfork = whichfork; ds->dargs.trans = sc->tp; ds->dargs.op_flags = XFS_DA_OP_OKNOENT; + ds->dargs.owner = sc->ip->i_ino; ds->state = xfs_da_state_alloc(&ds->dargs); ds->sc = sc; ds->private = private; diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h index 4f8c2138a1ec..de291e3b77dd 100644 --- a/fs/xfs/scrub/dabtree.h +++ b/fs/xfs/scrub/dabtree.h @@ -35,6 +35,9 @@ bool xchk_da_process_error(struct xchk_da_btree *ds, int level, int *error); /* Check for da btree corruption. */ void xchk_da_set_corrupt(struct xchk_da_btree *ds, int level); +void xchk_da_set_preen(struct xchk_da_btree *ds, int level); + +void xchk_da_set_preen(struct xchk_da_btree *ds, int level); int xchk_da_btree_hash(struct xchk_da_btree *ds, int level, __be32 *hashp); int xchk_da_btree(struct xfs_scrub *sc, int whichfork, diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 076a310b8eb0..c877bde71e62 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -16,22 +16,70 @@ #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_health.h" +#include "xfs_attr.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/readdir.h" #include "scrub/health.h" +#include "scrub/repair.h" +#include "scrub/trace.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" /* Set us up to scrub directories. */ int xchk_setup_directory( struct xfs_scrub *sc) { + int error; + + if (xchk_could_repair(sc)) { + error = xrep_setup_directory(sc); + if (error) + return error; + } + return xchk_setup_inode_contents(sc, 0); } /* Directories */ +/* Deferred directory entry that we saved for later. */ +struct xchk_dirent { + /* Cookie for retrieval of the dirent name. */ + xfblob_cookie name_cookie; + + /* Child inode number. */ + xfs_ino_t ino; + + /* Length of the pptr name. */ + uint8_t namelen; +}; + +struct xchk_dir { + struct xfs_scrub *sc; + + /* information for parent pointer validation. */ + struct xfs_parent_rec pptr_rec; + struct xfs_da_args pptr_args; + + /* Fixed-size array of xchk_dirent structures. */ + struct xfarray *dir_entries; + + /* Blobs containing dirent names. */ + struct xfblob *dir_names; + + /* If we've cycled the ILOCK, we must revalidate deferred dirents. */ + bool need_revalidate; + + /* Name buffer for dirent revalidation. */ + struct xfs_name xname; + uint8_t namebuf[MAXNAMELEN]; +}; + /* Scrub a directory entry. */ /* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */ @@ -52,6 +100,116 @@ xchk_dir_check_ftype( if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + /* + * Metadata and regular inodes cannot cross trees. This property + * cannot change without a full inode free and realloc cycle, so it's + * safe to check this without holding locks. + */ + if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(sc->ip)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); +} + +/* + * Try to lock a child file for checking parent pointers. Returns the inode + * flags for the locks we now hold, or zero if we failed. + */ +STATIC unsigned int +xchk_dir_lock_child( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + return 0; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; + } + + if (!xfs_inode_has_attr_fork(ip) || !xfs_need_iread_extents(&ip->i_af)) + return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED; + + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; + } + + return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL; +} + +/* Check the backwards link (parent pointer) associated with this dirent. */ +STATIC int +xchk_dir_parent_pointer( + struct xchk_dir *sd, + const struct xfs_name *name, + struct xfs_inode *ip) +{ + struct xfs_scrub *sc = sd->sc; + int error; + + xfs_inode_to_parent_rec(&sd->pptr_rec, sc->ip); + error = xfs_parent_lookup(sc->tp, ip, name, &sd->pptr_rec, + &sd->pptr_args); + if (error == -ENOATTR) + xchk_fblock_xref_set_corrupt(sc, XFS_DATA_FORK, 0); + + return 0; +} + +/* Look for a parent pointer matching this dirent, if the child isn't busy. */ +STATIC int +xchk_dir_check_pptr_fast( + struct xchk_dir *sd, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + struct xfs_inode *ip) +{ + struct xfs_scrub *sc = sd->sc; + unsigned int lockmode; + int error; + + /* dot and dotdot entries do not have parent pointers */ + if (xfs_dir2_samename(name, &xfs_name_dot) || + xfs_dir2_samename(name, &xfs_name_dotdot)) + return 0; + + /* No self-referential non-dot or dotdot dirents. */ + if (ip == sc->ip) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return -ECANCELED; + } + + /* Try to lock the inode. */ + lockmode = xchk_dir_lock_child(sc, ip); + if (!lockmode) { + struct xchk_dirent save_de = { + .namelen = name->len, + .ino = ip->i_ino, + }; + + /* Couldn't lock the inode, so save the dirent for later. */ + trace_xchk_dir_defer(sc->ip, name, ip->i_ino); + + error = xfblob_storename(sd->dir_names, &save_de.name_cookie, + name); + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + + error = xfarray_append(sd->dir_entries, &save_de); + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + + return 0; + } + + error = xchk_dir_parent_pointer(sd, name, ip); + xfs_iunlock(ip, lockmode); + return error; } /* @@ -71,6 +229,7 @@ xchk_dir_actor( { struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip; + struct xchk_dir *sd = priv; xfs_ino_t lookup_ino; xfs_dablk_t offset; int error = 0; @@ -102,7 +261,7 @@ xchk_dir_actor( * If this is ".." in the root inode, check that the inum * matches this dir. */ - if (dp->i_ino == mp->m_sb.sb_rootino && ino != dp->i_ino) + if (xchk_inode_is_dirtree_root(dp) && ino != dp->i_ino) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } @@ -137,6 +296,14 @@ xchk_dir_actor( goto out; xchk_dir_check_ftype(sc, offset, ip, name->type); + + if (xfs_has_parent(mp)) { + error = xchk_dir_check_pptr_fast(sd, dapos, name, ip); + if (error) + goto out_rele; + } + +out_rele: xchk_irele(sc, ip); out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) @@ -196,8 +363,8 @@ xchk_dir_rec( xchk_da_set_corrupt(ds, level); goto out; } - error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, - XFS_DABUF_MAP_HOLE_OK, &bp); + error = xfs_dir3_data_read(ds->dargs.trans, dp, ds->dargs.owner, + rec_bno, XFS_DABUF_MAP_HOLE_OK, &bp); if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno, &error)) goto out; @@ -315,10 +482,11 @@ xchk_directory_data_bestfree( /* dir block format */ if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); - error = xfs_dir3_block_read(sc->tp, sc->ip, &bp); + error = xfs_dir3_block_read(sc->tp, sc->ip, sc->ip->i_ino, &bp); } else { /* dir data format */ - error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, 0, &bp); + error = xfs_dir3_data_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, + 0, &bp); } if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; @@ -470,7 +638,7 @@ xchk_directory_leaf1_bestfree( int error; /* Read the free space block. */ - error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp); + error = xfs_dir3_leaf_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) return error; xchk_buffer_recheck(sc, bp); @@ -531,10 +699,9 @@ xchk_directory_leaf1_bestfree( /* Check all the bestfree entries. */ for (i = 0; i < bestcount; i++, bestp++) { best = be16_to_cpu(*bestp); - error = xfs_dir3_data_read(sc->tp, sc->ip, + error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner, xfs_dir2_db_to_da(args->geo, i), - XFS_DABUF_MAP_HOLE_OK, - &dbp); + XFS_DABUF_MAP_HOLE_OK, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) break; @@ -577,7 +744,7 @@ xchk_directory_free_bestfree( int error; /* Read the free space block */ - error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp); + error = xfs_dir2_free_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) return error; xchk_buffer_recheck(sc, bp); @@ -597,7 +764,7 @@ xchk_directory_free_bestfree( stale++; continue; } - error = xfs_dir3_data_read(sc->tp, sc->ip, + error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner, (freehdr.firstdb + i) * args->geo->fsbcount, 0, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, @@ -621,10 +788,11 @@ xchk_directory_blocks( { struct xfs_bmbt_irec got; struct xfs_da_args args = { - .dp = sc ->ip, + .dp = sc->ip, .whichfork = XFS_DATA_FORK, .geo = sc->mp->m_dir_geo, .trans = sc->tp, + .owner = sc->ip->i_ino, }; struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; @@ -648,7 +816,8 @@ xchk_directory_blocks( free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); /* Is this a block dir? */ - error = xfs_dir2_isblock(&args, &is_block); + if (xfs_dir2_format(&args, &error) == XFS_DIR2_FMT_BLOCK) + is_block = true; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; @@ -752,11 +921,148 @@ out: return error; } +/* + * Revalidate a dirent that we collected in the past but couldn't check because + * of lock contention. Returns 0 if the dirent is still valid, -ENOENT if it + * has gone away on us, or a negative errno. + */ +STATIC int +xchk_dir_revalidate_dirent( + struct xchk_dir *sd, + const struct xfs_name *xname, + xfs_ino_t ino) +{ + struct xfs_scrub *sc = sd->sc; + xfs_ino_t child_ino; + int error; + + /* + * Look up the directory entry. If we get -ENOENT, the directory entry + * went away and there's nothing to revalidate. Return any other + * error. + */ + error = xchk_dir_lookup(sc, sc->ip, xname, &child_ino); + if (error) + return error; + + /* The inode number changed, nothing to revalidate. */ + if (ino != child_ino) + return -ENOENT; + + return 0; +} + +/* + * Check a directory entry's parent pointers the slow way, which means we cycle + * locks a bunch and put up with revalidation until we get it done. + */ +STATIC int +xchk_dir_slow_dirent( + struct xchk_dir *sd, + struct xchk_dirent *dirent, + const struct xfs_name *xname) +{ + struct xfs_scrub *sc = sd->sc; + struct xfs_inode *ip; + unsigned int lockmode; + int error; + + /* Check that the deferred dirent still exists. */ + if (sd->need_revalidate) { + error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino); + if (error == -ENOENT) + return 0; + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + } + + error = xchk_iget(sc, dirent->ino, &ip); + if (error == -EINVAL || error == -ENOENT) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + + /* + * If we can grab both IOLOCK and ILOCK of the alleged child, we can + * proceed with the validation. + */ + lockmode = xchk_dir_lock_child(sc, ip); + if (lockmode) { + trace_xchk_dir_slowpath(sc->ip, xname, ip->i_ino); + goto check_pptr; + } + + /* + * We couldn't lock the child file. Drop all the locks and try to + * get them again, one at a time. + */ + xchk_iunlock(sc, sc->ilock_flags); + sd->need_revalidate = true; + + trace_xchk_dir_ultraslowpath(sc->ip, xname, ip->i_ino); + + error = xchk_dir_trylock_for_pptrs(sc, ip, &lockmode); + if (error) + goto out_rele; + + /* Revalidate, since we just cycled the locks. */ + error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino); + if (error == -ENOENT) { + error = 0; + goto out_unlock; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out_unlock; + +check_pptr: + error = xchk_dir_parent_pointer(sd, xname, ip); +out_unlock: + xfs_iunlock(ip, lockmode); +out_rele: + xchk_irele(sc, ip); + return error; +} + +/* Check all the dirents that we deferred the first time around. */ +STATIC int +xchk_dir_finish_slow_dirents( + struct xchk_dir *sd) +{ + xfarray_idx_t array_cur; + int error; + + foreach_xfarray_idx(sd->dir_entries, array_cur) { + struct xchk_dirent dirent; + + if (sd->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + error = xfarray_load(sd->dir_entries, array_cur, &dirent); + if (error) + return error; + + error = xfblob_loadname(sd->dir_names, dirent.name_cookie, + &sd->xname, dirent.namelen); + if (error) + return error; + + error = xchk_dir_slow_dirent(sd, &dirent, &sd->xname); + if (error) + return error; + } + + return 0; +} + /* Scrub a whole directory. */ int xchk_directory( struct xfs_scrub *sc) { + struct xchk_dir *sd; int error; if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) @@ -789,9 +1095,60 @@ xchk_directory( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return 0; + sd = kvzalloc(sizeof(struct xchk_dir), XCHK_GFP_FLAGS); + if (!sd) + return -ENOMEM; + sd->sc = sc; + sd->xname.name = sd->namebuf; + + if (xfs_has_parent(sc->mp)) { + char *descr; + + /* + * Set up some staging memory for dirents that we can't check + * due to locking contention. + */ + descr = xchk_xfile_ino_descr(sc, "slow directory entries"); + error = xfarray_create(descr, 0, sizeof(struct xchk_dirent), + &sd->dir_entries); + kfree(descr); + if (error) + goto out_sd; + + descr = xchk_xfile_ino_descr(sc, "slow directory entry names"); + error = xfblob_create(descr, &sd->dir_names); + kfree(descr); + if (error) + goto out_entries; + } + /* Look up every name in this directory by hash. */ - error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL); - if (error && error != -ECANCELED) + error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, sd); + if (error == -ECANCELED) + error = 0; + if (error) + goto out_names; + + if (xfs_has_parent(sc->mp)) { + error = xchk_dir_finish_slow_dirents(sd); + if (error == -ETIMEDOUT) { + /* Couldn't grab a lock, scrub was marked incomplete */ + error = 0; + goto out_names; + } + if (error) + goto out_names; + } + +out_names: + if (sd->dir_names) + xfblob_destroy(sd->dir_names); +out_entries: + if (sd->dir_entries) + xfarray_destroy(sd->dir_entries); +out_sd: + kvfree(sd); + if (error) return error; /* If the dir is clean, it is clearly not zapped. */ diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c new file mode 100644 index 000000000000..249313882108 --- /dev/null +++ b/fs/xfs/scrub/dir_repair.c @@ -0,0 +1,1968 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_bmap_util.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_ag.h" +#include "xfs_parent.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/iscan.h" +#include "scrub/readdir.h" +#include "scrub/reap.h" +#include "scrub/findparent.h" +#include "scrub/orphanage.h" +#include "scrub/listxattr.h" + +/* + * Directory Repair + * ================ + * + * We repair directories by reading the directory data blocks looking for + * directory entries that look salvageable (name passes verifiers, entry points + * to a valid allocated inode, etc). Each entry worth salvaging is stashed in + * memory, and the stashed entries are periodically replayed into a temporary + * directory to constrain memory use. Batching the construction of the + * temporary directory in this fashion reduces lock cycling of the directory + * being repaired and the temporary directory, and will later become important + * for parent pointer scanning. + * + * If parent pointers are enabled on this filesystem, we instead reconstruct + * the directory by visiting each parent pointer of each file in the filesystem + * and translating the relevant parent pointer records into dirents. In this + * case, it is advantageous to stash all directory entries created from parent + * pointers for a single child file before replaying them into the temporary + * directory. To save memory, the live filesystem scan reuses the findparent + * fields. Directory repair chooses either parent pointer scanning or + * directory entry salvaging, but not both. + * + * Directory entries added to the temporary directory do not elevate the link + * counts of the inodes found. When salvaging completes, the remaining stashed + * entries are replayed to the temporary directory. An atomic mapping exchange + * is used to commit the new directory blocks to the directory being repaired. + * This will disrupt readdir cursors. + * + * Locking Issues + * -------------- + * + * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on + * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects + * b's dotdot update. This is in contrast to every other dotdot update (link, + * remove, mkdir). If the repair code drops the ILOCK, it must either + * revalidate the dotdot entry or use dirent hooks to capture updates from + * other threads. + */ + +/* Create a dirent in the tempdir. */ +#define XREP_DIRENT_ADD (1) + +/* Remove a dirent from the tempdir. */ +#define XREP_DIRENT_REMOVE (2) + +/* Directory entry to be restored in the new directory. */ +struct xrep_dirent { + /* Cookie for retrieval of the dirent name. */ + xfblob_cookie name_cookie; + + /* Target inode number. */ + xfs_ino_t ino; + + /* Length of the dirent name. */ + uint8_t namelen; + + /* File type of the dirent. */ + uint8_t ftype; + + /* XREP_DIRENT_{ADD,REMOVE} */ + uint8_t action; +}; + +/* + * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names + * before we write them to the temp dir. + */ +#define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8) + +struct xrep_dir { + struct xfs_scrub *sc; + + /* Fixed-size array of xrep_dirent structures. */ + struct xfarray *dir_entries; + + /* Blobs containing directory entry names. */ + struct xfblob *dir_names; + + /* Information for exchanging data forks at the end. */ + struct xrep_tempexch tx; + + /* Preallocated args struct for performing dir operations */ + struct xfs_da_args args; + + /* + * Information used to scan the filesystem to find the inumber of the + * dotdot entry for this directory. For directory salvaging when + * parent pointers are not enabled, we use the findparent_* functions + * on this object and access only the parent_ino field directly. + * + * When parent pointers are enabled, however, the pptr scanner uses the + * iscan, hooks, lock, and parent_ino fields of this object directly. + * @pscan.lock coordinates access to dir_entries, dir_names, + * parent_ino, subdirs, dirents, and args. This reduces the memory + * requirements of this structure. + */ + struct xrep_parent_scan_info pscan; + + /* + * Context information for attaching this directory to the lost+found + * if this directory does not have a parent. + */ + struct xrep_adoption adoption; + + /* How many subdirectories did we find? */ + uint64_t subdirs; + + /* How many dirents did we find? */ + unsigned int dirents; + + /* Should we move this directory to the orphanage? */ + bool needs_adoption; + + /* Directory entry name, plus the trailing null. */ + struct xfs_name xname; + unsigned char namebuf[MAXNAMELEN]; +}; + +/* Tear down all the incore stuff we created. */ +static void +xrep_dir_teardown( + struct xfs_scrub *sc) +{ + struct xrep_dir *rd = sc->buf; + + xrep_findparent_scan_teardown(&rd->pscan); + xfblob_destroy(rd->dir_names); + xfarray_destroy(rd->dir_entries); +} + +/* Set up for a directory repair. */ +int +xrep_setup_directory( + struct xfs_scrub *sc) +{ + struct xrep_dir *rd; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + error = xrep_orphanage_try_create(sc); + if (error) + return error; + + error = xrep_tempfile_create(sc, S_IFDIR); + if (error) + return error; + + rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS); + if (!rd) + return -ENOMEM; + rd->sc = sc; + rd->xname.name = rd->namebuf; + sc->buf = rd; + + return 0; +} + +/* + * Look up the dotdot entry and confirm that it's really the parent. + * Returns NULLFSINO if we don't know what to do. + */ +static inline xfs_ino_t +xrep_dir_lookup_parent( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + xfs_ino_t ino; + int error; + + error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL); + if (error) + return NULLFSINO; + if (!xfs_verify_dir_ino(sc->mp, ino)) + return NULLFSINO; + + error = xrep_findparent_confirm(sc, &ino); + if (error) + return NULLFSINO; + + return ino; +} + +/* + * Look up '..' in the dentry cache and confirm that it's really the parent. + * Returns NULLFSINO if the dcache misses or if the hit is implausible. + */ +static inline xfs_ino_t +xrep_dir_dcache_parent( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + xfs_ino_t parent_ino; + int error; + + parent_ino = xrep_findparent_from_dcache(sc); + if (parent_ino == NULLFSINO) + return parent_ino; + + error = xrep_findparent_confirm(sc, &parent_ino); + if (error) + return NULLFSINO; + + return parent_ino; +} + +/* Try to find the parent of the directory being repaired. */ +STATIC int +xrep_dir_find_parent( + struct xrep_dir *rd) +{ + xfs_ino_t ino; + + ino = xrep_findparent_self_reference(rd->sc); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rd->pscan, ino); + return 0; + } + + ino = xrep_dir_dcache_parent(rd); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rd->pscan, ino); + return 0; + } + + ino = xrep_dir_lookup_parent(rd); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rd->pscan, ino); + return 0; + } + + /* + * A full filesystem scan is the last resort. On a busy filesystem, + * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means + * that we don't know what who the parent is, so we should return to + * userspace. + */ + return xrep_findparent_scan(&rd->pscan); +} + +/* + * Decide if we want to salvage this entry. We don't bother with oversized + * names or the dot entry. + */ +STATIC int +xrep_dir_want_salvage( + struct xrep_dir *rd, + const char *name, + int namelen, + xfs_ino_t ino) +{ + struct xfs_mount *mp = rd->sc->mp; + + /* No pointers to ourselves or to garbage. */ + if (ino == rd->sc->ip->i_ino) + return false; + if (!xfs_verify_dir_ino(mp, ino)) + return false; + + /* No weird looking names or dot entries. */ + if (namelen >= MAXNAMELEN || namelen <= 0) + return false; + if (namelen == 1 && name[0] == '.') + return false; + if (!xfs_dir2_namecheck(name, namelen)) + return false; + + return true; +} + +/* + * Remember that we want to create a dirent in the tempdir. These stashed + * actions will be replayed later. + */ +STATIC int +xrep_dir_stash_createname( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_ino_t ino) +{ + struct xrep_dirent dirent = { + .action = XREP_DIRENT_ADD, + .ino = ino, + .namelen = name->len, + .ftype = name->type, + }; + int error; + + trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino); + + error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); + if (error) + return error; + + return xfarray_append(rd->dir_entries, &dirent); +} + +/* + * Remember that we want to remove a dirent from the tempdir. These stashed + * actions will be replayed later. + */ +STATIC int +xrep_dir_stash_removename( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_ino_t ino) +{ + struct xrep_dirent dirent = { + .action = XREP_DIRENT_REMOVE, + .ino = ino, + .namelen = name->len, + .ftype = name->type, + }; + int error; + + trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino); + + error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); + if (error) + return error; + + return xfarray_append(rd->dir_entries, &dirent); +} + +/* Allocate an in-core record to hold entries while we rebuild the dir data. */ +STATIC int +xrep_dir_salvage_entry( + struct xrep_dir *rd, + unsigned char *name, + unsigned int namelen, + xfs_ino_t ino) +{ + struct xfs_name xname = { + .name = name, + }; + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *ip; + unsigned int i = 0; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Truncate the name to the first character that would trip namecheck. + * If we no longer have a name after that, ignore this entry. + */ + while (i < namelen && name[i] != 0 && name[i] != '/') + i++; + if (i == 0) + return 0; + xname.len = i; + + /* Ignore '..' entries; we already picked the new parent. */ + if (xname.len == 2 && name[0] == '.' && name[1] == '.') { + trace_xrep_dir_salvaged_parent(sc->ip, ino); + return 0; + } + + trace_xrep_dir_salvage_entry(sc->ip, &xname, ino); + + /* + * Compute the ftype or dump the entry if we can't. We don't lock the + * inode because inodes can't change type while we have a reference. + */ + error = xchk_iget(sc, ino, &ip); + if (error) + return 0; + + /* Don't mix metadata and regular directory trees. */ + if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(rd->sc->ip)) { + xchk_irele(sc, ip); + return 0; + } + + xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode); + xchk_irele(sc, ip); + + return xrep_dir_stash_createname(rd, &xname, ino); +} + +/* Record a shortform directory entry for later reinsertion. */ +STATIC int +xrep_dir_salvage_sf_entry( + struct xrep_dir *rd, + struct xfs_dir2_sf_hdr *sfp, + struct xfs_dir2_sf_entry *sfep) +{ + xfs_ino_t ino; + + ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep); + if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino)) + return 0; + + return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino); +} + +/* Record a regular directory entry for later reinsertion. */ +STATIC int +xrep_dir_salvage_data_entry( + struct xrep_dir *rd, + struct xfs_dir2_data_entry *dep) +{ + xfs_ino_t ino; + + ino = be64_to_cpu(dep->inumber); + if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino)) + return 0; + + return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino); +} + +/* Try to recover block/data format directory entries. */ +STATIC int +xrep_dir_recover_data( + struct xrep_dir *rd, + struct xfs_buf *bp) +{ + struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo; + unsigned int offset; + unsigned int end; + int error = 0; + + /* + * Loop over the data portion of the block. + * Each object is a real entry (dep) or an unused one (dup). + */ + offset = geo->data_entry_offset; + end = min_t(unsigned int, BBTOB(bp->b_length), + xfs_dir3_data_end_offset(geo, bp->b_addr)); + + while (offset < end) { + struct xfs_dir2_data_unused *dup = bp->b_addr + offset; + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; + + if (xchk_should_terminate(rd->sc, &error)) + return error; + + /* Skip unused entries. */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + offset += be16_to_cpu(dup->length); + continue; + } + + /* Don't walk off the end of the block. */ + offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen); + if (offset > end) + break; + + /* Ok, let's save this entry. */ + error = xrep_dir_salvage_data_entry(rd, dep); + if (error) + return error; + + } + + return 0; +} + +/* Try to recover shortform directory entries. */ +STATIC int +xrep_dir_recover_sf( + struct xrep_dir *rd) +{ + struct xfs_dir2_sf_hdr *hdr; + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_entry *next; + struct xfs_ifork *ifp; + xfs_ino_t ino; + unsigned char *end; + int error = 0; + + ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK); + hdr = ifp->if_data; + end = (unsigned char *)ifp->if_data + ifp->if_bytes; + + ino = xfs_dir2_sf_get_parent_ino(hdr); + trace_xrep_dir_salvaged_parent(rd->sc->ip, ino); + + sfep = xfs_dir2_sf_firstentry(hdr); + while ((unsigned char *)sfep < end) { + if (xchk_should_terminate(rd->sc, &error)) + return error; + + next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep); + if ((unsigned char *)next > end) + break; + + /* Ok, let's save this entry. */ + error = xrep_dir_salvage_sf_entry(rd, hdr, sfep); + if (error) + return error; + + sfep = next; + } + + return 0; +} + +/* + * Try to figure out the format of this directory from the data fork mappings + * and the directory size. If we can be reasonably sure of format, we can be + * more aggressive in salvaging directory entries. On return, @magic_guess + * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format" + * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory, + * and 0 if we can't tell. + */ +STATIC void +xrep_dir_guess_format( + struct xrep_dir *rd, + __be32 *magic_guess) +{ + struct xfs_inode *dp = rd->sc->ip; + struct xfs_mount *mp = rd->sc->mp; + struct xfs_da_geometry *geo = mp->m_dir_geo; + xfs_fileoff_t last; + int error; + + ASSERT(xfs_has_crc(mp)); + + *magic_guess = 0; + + /* + * If there's a single directory block and the directory size is + * exactly one block, this has to be a single block format directory. + */ + error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK); + if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize && + dp->i_disk_size == geo->blksize) { + *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + return; + } + + /* + * If the last extent before the leaf offset matches the directory + * size and the directory size is larger than 1 block, this is a + * data format directory. + */ + last = geo->leafblk; + error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK); + if (!error && + XFS_FSB_TO_B(mp, last) > geo->blksize && + XFS_FSB_TO_B(mp, last) == dp->i_disk_size) { + *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + return; + } +} + +/* Recover directory entries from a specific directory block. */ +STATIC int +xrep_dir_recover_dirblock( + struct xrep_dir *rd, + __be32 magic_guess, + xfs_dablk_t dabno) +{ + struct xfs_dir2_data_hdr *hdr; + struct xfs_buf *bp; + __be32 oldmagic; + int error; + + /* + * Try to read buffer. We invalidate them in the next step so we don't + * bother to set a buffer type or ops. + */ + error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno, + XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL); + if (error || !bp) + return error; + + hdr = bp->b_addr; + oldmagic = hdr->magic; + + trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno, + be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess)); + + /* + * If we're sure of the block's format, proceed with the salvage + * operation using the specified magic number. + */ + if (magic_guess) { + hdr->magic = magic_guess; + goto recover; + } + + /* + * If we couldn't guess what type of directory this is, then we will + * only salvage entries from directory blocks that match the magic + * number and pass verifiers. + */ + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops)) + goto out; + if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL) + goto out; + break; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops)) + goto out; + if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL) + goto out; + break; + default: + goto out; + } + +recover: + error = xrep_dir_recover_data(rd, bp); + +out: + hdr->magic = oldmagic; + xfs_trans_brelse(rd->sc->tp, bp); + return error; +} + +static inline void +xrep_dir_init_args( + struct xrep_dir *rd, + struct xfs_inode *dp, + const struct xfs_name *name) +{ + memset(&rd->args, 0, sizeof(struct xfs_da_args)); + rd->args.geo = rd->sc->mp->m_dir_geo; + rd->args.whichfork = XFS_DATA_FORK; + rd->args.owner = rd->sc->ip->i_ino; + rd->args.trans = rd->sc->tp; + rd->args.dp = dp; + if (!name) + return; + rd->args.name = name->name; + rd->args.namelen = name->len; + rd->args.filetype = name->type; + rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name); +} + +/* Replay a stashed createname into the temporary directory. */ +STATIC int +xrep_dir_replay_createname( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_ino_t inum, + xfs_extlen_t total) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *dp = rd->sc->tempip; + int error; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + + error = xfs_dir_ino_validate(sc->mp, inum); + if (error) + return error; + + trace_xrep_dir_replay_createname(dp, name, inum); + + xrep_dir_init_args(rd, dp, name); + rd->args.inumber = inum; + rd->args.total = total; + rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + return xfs_dir_createname_args(&rd->args); +} + +/* Replay a stashed removename onto the temporary directory. */ +STATIC int +xrep_dir_replay_removename( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_extlen_t total) +{ + struct xfs_inode *dp = rd->args.dp; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + + xrep_dir_init_args(rd, dp, name); + rd->args.op_flags = 0; + rd->args.total = total; + + trace_xrep_dir_replay_removename(dp, name, 0); + return xfs_dir_removename_args(&rd->args); +} + +/* + * Add this stashed incore directory entry to the temporary directory. + * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and + * must not be in transaction context. + */ +STATIC int +xrep_dir_replay_update( + struct xrep_dir *rd, + const struct xfs_name *xname, + const struct xrep_dirent *dirent) +{ + struct xfs_mount *mp = rd->sc->mp; +#ifdef DEBUG + xfs_ino_t ino; +#endif + uint resblks; + int error; + + resblks = xfs_link_space_res(mp, xname->len); + error = xchk_trans_alloc(rd->sc, resblks); + if (error) + return error; + + /* Lock the temporary directory and join it to the transaction */ + xrep_tempfile_ilock(rd->sc); + xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0); + + switch (dirent->action) { + case XREP_DIRENT_ADD: + /* + * Create a replacement dirent in the temporary directory. + * Note that _createname doesn't check for existing entries. + * There shouldn't be any in the temporary dir, but we'll + * verify this in debug mode. + */ +#ifdef DEBUG + error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); + if (error != -ENOENT) { + ASSERT(error != -ENOENT); + goto out_cancel; + } +#endif + + error = xrep_dir_replay_createname(rd, xname, dirent->ino, + resblks); + if (error) + goto out_cancel; + + if (xname->type == XFS_DIR3_FT_DIR) + rd->subdirs++; + rd->dirents++; + break; + case XREP_DIRENT_REMOVE: + /* + * Remove a dirent from the temporary directory. Note that + * _removename doesn't check the inode target of the exist + * entry. There should be a perfect match in the temporary + * dir, but we'll verify this in debug mode. + */ +#ifdef DEBUG + error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); + if (error) { + ASSERT(error != 0); + goto out_cancel; + } + if (ino != dirent->ino) { + ASSERT(ino == dirent->ino); + error = -EIO; + goto out_cancel; + } +#endif + + error = xrep_dir_replay_removename(rd, xname, resblks); + if (error) + goto out_cancel; + + if (xname->type == XFS_DIR3_FT_DIR) + rd->subdirs--; + rd->dirents--; + break; + default: + ASSERT(0); + error = -EIO; + goto out_cancel; + } + + /* Commit and unlock. */ + error = xrep_trans_commit(rd->sc); + if (error) + return error; + + xrep_tempfile_iunlock(rd->sc); + return 0; +out_cancel: + xchk_trans_cancel(rd->sc); + xrep_tempfile_iunlock(rd->sc); + return error; +} + +/* + * Flush stashed incore dirent updates that have been recorded by the scanner. + * This is done to reduce the memory requirements of the directory rebuild, + * since directories can contain up to 32GB of directory data. + * + * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir + * IOLOCK. + */ +STATIC int +xrep_dir_replay_updates( + struct xrep_dir *rd) +{ + xfarray_idx_t array_cur; + int error; + + /* Add all the salvaged dirents to the temporary directory. */ + mutex_lock(&rd->pscan.lock); + foreach_xfarray_idx(rd->dir_entries, array_cur) { + struct xrep_dirent dirent; + + error = xfarray_load(rd->dir_entries, array_cur, &dirent); + if (error) + goto out_unlock; + + error = xfblob_loadname(rd->dir_names, dirent.name_cookie, + &rd->xname, dirent.namelen); + if (error) + goto out_unlock; + rd->xname.type = dirent.ftype; + mutex_unlock(&rd->pscan.lock); + + error = xrep_dir_replay_update(rd, &rd->xname, &dirent); + if (error) + return error; + mutex_lock(&rd->pscan.lock); + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rd->dir_entries); + xfblob_truncate(rd->dir_names); + mutex_unlock(&rd->pscan.lock); + return 0; +out_unlock: + mutex_unlock(&rd->pscan.lock); + return error; +} + +/* + * Periodically flush stashed directory entries to the temporary dir. This + * is done to reduce the memory requirements of the directory rebuild, since + * directories can contain up to 32GB of directory data. + */ +STATIC int +xrep_dir_flush_stashed( + struct xrep_dir *rd) +{ + int error; + + /* + * Entering this function, the scrub context has a reference to the + * inode being repaired, the temporary file, and a scrub transaction + * that we use during dirent salvaging to avoid livelocking if there + * are cycles in the directory structures. We hold ILOCK_EXCL on both + * the inode being repaired and the temporary file, though they are + * not ijoined to the scrub transaction. + * + * To constrain kernel memory use, we occasionally write salvaged + * dirents from the xfarray and xfblob structures into the temporary + * directory in preparation for exchanging the directory structures at + * the end. Updating the temporary file requires a transaction, so we + * commit the scrub transaction and drop the two ILOCKs so that + * we can allocate whatever transaction we want. + * + * We still hold IOLOCK_EXCL on the inode being repaired, which + * prevents anyone from accessing the damaged directory data while we + * repair it. + */ + error = xrep_trans_commit(rd->sc); + if (error) + return error; + xchk_iunlock(rd->sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK of the temporary file while we modify dirents. This + * isn't strictly required because the temporary file is never revealed + * to userspace, but we follow the same locking rules. We still hold + * sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rd->sc); + if (error) + return error; + + /* Write to the tempdir all the updates that we've stashed. */ + error = xrep_dir_replay_updates(rd); + xrep_tempfile_iounlock(rd->sc); + if (error) + return error; + + /* + * Recreate the salvage transaction and relock the dir we're salvaging. + */ + error = xchk_trans_alloc(rd->sc, 0); + if (error) + return error; + xchk_ilock(rd->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Decide if we've stashed too much dirent data in memory. */ +static inline bool +xrep_dir_want_flush_stashed( + struct xrep_dir *rd) +{ + unsigned long long bytes; + + bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names); + return bytes > XREP_DIR_MAX_STASH_BYTES; +} + +/* Extract as many directory entries as we can. */ +STATIC int +xrep_dir_recover( + struct xrep_dir *rd) +{ + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = rd->sc; + struct xfs_da_geometry *geo = sc->mp->m_dir_geo; + xfs_fileoff_t offset; + xfs_dablk_t dabno; + __be32 magic_guess; + int nmap; + int error; + + xrep_dir_guess_format(rd, &magic_guess); + + /* Iterate each directory data block in the data fork. */ + for (offset = 0; + offset < geo->leafblk; + offset = got.br_startoff + got.br_blockcount) { + nmap = 1; + error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset, + &got, &nmap, 0); + if (error) + return error; + if (nmap != 1) + return -EFSCORRUPTED; + if (!xfs_bmap_is_written_extent(&got)) + continue; + + for (dabno = round_up(got.br_startoff, geo->fsbcount); + dabno < got.br_startoff + got.br_blockcount; + dabno += geo->fsbcount) { + if (xchk_should_terminate(rd->sc, &error)) + return error; + + error = xrep_dir_recover_dirblock(rd, + magic_guess, dabno); + if (error) + return error; + + /* Flush dirents to constrain memory usage. */ + if (xrep_dir_want_flush_stashed(rd)) { + error = xrep_dir_flush_stashed(rd); + if (error) + return error; + } + } + } + + return 0; +} + +/* + * Find all the directory entries for this inode by scraping them out of the + * directory leaf blocks by hand, and flushing them into the temp dir. + */ +STATIC int +xrep_dir_find_entries( + struct xrep_dir *rd) +{ + struct xfs_inode *dp = rd->sc->ip; + int error; + + /* + * Salvage directory entries from the old directory, and write them to + * the temporary directory. + */ + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + error = xrep_dir_recover_sf(rd); + } else { + error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK); + if (error) + return error; + + error = xrep_dir_recover(rd); + } + if (error) + return error; + + return xrep_dir_flush_stashed(rd); +} + +/* Scan all files in the filesystem for dirents. */ +STATIC int +xrep_dir_salvage_entries( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + /* + * Drop the ILOCK on this directory so that we can scan for this + * directory's parent. Figure out who is going to be the parent of + * this directory, then retake the ILOCK so that we can salvage + * directory entries. + */ + xchk_iunlock(sc, XFS_ILOCK_EXCL); + error = xrep_dir_find_parent(rd); + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (error) + return error; + + /* + * Collect directory entries by parsing raw leaf blocks to salvage + * whatever we can. When we're done, free the staging memory before + * exchanging the directories to reduce memory usage. + */ + error = xrep_dir_find_entries(rd); + if (error) + return error; + + /* + * Cancel the repair transaction and drop the ILOCK so that we can + * (later) use the atomic mapping exchange functions to compute the + * correct block reservations and re-lock the inodes. + * + * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory + * modifications, but there's nothing to prevent userspace from reading + * the directory until we're ready for the exchange operation. Reads + * will return -EIO without shutting down the fs, so we're ok with + * that. + * + * The VFS can change dotdot on us, but the findparent scan will keep + * our incore parent inode up to date. See the note on locking issues + * for more details. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + xchk_iunlock(sc, XFS_ILOCK_EXCL); + return 0; +} + + +/* + * Examine a parent pointer of a file. If it leads us back to the directory + * that we're rebuilding, create an incore dirent from the parent pointer and + * stash it. + */ +STATIC int +xrep_dir_scan_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + .type = xfs_mode_to_ftype(VFS_I(ip)->i_mode), + }; + xfs_ino_t parent_ino; + uint32_t parent_gen; + struct xrep_dir *rd = priv; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + /* + * Ignore parent pointers that point back to a different dir, list the + * wrong generation number, or are invalid. + */ + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, &parent_gen); + if (error) + return error; + + if (parent_ino != sc->ip->i_ino || + parent_gen != VFS_I(sc->ip)->i_generation) + return 0; + + mutex_lock(&rd->pscan.lock); + error = xrep_dir_stash_createname(rd, &xname, ip->i_ino); + mutex_unlock(&rd->pscan.lock); + return error; +} + +/* + * If this child dirent points to the directory being repaired, remember that + * fact so that we can reset the dotdot entry if necessary. + */ +STATIC int +xrep_dir_scan_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_dir *rd = priv; + + /* Dirent doesn't point to this directory. */ + if (ino != rd->sc->ip->i_ino) + return 0; + + /* Ignore garbage inum. */ + if (!xfs_verify_dir_ino(rd->sc->mp, ino)) + return 0; + + /* No weird looking names. */ + if (name->len >= MAXNAMELEN || name->len <= 0) + return 0; + + /* Don't pick up dot or dotdot entries; we only want child dirents. */ + if (xfs_dir2_samename(name, &xfs_name_dotdot) || + xfs_dir2_samename(name, &xfs_name_dot)) + return 0; + + trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot, + dp->i_ino); + + xrep_findparent_scan_found(&rd->pscan, dp->i_ino); + return 0; +} + +/* + * Decide if we want to look for child dirents or parent pointers in this file. + * Skip the dir being repaired and any files being used to stage repairs. + */ +static inline bool +xrep_dir_want_scan( + struct xrep_dir *rd, + const struct xfs_inode *ip) +{ + return ip != rd->sc->ip && !xrep_is_tempfile(ip); +} + +/* + * Take ILOCK on a file that we want to scan. + * + * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or + * has an unloaded attr bmbt. Otherwise, take ILOCK_SHARED. + */ +static inline unsigned int +xrep_dir_scan_ilock( + struct xrep_dir *rd, + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + /* Need to take the shared ILOCK to advance the iscan cursor. */ + if (!xrep_dir_want_scan(rd, ip)) + goto lock; + + if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) { + lock_mode = XFS_ILOCK_EXCL; + goto lock; + } + + if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) + lock_mode = XFS_ILOCK_EXCL; + +lock: + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* + * Scan this file for relevant child dirents or parent pointers that point to + * the directory we're rebuilding. + */ +STATIC int +xrep_dir_scan_file( + struct xrep_dir *rd, + struct xfs_inode *ip) +{ + unsigned int lock_mode; + int error = 0; + + lock_mode = xrep_dir_scan_ilock(rd, ip); + + if (!xrep_dir_want_scan(rd, ip)) + goto scan_done; + + /* + * If the extended attributes look as though they has been zapped by + * the inode record repair code, we cannot scan for parent pointers. + */ + if (xchk_pptr_looks_zapped(ip)) { + error = -EBUSY; + goto scan_done; + } + + error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd); + if (error) + goto scan_done; + + if (S_ISDIR(VFS_I(ip)->i_mode)) { + /* + * If the directory looks as though it has been zapped by the + * inode record repair code, we cannot scan for child dirents. + */ + if (xchk_dir_looks_zapped(ip)) { + error = -EBUSY; + goto scan_done; + } + + error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd); + if (error) + goto scan_done; + } + +scan_done: + xchk_iscan_mark_visited(&rd->pscan.iscan, ip); + xfs_iunlock(ip, lock_mode); + return error; +} + +/* + * Scan all files in the filesystem for parent pointers that we can turn into + * replacement dirents, and a dirent that we can use to set the dotdot pointer. + */ +STATIC int +xrep_dir_scan_dirtree( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *ip; + int error; + + /* Roots of directory trees are their own parents. */ + if (xchk_inode_is_dirtree_root(sc->ip)) + xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino); + + /* + * Filesystem scans are time consuming. Drop the directory ILOCK and + * all other resources for the duration of the scan and hope for the + * best. The live update hooks will keep our scan information up to + * date even though we've dropped the locks. + */ + xchk_trans_cancel(sc); + if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) + xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | + XFS_ILOCK_EXCL)); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) { + bool flush; + + error = xrep_dir_scan_file(rd, ip); + xchk_irele(sc, ip); + if (error) + break; + + /* Flush stashed dirent updates to constrain memory usage. */ + mutex_lock(&rd->pscan.lock); + flush = xrep_dir_want_flush_stashed(rd); + mutex_unlock(&rd->pscan.lock); + if (flush) { + xchk_trans_cancel(sc); + + error = xrep_tempfile_iolock_polled(sc); + if (error) + break; + + error = xrep_dir_replay_updates(rd); + xrep_tempfile_iounlock(sc); + if (error) + break; + + error = xchk_trans_alloc_empty(sc); + if (error) + break; + } + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rd->pscan.iscan); + if (error) { + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Cancel the empty transaction so that we can (later) use the atomic + * file mapping exchange functions to lock files and commit the new + * directory. + */ + xchk_trans_cancel(rd->sc); + return 0; +} + +/* + * Capture dirent updates being made by other threads which are relevant to the + * directory being repaired. + */ +STATIC int +xrep_dir_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_dir *rd; + struct xfs_scrub *sc; + int error = 0; + + rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb); + sc = rd->sc; + + /* + * This thread updated a child dirent in the directory that we're + * rebuilding. Stash the update for replay against the temporary + * directory. + */ + if (p->dp->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) { + mutex_lock(&rd->pscan.lock); + if (p->delta > 0) + error = xrep_dir_stash_createname(rd, p->name, + p->ip->i_ino); + else + error = xrep_dir_stash_removename(rd, p->name, + p->ip->i_ino); + mutex_unlock(&rd->pscan.lock); + if (error) + goto out_abort; + } + + /* + * This thread updated another directory's child dirent that points to + * the directory that we're rebuilding, so remember the new dotdot + * target. + */ + if (p->ip->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) { + if (p->delta > 0) { + trace_xrep_dir_stash_createname(sc->tempip, + &xfs_name_dotdot, + p->dp->i_ino); + + xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino); + } else { + trace_xrep_dir_stash_removename(sc->tempip, + &xfs_name_dotdot, + rd->pscan.parent_ino); + + xrep_findparent_scan_found(&rd->pscan, NULLFSINO); + } + } + + return NOTIFY_DONE; +out_abort: + xchk_iscan_abort(&rd->pscan.iscan); + return NOTIFY_DONE; +} + +/* + * Free all the directory blocks and reset the data fork. The caller must + * join the inode to the transaction. This function returns with the inode + * joined to a clean scrub transaction. + */ +STATIC int +xrep_dir_reset_fork( + struct xrep_dir *rd, + xfs_ino_t parent_ino) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK); + int error; + + /* Unmap all the directory buffers. */ + if (xfs_ifork_has_extents(ifp)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); + if (error) + return error; + } + + trace_xrep_dir_reset_fork(sc->tempip, parent_ino); + + /* Reset the data fork to an empty data fork. */ + xfs_idestroy_fork(ifp); + ifp->if_bytes = 0; + sc->tempip->i_disk_size = 0; + + /* Reinitialize the short form directory. */ + xrep_dir_init_args(rd, sc->tempip, NULL); + return xfs_dir2_sf_create(&rd->args, parent_ino); +} + +/* + * Prepare both inodes' directory forks for exchanging mappings. Promote the + * tempfile from short format to leaf format, and if the file being repaired + * has a short format data fork, turn it into an empty extent list. + */ +STATIC int +xrep_dir_swap_prep( + struct xfs_scrub *sc, + bool temp_local, + bool ip_local) +{ + int error; + + /* + * If the tempfile's directory is in shortform format, convert that to + * a single leaf extent so that we can use the atomic mapping exchange. + */ + if (temp_local) { + struct xfs_da_args args = { + .dp = sc->tempip, + .geo = sc->mp->m_dir_geo, + .whichfork = XFS_DATA_FORK, + .trans = sc->tp, + .total = 1, + .owner = sc->ip->i_ino, + }; + + error = xfs_dir2_sf_to_block(&args); + if (error) + return error; + + /* + * Roll the deferred log items to get us back to a clean + * transaction. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + /* + * If the file being repaired had a shortform data fork, convert that + * to an empty extent list in preparation for the atomic mapping + * exchange. + */ + if (ip_local) { + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + ifp->if_bytes = 0; + ifp->if_data = NULL; + ifp->if_height = 0; + + xfs_trans_log_inode(sc->tp, sc->ip, + XFS_ILOG_CORE | XFS_ILOG_DDATA); + } + + return 0; +} + +/* + * Replace the inode number of a directory entry. + */ +static int +xrep_dir_replace( + struct xrep_dir *rd, + struct xfs_inode *dp, + const struct xfs_name *name, + xfs_ino_t inum, + xfs_extlen_t total) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + + error = xfs_dir_ino_validate(sc->mp, inum); + if (error) + return error; + + xrep_dir_init_args(rd, dp, name); + rd->args.inumber = inum; + rd->args.total = total; + return xfs_dir_replace_args(&rd->args); +} + +/* + * Reset the link count of this directory and adjust the unlinked list pointers + * as needed. + */ +STATIC int +xrep_dir_set_nlink( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *dp = sc->ip; + struct xfs_perag *pag; + unsigned int new_nlink = min_t(unsigned long long, + rd->subdirs + 2, + XFS_NLINK_PINNED); + int error; + + /* + * The directory is not on the incore unlinked list, which means that + * it needs to be reachable via the directory tree. Update the nlink + * with our observed link count. If the directory has no parent, it + * will be moved to the orphanage. + */ + if (!xfs_inode_on_unlinked_list(dp)) + goto reset_nlink; + + /* + * The directory is on the unlinked list and we did not find any + * dirents. Set the link count to zero and let the directory + * inactivate when the last reference drops. + */ + if (rd->dirents == 0) { + rd->needs_adoption = false; + new_nlink = 0; + goto reset_nlink; + } + + /* + * The directory is on the unlinked list and we found dirents. This + * directory needs to be reachable via the directory tree. Remove the + * dir from the unlinked list and update nlink with the observed link + * count. If the directory has no parent, it will be moved to the + * orphanage. + */ + pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino)); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xfs_iunlink_remove(sc->tp, pag, dp); + xfs_perag_put(pag); + if (error) + return error; + +reset_nlink: + if (VFS_I(dp)->i_nlink != new_nlink) + set_nlink(VFS_I(dp), new_nlink); + return 0; +} + +/* + * Finish replaying stashed dirent updates, allocate a transaction for + * exchanging data fork mappings, and take the ILOCKs of both directories + * before we commit the new directory structure. + */ +STATIC int +xrep_dir_finalize_tempdir( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + if (!xfs_has_parent(sc->mp)) + return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); + + /* + * Repair relies on the ILOCK to quiesce all possible dirent updates. + * Replay all queued dirent updates into the tempdir before exchanging + * the contents, even if that means dropping the ILOCKs and the + * transaction. + */ + do { + error = xrep_dir_replay_updates(rd); + if (error) + return error; + + error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); + if (error) + return error; + + if (xfarray_length(rd->dir_entries) == 0) + break; + + xchk_trans_cancel(sc); + xrep_tempfile_iunlock_both(sc); + } while (!xchk_should_terminate(sc, &error)); + return error; +} + +/* Exchange the temporary directory's data fork with the one being repaired. */ +STATIC int +xrep_dir_swap( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + xfs_ino_t ino; + bool ip_local, temp_local; + int error = 0; + + /* + * If we never found the parent for this directory, temporarily assign + * the root dir as the parent; we'll move this to the orphanage after + * exchanging the dir contents. We hold the ILOCK of the dir being + * repaired, so we're not worried about racy updates of dotdot. + */ + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + if (rd->pscan.parent_ino == NULLFSINO) { + rd->needs_adoption = true; + rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino; + } + + /* + * Reset the temporary directory's '..' entry to point to the parent + * that we found. The dirent replace code asserts if the dirent + * already points at the new inumber, so we look it up here. + * + * It's also possible that this replacement could also expand a sf + * tempdir into block format. + */ + error = xchk_dir_lookup(sc, rd->sc->tempip, &xfs_name_dotdot, &ino); + if (error) + return error; + + if (rd->pscan.parent_ino != ino) { + error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot, + rd->pscan.parent_ino, rd->tx.req.resblks); + if (error) + return error; + } + + /* + * Changing the dot and dotdot entries could have changed the shape of + * the directory, so we recompute these. + */ + ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + + /* + * If the both files have a local format data fork and the rebuilt + * directory data would fit in the repaired file's data fork, copy + * the contents from the tempfile and update the directory link count. + * We're done now. + */ + if (ip_local && temp_local && + sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) { + xrep_tempfile_copyout_local(sc, XFS_DATA_FORK); + return xrep_dir_set_nlink(rd); + } + + /* + * Clean the transaction before we start working on exchanging + * directory contents. + */ + error = xrep_tempfile_roll_trans(rd->sc); + if (error) + return error; + + /* Otherwise, make sure both data forks are in block-mapping mode. */ + error = xrep_dir_swap_prep(sc, temp_local, ip_local); + if (error) + return error; + + /* + * Set nlink of the directory in the same transaction sequence that + * (atomically) commits the new directory data. + */ + error = xrep_dir_set_nlink(rd); + if (error) + return error; + + return xrep_tempexch_contents(sc, &rd->tx); +} + +/* + * Exchange the new directory contents (which we created in the tempfile) with + * the directory being repaired. + */ +STATIC int +xrep_dir_rebuild_tree( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino); + + /* + * Take the IOLOCK on the temporary file so that we can run dir + * operations with the same locks held as we would for a normal file. + * We still hold sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rd->sc); + if (error) + return error; + + /* + * Allocate transaction, lock inodes, and make sure that we've replayed + * all the stashed dirent updates to the tempdir. After this point, + * we're ready to exchange data fork mappings. + */ + error = xrep_dir_finalize_tempdir(rd); + if (error) + return error; + + if (xchk_iscan_aborted(&rd->pscan.iscan)) + return -ECANCELED; + + /* + * Exchange the tempdir's data fork with the file being repaired. This + * recreates the transaction and re-takes the ILOCK in the scrub + * context. + */ + error = xrep_dir_swap(rd); + if (error) + return error; + + /* + * Release the old directory blocks and reset the data fork of the temp + * directory to an empty shortform directory because inactivation does + * nothing for directories. + */ + error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino); + if (error) + return error; + + /* + * Roll to get a transaction without any inodes joined to it. Then we + * can drop the tempfile's ILOCK and IOLOCK before doing more work on + * the scrub target directory. + */ + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + + xrep_tempfile_iunlock(sc); + xrep_tempfile_iounlock(sc); + return 0; +} + +/* Set up the filesystem scan so we can regenerate directory entries. */ +STATIC int +xrep_dir_setup_scan( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + char *descr; + int error; + + /* Set up some staging memory for salvaging dirents. */ + descr = xchk_xfile_ino_descr(sc, "directory entries"); + error = xfarray_create(descr, 0, sizeof(struct xrep_dirent), + &rd->dir_entries); + kfree(descr); + if (error) + return error; + + descr = xchk_xfile_ino_descr(sc, "directory entry names"); + error = xfblob_create(descr, &rd->dir_names); + kfree(descr); + if (error) + goto out_xfarray; + + if (xfs_has_parent(sc->mp)) + error = __xrep_findparent_scan_start(sc, &rd->pscan, + xrep_dir_live_update); + else + error = xrep_findparent_scan_start(sc, &rd->pscan); + if (error) + goto out_xfblob; + + return 0; + +out_xfblob: + xfblob_destroy(rd->dir_names); + rd->dir_names = NULL; +out_xfarray: + xfarray_destroy(rd->dir_entries); + rd->dir_entries = NULL; + return error; +} + +/* + * Move the current file to the orphanage. + * + * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon + * successful return, the scrub transaction will have enough extra reservation + * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the + * orphanage; and both inodes will be ijoined. + */ +STATIC int +xrep_dir_move_to_orphanage( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + xfs_ino_t orig_parent, new_parent; + int error; + + /* + * We are about to drop the ILOCK on sc->ip to lock the orphanage and + * prepare for the adoption. Therefore, look up the old dotdot entry + * for sc->ip so that we can compare it after we re-lock sc->ip. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent); + if (error) + return error; + + /* + * Drop the ILOCK on the scrub target and commit the transaction. + * Adoption computes its own resource requirements and gathers the + * necessary components. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* If we can take the orphanage's iolock then we're ready to move. */ + if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { + xchk_iunlock(sc, sc->ilock_flags); + error = xrep_orphanage_iolock_two(sc); + if (error) + return error; + } + + /* Grab transaction and ILOCK the two files. */ + error = xrep_adoption_trans_alloc(sc, &rd->adoption); + if (error) + return error; + + error = xrep_adoption_compute_name(&rd->adoption, &rd->xname); + if (error) + return error; + + /* + * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot + * entry again. If the parent changed or the child was unlinked while + * the child directory was unlocked, we don't need to move the child to + * the orphanage after all. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent); + if (error) + return error; + + /* + * Attach to the orphanage if we still have a linked directory and it + * hasn't been moved. + */ + if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) { + error = xrep_adoption_move(&rd->adoption); + if (error) + return error; + } + + /* + * Launder the scrub transaction so we can drop the orphanage ILOCK + * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK. + */ + error = xrep_adoption_trans_roll(&rd->adoption); + if (error) + return error; + + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + return 0; +} + +/* + * Repair the directory metadata. + * + * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer + * cache in XFS can't handle aliased multiblock buffers, so this might + * misbehave if the directory blocks are crosslinked with other filesystem + * metadata. + * + * XXX: Is it necessary to check the dcache for this directory to make sure + * that we always recreate every cached entry? + */ +int +xrep_directory( + struct xfs_scrub *sc) +{ + struct xrep_dir *rd = sc->buf; + int error; + + /* The rmapbt is required to reap the old data fork. */ + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + + error = xrep_dir_setup_scan(rd); + if (error) + return error; + + if (xfs_has_parent(sc->mp)) + error = xrep_dir_scan_dirtree(rd); + else + error = xrep_dir_salvage_entries(rd); + if (error) + goto out_teardown; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto out_teardown; + + error = xrep_dir_rebuild_tree(rd); + if (error) + goto out_teardown; + + if (rd->needs_adoption) { + if (!xrep_orphanage_can_adopt(rd->sc)) + error = -EFSCORRUPTED; + else + error = xrep_dir_move_to_orphanage(rd); + if (error) + goto out_teardown; + } + +out_teardown: + xrep_dir_teardown(sc); + return error; +} diff --git a/fs/xfs/scrub/dirtree.c b/fs/xfs/scrub/dirtree.c new file mode 100644 index 000000000000..3a9cdf8738b6 --- /dev/null +++ b/fs/xfs/scrub/dirtree.c @@ -0,0 +1,1009 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr.h" +#include "xfs_parent.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/bitmap.h" +#include "scrub/ino_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/listxattr.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/orphanage.h" +#include "scrub/dirtree.h" + +/* + * Directory Tree Structure Validation + * =================================== + * + * Validating the tree qualities of the directory tree structure can be + * difficult. If the tree is frozen, running a depth (or breadth) first search + * and marking a bitmap suffices to determine if there is a cycle. XORing the + * mark bitmap with the inode bitmap afterwards tells us if there are + * disconnected cycles. If the tree is not frozen, directory updates can move + * subtrees across the scanner wavefront, which complicates the design greatly. + * + * Directory parent pointers change that by enabling an incremental approach to + * validation of the tree structure. Instead of using one thread to scan the + * entire filesystem, we instead can have multiple threads walking individual + * subdirectories upwards to the root. In a perfect world, the IOLOCK would + * suffice to stabilize two directories in a parent -> child relationship. + * Unfortunately, the VFS does not take the IOLOCK when moving a child + * subdirectory, so we instead synchronize on ILOCK and use dirent update hooks + * to detect a race. If a race occurs in a path, we restart the scan. + * + * If the walk terminates without reaching the root, we know the path is + * disconnected and ought to be attached to the lost and found. If on the walk + * we find the same subdir that we're scanning, we know this is a cycle and + * should delete an incoming edge. If we find multiple paths to the root, we + * know to delete an incoming edge. + * + * There are two big hitches with this approach: first, all file link counts + * must be correct to prevent other writers from doing the wrong thing with the + * directory tree structure. Second, because we're walking upwards in a tree + * of arbitrary depth, we cannot hold all the ILOCKs. Instead, we will use a + * directory update hook to invalidate the scan results if one of the paths + * we've scanned has changed. + */ + +/* Clean up the dirtree checking resources. */ +STATIC void +xchk_dirtree_buf_cleanup( + void *buf) +{ + struct xchk_dirtree *dl = buf; + struct xchk_dirpath *path, *n; + + if (dl->scan_ino != NULLFSINO) + xfs_dir_hook_del(dl->sc->mp, &dl->dhook); + + xchk_dirtree_for_each_path_safe(dl, path, n) { + list_del_init(&path->list); + xino_bitmap_destroy(&path->seen_inodes); + kfree(path); + } + + xfblob_destroy(dl->path_names); + xfarray_destroy(dl->path_steps); + mutex_destroy(&dl->lock); +} + +/* Set us up to look for directory loops. */ +int +xchk_setup_dirtree( + struct xfs_scrub *sc) +{ + struct xchk_dirtree *dl; + char *descr; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + if (xchk_could_repair(sc)) { + error = xrep_setup_dirtree(sc); + if (error) + return error; + } + + dl = kvzalloc(sizeof(struct xchk_dirtree), XCHK_GFP_FLAGS); + if (!dl) + return -ENOMEM; + dl->sc = sc; + dl->xname.name = dl->namebuf; + dl->hook_xname.name = dl->hook_namebuf; + INIT_LIST_HEAD(&dl->path_list); + dl->root_ino = NULLFSINO; + dl->scan_ino = NULLFSINO; + dl->parent_ino = NULLFSINO; + + mutex_init(&dl->lock); + + descr = xchk_xfile_ino_descr(sc, "dirtree path steps"); + error = xfarray_create(descr, 0, sizeof(struct xchk_dirpath_step), + &dl->path_steps); + kfree(descr); + if (error) + goto out_dl; + + descr = xchk_xfile_ino_descr(sc, "dirtree path names"); + error = xfblob_create(descr, &dl->path_names); + kfree(descr); + if (error) + goto out_steps; + + error = xchk_setup_inode_contents(sc, 0); + if (error) + goto out_names; + + sc->buf = dl; + sc->buf_cleanup = xchk_dirtree_buf_cleanup; + return 0; + +out_names: + xfblob_destroy(dl->path_names); +out_steps: + xfarray_destroy(dl->path_steps); +out_dl: + mutex_destroy(&dl->lock); + kvfree(dl); + return error; +} + +/* + * Add the parent pointer described by @dl->pptr to the given path as a new + * step. Returns -ELNRNG if the path is too deep. + */ +int +xchk_dirpath_append( + struct xchk_dirtree *dl, + struct xfs_inode *ip, + struct xchk_dirpath *path, + const struct xfs_name *name, + const struct xfs_parent_rec *pptr) +{ + struct xchk_dirpath_step step = { + .pptr_rec = *pptr, /* struct copy */ + .name_len = name->len, + }; + int error; + + /* + * If this path is more than 2 billion steps long, this directory tree + * is too far gone to fix. + */ + if (path->nr_steps >= XFS_MAXLINK) + return -ELNRNG; + + error = xfblob_storename(dl->path_names, &step.name_cookie, name); + if (error) + return error; + + error = xino_bitmap_set(&path->seen_inodes, ip->i_ino); + if (error) + return error; + + error = xfarray_append(dl->path_steps, &step); + if (error) + return error; + + path->nr_steps++; + return 0; +} + +/* + * Create an xchk_path for each parent pointer of the directory that we're + * scanning. For each path created, we will eventually try to walk towards the + * root with the goal of deleting all parents except for one that leads to the + * root. + * + * Returns -EFSCORRUPTED to signal that the inode being scanned has a corrupt + * parent pointer and hence there's no point in continuing; or -ENOSR if there + * are too many parent pointers for this directory. + */ +STATIC int +xchk_dirtree_create_path( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + struct xchk_dirtree *dl = priv; + struct xchk_dirpath *path; + const struct xfs_parent_rec *rec = value; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + /* + * If there are more than 2 billion actual parent pointers for this + * subdirectory, this fs is too far gone to fix. + */ + if (dl->nr_paths >= XFS_MAXLINK) + return -ENOSR; + + trace_xchk_dirtree_create_path(sc, ip, dl->nr_paths, &xname, rec); + + /* + * Create a new xchk_path structure to remember this parent pointer + * and record the first name step. + */ + path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS); + if (!path) + return -ENOMEM; + + INIT_LIST_HEAD(&path->list); + xino_bitmap_init(&path->seen_inodes); + path->nr_steps = 0; + path->outcome = XCHK_DIRPATH_SCANNING; + + error = xchk_dirpath_append(dl, sc->ip, path, &xname, rec); + if (error) + goto out_path; + + path->first_step = xfarray_length(dl->path_steps) - 1; + path->second_step = XFARRAY_NULLIDX; + path->path_nr = dl->nr_paths; + + list_add_tail(&path->list, &dl->path_list); + dl->nr_paths++; + return 0; +out_path: + kfree(path); + return error; +} + +/* + * Validate that the first step of this path still has a corresponding + * parent pointer in @sc->ip. We probably dropped @sc->ip's ILOCK while + * walking towards the roots, which is why this is necessary. + * + * This function has a side effect of loading the first parent pointer of this + * path into the parent pointer scratch pad. This prepares us to walk up the + * directory tree towards the root. Returns -ESTALE if the scan data is now + * out of date. + */ +STATIC int +xchk_dirpath_revalidate( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + /* + * Look up the parent pointer that corresponds to the start of this + * path. If the parent pointer has disappeared on us, dump all the + * scan results and try again. + */ + error = xfs_parent_lookup(sc->tp, sc->ip, &dl->xname, &dl->pptr_rec, + &dl->pptr_args); + if (error == -ENOATTR) { + trace_xchk_dirpath_disappeared(dl->sc, sc->ip, path->path_nr, + path->first_step, &dl->xname, &dl->pptr_rec); + dl->stale = true; + return -ESTALE; + } + + return error; +} + +/* + * Walk the parent pointers of a directory at the end of a path and record + * the parent that we find in @dl->xname/pptr_rec. + */ +STATIC int +xchk_dirpath_find_next_step( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xchk_dirtree *dl = priv; + const struct xfs_parent_rec *rec = value; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + /* + * If we've already set @dl->pptr_rec, then this directory has multiple + * parents. Signal this back to the caller via -EMLINK. + */ + if (dl->parents_found > 0) + return -EMLINK; + + dl->parents_found++; + memcpy(dl->namebuf, name, namelen); + dl->xname.len = namelen; + dl->pptr_rec = *rec; /* struct copy */ + return 0; +} + +/* Set and log the outcome of a path walk. */ +static inline void +xchk_dirpath_set_outcome( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + enum xchk_dirpath_outcome outcome) +{ + trace_xchk_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps, + outcome); + + path->outcome = outcome; +} + +/* + * Scan the directory at the end of this path for its parent directory link. + * If we find one, extend the path. Returns -ESTALE if the scan data out of + * date. Returns -EFSCORRUPTED if the parent pointer is bad; or -ELNRNG if + * the path got too deep. + */ +STATIC int +xchk_dirpath_step_up( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + bool is_metadir) +{ + struct xfs_scrub *sc = dl->sc; + struct xfs_inode *dp; + xfs_ino_t parent_ino = be64_to_cpu(dl->pptr_rec.p_ino); + unsigned int lock_mode; + int error; + + /* Grab and lock the parent directory. */ + error = xchk_iget(sc, parent_ino, &dp); + if (error) + return error; + + lock_mode = xfs_ilock_attr_map_shared(dp); + mutex_lock(&dl->lock); + + if (dl->stale) { + error = -ESTALE; + goto out_scanlock; + } + + /* We've reached the root directory; the path is ok. */ + if (parent_ino == dl->root_ino) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_OK); + error = 0; + goto out_scanlock; + } + + /* + * The inode being scanned is its own distant ancestor! Get rid of + * this path. + */ + if (parent_ino == sc->ip->i_ino) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + error = 0; + goto out_scanlock; + } + + /* + * We've seen this inode before during the path walk. There's a loop + * above us in the directory tree. This probably means that we cannot + * continue, but let's keep walking paths to get a full picture. + */ + if (xino_bitmap_test(&path->seen_inodes, parent_ino)) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_LOOP); + error = 0; + goto out_scanlock; + } + + /* The handle encoded in the parent pointer must match. */ + if (VFS_I(dp)->i_generation != be32_to_cpu(dl->pptr_rec.p_gen)) { + trace_xchk_dirpath_badgen(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + + /* Parent pointer must point up to a directory. */ + if (!S_ISDIR(VFS_I(dp)->i_mode)) { + trace_xchk_dirpath_nondir_parent(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + + /* Parent cannot be an unlinked directory. */ + if (VFS_I(dp)->i_nlink == 0) { + trace_xchk_dirpath_unlinked_parent(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + + /* Parent must be in the same directory tree. */ + if (is_metadir != xfs_is_metadir_inode(dp)) { + trace_xchk_dirpath_crosses_tree(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + + /* + * If the extended attributes look as though they has been zapped by + * the inode record repair code, we cannot scan for parent pointers. + */ + if (xchk_pptr_looks_zapped(dp)) { + error = -EBUSY; + xchk_set_incomplete(sc); + goto out_scanlock; + } + + /* + * Walk the parent pointers of @dp to find the parent of this directory + * to find the next step in our walk. If we find that @dp has exactly + * one parent, the parent pointer information will be stored in + * @dl->pptr_rec. This prepares us for the next step of the walk. + */ + mutex_unlock(&dl->lock); + dl->parents_found = 0; + error = xchk_xattr_walk(sc, dp, xchk_dirpath_find_next_step, NULL, dl); + mutex_lock(&dl->lock); + if (error == -EFSCORRUPTED || error == -EMLINK || + (!error && dl->parents_found == 0)) { + /* + * Further up the directory tree from @sc->ip, we found a + * corrupt parent pointer, multiple parent pointers while + * finding this directory's parent, or zero parents despite + * having a nonzero link count. Keep looking for other paths. + */ + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT); + error = 0; + goto out_scanlock; + } + if (error) + goto out_scanlock; + + if (dl->stale) { + error = -ESTALE; + goto out_scanlock; + } + + trace_xchk_dirpath_found_next_step(sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + + /* Append to the path steps */ + error = xchk_dirpath_append(dl, dp, path, &dl->xname, &dl->pptr_rec); + if (error) + goto out_scanlock; + + if (path->second_step == XFARRAY_NULLIDX) + path->second_step = xfarray_length(dl->path_steps) - 1; + +out_scanlock: + mutex_unlock(&dl->lock); + xfs_iunlock(dp, lock_mode); + xchk_irele(sc, dp); + return error; +} + +/* + * Walk the directory tree upwards towards what is hopefully the root + * directory, recording path steps as we go. The current path components are + * stored in dl->pptr_rec and dl->xname. + * + * Returns -ESTALE if the scan data are out of date. Returns -EFSCORRUPTED + * only if the direct parent pointer of @sc->ip associated with this path is + * corrupt. + */ +STATIC int +xchk_dirpath_walk_upwards( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xfs_scrub *sc = dl->sc; + bool is_metadir; + int error; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + /* Reload the start of this path and make sure it's still there. */ + error = xchk_dirpath_revalidate(dl, path); + if (error) + return error; + + trace_xchk_dirpath_walk_upwards(sc, sc->ip, path->path_nr, &dl->xname, + &dl->pptr_rec); + + /* + * The inode being scanned is its own direct ancestor! + * Get rid of this path. + */ + if (be64_to_cpu(dl->pptr_rec.p_ino) == sc->ip->i_ino) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + return 0; + } + + /* + * Drop ILOCK_EXCL on the inode being scanned. We still hold + * IOLOCK_EXCL on it, so it cannot move around or be renamed. + * + * Beyond this point we're walking up the directory tree, which means + * that we can acquire and drop the ILOCK on an alias of sc->ip. The + * ILOCK state is no longer tracked in the scrub context. Hence we + * must drop @sc->ip's ILOCK during the walk. + */ + is_metadir = xfs_is_metadir_inode(sc->ip); + mutex_unlock(&dl->lock); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* + * Take the first step in the walk towards the root by checking the + * start of this path, which is a direct parent pointer of @sc->ip. + * If we see any kind of error here (including corruptions), the parent + * pointer of @sc->ip is corrupt. Stop the whole scan. + */ + error = xchk_dirpath_step_up(dl, path, is_metadir); + if (error) { + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + return error; + } + + /* + * Take steps upward from the second step in this path towards the + * root. If we hit corruption errors here, there's a problem + * *somewhere* in the path, but we don't need to stop scanning. + */ + while (!error && path->outcome == XCHK_DIRPATH_SCANNING) + error = xchk_dirpath_step_up(dl, path, is_metadir); + + /* Retake the locks we had, mark paths, etc. */ + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + if (error == -EFSCORRUPTED) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT); + error = 0; + } + if (!error && dl->stale) + return -ESTALE; + return error; +} + +/* + * Decide if this path step has been touched by this live update. Returns + * 1 for yes, 0 for no, or a negative errno. + */ +STATIC int +xchk_dirpath_step_is_stale( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + unsigned int step_nr, + xfarray_idx_t step_idx, + struct xfs_dir_update_params *p, + xfs_ino_t *cursor) +{ + struct xchk_dirpath_step step; + xfs_ino_t child_ino = *cursor; + int error; + + error = xfarray_load(dl->path_steps, step_idx, &step); + if (error) + return error; + *cursor = be64_to_cpu(step.pptr_rec.p_ino); + + /* + * If the parent and child being updated are not the ones mentioned in + * this path step, the scan data is still ok. + */ + if (p->ip->i_ino != child_ino || p->dp->i_ino != *cursor) + return 0; + + /* + * If the dirent name lengths or byte sequences are different, the scan + * data is still ok. + */ + if (p->name->len != step.name_len) + return 0; + + error = xfblob_loadname(dl->path_names, step.name_cookie, + &dl->hook_xname, step.name_len); + if (error) + return error; + + if (memcmp(dl->hook_xname.name, p->name->name, p->name->len) != 0) + return 0; + + /* + * If the update comes from the repair code itself, walk the state + * machine forward. + */ + if (p->ip->i_ino == dl->scan_ino && + path->outcome == XREP_DIRPATH_ADOPTING) { + xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_ADOPTED); + return 0; + } + + if (p->ip->i_ino == dl->scan_ino && + path->outcome == XREP_DIRPATH_DELETING) { + xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETED); + return 0; + } + + /* Exact match, scan data is out of date. */ + trace_xchk_dirpath_changed(dl->sc, path->path_nr, step_nr, p->dp, + p->ip, p->name); + return 1; +} + +/* + * Decide if this path has been touched by this live update. Returns 1 for + * yes, 0 for no, or a negative errno. + */ +STATIC int +xchk_dirpath_is_stale( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + struct xfs_dir_update_params *p) +{ + xfs_ino_t cursor = dl->scan_ino; + xfarray_idx_t idx = path->first_step; + unsigned int i; + int ret; + + /* + * The child being updated has not been seen by this path at all; this + * path cannot be stale. + */ + if (!xino_bitmap_test(&path->seen_inodes, p->ip->i_ino)) + return 0; + + ret = xchk_dirpath_step_is_stale(dl, path, 0, idx, p, &cursor); + if (ret != 0) + return ret; + + for (i = 1, idx = path->second_step; i < path->nr_steps; i++, idx++) { + ret = xchk_dirpath_step_is_stale(dl, path, i, idx, p, &cursor); + if (ret != 0) + return ret; + } + + return 0; +} + +/* + * Decide if a directory update from the regular filesystem touches any of the + * paths we've scanned, and invalidate the scan data if true. + */ +STATIC int +xchk_dirtree_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xchk_dirtree *dl; + struct xchk_dirpath *path; + int ret; + + dl = container_of(nb, struct xchk_dirtree, dhook.dirent_hook.nb); + + trace_xchk_dirtree_live_update(dl->sc, p->dp, action, p->ip, p->delta, + p->name); + + mutex_lock(&dl->lock); + + if (dl->stale || dl->aborted) + goto out_unlock; + + xchk_dirtree_for_each_path(dl, path) { + ret = xchk_dirpath_is_stale(dl, path, p); + if (ret < 0) { + dl->aborted = true; + break; + } + if (ret == 1) { + dl->stale = true; + break; + } + } + +out_unlock: + mutex_unlock(&dl->lock); + return NOTIFY_DONE; +} + +/* Delete all the collected path information. */ +STATIC void +xchk_dirtree_reset( + void *buf) +{ + struct xchk_dirtree *dl = buf; + struct xchk_dirpath *path, *n; + + ASSERT(dl->sc->ilock_flags & XFS_ILOCK_EXCL); + + xchk_dirtree_for_each_path_safe(dl, path, n) { + list_del_init(&path->list); + xino_bitmap_destroy(&path->seen_inodes); + kfree(path); + } + dl->nr_paths = 0; + + xfarray_truncate(dl->path_steps); + xfblob_truncate(dl->path_names); + + dl->stale = false; +} + +/* + * Load the name/pptr from the first step in this path into @dl->pptr_rec and + * @dl->xname. + */ +STATIC int +xchk_dirtree_load_path( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xchk_dirpath_step step; + int error; + + error = xfarray_load(dl->path_steps, path->first_step, &step); + if (error) + return error; + + error = xfblob_loadname(dl->path_names, step.name_cookie, &dl->xname, + step.name_len); + if (error) + return error; + + dl->pptr_rec = step.pptr_rec; /* struct copy */ + return 0; +} + +/* + * For each parent pointer of this subdir, trace a path upwards towards the + * root directory and record what we find. Returns 0 for success; + * -EFSCORRUPTED if walking the parent pointers of @sc->ip failed, -ELNRNG if a + * path was too deep; -ENOSR if there were too many parent pointers; or + * a negative errno. + */ +int +xchk_dirtree_find_paths_to_root( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + struct xchk_dirpath *path; + int error = 0; + + do { + if (xchk_should_terminate(sc, &error)) + return error; + + xchk_dirtree_reset(dl); + + /* + * If the extended attributes look as though they has been + * zapped by the inode record repair code, we cannot scan for + * parent pointers. + */ + if (xchk_pptr_looks_zapped(sc->ip)) { + xchk_set_incomplete(sc); + return -EBUSY; + } + + /* + * Create path walk contexts for each parent of the directory + * that is being scanned. Directories are supposed to have + * only one parent, but this is how we detect multiple parents. + */ + error = xchk_xattr_walk(sc, sc->ip, xchk_dirtree_create_path, + NULL, dl); + if (error) + return error; + + xchk_dirtree_for_each_path(dl, path) { + /* Load path components into dl->pptr/xname */ + error = xchk_dirtree_load_path(dl, path); + if (error) + return error; + + /* + * Try to walk up each path to the root. This enables + * us to find directory loops in ancestors, and the + * like. + */ + error = xchk_dirpath_walk_upwards(dl, path); + if (error == -EFSCORRUPTED) { + /* + * A parent pointer of @sc->ip is bad, don't + * bother continuing. + */ + break; + } + if (error == -ESTALE) { + /* This had better be an invalidation. */ + ASSERT(dl->stale); + break; + } + if (error) + return error; + if (dl->aborted) + return 0; + } + } while (dl->stale); + + return error; +} + +/* + * Figure out what to do with the paths we tried to find. Do not call this + * if the scan results are stale. + */ +void +xchk_dirtree_evaluate( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + + ASSERT(!dl->stale); + + /* Scan the paths we have to decide what to do. */ + memset(oc, 0, sizeof(struct xchk_dirtree_outcomes)); + xchk_dirtree_for_each_path(dl, path) { + trace_xchk_dirpath_evaluate_path(dl->sc, path->path_nr, + path->nr_steps, path->outcome); + + switch (path->outcome) { + case XCHK_DIRPATH_SCANNING: + /* shouldn't get here */ + ASSERT(0); + break; + case XCHK_DIRPATH_DELETE: + /* This one is already going away. */ + oc->bad++; + break; + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + /* Couldn't find the end of this path. */ + oc->suspect++; + break; + case XCHK_DIRPATH_STALE: + /* shouldn't get here either */ + ASSERT(0); + break; + case XCHK_DIRPATH_OK: + /* This path got all the way to the root. */ + oc->good++; + break; + case XREP_DIRPATH_DELETING: + case XREP_DIRPATH_DELETED: + case XREP_DIRPATH_ADOPTING: + case XREP_DIRPATH_ADOPTED: + /* These should not be in progress! */ + ASSERT(0); + break; + } + } + + trace_xchk_dirtree_evaluate(dl, oc); +} + +/* Look for directory loops. */ +int +xchk_dirtree( + struct xfs_scrub *sc) +{ + struct xchk_dirtree_outcomes oc; + struct xchk_dirtree *dl = sc->buf; + int error; + + /* + * Nondirectories do not point downwards to other files, so they cannot + * cause a cycle in the directory tree. + */ + if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) + return -ENOENT; + + ASSERT(xfs_has_parent(sc->mp)); + + /* + * Find the root of the directory tree. Remember which directory to + * scan, because the hook doesn't detach until after sc->ip gets + * released during teardown. + */ + dl->root_ino = xchk_inode_rootdir_inum(sc->ip); + dl->scan_ino = sc->ip->i_ino; + + trace_xchk_dirtree_start(sc->ip, sc->sm, 0); + + /* + * Hook into the directory entry code so that we can capture updates to + * paths that we have already scanned. The scanner thread takes each + * directory's ILOCK, which means that any in-progress directory update + * will finish before we can scan the directory. + */ + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + xfs_dir_hook_setup(&dl->dhook, xchk_dirtree_live_update); + error = xfs_dir_hook_add(sc->mp, &dl->dhook); + if (error) + goto out; + + mutex_lock(&dl->lock); + + /* Trace each parent pointer's path to the root. */ + error = xchk_dirtree_find_paths_to_root(dl); + if (error == -EFSCORRUPTED || error == -ELNRNG || error == -ENOSR) { + /* + * Don't bother walking the paths if the xattr structure or the + * parent pointers are corrupt; this scan cannot be completed + * without full information. + */ + xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); + error = 0; + goto out_scanlock; + } + if (error == -EBUSY) { + /* + * We couldn't scan some directory's parent pointers because + * the attr fork looked like it had been zapped. The + * scan was marked incomplete, so no further error code + * is necessary. + */ + error = 0; + goto out_scanlock; + } + if (error) + goto out_scanlock; + if (dl->aborted) { + xchk_set_incomplete(sc); + goto out_scanlock; + } + + /* Assess what we found in our path evaluation. */ + xchk_dirtree_evaluate(dl, &oc); + if (xchk_dirtree_parentless(dl)) { + if (oc.good || oc.bad || oc.suspect) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } else { + if (oc.bad || oc.good + oc.suspect != 1) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + if (oc.suspect) + xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); + } + +out_scanlock: + mutex_unlock(&dl->lock); +out: + trace_xchk_dirtree_done(sc->ip, sc->sm, error); + return error; +} + +/* Does the directory targetted by this scrub have no parents? */ +bool +xchk_dirtree_parentless(const struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + + if (xchk_inode_is_dirtree_root(sc->ip)) + return true; + if (VFS_I(sc->ip)->i_nlink == 0) + return true; + return false; +} diff --git a/fs/xfs/scrub/dirtree.h b/fs/xfs/scrub/dirtree.h new file mode 100644 index 000000000000..9e5d95492717 --- /dev/null +++ b/fs/xfs/scrub/dirtree.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_DIRTREE_H__ +#define __XFS_SCRUB_DIRTREE_H__ + +/* + * Each of these represents one parent pointer path step in a chain going + * up towards the directory tree root. These are stored inside an xfarray. + */ +struct xchk_dirpath_step { + /* Directory entry name associated with this parent link. */ + xfblob_cookie name_cookie; + unsigned int name_len; + + /* Handle of the parent directory. */ + struct xfs_parent_rec pptr_rec; +}; + +enum xchk_dirpath_outcome { + XCHK_DIRPATH_SCANNING = 0, /* still being put together */ + XCHK_DIRPATH_DELETE, /* delete this path */ + XCHK_DIRPATH_CORRUPT, /* corruption detected in path */ + XCHK_DIRPATH_LOOP, /* cycle detected further up */ + XCHK_DIRPATH_STALE, /* path is stale */ + XCHK_DIRPATH_OK, /* path reaches the root */ + + XREP_DIRPATH_DELETING, /* path is being deleted */ + XREP_DIRPATH_DELETED, /* path has been deleted */ + XREP_DIRPATH_ADOPTING, /* path is being adopted */ + XREP_DIRPATH_ADOPTED, /* path has been adopted */ +}; + +/* + * Each of these represents one parent pointer path out of the directory being + * scanned. These exist in-core, and hopefully there aren't more than a + * handful of them. + */ +struct xchk_dirpath { + struct list_head list; + + /* Index of the first step in this path. */ + xfarray_idx_t first_step; + + /* Index of the second step in this path. */ + xfarray_idx_t second_step; + + /* Inodes seen while walking this path. */ + struct xino_bitmap seen_inodes; + + /* Number of steps in this path. */ + unsigned int nr_steps; + + /* Which path is this? */ + unsigned int path_nr; + + /* What did we conclude from following this path? */ + enum xchk_dirpath_outcome outcome; +}; + +struct xchk_dirtree_outcomes { + /* Number of XCHK_DIRPATH_DELETE */ + unsigned int bad; + + /* Number of XCHK_DIRPATH_CORRUPT or XCHK_DIRPATH_LOOP */ + unsigned int suspect; + + /* Number of XCHK_DIRPATH_OK */ + unsigned int good; + + /* Directory needs to be added to lost+found */ + bool needs_adoption; +}; + +struct xchk_dirtree { + struct xfs_scrub *sc; + + /* Root inode that we're looking for. */ + xfs_ino_t root_ino; + + /* + * This is the inode that we're scanning. The live update hook can + * continue to be called after xchk_teardown drops sc->ip but before + * it calls buf_cleanup, so we keep a copy. + */ + xfs_ino_t scan_ino; + + /* + * If we start deleting redundant paths to this subdirectory, this is + * the inode number of the surviving parent and the dotdot entry will + * be set to this value. If the value is NULLFSINO, then use @root_ino + * as a stand-in until the orphanage can adopt the subdirectory. + */ + xfs_ino_t parent_ino; + + /* Scratch buffer for scanning pptr xattrs */ + struct xfs_parent_rec pptr_rec; + struct xfs_da_args pptr_args; + + /* Name buffer */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; + + /* Information for reparenting this directory. */ + struct xrep_adoption adoption; + + /* + * Hook into directory updates so that we can receive live updates + * from other writer threads. + */ + struct xfs_dir_hook dhook; + + /* Parent pointer update arguments. */ + struct xfs_parent_args ppargs; + + /* lock for everything below here */ + struct mutex lock; + + /* buffer for the live update functions to use for dirent names */ + struct xfs_name hook_xname; + unsigned char hook_namebuf[MAXNAMELEN]; + + /* + * All path steps observed during this scan. Each of the path + * steps for a particular pathwalk are recorded in sequential + * order in the xfarray. A pathwalk ends either with a step + * pointing to the root directory (success) or pointing to NULLFSINO + * (loop detected, empty dir detected, etc). + */ + struct xfarray *path_steps; + + /* All names observed during this scan. */ + struct xfblob *path_names; + + /* All paths being tracked by this scanner. */ + struct list_head path_list; + + /* Number of paths in path_list. */ + unsigned int nr_paths; + + /* Number of parents found by a pptr scan. */ + unsigned int parents_found; + + /* Have the path data been invalidated by a concurrent update? */ + bool stale:1; + + /* Has the scan been aborted? */ + bool aborted:1; +}; + +#define xchk_dirtree_for_each_path_safe(dl, path, n) \ + list_for_each_entry_safe((path), (n), &(dl)->path_list, list) + +#define xchk_dirtree_for_each_path(dl, path) \ + list_for_each_entry((path), &(dl)->path_list, list) + +bool xchk_dirtree_parentless(const struct xchk_dirtree *dl); + +int xchk_dirtree_find_paths_to_root(struct xchk_dirtree *dl); +int xchk_dirpath_append(struct xchk_dirtree *dl, struct xfs_inode *ip, + struct xchk_dirpath *path, const struct xfs_name *name, + const struct xfs_parent_rec *pptr); +void xchk_dirtree_evaluate(struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc); + +#endif /* __XFS_SCRUB_DIRTREE_H__ */ diff --git a/fs/xfs/scrub/dirtree_repair.c b/fs/xfs/scrub/dirtree_repair.c new file mode 100644 index 000000000000..5c04e70ba951 --- /dev/null +++ b/fs/xfs/scrub/dirtree_repair.c @@ -0,0 +1,821 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_trans_space.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr.h" +#include "xfs_parent.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/bitmap.h" +#include "scrub/ino_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/listxattr.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/orphanage.h" +#include "scrub/dirtree.h" +#include "scrub/readdir.h" + +/* + * Directory Tree Structure Repairs + * ================================ + * + * If we decide that the directory being scanned is participating in a + * directory loop, the only change we can make is to remove directory entries + * pointing down to @sc->ip. If that leaves it with no parents, the directory + * should be adopted by the orphanage. + */ + +/* Set up to repair directory loops. */ +int +xrep_setup_dirtree( + struct xfs_scrub *sc) +{ + return xrep_orphanage_try_create(sc); +} + +/* Change the outcome of this path. */ +static inline void +xrep_dirpath_set_outcome( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + enum xchk_dirpath_outcome outcome) +{ + trace_xrep_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps, + outcome); + + path->outcome = outcome; +} + +/* Delete all paths. */ +STATIC void +xrep_dirtree_delete_all_paths( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + oc->suspect--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + case XCHK_DIRPATH_OK: + oc->good--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + default: + break; + } + } + + ASSERT(oc->suspect == 0); + ASSERT(oc->good == 0); +} + +/* Since this is the surviving path, set the dotdot entry to this value. */ +STATIC void +xrep_dirpath_retain_parent( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xchk_dirpath_step step; + int error; + + error = xfarray_load(dl->path_steps, path->first_step, &step); + if (error) + return; + + dl->parent_ino = be64_to_cpu(step.pptr_rec.p_ino); +} + +/* Find the one surviving path so we know how to set dotdot. */ +STATIC void +xrep_dirtree_find_surviving_path( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + bool foundit = false; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + case XCHK_DIRPATH_OK: + if (!foundit) { + xrep_dirpath_retain_parent(dl, path); + foundit = true; + continue; + } + ASSERT(foundit == false); + break; + default: + break; + } + } + + ASSERT(oc->suspect + oc->good == 1); +} + +/* Delete all paths except for the one good one. */ +STATIC void +xrep_dirtree_keep_one_good_path( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + bool foundit = false; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + oc->suspect--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + case XCHK_DIRPATH_OK: + if (!foundit) { + xrep_dirpath_retain_parent(dl, path); + foundit = true; + continue; + } + oc->good--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + default: + break; + } + } + + ASSERT(oc->suspect == 0); + ASSERT(oc->good < 2); +} + +/* Delete all paths except for one suspect one. */ +STATIC void +xrep_dirtree_keep_one_suspect_path( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + bool foundit = false; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + if (!foundit) { + xrep_dirpath_retain_parent(dl, path); + foundit = true; + continue; + } + oc->suspect--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + case XCHK_DIRPATH_OK: + ASSERT(0); + break; + default: + break; + } + } + + ASSERT(oc->suspect == 1); + ASSERT(oc->good == 0); +} + +/* + * Figure out what to do with the paths we tried to find. Returns -EDEADLOCK + * if the scan results have become stale. + */ +STATIC void +xrep_dirtree_decide_fate( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + xchk_dirtree_evaluate(dl, oc); + + /* Parentless directories should not have any paths at all. */ + if (xchk_dirtree_parentless(dl)) { + xrep_dirtree_delete_all_paths(dl, oc); + return; + } + + /* One path is exactly the number of paths we want. */ + if (oc->good + oc->suspect == 1) { + xrep_dirtree_find_surviving_path(dl, oc); + return; + } + + /* Zero paths means we should reattach the subdir to the orphanage. */ + if (oc->good + oc->suspect == 0) { + if (dl->sc->orphanage) + oc->needs_adoption = true; + return; + } + + /* + * Otherwise, this subdirectory has too many parents. If there's at + * least one good path, keep it and delete the others. + */ + if (oc->good > 0) { + xrep_dirtree_keep_one_good_path(dl, oc); + return; + } + + /* + * There are no good paths and there are too many suspect paths. + * Keep the first suspect path and delete the rest. + */ + xrep_dirtree_keep_one_suspect_path(dl, oc); +} + +/* + * Load the first step of this path into @step and @dl->xname/pptr + * for later repair work. + */ +STATIC int +xrep_dirtree_prep_path( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + struct xchk_dirpath_step *step) +{ + int error; + + error = xfarray_load(dl->path_steps, path->first_step, step); + if (error) + return error; + + error = xfblob_loadname(dl->path_names, step->name_cookie, &dl->xname, + step->name_len); + if (error) + return error; + + dl->pptr_rec = step->pptr_rec; /* struct copy */ + return 0; +} + +/* Delete the VFS dentry for a removed child. */ +STATIC int +xrep_dirtree_purge_dentry( + struct xchk_dirtree *dl, + struct xfs_inode *dp, + const struct xfs_name *name) +{ + struct qstr qname = QSTR_INIT(name->name, name->len); + struct dentry *parent_dentry, *child_dentry; + int error = 0; + + /* + * Find the dentry for the parent directory. If there isn't one, we're + * done. Caller already holds i_rwsem for parent and child. + */ + parent_dentry = d_find_alias(VFS_I(dp)); + if (!parent_dentry) + return 0; + + /* The VFS thinks the parent is a directory, right? */ + if (!d_is_dir(parent_dentry)) { + ASSERT(d_is_dir(parent_dentry)); + error = -EFSCORRUPTED; + goto out_dput_parent; + } + + /* + * Try to find the dirent pointing to the child. If there isn't one, + * we're done. + */ + qname.hash = full_name_hash(parent_dentry, name->name, name->len); + child_dentry = d_lookup(parent_dentry, &qname); + if (!child_dentry) { + error = 0; + goto out_dput_parent; + } + + trace_xrep_dirtree_delete_child(dp->i_mount, child_dentry); + + /* Child is not a directory? We're screwed. */ + if (!d_is_dir(child_dentry)) { + ASSERT(d_is_dir(child_dentry)); + error = -EFSCORRUPTED; + goto out_dput_child; + } + + /* Replace the child dentry with a negative one. */ + d_delete(child_dentry); + +out_dput_child: + dput(child_dentry); +out_dput_parent: + dput(parent_dentry); + return error; +} + +/* + * Prepare to delete a link by taking the IOLOCK of the parent and the child + * (scrub target). Caller must hold IOLOCK_EXCL on @sc->ip. Returns 0 if we + * took both locks, or a negative errno if we couldn't lock the parent in time. + */ +static inline int +xrep_dirtree_unlink_iolock( + struct xfs_scrub *sc, + struct xfs_inode *dp) +{ + int error; + + ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL); + + if (xfs_ilock_nowait(dp, XFS_IOLOCK_EXCL)) + return 0; + + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + do { + xfs_ilock(dp, XFS_IOLOCK_EXCL); + if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + break; + xfs_iunlock(dp, XFS_IOLOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) { + xchk_ilock(sc, XFS_IOLOCK_EXCL); + return error; + } + + delay(1); + } while (1); + + return 0; +} + +/* + * Remove a link from the directory tree and update the dcache. Returns + * -ESTALE if the scan data are now out of date. + */ +STATIC int +xrep_dirtree_unlink( + struct xchk_dirtree *dl, + struct xfs_inode *dp, + struct xchk_dirpath *path, + struct xchk_dirpath_step *step) +{ + struct xfs_scrub *sc = dl->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t dotdot_ino; + xfs_ino_t parent_ino = dl->parent_ino; + unsigned int resblks; + int dontcare; + int error; + + /* Take IOLOCK_EXCL of the parent and child. */ + error = xrep_dirtree_unlink_iolock(sc, dp); + if (error) + return error; + + /* + * Create the transaction that we need to sever the path. Ignore + * EDQUOT and ENOSPC being returned via nospace_error because the + * directory code can handle a reservationless update. + */ + resblks = xfs_remove_space_res(mp, step->name_len); + error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, sc->ip, + &resblks, &sc->tp, &dontcare); + if (error) + goto out_iolock; + + /* + * Cancel if someone invalidate the paths while we were trying to get + * the ILOCK. + */ + mutex_lock(&dl->lock); + if (dl->stale) { + mutex_unlock(&dl->lock); + error = -ESTALE; + goto out_trans_cancel; + } + xrep_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETING); + mutex_unlock(&dl->lock); + + trace_xrep_dirtree_delete_path(dl->sc, sc->ip, path->path_nr, + &dl->xname, &dl->pptr_rec); + + /* + * Decide if we need to reset the dotdot entry. Rules: + * + * - If there's a surviving parent, we want dotdot to point there. + * - If we don't have any surviving parents, then point dotdot at the + * root dir. + * - If dotdot is already set to the value we want, pass in NULLFSINO + * for no change necessary. + * + * Do this /before/ we dirty anything, in case the dotdot lookup + * fails. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &dotdot_ino); + if (error) + goto out_trans_cancel; + if (parent_ino == NULLFSINO) + parent_ino = dl->root_ino; + if (dotdot_ino == parent_ino) + parent_ino = NULLFSINO; + + /* Drop the link from sc->ip's dotdot entry. */ + error = xfs_droplink(sc->tp, dp); + if (error) + goto out_trans_cancel; + + /* Reset the dotdot entry to a surviving parent. */ + if (parent_ino != NULLFSINO) { + error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot, + parent_ino, 0); + if (error) + goto out_trans_cancel; + } + + /* Drop the link from dp to sc->ip. */ + error = xfs_droplink(sc->tp, sc->ip); + if (error) + goto out_trans_cancel; + + error = xfs_dir_removename(sc->tp, dp, &dl->xname, sc->ip->i_ino, + resblks); + if (error) { + ASSERT(error != -ENOENT); + goto out_trans_cancel; + } + + if (xfs_has_parent(sc->mp)) { + error = xfs_parent_removename(sc->tp, &dl->ppargs, dp, + &dl->xname, sc->ip); + if (error) + goto out_trans_cancel; + } + + /* + * Notify dirent hooks that we removed the bad link, invalidate the + * dcache, and commit the repair. + */ + xfs_dir_update_hook(dp, sc->ip, -1, &dl->xname); + error = xrep_dirtree_purge_dentry(dl, dp, &dl->xname); + if (error) + goto out_trans_cancel; + + error = xrep_trans_commit(sc); + goto out_ilock; + +out_trans_cancel: + xchk_trans_cancel(sc); +out_ilock: + xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); +out_iolock: + xfs_iunlock(dp, XFS_IOLOCK_EXCL); + return error; +} + +/* + * Delete a directory entry that points to this directory. Returns -ESTALE + * if the scan data are now out of date. + */ +STATIC int +xrep_dirtree_delete_path( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xchk_dirpath_step step; + struct xfs_scrub *sc = dl->sc; + struct xfs_inode *dp; + int error; + + /* + * Load the parent pointer and directory inode for this path, then + * drop the scan lock, the ILOCK, and the transaction so that + * _delete_path can reserve the proper transaction. This sets up + * @dl->xname for the deletion. + */ + error = xrep_dirtree_prep_path(dl, path, &step); + if (error) + return error; + + error = xchk_iget(sc, be64_to_cpu(step.pptr_rec.p_ino), &dp); + if (error) + return error; + + mutex_unlock(&dl->lock); + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* Delete the directory link and release the parent. */ + error = xrep_dirtree_unlink(dl, dp, path, &step); + xchk_irele(sc, dp); + + /* + * Retake all the resources we had at the beginning even if the repair + * failed or the scan data are now stale. This keeps things simple for + * the caller. + */ + xchk_trans_alloc_empty(sc); + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + + if (!error && dl->stale) + error = -ESTALE; + return error; +} + +/* Add a new path to represent our in-progress adoption. */ +STATIC int +xrep_dirtree_create_adoption_path( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + struct xchk_dirpath *path; + int error; + + /* + * We should have capped the number of paths at XFS_MAXLINK-1 in the + * scanner. + */ + if (dl->nr_paths > XFS_MAXLINK) { + ASSERT(dl->nr_paths <= XFS_MAXLINK); + return -EFSCORRUPTED; + } + + /* + * Create a new xchk_path structure to remember this parent pointer + * and record the first name step. + */ + path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS); + if (!path) + return -ENOMEM; + + INIT_LIST_HEAD(&path->list); + xino_bitmap_init(&path->seen_inodes); + path->nr_steps = 0; + path->outcome = XREP_DIRPATH_ADOPTING; + + /* + * Record the new link that we just created in the orphanage. Because + * adoption is the last repair that we perform, we don't bother filling + * in the path all the way back to the root. + */ + xfs_inode_to_parent_rec(&dl->pptr_rec, sc->orphanage); + + error = xino_bitmap_set(&path->seen_inodes, sc->orphanage->i_ino); + if (error) + goto out_path; + + trace_xrep_dirtree_create_adoption(sc, sc->ip, dl->nr_paths, + &dl->xname, &dl->pptr_rec); + + error = xchk_dirpath_append(dl, sc->ip, path, &dl->xname, + &dl->pptr_rec); + if (error) + goto out_path; + + path->first_step = xfarray_length(dl->path_steps) - 1; + path->second_step = XFARRAY_NULLIDX; + path->path_nr = dl->nr_paths; + + list_add_tail(&path->list, &dl->path_list); + dl->nr_paths++; + return 0; + +out_path: + kfree(path); + return error; +} + +/* + * Prepare to move a file to the orphanage by taking the IOLOCK of the + * orphanage and the child (scrub target). Caller must hold IOLOCK_EXCL on + * @sc->ip. Returns 0 if we took both locks, or a negative errno if we + * couldn't lock the orphanage in time. + */ +static inline int +xrep_dirtree_adopt_iolock( + struct xfs_scrub *sc) +{ + int error; + + ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL); + + if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + return 0; + + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + do { + xrep_orphanage_ilock(sc, XFS_IOLOCK_EXCL); + if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + break; + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) { + xchk_ilock(sc, XFS_IOLOCK_EXCL); + return error; + } + + delay(1); + } while (1); + + return 0; +} + +/* + * Reattach this orphaned directory to the orphanage. Do not call this with + * any resources held. Returns -ESTALE if the scan data have become out of + * date. + */ +STATIC int +xrep_dirtree_adopt( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + /* Take the IOLOCK of the orphanage and the scrub target. */ + error = xrep_dirtree_adopt_iolock(sc); + if (error) + return error; + + /* + * Set up for an adoption. The directory tree fixer runs after the + * link counts have been corrected. Therefore, we must bump the + * child's link count since there will be no further opportunity to fix + * errors. + */ + error = xrep_adoption_trans_alloc(sc, &dl->adoption); + if (error) + goto out_iolock; + dl->adoption.bump_child_nlink = true; + + /* Figure out what name we're going to use here. */ + error = xrep_adoption_compute_name(&dl->adoption, &dl->xname); + if (error) + goto out_trans; + + /* + * Now that we have a proposed name for the orphanage entry, create + * a faux path so that the live update hook will see it. + */ + mutex_lock(&dl->lock); + if (dl->stale) { + mutex_unlock(&dl->lock); + error = -ESTALE; + goto out_trans; + } + error = xrep_dirtree_create_adoption_path(dl); + mutex_unlock(&dl->lock); + if (error) + goto out_trans; + + /* Reparent the directory. */ + error = xrep_adoption_move(&dl->adoption); + if (error) + goto out_trans; + + /* + * Commit the name and release all inode locks except for the scrub + * target's IOLOCK. + */ + error = xrep_trans_commit(sc); + goto out_ilock; + +out_trans: + xchk_trans_cancel(sc); +out_ilock: + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); +out_iolock: + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + return error; +} + +/* + * This newly orphaned directory needs to be adopted by the orphanage. + * Make this happen. + */ +STATIC int +xrep_dirtree_move_to_orphanage( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + /* + * Start by dropping all the resources that we hold so that we can grab + * all the resources that we need for the adoption. + */ + mutex_unlock(&dl->lock); + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* Perform the adoption. */ + error = xrep_dirtree_adopt(dl); + + /* + * Retake all the resources we had at the beginning even if the repair + * failed or the scan data are now stale. This keeps things simple for + * the caller. + */ + xchk_trans_alloc_empty(sc); + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + + if (!error && dl->stale) + error = -ESTALE; + return error; +} + +/* + * Try to fix all the problems. Returns -ESTALE if the scan data have become + * out of date. + */ +STATIC int +xrep_dirtree_fix_problems( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + int error; + + /* Delete all the paths we don't want. */ + xchk_dirtree_for_each_path(dl, path) { + if (path->outcome != XCHK_DIRPATH_DELETE) + continue; + + error = xrep_dirtree_delete_path(dl, path); + if (error) + return error; + } + + /* Reparent this directory to the orphanage. */ + if (oc->needs_adoption) { + if (xrep_orphanage_can_adopt(dl->sc)) + return xrep_dirtree_move_to_orphanage(dl); + return -EFSCORRUPTED; + } + + return 0; +} + +/* Fix directory loops involving this directory. */ +int +xrep_dirtree( + struct xfs_scrub *sc) +{ + struct xchk_dirtree *dl = sc->buf; + struct xchk_dirtree_outcomes oc; + int error; + + /* + * Prepare to fix the directory tree by retaking the scan lock. The + * order of resource acquisition is still IOLOCK -> transaction -> + * ILOCK -> scan lock. + */ + mutex_lock(&dl->lock); + do { + /* + * Decide what we're going to do, then do it. An -ESTALE + * return here means the scan results are invalid and we have + * to walk again. + */ + if (!dl->stale) { + xrep_dirtree_decide_fate(dl, &oc); + + trace_xrep_dirtree_decided_fate(dl, &oc); + + error = xrep_dirtree_fix_problems(dl, &oc); + if (!error || error != -ESTALE) + break; + } + error = xchk_dirtree_find_paths_to_root(dl); + if (error == -ELNRNG || error == -ENOSR) + error = -EFSCORRUPTED; + } while (!error); + mutex_unlock(&dl->lock); + + return error; +} diff --git a/fs/xfs/scrub/findparent.c b/fs/xfs/scrub/findparent.c new file mode 100644 index 000000000000..84487072b6dd --- /dev/null +++ b/fs/xfs/scrub/findparent.c @@ -0,0 +1,470 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_priv.h" +#include "xfs_trans_space.h" +#include "xfs_health.h" +#include "xfs_exchmaps.h" +#include "xfs_parent.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/iscan.h" +#include "scrub/findparent.h" +#include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/listxattr.h" + +/* + * Finding the Parent of a Directory + * ================================= + * + * Directories have parent pointers, in the sense that each directory contains + * a dotdot entry that points to the single allowed parent. The brute force + * way to find the parent of a given directory is to scan every directory in + * the filesystem looking for a child dirent that references this directory. + * + * This module wraps the process of scanning the directory tree. It requires + * that @sc->ip is the directory whose parent we want to find, and that the + * caller hold only the IOLOCK on that directory. The scan itself needs to + * take the ILOCK of each directory visited. + * + * Because we cannot hold @sc->ip's ILOCK during a scan of the whole fs, it is + * necessary to use dirent hook to update the parent scan results. Callers + * must not read the scan results without re-taking @sc->ip's ILOCK. + * + * There are a few shortcuts that we can take to avoid scanning the entire + * filesystem, such as noticing directory tree roots and querying the dentry + * cache for parent information. + */ + +struct xrep_findparent_info { + /* The directory currently being scanned. */ + struct xfs_inode *dp; + + /* + * Scrub context. We're looking for a @dp containing a directory + * entry pointing to sc->ip->i_ino. + */ + struct xfs_scrub *sc; + + /* Optional scan information for a xrep_findparent_scan call. */ + struct xrep_parent_scan_info *parent_scan; + + /* + * Parent that we've found for sc->ip. If we're scanning the entire + * directory tree, we need this to ensure that we only find /one/ + * parent directory. + */ + xfs_ino_t found_parent; + + /* + * This is set to true if @found_parent was not observed directly from + * the directory scan but by noticing a change in dotdot entries after + * cycling the sc->ip IOLOCK. + */ + bool parent_tentative; +}; + +/* + * If this directory entry points to the scrub target inode, then the directory + * we're scanning is the parent of the scrub target inode. + */ +STATIC int +xrep_findparent_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_findparent_info *fpi = priv; + int error = 0; + + if (xchk_should_terminate(fpi->sc, &error)) + return error; + + if (ino != fpi->sc->ip->i_ino) + return 0; + + /* Ignore garbage directory entry names. */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) + return -EFSCORRUPTED; + + /* + * Ignore dotdot and dot entries -- we're looking for parent -> child + * links only. + */ + if (name->name[0] == '.' && (name->len == 1 || + (name->len == 2 && name->name[1] == '.'))) + return 0; + + /* Uhoh, more than one parent for a dir? */ + if (fpi->found_parent != NULLFSINO && + !(fpi->parent_tentative && fpi->found_parent == fpi->dp->i_ino)) { + trace_xrep_findparent_dirent(fpi->sc->ip, 0); + return -EFSCORRUPTED; + } + + /* We found a potential parent; remember this. */ + trace_xrep_findparent_dirent(fpi->sc->ip, fpi->dp->i_ino); + fpi->found_parent = fpi->dp->i_ino; + fpi->parent_tentative = false; + + if (fpi->parent_scan) + xrep_findparent_scan_found(fpi->parent_scan, fpi->dp->i_ino); + + return 0; +} + +/* + * If this is a directory, walk the dirents looking for any that point to the + * scrub target inode. + */ +STATIC int +xrep_findparent_walk_directory( + struct xrep_findparent_info *fpi) +{ + struct xfs_scrub *sc = fpi->sc; + struct xfs_inode *dp = fpi->dp; + unsigned int lock_mode; + int error = 0; + + /* + * The inode being scanned cannot be its own parent, nor can any + * temporary directory we created to stage this repair. + */ + if (dp == sc->ip || dp == sc->tempip) + return 0; + + /* + * Similarly, temporary files created to stage a repair cannot be the + * parent of this inode. + */ + if (xrep_is_tempfile(dp)) + return 0; + + /* + * Scan the directory to see if there it contains an entry pointing to + * the directory that we are repairing. + */ + lock_mode = xfs_ilock_data_map_shared(dp); + + /* Don't mix metadata and regular directory trees. */ + if (xfs_is_metadir_inode(dp) != xfs_is_metadir_inode(sc->ip)) + goto out_unlock; + + /* + * If this directory is known to be sick, we cannot scan it reliably + * and must abort. + */ + if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | + XFS_SICK_INO_BMBTD | + XFS_SICK_INO_DIR)) { + error = -EFSCORRUPTED; + goto out_unlock; + } + + /* + * We cannot complete our parent pointer scan if a directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_unlock; + } + + error = xchk_dir_walk(sc, dp, xrep_findparent_dirent, fpi); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock(dp, lock_mode); + return error; +} + +/* + * Update this directory's dotdot pointer based on ongoing dirent updates. + */ +STATIC int +xrep_findparent_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_parent_scan_info *pscan; + struct xfs_scrub *sc; + + pscan = container_of(nb, struct xrep_parent_scan_info, + dhook.dirent_hook.nb); + sc = pscan->sc; + + /* + * If @p->ip is the subdirectory that we're interested in and we've + * already scanned @p->dp, update the dotdot target inumber to the + * parent inode. + */ + if (p->ip->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&pscan->iscan, p->dp->i_ino)) { + if (p->delta > 0) { + xrep_findparent_scan_found(pscan, p->dp->i_ino); + } else { + xrep_findparent_scan_found(pscan, NULLFSINO); + } + } + + return NOTIFY_DONE; +} + +/* + * Set up a scan to find the parent of a directory. The provided dirent hook + * will be called when there is a dotdot update for the inode being repaired. + */ +int +__xrep_findparent_scan_start( + struct xfs_scrub *sc, + struct xrep_parent_scan_info *pscan, + notifier_fn_t custom_fn) +{ + int error; + + if (!(sc->flags & XCHK_FSGATES_DIRENTS)) { + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + return -EINVAL; + } + + pscan->sc = sc; + pscan->parent_ino = NULLFSINO; + + mutex_init(&pscan->lock); + + xchk_iscan_start(sc, 30000, 100, &pscan->iscan); + + /* + * Hook into the dirent update code. The hook only operates on inodes + * that were already scanned, and the scanner thread takes each inode's + * ILOCK, which means that any in-progress inode updates will finish + * before we can scan the inode. + */ + if (custom_fn) + xfs_dir_hook_setup(&pscan->dhook, custom_fn); + else + xfs_dir_hook_setup(&pscan->dhook, xrep_findparent_live_update); + error = xfs_dir_hook_add(sc->mp, &pscan->dhook); + if (error) + goto out_iscan; + + return 0; +out_iscan: + xchk_iscan_teardown(&pscan->iscan); + mutex_destroy(&pscan->lock); + return error; +} + +/* + * Scan the entire filesystem looking for a parent inode for the inode being + * scrubbed. @sc->ip must not be the root of a directory tree. Callers must + * not hold a dirty transaction or any lock that would interfere with taking + * an ILOCK. + * + * Returns 0 with @pscan->parent_ino set to the parent that we found. + * Returns 0 with @pscan->parent_ino set to NULLFSINO if we found no parents. + * Returns the usual negative errno if something else happened. + */ +int +xrep_findparent_scan( + struct xrep_parent_scan_info *pscan) +{ + struct xrep_findparent_info fpi = { + .sc = pscan->sc, + .found_parent = NULLFSINO, + .parent_scan = pscan, + }; + struct xfs_scrub *sc = pscan->sc; + int ret; + + ASSERT(S_ISDIR(VFS_IC(sc->ip)->i_mode)); + + while ((ret = xchk_iscan_iter(&pscan->iscan, &fpi.dp)) == 1) { + if (S_ISDIR(VFS_I(fpi.dp)->i_mode)) + ret = xrep_findparent_walk_directory(&fpi); + else + ret = 0; + xchk_iscan_mark_visited(&pscan->iscan, fpi.dp); + xchk_irele(sc, fpi.dp); + if (ret) + break; + + if (xchk_should_terminate(sc, &ret)) + break; + } + xchk_iscan_iter_finish(&pscan->iscan); + + return ret; +} + +/* Tear down a parent scan. */ +void +xrep_findparent_scan_teardown( + struct xrep_parent_scan_info *pscan) +{ + xfs_dir_hook_del(pscan->sc->mp, &pscan->dhook); + xchk_iscan_teardown(&pscan->iscan); + mutex_destroy(&pscan->lock); +} + +/* Finish a parent scan early. */ +void +xrep_findparent_scan_finish_early( + struct xrep_parent_scan_info *pscan, + xfs_ino_t ino) +{ + xrep_findparent_scan_found(pscan, ino); + xchk_iscan_finish_early(&pscan->iscan); +} + +/* + * Confirm that the directory @parent_ino actually contains a directory entry + * pointing to the child @sc->ip->ino. This function returns one of several + * ways: + * + * Returns 0 with @parent_ino unchanged if the parent was confirmed. + * Returns 0 with @parent_ino set to NULLFSINO if the parent was not valid. + * Returns the usual negative errno if something else happened. + */ +int +xrep_findparent_confirm( + struct xfs_scrub *sc, + xfs_ino_t *parent_ino) +{ + struct xrep_findparent_info fpi = { + .sc = sc, + .found_parent = NULLFSINO, + }; + int error; + + /* The root directory always points to itself. */ + if (sc->ip == sc->mp->m_rootip) { + *parent_ino = sc->mp->m_sb.sb_rootino; + return 0; + } + + /* The metadata root directory always points to itself. */ + if (sc->ip == sc->mp->m_metadirip) { + *parent_ino = sc->mp->m_sb.sb_metadirino; + return 0; + } + + /* Unlinked dirs can point anywhere; point them up to the root dir. */ + if (VFS_I(sc->ip)->i_nlink == 0) { + *parent_ino = xchk_inode_rootdir_inum(sc->ip); + return 0; + } + + /* Reject garbage parent inode numbers and self-referential parents. */ + if (*parent_ino == NULLFSINO) + return 0; + if (!xfs_verify_dir_ino(sc->mp, *parent_ino) || + *parent_ino == sc->ip->i_ino) { + *parent_ino = NULLFSINO; + return 0; + } + + error = xchk_iget(sc, *parent_ino, &fpi.dp); + if (error) + return error; + + if (!S_ISDIR(VFS_I(fpi.dp)->i_mode)) { + *parent_ino = NULLFSINO; + goto out_rele; + } + + error = xrep_findparent_walk_directory(&fpi); + if (error) + goto out_rele; + + *parent_ino = fpi.found_parent; +out_rele: + xchk_irele(sc, fpi.dp); + return error; +} + +/* + * If we're the root of a directory tree, we are our own parent. If we're an + * unlinked directory, the parent /won't/ have a link to us. Set the parent + * directory to the root for both cases. Returns NULLFSINO if we don't know + * what to do. + */ +xfs_ino_t +xrep_findparent_self_reference( + struct xfs_scrub *sc) +{ + if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino) + return sc->mp->m_sb.sb_rootino; + + if (sc->ip->i_ino == sc->mp->m_sb.sb_metadirino) + return sc->mp->m_sb.sb_metadirino; + + if (VFS_I(sc->ip)->i_nlink == 0) + return xchk_inode_rootdir_inum(sc->ip); + + return NULLFSINO; +} + +/* Check the dentry cache to see if knows of a parent for the scrub target. */ +xfs_ino_t +xrep_findparent_from_dcache( + struct xfs_scrub *sc) +{ + struct inode *pip = NULL; + struct dentry *dentry, *parent; + xfs_ino_t ret = NULLFSINO; + + dentry = d_find_alias(VFS_I(sc->ip)); + if (!dentry) + goto out; + + parent = dget_parent(dentry); + if (!parent) + goto out_dput; + + ASSERT(parent->d_sb == sc->ip->i_mount->m_super); + + pip = igrab(d_inode(parent)); + dput(parent); + + if (S_ISDIR(pip->i_mode)) { + trace_xrep_findparent_from_dcache(sc->ip, XFS_I(pip)->i_ino); + ret = XFS_I(pip)->i_ino; + } + + xchk_irele(sc, XFS_I(pip)); + +out_dput: + dput(dentry); +out: + return ret; +} diff --git a/fs/xfs/scrub/findparent.h b/fs/xfs/scrub/findparent.h new file mode 100644 index 000000000000..d998c7a88152 --- /dev/null +++ b/fs/xfs/scrub/findparent.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_FINDPARENT_H__ +#define __XFS_SCRUB_FINDPARENT_H__ + +struct xrep_parent_scan_info { + struct xfs_scrub *sc; + + /* Inode scan cursor. */ + struct xchk_iscan iscan; + + /* Hook to capture directory entry updates. */ + struct xfs_dir_hook dhook; + + /* Lock protecting parent_ino. */ + struct mutex lock; + + /* Parent inode that we've found. */ + xfs_ino_t parent_ino; + + bool lookup_parent; +}; + +int __xrep_findparent_scan_start(struct xfs_scrub *sc, + struct xrep_parent_scan_info *pscan, + notifier_fn_t custom_fn); +static inline int xrep_findparent_scan_start(struct xfs_scrub *sc, + struct xrep_parent_scan_info *pscan) +{ + return __xrep_findparent_scan_start(sc, pscan, NULL); +} +int xrep_findparent_scan(struct xrep_parent_scan_info *pscan); +void xrep_findparent_scan_teardown(struct xrep_parent_scan_info *pscan); + +static inline void +xrep_findparent_scan_found( + struct xrep_parent_scan_info *pscan, + xfs_ino_t ino) +{ + mutex_lock(&pscan->lock); + pscan->parent_ino = ino; + mutex_unlock(&pscan->lock); +} + +void xrep_findparent_scan_finish_early(struct xrep_parent_scan_info *pscan, + xfs_ino_t ino); + +int xrep_findparent_confirm(struct xfs_scrub *sc, xfs_ino_t *parent_ino); + +xfs_ino_t xrep_findparent_self_reference(struct xfs_scrub *sc); +xfs_ino_t xrep_findparent_from_dcache(struct xfs_scrub *sc); + +#endif /* __XFS_SCRUB_FINDPARENT_H__ */ diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index d310737c8823..9b598c5790ad 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -19,6 +19,7 @@ #include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_icache.h" +#include "xfs_rtgroup.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -74,10 +75,9 @@ xchk_fscount_warmup( struct xfs_buf *agi_bp = NULL; struct xfs_buf *agf_bp = NULL; struct xfs_perag *pag = NULL; - xfs_agnumber_t agno; int error = 0; - for_each_perag(mp, agno, pag) { + while ((pag = xfs_perag_next(mp, pag))) { if (xchk_should_terminate(sc, &error)) break; if (xfs_perag_initialised_agi(pag) && @@ -85,7 +85,7 @@ xchk_fscount_warmup( continue; /* Lock both AG headers. */ - error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); + error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp); if (error) break; error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); @@ -123,7 +123,7 @@ xchk_fsfreeze( { int error; - error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL); trace_xchk_fsfreeze(sc, error); return error; } @@ -135,7 +135,7 @@ xchk_fsthaw( int error; /* This should always succeed, we have a kernel freeze */ - error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL); trace_xchk_fsthaw(sc, error); return error; } @@ -261,7 +261,7 @@ xchk_fscount_btreeblks( struct xchk_fscounters *fsc, xfs_agnumber_t agno) { - xfs_extlen_t blocks; + xfs_filblks_t blocks; int error; error = xchk_ag_init_existing(sc, agno, &sc->sa); @@ -295,9 +295,8 @@ xchk_fscount_aggregate_agcounts( struct xchk_fscounters *fsc) { struct xfs_mount *mp = sc->mp; - struct xfs_perag *pag; + struct xfs_perag *pag = NULL; uint64_t delayed; - xfs_agnumber_t agno; int tries = 8; int error = 0; @@ -306,7 +305,7 @@ retry: fsc->ifree = 0; fsc->fdblocks = 0; - for_each_perag(mp, agno, pag) { + while ((pag = xfs_perag_next(mp, pag))) { if (xchk_should_terminate(sc, &error)) break; @@ -327,7 +326,7 @@ retry: if (xfs_has_lazysbcount(sc->mp)) { fsc->fdblocks += pag->pagf_btreeblks; } else { - error = xchk_fscount_btreeblks(sc, fsc, agno); + error = xchk_fscount_btreeblks(sc, fsc, pag_agno(pag)); if (error) break; } @@ -351,7 +350,7 @@ retry: * The global incore space reservation is taken from the incore * counters, so leave that out of the computation. */ - fsc->fdblocks -= mp->m_resblks_avail; + fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail; /* * Delayed allocation reservations are taken out of the incore counters @@ -388,7 +387,7 @@ retry: #ifdef CONFIG_XFS_RT STATIC int xchk_fscount_add_frextent( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) @@ -409,23 +408,34 @@ xchk_fscount_count_frextents( struct xchk_fscounters *fsc) { struct xfs_mount *mp = sc->mp; + struct xfs_rtgroup *rtg = NULL; int error; fsc->frextents = 0; - if (!xfs_has_realtime(mp)) + fsc->frextents_delayed = 0; + + /* + * Don't bother verifying and repairing the fs counters for zoned file + * systems as they don't track an on-disk frextents count, and the + * in-memory percpu counter also includes reservations. + */ + if (!xfs_has_realtime(mp) || xfs_has_zoned(mp)) return 0; - xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - error = xfs_rtalloc_query_all(sc->mp, sc->tp, - xchk_fscount_add_frextent, fsc); - if (error) { - xchk_set_incomplete(sc); - goto out_unlock; + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + error = xfs_rtalloc_query_all(rtg, sc->tp, + xchk_fscount_add_frextent, fsc); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + if (error) { + xchk_set_incomplete(sc); + xfs_rtgroup_rele(rtg); + return error; + } } -out_unlock: - xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - return error; + fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents); + return 0; } #else STATIC int @@ -434,6 +444,7 @@ xchk_fscount_count_frextents( struct xchk_fscounters *fsc) { fsc->frextents = 0; + fsc->frextents_delayed = 0; return 0; } #endif /* CONFIG_XFS_RT */ @@ -508,8 +519,8 @@ xchk_fscounters( /* Snapshot the percpu counters. */ icount = percpu_counter_sum(&mp->m_icount); ifree = percpu_counter_sum(&mp->m_ifree); - fdblocks = percpu_counter_sum(&mp->m_fdblocks); - frextents = percpu_counter_sum(&mp->m_frextents); + fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); + frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS); /* No negative values, please! */ if (icount < 0 || ifree < 0) @@ -517,7 +528,7 @@ xchk_fscounters( /* * If the filesystem is not frozen, the counter summation calls above - * can race with xfs_mod_freecounter, which subtracts a requested space + * can race with xfs_dec_freecounter, which subtracts a requested space * reservation from the counter and undoes the subtraction if that made * the counter go negative. Therefore, it's possible to see negative * values here, and we should only flag that as a corruption if we @@ -584,16 +595,18 @@ xchk_fscounters( try_again = true; } - if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, - fsc->fdblocks)) { + if (!xchk_fscount_within_range(sc, fdblocks, + &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) { if (fsc->frozen) xchk_set_corrupt(sc); else try_again = true; } - if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, - fsc->frextents)) { + if (!xfs_has_zoned(mp) && + !xchk_fscount_within_range(sc, frextents, + &mp->m_free[XC_FREE_RTEXTENTS].count, + fsc->frextents - fsc->frextents_delayed)) { if (fsc->frozen) xchk_set_corrupt(sc); else diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h index 461a13d25f4b..bcf56e1c36f9 100644 --- a/fs/xfs/scrub/fscounters.h +++ b/fs/xfs/scrub/fscounters.h @@ -12,6 +12,7 @@ struct xchk_fscounters { uint64_t ifree; uint64_t fdblocks; uint64_t frextents; + uint64_t frextents_delayed; unsigned long long icount_min; unsigned long long icount_max; bool frozen; diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c index 94cdb852bee4..f0d2b04644e4 100644 --- a/fs/xfs/scrub/fscounters_repair.c +++ b/fs/xfs/scrub/fscounters_repair.c @@ -64,9 +64,22 @@ xrep_fscounters( percpu_counter_set(&mp->m_icount, fsc->icount); percpu_counter_set(&mp->m_ifree, fsc->ifree); - percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks); - percpu_counter_set(&mp->m_frextents, fsc->frextents); - mp->m_sb.sb_frextents = fsc->frextents; + xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks); + + /* + * Online repair is only supported on v5 file systems, which require + * lazy sb counters and thus no update of sb_fdblocks here. But + * sb_frextents only uses a lazy counter with rtgroups, and thus needs + * to be updated directly here otherwise. And for that we need to keep + * track of the delalloc reservations separately, as they are are + * subtracted from m_frextents, but not included in sb_frextents. + */ + if (!xfs_has_zoned(mp)) { + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + fsc->frextents - fsc->frextents_delayed); + if (!xfs_has_rtgroups(mp)) + mp->m_sb.sb_frextents = fsc->frextents; + } return 0; } diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 9020a6bef7f1..3c0f25098b69 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -12,6 +12,7 @@ #include "xfs_btree.h" #include "xfs_ag.h" #include "xfs_health.h" +#include "xfs_rtgroup.h" #include "scrub/scrub.h" #include "scrub/health.h" #include "scrub/common.h" @@ -70,10 +71,11 @@ /* Map our scrub type to a sick mask and a set of health update functions. */ enum xchk_health_group { - XHG_FS = 1, - XHG_RT, + XHG_NONE = 1, + XHG_FS, XHG_AG, XHG_INO, + XHG_RTGROUP, }; struct xchk_health_map { @@ -82,6 +84,7 @@ struct xchk_health_map { }; static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { + [XFS_SCRUB_TYPE_PROBE] = { XHG_NONE, 0 }, [XFS_SCRUB_TYPE_SB] = { XHG_AG, XFS_SICK_AG_SB }, [XFS_SCRUB_TYPE_AGF] = { XHG_AG, XFS_SICK_AG_AGF }, [XFS_SCRUB_TYPE_AGFL] = { XHG_AG, XFS_SICK_AG_AGFL }, @@ -100,14 +103,19 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_XATTR] = { XHG_INO, XFS_SICK_INO_XATTR }, [XFS_SCRUB_TYPE_SYMLINK] = { XHG_INO, XFS_SICK_INO_SYMLINK }, [XFS_SCRUB_TYPE_PARENT] = { XHG_INO, XFS_SICK_INO_PARENT }, - [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RT, XFS_SICK_RT_BITMAP }, - [XFS_SCRUB_TYPE_RTSUM] = { XHG_RT, XFS_SICK_RT_SUMMARY }, + [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RTGROUP, XFS_SICK_RG_BITMAP }, + [XFS_SCRUB_TYPE_RTSUM] = { XHG_RTGROUP, XFS_SICK_RG_SUMMARY }, [XFS_SCRUB_TYPE_UQUOTA] = { XHG_FS, XFS_SICK_FS_UQUOTA }, [XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA }, [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA }, [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS }, [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK }, [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS }, + [XFS_SCRUB_TYPE_DIRTREE] = { XHG_INO, XFS_SICK_INO_DIRTREE }, + [XFS_SCRUB_TYPE_METAPATH] = { XHG_FS, XFS_SICK_FS_METAPATH }, + [XFS_SCRUB_TYPE_RGSUPER] = { XHG_RTGROUP, XFS_SICK_RG_SUPER }, + [XFS_SCRUB_TYPE_RTRMAPBT] = { XHG_RTGROUP, XFS_SICK_RG_RMAPBT }, + [XFS_SCRUB_TYPE_RTREFCBT] = { XHG_RTGROUP, XFS_SICK_RG_REFCNTBT }, }; /* Return the health status mask for this scrub type. */ @@ -129,7 +137,7 @@ xchk_mark_healthy_if_clean( { if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | XFS_SCRUB_OFLAG_XCORRUPT))) - sc->sick_mask |= mask; + sc->healthy_mask |= mask; } /* @@ -159,13 +167,14 @@ STATIC void xchk_mark_all_healthy( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT); - xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT); - for_each_perag(mp, agno, pag) - xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT); + while ((pag = xfs_perag_next(mp, pag))) + xfs_group_mark_healthy(pag_group(pag), XFS_SICK_AG_INDIRECT); + while ((rtg = xfs_rtgroup_next(mp, rtg))) + xfs_group_mark_healthy(rtg_group(rtg), XFS_SICK_RG_INDIRECT); } /* @@ -183,6 +192,8 @@ xchk_update_health( struct xfs_scrub *sc) { struct xfs_perag *pag; + struct xfs_rtgroup *rtg; + unsigned int mask = sc->sick_mask; bool bad; /* @@ -197,49 +208,57 @@ xchk_update_health( return; } - if (!sc->sick_mask) - return; - bad = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | XFS_SCRUB_OFLAG_XCORRUPT)); + if (!bad) + mask |= sc->healthy_mask; switch (type_to_health_flag[sc->sm->sm_type].group) { + case XHG_NONE: + break; case XHG_AG: + if (!mask) + return; pag = xfs_perag_get(sc->mp, sc->sm->sm_agno); if (bad) - xfs_ag_mark_corrupt(pag, sc->sick_mask); + xfs_group_mark_corrupt(pag_group(pag), mask); else - xfs_ag_mark_healthy(pag, sc->sick_mask); + xfs_group_mark_healthy(pag_group(pag), mask); xfs_perag_put(pag); break; case XHG_INO: if (!sc->ip) return; - if (bad) { - unsigned int mask = sc->sick_mask; - - /* - * If we're coming in for repairs then we don't want - * sickness flags to propagate to the incore health - * status if the inode gets inactivated before we can - * fix it. - */ - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) - mask |= XFS_SICK_INO_FORGET; + /* + * If we're coming in for repairs then we don't want sickness + * flags to propagate to the incore health status if the inode + * gets inactivated before we can fix it. + */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + mask |= XFS_SICK_INO_FORGET; + if (!mask) + return; + if (bad) xfs_inode_mark_corrupt(sc->ip, mask); - } else - xfs_inode_mark_healthy(sc->ip, sc->sick_mask); + else + xfs_inode_mark_healthy(sc->ip, mask); break; case XHG_FS: + if (!mask) + return; if (bad) - xfs_fs_mark_corrupt(sc->mp, sc->sick_mask); + xfs_fs_mark_corrupt(sc->mp, mask); else - xfs_fs_mark_healthy(sc->mp, sc->sick_mask); + xfs_fs_mark_healthy(sc->mp, mask); break; - case XHG_RT: + case XHG_RTGROUP: + if (!mask) + return; + rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno); if (bad) - xfs_rt_mark_corrupt(sc->mp, sc->sick_mask); + xfs_group_mark_corrupt(rtg_group(rtg), mask); else - xfs_rt_mark_healthy(sc->mp, sc->sick_mask); + xfs_group_mark_healthy(rtg_group(rtg), mask); + xfs_rtgroup_put(rtg); break; default: ASSERT(0); @@ -276,7 +295,7 @@ xchk_ag_btree_del_cursor_if_sick( type_to_health_flag[sc->sm->sm_type].group == XHG_AG) mask &= ~sc->sick_mask; - if (xfs_ag_has_sickness((*curp)->bc_ag.pag, mask)) { + if (xfs_group_has_sickness((*curp)->bc_group, mask)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; xfs_btree_del_cursor(*curp, XFS_BTREE_NOERROR); *curp = NULL; @@ -293,9 +312,8 @@ xchk_health_record( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; - struct xfs_perag *pag; - xfs_agnumber_t agno; - + struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; unsigned int sick; unsigned int checked; @@ -303,15 +321,17 @@ xchk_health_record( if (sick & XFS_SICK_FS_PRIMARY) xchk_set_corrupt(sc); - xfs_rt_measure_sickness(mp, &sick, &checked); - if (sick & XFS_SICK_RT_PRIMARY) - xchk_set_corrupt(sc); - - for_each_perag(mp, agno, pag) { - xfs_ag_measure_sickness(pag, &sick, &checked); + while ((pag = xfs_perag_next(mp, pag))) { + xfs_group_measure_sickness(pag_group(pag), &sick, &checked); if (sick & XFS_SICK_AG_PRIMARY) xchk_set_corrupt(sc); } + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + xfs_group_measure_sickness(rtg_group(rtg), &sick, &checked); + if (sick & XFS_SICK_RG_PRIMARY) + xchk_set_corrupt(sc); + } + return 0; } diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 750d7b0cd25a..4dc7c83dc08a 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -258,7 +258,7 @@ xchk_iallocbt_chunk( { struct xfs_scrub *sc = bs->sc; struct xfs_mount *mp = bs->cur->bc_mp; - struct xfs_perag *pag = bs->cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(bs->cur->bc_group); xfs_agblock_t agbno; xfs_extlen_t len; @@ -303,7 +303,6 @@ xchk_iallocbt_check_cluster_ifree( unsigned int irec_ino, struct xfs_dinode *dip) { - struct xfs_mount *mp = bs->cur->bc_mp; xfs_ino_t fsino; xfs_agino_t agino; bool irec_free; @@ -319,7 +318,7 @@ xchk_iallocbt_check_cluster_ifree( * the record, compute which fs inode we're talking about. */ agino = irec->ir_startino + irec_ino; - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.pag->pag_agno, agino); + fsino = xfs_agino_to_ino(to_perag(bs->cur->bc_group), agino); irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || @@ -368,7 +367,6 @@ xchk_iallocbt_check_cluster( struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_buf *cluster_bp; unsigned int nr_inodes; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agblock_t agbno; unsigned int cluster_index; uint16_t cluster_mask = 0; @@ -396,7 +394,7 @@ xchk_iallocbt_check_cluster( * ir_startino can be large enough to make im_boffset nonzero. */ ir_holemask = (irec->ir_holemask & cluster_mask); - imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap.im_blkno = xfs_agbno_to_daddr(to_perag(bs->cur->bc_group), agbno); imap.im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino) << mp->m_sb.sb_inodelog; @@ -407,9 +405,9 @@ xchk_iallocbt_check_cluster( return 0; } - trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino, - imap.im_blkno, imap.im_len, cluster_base, nr_inodes, - cluster_mask, ir_holemask, + trace_xchk_iallocbt_check_cluster(to_perag(bs->cur->bc_group), + irec->ir_startino, imap.im_blkno, imap.im_len, + cluster_base, nr_inodes, cluster_mask, ir_holemask, XFS_INO_TO_OFFSET(mp, irec->ir_startino + cluster_base)); @@ -585,7 +583,7 @@ xchk_iallocbt_rec( uint16_t holemask; xfs_inobt_btrec_to_irec(mp, rec, &irec); - if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { + if (xfs_inobt_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -652,8 +650,8 @@ xchk_iallocbt_xref_rmap_btreeblks( struct xfs_scrub *sc) { xfs_filblks_t blocks; - xfs_extlen_t inobt_blocks = 0; - xfs_extlen_t finobt_blocks = 0; + xfs_filblks_t inobt_blocks = 0; + xfs_filblks_t finobt_blocks = 0; int error; if (!sc->sa.ino_cur || !sc->sa.rmap_cur || diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c index a00ec7ae1792..14e48d3f1912 100644 --- a/fs/xfs/scrub/ialloc_repair.c +++ b/fs/xfs/scrub/ialloc_repair.c @@ -146,15 +146,12 @@ xrep_ibt_check_ifree( struct xfs_scrub *sc = ri->sc; struct xfs_mount *mp = sc->mp; struct xfs_dinode *dip; - xfs_ino_t fsino; xfs_agino_t agino; - xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno; unsigned int cluster_buf_base; unsigned int offset; int error; agino = cluster_ag_base + cluster_index; - fsino = XFS_AGINO_TO_INO(mp, agno, agino); /* Inode uncached or half assembled, read disk buffer */ cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base); @@ -165,7 +162,8 @@ xrep_ibt_check_ifree( if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) return -EFSCORRUPTED; - if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino) + if (dip->di_version >= 3 && + be64_to_cpu(dip->di_ino) != xfs_agino_to_ino(ri->sc->sa.pag, agino)) return -EFSCORRUPTED; /* Will the in-core inode tell us if it's in use? */ @@ -194,7 +192,7 @@ xrep_ibt_stash( if (ri->rie.ir_freecount > 0) ri->finobt_recs++; - trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie); + trace_xrep_ibt_found(ri->sc->sa.pag, &ri->rie); error = xfarray_append(ri->inode_records, &ri->rie); if (error) @@ -307,7 +305,7 @@ xrep_ibt_process_cluster( * inobt because imap_to_bp directly maps the buffer without touching * either inode btree. */ - imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno); + imap.im_blkno = xfs_agbno_to_daddr(sc->sa.pag, cluster_bno); imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); imap.im_boffset = 0; error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp); @@ -423,9 +421,7 @@ xrep_ibt_record_inode_blocks( if (error) return error; - trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno, - rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, - rec->rm_offset, rec->rm_flags); + trace_xrep_ibt_walk_rmap(ri->sc->sa.pag, rec); /* * Record the free/hole masks for each inode cluster that could be @@ -634,7 +630,6 @@ xrep_ibt_build_new_trees( struct xfs_scrub *sc = ri->sc; struct xfs_btree_cur *ino_cur; struct xfs_btree_cur *fino_cur = NULL; - xfs_fsblock_t fsbno; bool need_finobt; int error; @@ -656,9 +651,8 @@ xrep_ibt_build_new_trees( * * Start by setting up the inobt staging cursor. */ - fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, - XFS_IBT_BLOCK(sc->mp)), - xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno, + xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, + xfs_agbno_to_fsb(sc->sa.pag, XFS_IBT_BLOCK(sc->mp)), XFS_AG_RESV_NONE); ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; ri->new_inobt.bload.get_records = xrep_ibt_get_records; @@ -677,10 +671,9 @@ xrep_ibt_build_new_trees( if (sc->mp->m_finobt_nores) resv = XFS_AG_RESV_NONE; - fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, - XFS_FIBT_BLOCK(sc->mp)), xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT, - fsbno, resv); + xfs_agbno_to_fsb(sc->sa.pag, XFS_FIBT_BLOCK(sc->mp)), + resv); ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; ri->new_finobt.bload.get_records = xrep_fibt_get_records; @@ -821,7 +814,7 @@ xrep_iallocbt( sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT; /* Set up enough storage to handle an AG with nothing but inodes. */ - xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino); + xfs_agino_range(mp, pag_agno(sc->sa.pag), &first_agino, &last_agino); last_agino /= XFS_INODES_PER_CHUNK; descr = xchk_xfile_ag_descr(sc, "inode index records"); error = xfarray_create(descr, last_agino, diff --git a/fs/xfs/scrub/ino_bitmap.h b/fs/xfs/scrub/ino_bitmap.h new file mode 100644 index 000000000000..1300833679ab --- /dev/null +++ b/fs/xfs/scrub/ino_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_INO_BITMAP_H__ +#define __XFS_SCRUB_INO_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_ino_t */ + +struct xino_bitmap { + struct xbitmap64 inobitmap; +}; + +static inline void xino_bitmap_init(struct xino_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->inobitmap); +} + +static inline void xino_bitmap_destroy(struct xino_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->inobitmap); +} + +static inline int xino_bitmap_set(struct xino_bitmap *bitmap, xfs_ino_t ino) +{ + return xbitmap64_set(&bitmap->inobitmap, ino, 1); +} + +static inline int xino_bitmap_test(struct xino_bitmap *bitmap, xfs_ino_t ino) +{ + uint64_t len = 1; + + return xbitmap64_test(&bitmap->inobitmap, ino, &len); +} + +#endif /* __XFS_SCRUB_INO_BITMAP_H__ */ diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 6e2fe2d6250b..bb3f475b6353 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -60,6 +60,22 @@ xchk_install_handle_iscrub( if (error) return error; + /* + * Don't allow scrubbing by handle of any non-directory inode records + * in the metadata directory tree. We don't know if any of the scans + * launched by this scrubber will end up indirectly trying to lock this + * file. + * + * Scrubbers of inode-rooted metadata files (e.g. quota files) will + * attach all the resources needed to scrub the inode and call + * xchk_inode directly. Userspace cannot call this directly. + */ + if (xfs_is_metadir_inode(ip) && !S_ISDIR(VFS_I(ip)->i_mode)) { + xchk_irele(sc, ip); + sc->ip = NULL; + return -ENOENT; + } + return xchk_prepare_iscrub(sc); } @@ -94,9 +110,15 @@ xchk_setup_inode( return xchk_prepare_iscrub(sc); } - /* Reject internal metadata files and obviously bad inode numbers. */ - if (xfs_internal_inum(mp, sc->sm->sm_ino)) + /* + * On pre-metadir filesystems, reject internal metadata files. For + * metadir filesystems, limited scrubbing of any file in the metadata + * directory tree by handle is allowed, because that is the only way to + * validate the lack of parent pointers in the sb-root metadata inodes. + */ + if (!xfs_has_metadir(mp) && xfs_is_sb_inum(mp, sc->sm->sm_ino)) return -ENOENT; + /* Reject obviously bad inode numbers. */ if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; @@ -238,12 +260,7 @@ xchk_inode_extsize( xchk_ino_set_warning(sc, ino); } -/* - * Validate di_cowextsize hint. - * - * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). - * These functions must be kept in sync with each other. - */ +/* Validate di_cowextsize hint. */ STATIC void xchk_inode_cowextsize( struct xfs_scrub *sc, @@ -254,12 +271,32 @@ xchk_inode_cowextsize( uint64_t flags2) { xfs_failaddr_t fa; + uint32_t value = be32_to_cpu(dip->di_cowextsize); + + /* + * The used block counter for rtrmap is checked and repaired elsewhere. + */ + if (xfs_has_zoned(sc->mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) + return; - fa = xfs_inode_validate_cowextsize(sc->mp, - be32_to_cpu(dip->di_cowextsize), mode, flags, - flags2); + fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2); if (fa) xchk_ino_set_corrupt(sc, ino); + + /* + * XFS allows a sysadmin to change the rt extent size when adding a rt + * section to a filesystem after formatting. If there are any + * directories with cowextsize and rtinherit set, the hint could become + * misaligned with the new rextsize. The verifier doesn't check this, + * because we allow rtinherit directories even without an rt device. + * Flag this as an administrative warning since we will clean this up + * eventually. + */ + if ((flags & XFS_DIFLAG_RTINHERIT) && + (flags2 & XFS_DIFLAG2_COWEXTSIZE) && + value % sc->mp->m_sb.sb_rextsize > 0) + xchk_ino_set_warning(sc, ino); } /* Make sure the di_flags make sense for the inode. */ @@ -338,8 +375,9 @@ xchk_inode_flags2( if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode)) goto bad; - /* realtime and reflink make no sense, currently */ - if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK)) + /* realtime and reflink don't always go together */ + if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK) && + !xfs_has_rtreflink(mp)) goto bad; /* no bigtime iflag without the bigtime feature */ @@ -421,8 +459,13 @@ xchk_dinode( break; case 2: case 3: - if (dip->di_onlink != 0) - xchk_ino_set_corrupt(sc, ino); + if (xfs_dinode_is_metadir(dip)) { + if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX) + xchk_ino_set_corrupt(sc, ino); + } else { + if (dip->di_metatype != 0) + xchk_ino_set_corrupt(sc, ino); + } if (dip->di_mode == 0 && sc->ip) xchk_ino_set_corrupt(sc, ino); @@ -475,6 +518,10 @@ xchk_dinode( if (!S_ISREG(mode) && !S_ISDIR(mode)) xchk_ino_set_corrupt(sc, ino); break; + case XFS_DINODE_FMT_META_BTREE: + if (!S_ISREG(mode)) + xchk_ino_set_corrupt(sc, ino); + break; case XFS_DINODE_FMT_UUID: default: xchk_ino_set_corrupt(sc, ino); @@ -659,15 +706,13 @@ xchk_inode_xref_bmap( return; /* Walk all the extents to check nextents/naextents/nblocks. */ - error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, - &nextents, &count); + error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count); if (!xchk_should_check_xref(sc, &error, NULL)) return; if (nextents < xfs_dfork_data_extents(dip)) xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); - error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, - &nextents, &acount); + error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents, &acount); if (!xchk_should_check_xref(sc, &error, NULL)) return; if (nextents != xfs_dfork_attr_extents(dip)) @@ -739,6 +784,23 @@ xchk_inode_check_reflink_iflag( xchk_ino_set_corrupt(sc, ino); } +/* + * If this inode has zero link count, it must be on the unlinked list. If + * it has nonzero link count, it must not be on the unlinked list. + */ +STATIC void +xchk_inode_check_unlinked( + struct xfs_scrub *sc) +{ + if (VFS_I(sc->ip)->i_nlink == 0) { + if (!xfs_inode_on_unlinked_list(sc->ip)) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } else { + if (xfs_inode_on_unlinked_list(sc->ip)) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } +} + /* Scrub an inode. */ int xchk_inode( @@ -771,6 +833,8 @@ xchk_inode( if (S_ISREG(VFS_I(sc->ip)->i_mode)) xchk_inode_check_reflink_iflag(sc, sc->ip->i_ino); + xchk_inode_check_unlinked(sc); + xchk_inode_xref(sc, sc->ip->i_ino, &di); out: return error; diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index eab380e95ef4..a90a011c7e5f 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -38,6 +38,9 @@ #include "xfs_log_priv.h" #include "xfs_health.h" #include "xfs_symlink_remote.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -46,6 +49,7 @@ #include "scrub/repair.h" #include "scrub/iscan.h" #include "scrub/readdir.h" +#include "scrub/tempfile.h" /* * Inode Record Repair @@ -282,6 +286,51 @@ xrep_dinode_findmode_dirent( return 0; } +/* Try to lock a directory, or wait a jiffy. */ +static inline int +xrep_dinode_ilock_nowait( + struct xfs_inode *dp, + unsigned int lock_mode) +{ + if (xfs_ilock_nowait(dp, lock_mode)) + return true; + + schedule_timeout_killable(1); + return false; +} + +/* + * Try to lock a directory to look for ftype hints. Since we already hold the + * AGI buffer, we cannot block waiting for the ILOCK because rename can take + * the ILOCK and then try to lock AGIs. + */ +STATIC int +xrep_dinode_trylock_directory( + struct xrep_inode *ri, + struct xfs_inode *dp, + unsigned int *lock_modep) +{ + unsigned long deadline = jiffies + msecs_to_jiffies(30000); + unsigned int lock_mode; + int error = 0; + + do { + if (xchk_should_terminate(ri->sc, &error)) + return error; + + if (xfs_need_iread_extents(&dp->i_df)) + lock_mode = XFS_ILOCK_EXCL; + else + lock_mode = XFS_ILOCK_SHARED; + + if (xrep_dinode_ilock_nowait(dp, lock_mode)) { + *lock_modep = lock_mode; + return 0; + } + } while (!time_is_before_jiffies(deadline)); + return -EBUSY; +} + /* * If this is a directory, walk the dirents looking for any that point to the * scrub target inode. @@ -295,11 +344,17 @@ xrep_dinode_findmode_walk_directory( unsigned int lock_mode; int error = 0; + /* Ignore temporary repair directories. */ + if (xrep_is_tempfile(dp)) + return 0; + /* * Scan the directory to see if there it contains an entry pointing to * the directory that we are repairing. */ - lock_mode = xfs_ilock_data_map_shared(dp); + error = xrep_dinode_trylock_directory(ri, dp, &lock_mode); + if (error) + return error; /* * If this directory is known to be sick, we cannot scan it reliably @@ -356,6 +411,7 @@ xrep_dinode_find_mode( * so there's a real possibility that _iscan_iter can return EBUSY. */ xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); + xchk_iscan_set_agi_trylock(&ri->ftype_iscan); ri->ftype_iscan.skip_ino = sc->sm->sm_ino; ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { @@ -463,6 +519,24 @@ xrep_dinode_mode( return 0; } +/* Fix unused link count fields having nonzero values. */ +STATIC void +xrep_dinode_nlinks( + struct xfs_dinode *dip) +{ + if (dip->di_version < 2) { + dip->di_nlink = 0; + return; + } + + if (xfs_dinode_is_metadir(dip)) { + if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX) + dip->di_metatype = cpu_to_be16(XFS_METAFILE_UNKNOWN); + } else { + dip->di_metatype = 0; + } +} + /* Fix any conflicting flags that the verifiers complain about. */ STATIC void xrep_dinode_flags( @@ -491,8 +565,6 @@ xrep_dinode_flags( flags2 |= XFS_DIFLAG2_REFLINK; else flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); - if (flags & XFS_DIFLAG_REALTIME) - flags2 &= ~XFS_DIFLAG2_REFLINK; if (!xfs_has_bigtime(mp)) flags2 &= ~XFS_DIFLAG2_BIGTIME; if (!xfs_has_large_extent_counts(mp)) @@ -501,6 +573,16 @@ xrep_dinode_flags( dip->di_nrext64_pad = 0; else if (dip->di_version >= 3) dip->di_v3_pad = 0; + + if (flags2 & XFS_DIFLAG2_METADATA) { + xfs_failaddr_t fa; + + fa = xfs_dinode_verify_metadir(sc->mp, dip, mode, flags, + flags2); + if (fa) + flags2 &= ~XFS_DIFLAG2_METADATA; + } + dip->di_flags = cpu_to_be16(flags); dip->di_flags2 = cpu_to_be64(flags2); } @@ -628,7 +710,9 @@ xrep_dinode_extsize_hints( XFS_DIFLAG_EXTSZINHERIT); } - if (dip->di_version < 3) + if (dip->di_version < 3 || + (xfs_has_zoned(sc->mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))) return; fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), @@ -692,19 +776,72 @@ xrep_dinode_count_ag_rmaps( return error; } +/* Count extents and blocks for an inode given an rt rmap. */ +STATIC int +xrep_dinode_walk_rtrmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_inode *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + /* We only care about this inode. */ + if (rec->rm_owner != ri->sc->sm->sm_ino) + return 0; + + if (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) + return -EFSCORRUPTED; + + ri->rt_blocks += rec->rm_blockcount; + ri->rt_extents++; + return 0; +} + +/* Count extents and blocks for an inode from all realtime rmap data. */ +STATIC int +xrep_dinode_count_rtgroup_rmaps( + struct xrep_inode *ri, + struct xfs_rtgroup *rtg) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + error = xrep_rtgroup_init(sc, rtg, &sc->sr, XFS_RTGLOCK_RMAP); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_dinode_walk_rtrmap, + ri); + xchk_rtgroup_btcur_free(&sc->sr); + xchk_rtgroup_free(sc, &sc->sr); + return error; +} + /* Count extents and blocks for a given inode from all rmap data. */ STATIC int xrep_dinode_count_rmaps( struct xrep_inode *ri) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; int error; - if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) + if (!xfs_has_rmapbt(ri->sc->mp)) return -EOPNOTSUPP; - for_each_perag(ri->sc->mp, agno, pag) { + while ((rtg = xfs_rtgroup_next(ri->sc->mp, rtg))) { + error = xrep_dinode_count_rtgroup_rmaps(ri, rtg); + if (error) { + xfs_rtgroup_rele(rtg); + return error; + } + } + + while ((pag = xfs_perag_next(ri->sc->mp, pag))) { error = xrep_dinode_count_ag_rmaps(ri, pag); if (error) { xfs_perag_rele(pag); @@ -782,7 +919,7 @@ xrep_dinode_bad_bmbt_fork( nrecs = be16_to_cpu(dfp->bb_numrecs); level = be16_to_cpu(dfp->bb_level); - if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size) + if (nrecs == 0 || xfs_bmdr_space_calc(nrecs) > dfork_size) return true; if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) return true; @@ -794,12 +931,12 @@ xrep_dinode_bad_bmbt_fork( xfs_fileoff_t fileoff; xfs_fsblock_t fsbno; - fkp = XFS_BMDR_KEY_ADDR(dfp, i); + fkp = xfs_bmdr_key_addr(dfp, i); fileoff = be64_to_cpu(fkp->br_startoff); if (!xfs_verify_fileoff(sc->mp, fileoff)) return true; - fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr); + fpp = xfs_bmdr_ptr_addr(dfp, i, dmxr); fsbno = be64_to_cpu(*fpp); if (!xfs_verify_fsbno(sc->mp, fsbno)) return true; @@ -808,6 +945,85 @@ xrep_dinode_bad_bmbt_fork( return false; } +/* Return true if this rmap-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_rtrmapbt_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size) +{ + struct xfs_rtrmap_root *dfp; + unsigned int nrecs; + unsigned int level; + + if (dfork_size < sizeof(struct xfs_rtrmap_root)) + return true; + + dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (level > sc->mp->m_rtrmap_maxlevels) + return true; + if (xfs_rtrmap_droot_space_calc(level, nrecs) > dfork_size) + return true; + if (level > 0 && nrecs == 0) + return true; + + return false; +} + +/* Return true if this refcount-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_rtrefcountbt_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size) +{ + struct xfs_rtrefcount_root *dfp; + unsigned int nrecs; + unsigned int level; + + if (dfork_size < sizeof(struct xfs_rtrefcount_root)) + return true; + + dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (level > sc->mp->m_rtrefc_maxlevels) + return true; + if (xfs_rtrefcount_droot_space_calc(level, nrecs) > dfork_size) + return true; + if (level > 0 && nrecs == 0) + return true; + + return false; +} + +/* Check a metadata-btree fork. */ +STATIC bool +xrep_dinode_bad_metabt_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + if (whichfork != XFS_DATA_FORK) + return true; + + switch (be16_to_cpu(dip->di_metatype)) { + case XFS_METAFILE_RTRMAP: + return xrep_dinode_bad_rtrmapbt_fork(sc, dip, dfork_size); + case XFS_METAFILE_RTREFCOUNT: + return xrep_dinode_bad_rtrefcountbt_fork(sc, dip, dfork_size); + default: + return true; + } + + return false; +} + /* * Check the data fork for things that will fail the ifork verifiers or the * ifork formatters. @@ -841,9 +1057,17 @@ xrep_dinode_check_dfork( return true; break; case S_IFREG: - if (fmt == XFS_DINODE_FMT_LOCAL) + switch (fmt) { + case XFS_DINODE_FMT_LOCAL: return true; - fallthrough; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_META_BTREE: + break; + default: + return true; + } + break; case S_IFLNK: case S_IFDIR: switch (fmt) { @@ -888,6 +1112,11 @@ xrep_dinode_check_dfork( XFS_DATA_FORK)) return true; break; + case XFS_DINODE_FMT_META_BTREE: + if (xrep_dinode_bad_metabt_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; default: return true; } @@ -1008,6 +1237,11 @@ xrep_dinode_check_afork( XFS_ATTR_FORK)) return true; break; + case XFS_DINODE_FMT_META_BTREE: + if (xrep_dinode_bad_metabt_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; default: return true; } @@ -1055,9 +1289,11 @@ xrep_dinode_ensure_forkoff( uint16_t mode) { struct xfs_bmdr_block *bmdr; + struct xfs_rtrmap_root *rmdr; + struct xfs_rtrefcount_root *rcdr; struct xfs_scrub *sc = ri->sc; xfs_extnum_t attr_extents, data_extents; - size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1); + size_t bmdr_minsz = xfs_bmdr_space_calc(1); unsigned int lit_sz = XFS_LITINO(sc->mp); unsigned int afork_min, dfork_min; @@ -1109,7 +1345,7 @@ xrep_dinode_ensure_forkoff( case XFS_DINODE_FMT_BTREE: /* Must have space for btree header and key/pointers. */ bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); - afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + afork_min = xfs_bmap_broot_space(sc->mp, bmdr); break; default: /* We should never see any other formats. */ @@ -1159,7 +1395,22 @@ xrep_dinode_ensure_forkoff( case XFS_DINODE_FMT_BTREE: /* Must have space for btree header and key/pointers. */ bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); - dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + dfork_min = xfs_bmap_broot_space(sc->mp, bmdr); + break; + case XFS_DINODE_FMT_META_BTREE: + switch (be16_to_cpu(dip->di_metatype)) { + case XFS_METAFILE_RTRMAP: + rmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + dfork_min = xfs_rtrmap_broot_space(sc->mp, rmdr); + break; + case XFS_METAFILE_RTREFCOUNT: + rcdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + dfork_min = xfs_rtrefcount_broot_space(sc->mp, rcdr); + break; + default: + dfork_min = 0; + break; + } break; default: dfork_min = 0; @@ -1309,8 +1560,7 @@ xrep_dinode_core( /* Read the inode cluster buffer. */ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, - ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, - NULL); + ri->imap.im_blkno, ri->imap.im_len, 0, &bp, NULL); if (error) return error; @@ -1324,6 +1574,7 @@ xrep_dinode_core( iget_error = xrep_dinode_mode(ri, dip); if (iget_error) goto write; + xrep_dinode_nlinks(dip); xrep_dinode_flags(sc, dip, ri->rt_extents > 0); xrep_dinode_size(ri, dip); xrep_dinode_extsize_hints(sc, dip); @@ -1419,8 +1670,7 @@ xrep_inode_blockcounts( trace_xrep_inode_blockcounts(sc); /* Set data fork counters from the data fork mappings. */ - error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, - &nextents, &count); + error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count); if (error) return error; if (xfs_is_reflink_inode(sc->ip)) { @@ -1444,8 +1694,8 @@ xrep_inode_blockcounts( /* Set attr fork counters from the attr fork mappings. */ ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); if (ifp) { - error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, - &nextents, &acount); + error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents, + &acount); if (error) return error; if (count >= sc->mp->m_sb.sb_dblocks) @@ -1583,10 +1833,6 @@ xrep_inode_flags( /* DAX only applies to files and dirs. */ if (!(S_ISREG(mode) || S_ISDIR(mode))) sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; - - /* No reflink files on the realtime device. */ - if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) - sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; } /* @@ -1671,6 +1917,51 @@ xrep_inode_extsize( } } +/* Ensure this file has an attr fork if it needs to hold a parent pointer. */ +STATIC int +xrep_inode_pptr( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct inode *inode = VFS_I(ip); + + if (!xfs_has_parent(mp)) + return 0; + + /* + * Unlinked inodes that cannot be added to the directory tree will not + * have a parent pointer. + */ + if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + return 0; + + /* Children of the superblock do not have parent pointers. */ + if (xchk_inode_is_sb_rooted(ip)) + return 0; + + /* Inode already has an attr fork; no further work possible here. */ + if (xfs_inode_has_attr_fork(ip)) + return 0; + + return xfs_bmap_add_attrfork(sc->tp, ip, + sizeof(struct xfs_attr_sf_hdr), true); +} + +/* Fix COW extent size hint problems. */ +STATIC void +xrep_inode_cowextsize( + struct xfs_scrub *sc) +{ + /* Fix misaligned CoW extent size hints on a directory. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (sc->ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + sc->ip->i_extsize % sc->mp->m_sb.sb_rextsize > 0) { + sc->ip->i_cowextsize = 0; + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + } +} + /* Fix any irregularities in an inode that the verifiers don't catch. */ STATIC int xrep_inode_problems( @@ -1681,6 +1972,9 @@ xrep_inode_problems( error = xrep_inode_blockcounts(sc); if (error) return error; + error = xrep_inode_pptr(sc); + if (error) + return error; xrep_inode_timestamps(sc->ip); xrep_inode_flags(sc); xrep_inode_ids(sc); @@ -1691,12 +1985,53 @@ xrep_inode_problems( if (S_ISDIR(VFS_I(sc->ip)->i_mode)) xrep_inode_dir_size(sc); xrep_inode_extsize(sc); + xrep_inode_cowextsize(sc); trace_xrep_inode_fixed(sc); xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); return xrep_roll_trans(sc); } +/* + * Make sure this inode's unlinked list pointers are consistent with its + * link count. + */ +STATIC int +xrep_inode_unlinked( + struct xfs_scrub *sc) +{ + unsigned int nlink = VFS_I(sc->ip)->i_nlink; + int error; + + /* + * If this inode is linked from the directory tree and on the unlinked + * list, remove it from the unlinked list. + */ + if (nlink > 0 && xfs_inode_on_unlinked_list(sc->ip)) { + struct xfs_perag *pag; + int error; + + pag = xfs_perag_get(sc->mp, + XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino)); + error = xfs_iunlink_remove(sc->tp, pag, sc->ip); + xfs_perag_put(pag); + if (error) + return error; + } + + /* + * If this inode is not linked from the directory tree yet not on the + * unlinked list, put it on the unlinked list. + */ + if (nlink == 0 && !xfs_inode_on_unlinked_list(sc->ip)) { + error = xfs_iunlink(sc->tp, sc->ip); + if (error) + return error; + } + + return 0; +} + /* Repair an inode's fields. */ int xrep_inode( @@ -1746,5 +2081,10 @@ xrep_inode( return error; } + /* Reconnect incore unlinked list */ + error = xrep_inode_unlinked(sc); + if (error) + return error; + return xrep_defer_finish(sc); } diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c index ec3478bc505e..84f117667ca2 100644 --- a/fs/xfs/scrub/iscan.c +++ b/fs/xfs/scrub/iscan.c @@ -67,7 +67,7 @@ xchk_iscan_mask_skipino( xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino); xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino); - if (pag->pag_agno != skip_agno) + if (pag_agno(pag) != skip_agno) return; if (skip_agino < rec->ir_startino) return; @@ -95,7 +95,7 @@ xchk_iscan_find_next( struct xfs_btree_cur *cur; struct xfs_mount *mp = sc->mp; struct xfs_trans *tp = sc->tp; - xfs_agnumber_t agno = pag->pag_agno; + xfs_agnumber_t agno = pag_agno(pag); xfs_agino_t lastino = NULLAGINO; xfs_agino_t first, last; xfs_agino_t agino = *cursor; @@ -243,6 +243,51 @@ xchk_iscan_finish( mutex_unlock(&iscan->lock); } +/* Mark an inode scan finished before we actually scan anything. */ +void +xchk_iscan_finish_early( + struct xchk_iscan *iscan) +{ + ASSERT(iscan->cursor_ino == iscan->scan_start_ino); + ASSERT(iscan->__visited_ino == iscan->scan_start_ino); + + xchk_iscan_finish(iscan); +} + +/* + * Grab the AGI to advance the inode scan. Returns 0 if *agi_bpp is now set, + * -ECANCELED if the live scan aborted, -EBUSY if the AGI could not be grabbed, + * or the usual negative errno. + */ +STATIC int +xchk_iscan_read_agi( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_buf **agi_bpp) +{ + struct xfs_scrub *sc = iscan->sc; + unsigned long relax; + int ret; + + if (!xchk_iscan_agi_needs_trylock(iscan)) + return xfs_ialloc_read_agi(pag, sc->tp, 0, agi_bpp); + + relax = msecs_to_jiffies(iscan->iget_retry_delay); + do { + ret = xfs_ialloc_read_agi(pag, sc->tp, XFS_IALLOC_FLAG_TRYLOCK, + agi_bpp); + if (ret != -EAGAIN) + return ret; + if (!iscan->iget_timeout || + time_is_before_jiffies(iscan->__iget_deadline)) + return -EBUSY; + + trace_xchk_iscan_agi_retry_wait(iscan); + } while (!schedule_timeout_killable(relax) && + !xchk_iscan_aborted(iscan)); + return -ECANCELED; +} + /* * Advance ino to the next inode that the inobt thinks is allocated, being * careful to jump to the next AG if we've reached the right end of this AG's @@ -281,7 +326,7 @@ xchk_iscan_advance( if (!pag) return -ECANCELED; - ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); + ret = xchk_iscan_read_agi(iscan, pag, &agi_bp); if (ret) goto out_pag; @@ -363,6 +408,15 @@ xchk_iscan_iget_retry( } /* + * For an inode scan, we hold the AGI and want to try to grab a batch of + * inodes. Holding the AGI prevents inodegc from clearing freed inodes, + * so we must use noretry here. For every inode after the first one in the + * batch, we don't want to wait, so we use retry there too. Finally, use + * dontcache to avoid polluting the cache. + */ +#define ISCAN_IGET_FLAGS (XFS_IGET_NORETRY | XFS_IGET_DONTCACHE) + +/* * Grab an inode as part of an inode scan. While scanning this inode, the * caller must ensure that no other threads can modify the inode until a call * to xchk_iscan_visit succeeds. @@ -389,7 +443,7 @@ xchk_iscan_iget( ASSERT(iscan->__inodes[0] == NULL); /* Fill the first slot in the inode array. */ - error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0, &iscan->__inodes[idx]); trace_xchk_iscan_iget(iscan, error); @@ -402,8 +456,13 @@ xchk_iscan_iget( * It's possible that this inode has lost all of its links but * hasn't yet been inactivated. If we don't have a transaction * or it's not writable, flush the inodegc workers and wait. + * If we have a non-empty transaction, we must not block on + * inodegc, which allocates its own transactions. */ - xfs_inodegc_flush(mp); + if (sc->tp && !(sc->tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) + xfs_inodegc_push(mp); + else + xfs_inodegc_flush(mp); return xchk_iscan_iget_retry(iscan, true); } @@ -457,7 +516,7 @@ xchk_iscan_iget( ASSERT(iscan->__inodes[idx] == NULL); - error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0, &iscan->__inodes[idx]); if (error) break; diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h index 71f657552dfa..f9f47fa01a9e 100644 --- a/fs/xfs/scrub/iscan.h +++ b/fs/xfs/scrub/iscan.h @@ -59,6 +59,9 @@ struct xchk_iscan { /* Set if the scan has been aborted due to some event in the fs. */ #define XCHK_ISCAN_OPSTATE_ABORTED (1) +/* Use trylock to acquire the AGI */ +#define XCHK_ISCAN_OPSTATE_TRYLOCK_AGI (2) + static inline bool xchk_iscan_aborted(const struct xchk_iscan *iscan) { @@ -71,8 +74,21 @@ xchk_iscan_abort(struct xchk_iscan *iscan) set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); } +static inline bool +xchk_iscan_agi_needs_trylock(const struct xchk_iscan *iscan) +{ + return test_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate); +} + +static inline void +xchk_iscan_set_agi_trylock(struct xchk_iscan *iscan) +{ + set_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate); +} + void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout, unsigned int iget_retry_delay, struct xchk_iscan *iscan); +void xchk_iscan_finish_early(struct xchk_iscan *iscan); void xchk_iscan_teardown(struct xchk_iscan *iscan); int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp); diff --git a/fs/xfs/scrub/listxattr.c b/fs/xfs/scrub/listxattr.c new file mode 100644 index 000000000000..256ff7700c94 --- /dev/null +++ b/fs/xfs/scrub/listxattr.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_sf.h" +#include "xfs_trans.h" +#include "scrub/scrub.h" +#include "scrub/bitmap.h" +#include "scrub/dab_bitmap.h" +#include "scrub/listxattr.h" + +/* Call a function for every entry in a shortform xattr structure. */ +STATIC int +xchk_xattr_walk_sf( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + void *priv) +{ + struct xfs_attr_sf_hdr *hdr = ip->i_af.if_data; + struct xfs_attr_sf_entry *sfe; + unsigned int i; + int error; + + sfe = xfs_attr_sf_firstentry(hdr); + for (i = 0; i < hdr->count; i++) { + error = attr_fn(sc, ip, sfe->flags, sfe->nameval, sfe->namelen, + &sfe->nameval[sfe->namelen], sfe->valuelen, + priv); + if (error) + return error; + + sfe = xfs_attr_sf_nextentry(sfe); + } + + return 0; +} + +/* Call a function for every entry in this xattr leaf block. */ +STATIC int +xchk_xattr_walk_leaf_entries( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + struct xfs_buf *bp, + void *priv) +{ + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_mount *mp = sc->mp; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *entry; + unsigned int i; + int error; + + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); + + for (i = 0; i < ichdr.count; entry++, i++) { + void *value; + unsigned char *name; + unsigned int namelen, valuelen; + + if (entry->flags & XFS_ATTR_LOCAL) { + struct xfs_attr_leaf_name_local *name_loc; + + name_loc = xfs_attr3_leaf_name_local(leaf, i); + name = name_loc->nameval; + namelen = name_loc->namelen; + value = &name_loc->nameval[name_loc->namelen]; + valuelen = be16_to_cpu(name_loc->valuelen); + } else { + struct xfs_attr_leaf_name_remote *name_rmt; + + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + name = name_rmt->name; + namelen = name_rmt->namelen; + value = NULL; + valuelen = be32_to_cpu(name_rmt->valuelen); + } + + error = attr_fn(sc, ip, entry->flags, name, namelen, value, + valuelen, priv); + if (error) + return error; + + } + + return 0; +} + +/* + * Call a function for every entry in a leaf-format xattr structure. Avoid + * memory allocations for the loop detector since there's only one block. + */ +STATIC int +xchk_xattr_walk_leaf( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + void *priv) +{ + struct xfs_buf *leaf_bp; + int error; + + error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino, 0, &leaf_bp); + if (error) + return error; + + error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp, priv); + xfs_trans_brelse(sc->tp, leaf_bp); + return error; +} + +/* Find the leftmost leaf in the xattr dabtree. */ +STATIC int +xchk_xattr_find_leftmost_leaf( + struct xfs_scrub *sc, + struct xfs_inode *ip, + struct xdab_bitmap *seen_dablks, + struct xfs_buf **leaf_bpp) +{ + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_buf *bp; + xfs_failaddr_t fa; + xfs_dablk_t blkno = 0; + unsigned int expected_level = 0; + int error; + + for (;;) { + xfs_extlen_t len = 1; + uint16_t magic; + + /* Make sure we haven't seen this new block already. */ + if (xdab_bitmap_test(seen_dablks, blkno, &len)) + return -EFSCORRUPTED; + + error = xfs_da3_node_read(tp, ip, blkno, &bp, XFS_ATTR_FORK); + if (error) + return error; + + node = bp->b_addr; + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) + break; + + error = -EFSCORRUPTED; + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) + goto out_buf; + + fa = xfs_da3_node_header_check(bp, ip->i_ino); + if (fa) + goto out_buf; + + xfs_da3_node_hdr_from_disk(mp, &nodehdr, node); + + if (nodehdr.count == 0 || nodehdr.level >= XFS_DA_NODE_MAXDEPTH) + goto out_buf; + + /* Check the level from the root node. */ + if (blkno == 0) + expected_level = nodehdr.level - 1; + else if (expected_level != nodehdr.level) + goto out_buf; + else + expected_level--; + + /* Remember that we've seen this node. */ + error = xdab_bitmap_set(seen_dablks, blkno, 1); + if (error) + goto out_buf; + + /* Find the next level towards the leaves of the dabtree. */ + btree = nodehdr.btree; + blkno = be32_to_cpu(btree->before); + xfs_trans_brelse(tp, bp); + } + + error = -EFSCORRUPTED; + fa = xfs_attr3_leaf_header_check(bp, ip->i_ino); + if (fa) + goto out_buf; + + if (expected_level != 0) + goto out_buf; + + /* Remember that we've seen this leaf. */ + error = xdab_bitmap_set(seen_dablks, blkno, 1); + if (error) + goto out_buf; + + *leaf_bpp = bp; + return 0; + +out_buf: + xfs_trans_brelse(tp, bp); + return error; +} + +/* Call a function for every entry in a node-format xattr structure. */ +STATIC int +xchk_xattr_walk_node( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + xchk_xattrleaf_fn leaf_fn, + void *priv) +{ + struct xfs_attr3_icleaf_hdr leafhdr; + struct xdab_bitmap seen_dablks; + struct xfs_mount *mp = sc->mp; + struct xfs_attr_leafblock *leaf; + struct xfs_buf *leaf_bp; + int error; + + xdab_bitmap_init(&seen_dablks); + + error = xchk_xattr_find_leftmost_leaf(sc, ip, &seen_dablks, &leaf_bp); + if (error) + goto out_bitmap; + + for (;;) { + xfs_extlen_t len; + + error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp, + priv); + if (error) + goto out_leaf; + + /* Find the right sibling of this leaf block. */ + leaf = leaf_bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + if (leafhdr.forw == 0) + goto out_leaf; + + xfs_trans_brelse(sc->tp, leaf_bp); + + if (leaf_fn) { + error = leaf_fn(sc, priv); + if (error) + goto out_bitmap; + } + + /* Make sure we haven't seen this new leaf already. */ + len = 1; + if (xdab_bitmap_test(&seen_dablks, leafhdr.forw, &len)) { + error = -EFSCORRUPTED; + goto out_bitmap; + } + + error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino, + leafhdr.forw, &leaf_bp); + if (error) + goto out_bitmap; + + /* Remember that we've seen this new leaf. */ + error = xdab_bitmap_set(&seen_dablks, leafhdr.forw, 1); + if (error) + goto out_leaf; + } + +out_leaf: + xfs_trans_brelse(sc->tp, leaf_bp); +out_bitmap: + xdab_bitmap_destroy(&seen_dablks); + return error; +} + +/* + * Call a function for every extended attribute in a file. + * + * Callers must hold the ILOCK. No validation or cursor restarts allowed. + * Returns -EFSCORRUPTED on any problem, including loops in the dabtree. + */ +int +xchk_xattr_walk( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + xchk_xattrleaf_fn leaf_fn, + void *priv) +{ + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + if (!xfs_inode_hasattr(ip)) + return 0; + + if (ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) + return xchk_xattr_walk_sf(sc, ip, attr_fn, priv); + + /* attr functions require that the attr fork is loaded */ + error = xfs_iread_extents(sc->tp, ip, XFS_ATTR_FORK); + if (error) + return error; + + if (xfs_attr_is_leaf(ip)) + return xchk_xattr_walk_leaf(sc, ip, attr_fn, priv); + + return xchk_xattr_walk_node(sc, ip, attr_fn, leaf_fn, priv); +} diff --git a/fs/xfs/scrub/listxattr.h b/fs/xfs/scrub/listxattr.h new file mode 100644 index 000000000000..703cfb7b14cf --- /dev/null +++ b/fs/xfs/scrub/listxattr.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_LISTXATTR_H__ +#define __XFS_SCRUB_LISTXATTR_H__ + +typedef int (*xchk_xattr_fn)(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int attr_flags, const unsigned char *name, + unsigned int namelen, const void *value, unsigned int valuelen, + void *priv); + +typedef int (*xchk_xattrleaf_fn)(struct xfs_scrub *sc, void *priv); + +int xchk_xattr_walk(struct xfs_scrub *sc, struct xfs_inode *ip, + xchk_xattr_fn attr_fn, xchk_xattrleaf_fn leaf_fn, void *priv); + +#endif /* __XFS_SCRUB_LISTXATTR_H__ */ diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c new file mode 100644 index 000000000000..e21c16fbd15d --- /dev/null +++ b/fs/xfs/scrub/metapath.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_metafile.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_dir2.h" +#include "xfs_parent.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_attr.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/readdir.h" +#include "scrub/repair.h" + +/* + * Metadata Directory Tree Paths + * ============================= + * + * A filesystem with metadir enabled expects to find metadata structures + * attached to files that are accessible by walking a path down the metadata + * directory tree. Given the metadir path and the incore inode storing the + * metadata, this scrubber ensures that the ondisk metadir path points to the + * ondisk inode represented by the incore inode. + */ + +struct xchk_metapath { + struct xfs_scrub *sc; + + /* Name for lookup */ + struct xfs_name xname; + + /* Directory update for repairs */ + struct xfs_dir_update du; + + /* Path down to this metadata file from the parent directory */ + const char *path; + + /* Directory parent of the metadata file. */ + struct xfs_inode *dp; + + /* Locks held on dp */ + unsigned int dp_ilock_flags; + + /* Transaction block reservations */ + unsigned int link_resblks; + unsigned int unlink_resblks; + + /* Parent pointer updates */ + struct xfs_parent_args link_ppargs; + struct xfs_parent_args unlink_ppargs; + + /* Scratchpads for removing links */ + struct xfs_da_args pptr_args; +}; + +/* Release resources tracked in the buffer. */ +static inline void +xchk_metapath_cleanup( + void *buf) +{ + struct xchk_metapath *mpath = buf; + + if (mpath->dp_ilock_flags) + xfs_iunlock(mpath->dp, mpath->dp_ilock_flags); + kfree(mpath->path); +} + +/* Set up a metadir path scan. @path must be dynamically allocated. */ +static inline int +xchk_setup_metapath_scan( + struct xfs_scrub *sc, + struct xfs_inode *dp, + const char *path, + struct xfs_inode *ip) +{ + struct xchk_metapath *mpath; + int error; + + if (!path) + return -ENOMEM; + + error = xchk_install_live_inode(sc, ip); + if (error) { + kfree(path); + return error; + } + + mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS); + if (!mpath) { + kfree(path); + return -ENOMEM; + } + + mpath->sc = sc; + sc->buf = mpath; + sc->buf_cleanup = xchk_metapath_cleanup; + + mpath->dp = dp; + mpath->path = path; /* path is now owned by mpath */ + + mpath->xname.name = mpath->path; + mpath->xname.len = strlen(mpath->path); + mpath->xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode); + + return 0; +} + +#ifdef CONFIG_XFS_RT +/* Scan the /rtgroups directory itself. */ +static int +xchk_setup_metapath_rtdir( + struct xfs_scrub *sc) +{ + if (!sc->mp->m_rtdirip) + return -ENOENT; + + return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, + kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip); +} + +/* Scan a rtgroup inode under the /rtgroups directory. */ +static int +xchk_setup_metapath_rtginode( + struct xfs_scrub *sc, + enum xfs_rtg_inodes type) +{ + struct xfs_rtgroup *rtg; + struct xfs_inode *ip; + int error; + + rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno); + if (!rtg) + return -ENOENT; + + ip = rtg->rtg_inodes[type]; + if (!ip) { + error = -ENOENT; + goto out_put_rtg; + } + + error = xchk_setup_metapath_scan(sc, sc->mp->m_rtdirip, + xfs_rtginode_path(rtg_rgno(rtg), type), ip); + +out_put_rtg: + xfs_rtgroup_put(rtg); + return error; +} +#else +# define xchk_setup_metapath_rtdir(...) (-ENOENT) +# define xchk_setup_metapath_rtginode(...) (-ENOENT) +#endif /* CONFIG_XFS_RT */ + +#ifdef CONFIG_XFS_QUOTA +/* Scan the /quota directory itself. */ +static int +xchk_setup_metapath_quotadir( + struct xfs_scrub *sc) +{ + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + + if (!qi || !qi->qi_dirip) + return -ENOENT; + + return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, + kstrdup("quota", GFP_KERNEL), qi->qi_dirip); +} + +/* Scan a quota inode under the /quota directory. */ +static int +xchk_setup_metapath_dqinode( + struct xfs_scrub *sc, + xfs_dqtype_t type) +{ + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_inode *ip = NULL; + + if (!qi) + return -ENOENT; + + switch (type) { + case XFS_DQTYPE_USER: + ip = qi->qi_uquotaip; + break; + case XFS_DQTYPE_GROUP: + ip = qi->qi_gquotaip; + break; + case XFS_DQTYPE_PROJ: + ip = qi->qi_pquotaip; + break; + default: + ASSERT(0); + return -EINVAL; + } + if (!ip) + return -ENOENT; + + return xchk_setup_metapath_scan(sc, qi->qi_dirip, + kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip); +} +#else +# define xchk_setup_metapath_quotadir(...) (-ENOENT) +# define xchk_setup_metapath_dqinode(...) (-ENOENT) +#endif /* CONFIG_XFS_QUOTA */ + +int +xchk_setup_metapath( + struct xfs_scrub *sc) +{ + if (!xfs_has_metadir(sc->mp)) + return -ENOENT; + if (sc->sm->sm_gen) + return -EINVAL; + + switch (sc->sm->sm_ino) { + case XFS_SCRUB_METAPATH_PROBE: + /* Just probing, nothing else to do. */ + if (sc->sm->sm_agno) + return -EINVAL; + return 0; + case XFS_SCRUB_METAPATH_RTDIR: + return xchk_setup_metapath_rtdir(sc); + case XFS_SCRUB_METAPATH_RTBITMAP: + return xchk_setup_metapath_rtginode(sc, XFS_RTGI_BITMAP); + case XFS_SCRUB_METAPATH_RTSUMMARY: + return xchk_setup_metapath_rtginode(sc, XFS_RTGI_SUMMARY); + case XFS_SCRUB_METAPATH_QUOTADIR: + return xchk_setup_metapath_quotadir(sc); + case XFS_SCRUB_METAPATH_USRQUOTA: + return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_USER); + case XFS_SCRUB_METAPATH_GRPQUOTA: + return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_GROUP); + case XFS_SCRUB_METAPATH_PRJQUOTA: + return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_PROJ); + case XFS_SCRUB_METAPATH_RTRMAPBT: + return xchk_setup_metapath_rtginode(sc, XFS_RTGI_RMAP); + case XFS_SCRUB_METAPATH_RTREFCOUNTBT: + return xchk_setup_metapath_rtginode(sc, XFS_RTGI_REFCOUNT); + default: + return -ENOENT; + } +} + +/* + * Take the ILOCK on the metadata directory parent and child. We do not know + * that the metadata directory is not corrupt, so we lock the parent and try + * to lock the child. Returns 0 if successful, or -EINTR to abort the scrub. + */ +STATIC int +xchk_metapath_ilock_both( + struct xchk_metapath *mpath) +{ + struct xfs_scrub *sc = mpath->sc; + int error = 0; + + while (true) { + xfs_ilock(mpath->dp, XFS_ILOCK_EXCL); + if (xchk_ilock_nowait(sc, XFS_ILOCK_EXCL)) { + mpath->dp_ilock_flags |= XFS_ILOCK_EXCL; + return 0; + } + xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) + return error; + + delay(1); + } + + ASSERT(0); + return -EINTR; +} + +/* Unlock parent and child inodes. */ +static inline void +xchk_metapath_iunlock( + struct xchk_metapath *mpath) +{ + struct xfs_scrub *sc = mpath->sc; + + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + mpath->dp_ilock_flags &= ~XFS_ILOCK_EXCL; + xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL); +} + +int +xchk_metapath( + struct xfs_scrub *sc) +{ + struct xchk_metapath *mpath = sc->buf; + xfs_ino_t ino = NULLFSINO; + int error; + + /* Just probing, nothing else to do. */ + if (sc->sm->sm_ino == XFS_SCRUB_METAPATH_PROBE) + return 0; + + /* Parent required to do anything else. */ + if (mpath->dp == NULL) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + error = xchk_metapath_ilock_both(mpath); + if (error) + goto out_cancel; + + /* Make sure the parent dir has a dirent pointing to this file. */ + error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino); + trace_xchk_metapath_lookup(sc, mpath->path, mpath->dp, ino); + if (error == -ENOENT) { + /* No directory entry at all */ + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + error = 0; + goto out_ilock; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out_ilock; + if (ino != sc->ip->i_ino) { + /* Pointing to wrong inode */ + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } + +out_ilock: + xchk_metapath_iunlock(mpath); +out_cancel: + xchk_trans_cancel(sc); + return error; +} + +#ifdef CONFIG_XFS_ONLINE_REPAIR +/* Create the dirent represented by the final component of the path. */ +STATIC int +xrep_metapath_link( + struct xchk_metapath *mpath) +{ + struct xfs_scrub *sc = mpath->sc; + + mpath->du.dp = mpath->dp; + mpath->du.name = &mpath->xname; + mpath->du.ip = sc->ip; + + if (xfs_has_parent(sc->mp)) + mpath->du.ppargs = &mpath->link_ppargs; + else + mpath->du.ppargs = NULL; + + trace_xrep_metapath_link(sc, mpath->path, mpath->dp, sc->ip->i_ino); + + return xfs_dir_add_child(sc->tp, mpath->link_resblks, &mpath->du); +} + +/* Remove the dirent at the final component of the path. */ +STATIC int +xrep_metapath_unlink( + struct xchk_metapath *mpath, + xfs_ino_t ino, + struct xfs_inode *ip) +{ + struct xfs_parent_rec rec; + struct xfs_scrub *sc = mpath->sc; + struct xfs_mount *mp = sc->mp; + int error; + + trace_xrep_metapath_unlink(sc, mpath->path, mpath->dp, ino); + + if (!ip) { + /* The child inode isn't allocated. Junk the dirent. */ + xfs_trans_log_inode(sc->tp, mpath->dp, XFS_ILOG_CORE); + return xfs_dir_removename(sc->tp, mpath->dp, &mpath->xname, + ino, mpath->unlink_resblks); + } + + mpath->du.dp = mpath->dp; + mpath->du.name = &mpath->xname; + mpath->du.ip = ip; + mpath->du.ppargs = NULL; + + /* Figure out if we're removing a parent pointer too. */ + if (xfs_has_parent(mp)) { + xfs_inode_to_parent_rec(&rec, ip); + error = xfs_parent_lookup(sc->tp, ip, &mpath->xname, &rec, + &mpath->pptr_args); + switch (error) { + case -ENOATTR: + break; + case 0: + mpath->du.ppargs = &mpath->unlink_ppargs; + break; + default: + return error; + } + } + + return xfs_dir_remove_child(sc->tp, mpath->unlink_resblks, &mpath->du); +} + +/* + * Try to create a dirent in @mpath->dp with the name @mpath->xname that points + * to @sc->ip. Returns: + * + * -EEXIST and an @alleged_child if the dirent that points to the wrong inode; + * 0 if there is now a dirent pointing to @sc->ip; or + * A negative errno on error. + */ +STATIC int +xrep_metapath_try_link( + struct xchk_metapath *mpath, + xfs_ino_t *alleged_child) +{ + struct xfs_scrub *sc = mpath->sc; + xfs_ino_t ino; + int error; + + /* Allocate transaction, lock inodes, join to transaction. */ + error = xchk_trans_alloc(sc, mpath->link_resblks); + if (error) + return error; + + error = xchk_metapath_ilock_both(mpath); + if (error) { + xchk_trans_cancel(sc); + return error; + } + xfs_trans_ijoin(sc->tp, mpath->dp, 0); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino); + trace_xrep_metapath_lookup(sc, mpath->path, mpath->dp, ino); + if (error == -ENOENT) { + /* + * There is no dirent in the directory. Create an entry + * pointing to @sc->ip. + */ + error = xrep_metapath_link(mpath); + if (error) + goto out_cancel; + + error = xrep_trans_commit(sc); + xchk_metapath_iunlock(mpath); + return error; + } + if (error) + goto out_cancel; + + if (ino == sc->ip->i_ino) { + /* The dirent already points to @sc->ip; we're done. */ + error = 0; + goto out_cancel; + } + + /* + * The dirent points elsewhere; pass that back so that the caller + * can try to remove the dirent. + */ + *alleged_child = ino; + error = -EEXIST; + +out_cancel: + xchk_trans_cancel(sc); + xchk_metapath_iunlock(mpath); + return error; +} + +/* + * Take the ILOCK on the metadata directory parent and a bad child, if one is + * supplied. We do not know that the metadata directory is not corrupt, so we + * lock the parent and try to lock the child. Returns 0 if successful, or + * -EINTR to abort the repair. The lock state of @dp is not recorded in @mpath. + */ +STATIC int +xchk_metapath_ilock_parent_and_child( + struct xchk_metapath *mpath, + struct xfs_inode *ip) +{ + struct xfs_scrub *sc = mpath->sc; + int error = 0; + + while (true) { + xfs_ilock(mpath->dp, XFS_ILOCK_EXCL); + if (!ip || xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return 0; + xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) + return error; + + delay(1); + } + + ASSERT(0); + return -EINTR; +} + +/* + * Try to remove a dirent in @mpath->dp with the name @mpath->xname that points + * to @alleged_child. Returns: + * + * 0 if there is no longer a dirent; + * -EEXIST if the dirent points to @sc->ip; + * -EAGAIN and an updated @alleged_child if the dirent points elsewhere; or + * A negative errno for any other error. + */ +STATIC int +xrep_metapath_try_unlink( + struct xchk_metapath *mpath, + xfs_ino_t *alleged_child) +{ + struct xfs_scrub *sc = mpath->sc; + struct xfs_inode *ip = NULL; + xfs_ino_t ino; + int error; + + ASSERT(*alleged_child != sc->ip->i_ino); + + trace_xrep_metapath_try_unlink(sc, mpath->path, mpath->dp, + *alleged_child); + + /* + * Allocate transaction, grab the alleged child inode, lock inodes, + * join to transaction. + */ + error = xchk_trans_alloc(sc, mpath->unlink_resblks); + if (error) + return error; + + error = xchk_iget(sc, *alleged_child, &ip); + if (error == -EINVAL || error == -ENOENT) { + /* inode number is bogus, junk the dirent */ + error = 0; + } + if (error) { + xchk_trans_cancel(sc); + return error; + } + + error = xchk_metapath_ilock_parent_and_child(mpath, ip); + if (error) { + xchk_trans_cancel(sc); + return error; + } + xfs_trans_ijoin(sc->tp, mpath->dp, 0); + if (ip) + xfs_trans_ijoin(sc->tp, ip, 0); + + error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino); + trace_xrep_metapath_lookup(sc, mpath->path, mpath->dp, ino); + if (error == -ENOENT) { + /* + * There is no dirent in the directory anymore. We're ready to + * try the link operation again. + */ + error = 0; + goto out_cancel; + } + if (error) + goto out_cancel; + + if (ino == sc->ip->i_ino) { + /* The dirent already points to @sc->ip; we're done. */ + error = -EEXIST; + goto out_cancel; + } + + /* + * The dirent does not point to the alleged child. Update the caller + * and signal that we want to be called again. + */ + if (ino != *alleged_child) { + *alleged_child = ino; + error = -EAGAIN; + goto out_cancel; + } + + /* Remove the link to the child. */ + error = xrep_metapath_unlink(mpath, ino, ip); + if (error) + goto out_cancel; + + error = xrep_trans_commit(sc); + goto out_unlock; + +out_cancel: + xchk_trans_cancel(sc); +out_unlock: + xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL); + if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xchk_irele(sc, ip); + } + return error; +} + +/* + * Make sure the metadata directory path points to the child being examined. + * + * Repair needs to be able to create a directory structure, create its own + * transactions, and take ILOCKs. This function /must/ be called after all + * other repairs have completed. + */ +int +xrep_metapath( + struct xfs_scrub *sc) +{ + struct xchk_metapath *mpath = sc->buf; + struct xfs_mount *mp = sc->mp; + int error = 0; + + /* Just probing, nothing to repair. */ + if (sc->sm->sm_ino == XFS_SCRUB_METAPATH_PROBE) + return 0; + + /* Parent required to do anything else. */ + if (mpath->dp == NULL) + return -EFSCORRUPTED; + + /* + * Make sure the child file actually has an attr fork to receive a new + * parent pointer if the fs has parent pointers. + */ + if (xfs_has_parent(mp)) { + error = xfs_attr_add_fork(sc->ip, + sizeof(struct xfs_attr_sf_hdr), 1); + if (error) + return error; + } + + /* Compute block reservation required to unlink and link a file. */ + mpath->unlink_resblks = xfs_remove_space_res(mp, MAXNAMELEN); + mpath->link_resblks = xfs_link_space_res(mp, MAXNAMELEN); + + do { + xfs_ino_t alleged_child; + + /* Re-establish the link, or tell us which inode to remove. */ + error = xrep_metapath_try_link(mpath, &alleged_child); + if (!error) + return 0; + if (error != -EEXIST) + return error; + + /* + * Remove an incorrect link to an alleged child, or tell us + * which inode to remove. + */ + do { + error = xrep_metapath_try_unlink(mpath, &alleged_child); + } while (error == -EAGAIN); + if (error == -EEXIST) { + /* Link established; we're done. */ + error = 0; + break; + } + } while (!error); + + return error; +} +#endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 4a0271123d94..1588ce971cb8 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -19,6 +19,8 @@ #include "xfs_rmap.h" #include "xfs_ag.h" #include "xfs_defer.h" +#include "xfs_metafile.h" +#include "xfs_quota.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -58,9 +60,9 @@ xrep_newbt_estimate_slack( if (sc->ops->type == ST_PERAG) { free = sc->sa.pag->pagf_freeblks; - sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); + sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag)); } else { - free = percpu_counter_sum(&sc->mp->m_fdblocks); + free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS); sz = sc->mp->m_sb.sb_dblocks; } @@ -121,6 +123,43 @@ xrep_newbt_init_inode( } /* + * Initialize accounting resources for staging a new metadata inode btree. + * If the metadata file has a space reservation, the caller must adjust that + * reservation when committing the new ondisk btree. + */ +int +xrep_newbt_init_metadir_inode( + struct xrep_newbt *xnr, + struct xfs_scrub *sc) +{ + struct xfs_owner_info oinfo; + struct xfs_ifork *ifp; + + ASSERT(xfs_is_metadir_inode(sc->ip)); + + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); + + ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); + if (!ifp) + return -ENOMEM; + + /* + * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the + * inode metadata space reservations can only account allocated space + * to the i_nblocks. We do not want to change the inode core fields + * until we're ready to commit the new tree, so we allocate the blocks + * as if they were regular file blocks. This exposes us to a higher + * risk of the repair being cancelled due to ENOSPC. + */ + xrep_newbt_init_ag(xnr, sc, &oinfo, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), + XFS_AG_RESV_NONE); + xnr->ifake.if_fork = ifp; + xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK); + return 0; +} + +/* * Initialize accounting resources for staging a new btree. Callers are * expected to add their own reservations (and clean them up) manually. */ @@ -160,7 +199,8 @@ xrep_newbt_add_blocks( if (args->tp) { ASSERT(xnr->oinfo.oi_offset == 0); - error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); + error = xfs_alloc_schedule_autoreap(args, + XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap); if (error) goto out_pag; } @@ -185,11 +225,10 @@ xrep_newbt_add_extent( xfs_agblock_t agbno, xfs_extlen_t len) { - struct xfs_mount *mp = xnr->sc->mp; struct xfs_alloc_arg args = { .tp = NULL, /* no autoreap */ .oinfo = xnr->oinfo, - .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno), + .fsbno = xfs_agbno_to_fsb(pag, agbno), .len = len, .resv = xnr->resv, }; @@ -205,12 +244,12 @@ xrep_newbt_validate_ag_alloc_hint( struct xfs_scrub *sc = xnr->sc; xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); - if (agno == sc->sa.pag->pag_agno && + if (agno == pag_agno(sc->sa.pag) && xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) return; - xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, - XFS_AGFL_BLOCK(sc->mp) + 1); + xnr->alloc_hint = + xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1); } /* Allocate disk space for a new per-AG btree. */ @@ -224,6 +263,7 @@ xrep_newbt_alloc_ag_blocks( int error = 0; ASSERT(sc->sa.pag != NULL); + ASSERT(xnr->resv != XFS_AG_RESV_METAFILE); while (nr_blocks > 0) { struct xfs_alloc_arg args = { @@ -250,16 +290,15 @@ xrep_newbt_alloc_ag_blocks( return -ENOSPC; agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + if (agno != pag_agno(sc->sa.pag)) { + ASSERT(agno == pag_agno(sc->sa.pag)); + return -EFSCORRUPTED; + } - trace_xrep_newbt_alloc_ag_blocks(mp, agno, + trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, xnr->oinfo.oi_owner); - if (agno != sc->sa.pag->pag_agno) { - ASSERT(agno == sc->sa.pag->pag_agno); - return -EFSCORRUPTED; - } - error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); if (error) return error; @@ -298,6 +337,8 @@ xrep_newbt_alloc_file_blocks( struct xfs_mount *mp = sc->mp; int error = 0; + ASSERT(xnr->resv != XFS_AG_RESV_METAFILE); + while (nr_blocks > 0) { struct xfs_alloc_arg args = { .tp = sc->tp, @@ -325,16 +366,16 @@ xrep_newbt_alloc_file_blocks( agno = XFS_FSB_TO_AGNO(mp, args.fsbno); - trace_xrep_newbt_alloc_file_blocks(mp, agno, - XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, - xnr->oinfo.oi_owner); - pag = xfs_perag_get(mp, agno); if (!pag) { ASSERT(0); return -EFSCORRUPTED; } + trace_xrep_newbt_alloc_file_blocks(pag, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + error = xrep_newbt_add_blocks(xnr, pag, &args); xfs_perag_put(pag); if (error) @@ -375,7 +416,6 @@ xrep_newbt_free_extent( struct xfs_scrub *sc = xnr->sc; xfs_agblock_t free_agbno = resv->agbno; xfs_extlen_t free_aglen = resv->len; - xfs_fsblock_t fsbno; int error; if (!btree_committed || resv->used == 0) { @@ -384,8 +424,8 @@ xrep_newbt_free_extent( * space reservation, let the existing EFI free the entire * space extent. */ - trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, - free_agbno, free_aglen, xnr->oinfo.oi_owner); + trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, + xnr->oinfo.oi_owner); xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); return 1; } @@ -402,8 +442,8 @@ xrep_newbt_free_extent( if (free_aglen == 0) return 0; - trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, - free_aglen, xnr->oinfo.oi_owner); + trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, + xnr->oinfo.oi_owner); ASSERT(xnr->resv != XFS_AG_RESV_AGFL); ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); @@ -412,9 +452,9 @@ xrep_newbt_free_extent( * Use EFIs to free the reservations. This reduces the chance * that we leak blocks if the system goes down. */ - fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); - error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, - xnr->resv, true); + error = xfs_free_extent_later(sc->tp, + xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen, + &xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD); if (error) return error; @@ -515,7 +555,6 @@ xrep_newbt_claim_block( union xfs_btree_ptr *ptr) { struct xrep_newbt_resv *resv; - struct xfs_mount *mp = cur->bc_mp; xfs_agblock_t agbno; /* @@ -540,12 +579,10 @@ xrep_newbt_claim_block( if (resv->used == resv->len) list_move_tail(&resv->list, &xnr->resv_list); - trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, - xnr->oinfo.oi_owner); + trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) - ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, - agbno)); + ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno)); else ptr->s = cpu_to_be32(agbno); diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h index 3d804d31af24..5ce785599287 100644 --- a/fs/xfs/scrub/newbt.h +++ b/fs/xfs/scrub/newbt.h @@ -63,6 +63,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc, enum xfs_ag_resv_type resv); int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc, int whichfork, const struct xfs_owner_info *oinfo); +int xrep_newbt_init_metadir_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc); int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks); int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len); diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c index 8a7d9557897c..4a47d0aabf73 100644 --- a/fs/xfs/scrub/nlinks.c +++ b/fs/xfs/scrub/nlinks.c @@ -18,15 +18,19 @@ #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_ag.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/iscan.h" +#include "scrub/orphanage.h" #include "scrub/nlinks.h" #include "scrub/trace.h" #include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/listxattr.h" /* * Live Inode Link Count Checking @@ -43,11 +47,23 @@ int xchk_setup_nlinks( struct xfs_scrub *sc) { + struct xchk_nlink_ctrs *xnc; + int error; + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); - sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS); - if (!sc->buf) + if (xchk_could_repair(sc)) { + error = xrep_setup_nlinks(sc); + if (error) + return error; + } + + xnc = kvzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS); + if (!xnc) return -ENOMEM; + xnc->xname.name = xnc->namebuf; + xnc->sc = sc; + sc->buf = xnc; return xchk_setup_fs(sc); } @@ -152,6 +168,13 @@ xchk_nlinks_live_update( xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb); + /* + * Ignore temporary directories being used to stage dir repairs, since + * we don't bump the link counts of the children. + */ + if (xrep_is_tempfile(p->dp)) + return NOTIFY_DONE; + trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino, p->delta, p->name->name, p->name->len); @@ -251,12 +274,17 @@ xchk_nlinks_collect_dirent( * number of parents of the root directory. * * Otherwise, increment the number of backrefs pointing back to ino. + * + * If the filesystem has parent pointers, we walk the pptrs to + * determine the backref count. */ if (dotdot) { - if (dp == sc->mp->m_rootip) + if (xchk_inode_is_dirtree_root(dp)) error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); - else + else if (!xfs_has_parent(sc->mp)) error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0); + else + error = 0; if (error) goto out_unlock; } @@ -293,6 +321,61 @@ out_incomplete: return error; } +/* Bump the backref count for the inode referenced by this parent pointer. */ +STATIC int +xchk_nlinks_collect_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + struct xchk_nlink_ctrs *xnc = priv; + const struct xfs_parent_rec *pptr_rec = value; + xfs_ino_t parent_ino; + int error; + + /* Update the shadow link counts if we haven't already failed. */ + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_incomplete; + } + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) + return error; + + trace_xchk_nlinks_collect_pptr(sc->mp, ip, &xname, pptr_rec); + + mutex_lock(&xnc->lock); + + error = xchk_nlinks_update_incore(xnc, parent_ino, 0, 1, 0); + if (error) + goto out_unlock; + + mutex_unlock(&xnc->lock); + return 0; + +out_unlock: + mutex_unlock(&xnc->lock); + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(sc); + return error; +} + /* Walk a directory to bump the observed link counts of the children. */ STATIC int xchk_nlinks_collect_dir( @@ -303,6 +386,13 @@ xchk_nlinks_collect_dir( unsigned int lock_mode; int error = 0; + /* + * Ignore temporary directories being used to stage dir repairs, since + * we don't bump the link counts of the children. + */ + if (xrep_is_tempfile(dp)) + return 0; + /* Prevent anyone from changing this directory while we walk it. */ xfs_ilock(dp, XFS_IOLOCK_SHARED); lock_mode = xfs_ilock_data_map_shared(dp); @@ -332,6 +422,28 @@ xchk_nlinks_collect_dir( if (error) goto out_abort; + /* Walk the parent pointers to get real backref counts. */ + if (xfs_has_parent(sc->mp)) { + /* + * If the extended attributes look as though they has been + * zapped by the inode record repair code, we cannot scan for + * parent pointers. + */ + if (xchk_pptr_looks_zapped(dp)) { + error = -EBUSY; + goto out_unlock; + } + + error = xchk_xattr_walk(sc, dp, xchk_nlinks_collect_pptr, NULL, + xnc); + if (error == -ECANCELED) { + error = 0; + goto out_unlock; + } + if (error) + goto out_abort; + } + xchk_iscan_mark_visited(&xnc->collect_iscan, dp); goto out_unlock; @@ -537,6 +649,14 @@ xchk_nlinks_compare_inode( unsigned int actual_nlink; int error; + /* + * Ignore temporary files being used to stage repairs, since we assume + * they're correct for non-directories, and the directory repair code + * doesn't bump the link counts for the children. + */ + if (xrep_is_tempfile(ip)) + return 0; + xfs_ilock(ip, XFS_ILOCK_SHARED); mutex_lock(&xnc->lock); @@ -571,9 +691,11 @@ xchk_nlinks_compare_inode( * this as a corruption. The VFS won't let users increase the link * count, but it will let them decrease it. */ - if (total_links > XFS_MAXLINK) { + if (total_links > XFS_NLINK_PINNED) { xchk_ino_set_corrupt(sc, ip->i_ino); goto out_corrupt; + } else if (total_links > XFS_MAXLINK) { + xchk_ino_set_warning(sc, ip->i_ino); } /* Link counts should match. */ @@ -613,7 +735,7 @@ xchk_nlinks_compare_inode( } } - if (ip == sc->mp->m_rootip) { + if (xchk_inode_is_dirtree_root(ip)) { /* * For the root of a directory tree, both the '.' and '..' * entries should point to the root directory. The dotdot @@ -850,9 +972,6 @@ xchk_nlinks_setup_scan( xfs_agino_t first_agino, last_agino; int error; - ASSERT(xnc->sc == NULL); - xnc->sc = sc; - mutex_init(&xnc->lock); /* Retry iget every tenth of a second for up to 30 seconds. */ diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h index a950f3daf204..b820712bfd87 100644 --- a/fs/xfs/scrub/nlinks.h +++ b/fs/xfs/scrub/nlinks.h @@ -28,6 +28,13 @@ struct xchk_nlink_ctrs { * from other writer threads. */ struct xfs_dir_hook dhook; + + /* Orphanage reparenting request. */ + struct xrep_adoption adoption; + + /* Directory entry name, plus the trailing null. */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; }; /* diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c index b87618322f55..4ebdee095428 100644 --- a/fs/xfs/scrub/nlinks_repair.c +++ b/fs/xfs/scrub/nlinks_repair.c @@ -17,14 +17,19 @@ #include "xfs_iwalk.h" #include "xfs_ialloc.h" #include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/iscan.h" +#include "scrub/orphanage.h" #include "scrub/nlinks.h" #include "scrub/trace.h" +#include "scrub/tempfile.h" /* * Live Inode Link Count Repair @@ -36,6 +41,46 @@ * inode is locked. */ +/* Set up to repair inode link counts. */ +int +xrep_setup_nlinks( + struct xfs_scrub *sc) +{ + return xrep_orphanage_try_create(sc); +} + +/* + * Inodes that aren't the root directory or the orphanage, have a nonzero link + * count, and no observed parents should be moved to the orphanage. + */ +static inline bool +xrep_nlinks_is_orphaned( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int actual_nlink, + const struct xchk_nlink *obs) +{ + if (obs->parents != 0) + return false; + if (xchk_inode_is_dirtree_root(ip) || ip == sc->orphanage) + return false; + return actual_nlink != 0; +} + +/* Remove an inode from the unlinked list. */ +STATIC int +xrep_nlinks_iunlink_remove( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag; + int error; + + pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino)); + error = xfs_iunlink_remove(sc->tp, pag, sc->ip); + xfs_perag_put(pag); + return error; +} + /* * Correct the link count of the given inode. Because we have to grab locks * and resources in a certain order, it's possible that this will be a no-op. @@ -50,17 +95,55 @@ xrep_nlinks_repair_inode( struct xfs_inode *ip = sc->ip; uint64_t total_links; uint64_t actual_nlink; + bool orphanage_available = false; bool dirty = false; int error; - xchk_ilock(sc, XFS_IOLOCK_EXCL); + /* + * Ignore temporary files being used to stage repairs, since we assume + * they're correct for non-directories, and the directory repair code + * doesn't bump the link counts for the children. + */ + if (xrep_is_tempfile(ip)) + return 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp); - if (error) - return error; + /* + * If the filesystem has an orphanage attached to the scrub context, + * prepare for a link count repair that could involve @ip being adopted + * by the lost+found. + */ + if (xrep_orphanage_can_adopt(sc)) { + error = xrep_orphanage_iolock_two(sc); + if (error) + return error; - xchk_ilock(sc, XFS_ILOCK_EXCL); - xfs_trans_ijoin(sc->tp, ip, 0); + error = xrep_adoption_trans_alloc(sc, &xnc->adoption); + if (error) { + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + } else { + orphanage_available = true; + } + } + + /* + * Either there is no orphanage or we couldn't allocate resources for + * that kind of update. Let's try again with only the resources we + * need for a simple link count update, since that's much more common. + */ + if (!orphanage_available) { + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, + &sc->tp); + if (error) { + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + return error; + } + + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_trans_ijoin(sc->tp, ip, 0); + } mutex_lock(&xnc->lock); @@ -99,28 +182,68 @@ xrep_nlinks_repair_inode( } /* - * We did not find any links to this inode. If the inode agrees, we - * have nothing further to do. If not, the inode has a nonzero link - * count and we don't have anywhere to graft the child onto. Dropping - * a live inode's link count to zero can cause unexpected shutdowns in - * inactivation, so leave it alone. + * Decide if we're going to move this file to the orphanage, and fix + * up the incore link counts if we are. */ - if (total_links == 0) { - if (actual_nlink != 0) - trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); - goto out_trans; + if (orphanage_available && + xrep_nlinks_is_orphaned(sc, ip, actual_nlink, &obs)) { + /* Figure out what name we're going to use here. */ + error = xrep_adoption_compute_name(&xnc->adoption, &xnc->xname); + if (error) + goto out_trans; + + /* + * Reattach this file to the directory tree by moving it to + * the orphanage per the adoption parameters that we already + * computed. + */ + error = xrep_adoption_move(&xnc->adoption); + if (error) + goto out_trans; + + /* + * Re-read the link counts since the reparenting will have + * updated our scan info. + */ + mutex_lock(&xnc->lock); + error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs); + mutex_unlock(&xnc->lock); + if (error) + goto out_trans; + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + dirty = true; } - /* Commit the new link count if it changed. */ - if (total_links != actual_nlink) { - if (total_links > XFS_MAXLINK) { - trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + /* + * If this inode is linked from the directory tree and on the unlinked + * list, remove it from the unlinked list. + */ + if (total_links > 0 && xfs_inode_on_unlinked_list(ip)) { + error = xrep_nlinks_iunlink_remove(sc); + if (error) goto out_trans; - } + dirty = true; + } + + /* + * If this inode is not linked from the directory tree yet not on the + * unlinked list, put it on the unlinked list. + */ + if (total_links == 0 && !xfs_inode_on_unlinked_list(ip)) { + error = xfs_iunlink(sc->tp, ip); + if (error) + goto out_trans; + dirty = true; + } + /* Commit the new link count if it changed. */ + if (total_links != actual_nlink) { trace_xrep_nlinks_update_inode(mp, ip, &obs); - set_nlink(VFS_I(ip), total_links); + set_nlink(VFS_I(ip), min_t(unsigned long long, total_links, + XFS_NLINK_PINNED)); dirty = true; } @@ -132,14 +255,19 @@ xrep_nlinks_repair_inode( xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE); error = xrep_trans_commit(sc); - xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - return error; + goto out_unlock; out_scanlock: mutex_unlock(&xnc->lock); out_trans: xchk_trans_cancel(sc); - xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); +out_unlock: + xchk_iunlock(sc, XFS_ILOCK_EXCL); + if (orphanage_available) { + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + } + xchk_iunlock(sc, XFS_IOLOCK_EXCL); return error; } @@ -172,10 +300,10 @@ xrep_nlinks( /* * We need ftype for an accurate count of the number of child * subdirectory links. Child subdirectories with a back link (dotdot - * entry) but no forward link are unfixable, so we cannot repair the - * link count of the parent directory based on the back link count - * alone. Filesystems without ftype support are rare (old V4) so we - * just skip out here. + * entry) but no forward link are moved to the orphanage, so we cannot + * repair the link count of the parent directory based on the back link + * count alone. Filesystems without ftype support are rare (old V4) so + * we just skip out here. */ if (!xfs_has_ftype(sc->mp)) return -EOPNOTSUPP; diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c new file mode 100644 index 000000000000..9c12cb844231 --- /dev/null +++ b/fs/xfs/scrub/orphanage.c @@ -0,0 +1,629 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_dir2.h" +#include "xfs_icache.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_parent.h" +#include "xfs_attr_sf.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/trace.h" +#include "scrub/orphanage.h" +#include "scrub/readdir.h" + +#include <linux/namei.h> + +/* + * The Orphanage + * ============= + * + * If the directory tree is damaged, children of that directory become + * inaccessible via that file path. If a child has no other parents, the file + * is said to be orphaned. xfs_repair fixes this situation by creating a + * orphanage directory (specifically, /lost+found) and creating a directory + * entry pointing to the orphaned file. + * + * Online repair follows this tactic by creating a root-owned /lost+found + * directory if one does not exist. If an orphan is found, it will move that + * files into orphanage. + */ + +/* Make the orphanage owned by root. */ +STATIC int +xrep_chown_orphanage( + struct xfs_scrub *sc, + struct xfs_inode *dp) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = sc->mp; + struct xfs_dquot *udqp = NULL, *gdqp = NULL, *pdqp = NULL; + struct xfs_dquot *oldu = NULL, *oldg = NULL, *oldp = NULL; + struct inode *inode = VFS_I(dp); + int error; + + error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp); + if (error) + return error; + + error = xfs_trans_alloc_ichange(dp, udqp, gdqp, pdqp, true, &tp); + if (error) + goto out_dqrele; + + /* + * Always clear setuid/setgid/sticky on the orphanage since we don't + * normally want that functionality on this directory and xfs_repair + * doesn't create it this way either. Leave the other access bits + * unchanged. + */ + inode->i_mode &= ~(S_ISUID | S_ISGID | S_ISVTX); + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) { + if (XFS_IS_UQUOTA_ON(mp)) + oldu = xfs_qm_vop_chown(tp, dp, &dp->i_udquot, udqp); + inode->i_uid = GLOBAL_ROOT_UID; + } + if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) { + if (XFS_IS_GQUOTA_ON(mp)) + oldg = xfs_qm_vop_chown(tp, dp, &dp->i_gdquot, gdqp); + inode->i_gid = GLOBAL_ROOT_GID; + } + if (dp->i_projid != 0) { + if (XFS_IS_PQUOTA_ON(mp)) + oldp = xfs_qm_vop_chown(tp, dp, &dp->i_pdquot, pdqp); + dp->i_projid = 0; + } + + dp->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + XFS_STATS_INC(mp, xs_ig_attrchg); + + if (xfs_has_wsync(mp)) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp); + + xfs_qm_dqrele(oldu); + xfs_qm_dqrele(oldg); + xfs_qm_dqrele(oldp); + +out_dqrele: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + return error; +} + +#define ORPHANAGE "lost+found" + +/* Create the orphanage directory, and set sc->orphanage to it. */ +int +xrep_orphanage_create( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct dentry *root_dentry, *orphanage_dentry; + struct inode *root_inode = VFS_I(sc->mp->m_rootip); + struct inode *orphanage_inode; + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + if (xfs_is_readonly(mp)) { + sc->orphanage = NULL; + return 0; + } + + ASSERT(sc->tp == NULL); + ASSERT(sc->orphanage == NULL); + + /* Find the dentry for the root directory... */ + root_dentry = d_find_alias(root_inode); + if (!root_dentry) { + error = -EFSCORRUPTED; + goto out; + } + + /* ...which is a directory, right? */ + if (!d_is_dir(root_dentry)) { + error = -EFSCORRUPTED; + goto out_dput_root; + } + + /* Try to find the orphanage directory. */ + inode_lock_nested(root_inode, I_MUTEX_PARENT); + orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry); + if (IS_ERR(orphanage_dentry)) { + error = PTR_ERR(orphanage_dentry); + goto out_unlock_root; + } + + /* + * Nothing found? Call mkdir to create the orphanage. Create the + * directory without other-user access because we're live and someone + * could have been relying partly on minimal access to a parent + * directory to control access to a file we put in here. + */ + if (d_really_is_negative(orphanage_dentry)) { + orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode, + orphanage_dentry, 0750); + error = PTR_ERR(orphanage_dentry); + if (IS_ERR(orphanage_dentry)) + goto out_unlock_root; + } + + /* Not a directory? Bail out. */ + if (!d_is_dir(orphanage_dentry)) { + error = -ENOTDIR; + goto out_dput_orphanage; + } + + /* + * Grab a reference to the orphanage. This /should/ succeed since + * we hold the root directory locked and therefore nobody can delete + * the orphanage. + */ + orphanage_inode = igrab(d_inode(orphanage_dentry)); + if (!orphanage_inode) { + error = -ENOENT; + goto out_dput_orphanage; + } + + /* Make sure the orphanage is owned by root. */ + error = xrep_chown_orphanage(sc, XFS_I(orphanage_inode)); + if (error) + goto out_dput_orphanage; + + /* Stash the reference for later and bail out. */ + sc->orphanage = XFS_I(orphanage_inode); + sc->orphanage_ilock_flags = 0; + +out_dput_orphanage: + dput(orphanage_dentry); +out_unlock_root: + inode_unlock(VFS_I(sc->mp->m_rootip)); +out_dput_root: + dput(root_dentry); +out: + return error; +} + +void +xrep_orphanage_ilock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + sc->orphanage_ilock_flags |= ilock_flags; + xfs_ilock(sc->orphanage, ilock_flags); +} + +bool +xrep_orphanage_ilock_nowait( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + if (xfs_ilock_nowait(sc->orphanage, ilock_flags)) { + sc->orphanage_ilock_flags |= ilock_flags; + return true; + } + + return false; +} + +void +xrep_orphanage_iunlock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + xfs_iunlock(sc->orphanage, ilock_flags); + sc->orphanage_ilock_flags &= ~ilock_flags; +} + +/* Grab the IOLOCK of the orphanage and sc->ip. */ +int +xrep_orphanage_iolock_two( + struct xfs_scrub *sc) +{ + int error = 0; + + while (true) { + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Normal XFS takes the IOLOCK before grabbing a transaction. + * Scrub holds a transaction, which means that we can't block + * on either IOLOCK. + */ + if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { + if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + break; + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + } + delay(1); + } + + return 0; +} + +/* Release the orphanage. */ +void +xrep_orphanage_rele( + struct xfs_scrub *sc) +{ + if (!sc->orphanage) + return; + + if (sc->orphanage_ilock_flags) + xfs_iunlock(sc->orphanage, sc->orphanage_ilock_flags); + + xchk_irele(sc, sc->orphanage); + sc->orphanage = NULL; +} + +/* Adoption moves a file into /lost+found */ + +/* Can the orphanage adopt @sc->ip? */ +bool +xrep_orphanage_can_adopt( + struct xfs_scrub *sc) +{ + ASSERT(sc->ip != NULL); + + if (!sc->orphanage) + return false; + if (sc->ip == sc->orphanage) + return false; + if (xchk_inode_is_sb_rooted(sc->ip)) + return false; + if (xfs_is_internal_inode(sc->ip)) + return false; + return true; +} + +/* + * Create a new transaction to send a child to the orphanage. + * + * Allocate a new transaction with sufficient disk space to handle the + * adoption, take ILOCK_EXCL of the orphanage and sc->ip, joins them to the + * transaction, and reserve quota to reparent the latter. Caller must hold the + * IOLOCK of the orphanage and sc->ip. + */ +int +xrep_adoption_trans_alloc( + struct xfs_scrub *sc, + struct xrep_adoption *adopt) +{ + struct xfs_mount *mp = sc->mp; + unsigned int child_blkres = 0; + int error; + + ASSERT(sc->tp == NULL); + ASSERT(sc->ip != NULL); + ASSERT(sc->orphanage != NULL); + ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL); + ASSERT(sc->orphanage_ilock_flags & XFS_IOLOCK_EXCL); + ASSERT(!(sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))); + ASSERT(!(sc->orphanage_ilock_flags & + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))); + + /* Compute the worst case space reservation that we need. */ + adopt->sc = sc; + adopt->orphanage_blkres = xfs_link_space_res(mp, MAXNAMELEN); + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + child_blkres = xfs_rename_space_res(mp, 0, false, + xfs_name_dotdot.len, false); + if (xfs_has_parent(mp)) + child_blkres += XFS_ADDAFORK_SPACE_RES(mp); + adopt->child_blkres = child_blkres; + + /* + * Allocate a transaction to link the child into the parent, along with + * enough disk space to handle expansion of both the orphanage and the + * dotdot entry of a child directory. + */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, + adopt->orphanage_blkres + adopt->child_blkres, 0, 0, + &sc->tp); + if (error) + return error; + + xfs_lock_two_inodes(sc->orphanage, XFS_ILOCK_EXCL, + sc->ip, XFS_ILOCK_EXCL); + sc->ilock_flags |= XFS_ILOCK_EXCL; + sc->orphanage_ilock_flags |= XFS_ILOCK_EXCL; + + xfs_trans_ijoin(sc->tp, sc->orphanage, 0); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* + * Reserve enough quota in the orphan directory to add the new name. + * Normally the orphanage should have user/group/project ids of zero + * and hence is not subject to quota enforcement, but we're allowed to + * exceed quota to reattach disconnected parts of the directory tree. + */ + error = xfs_trans_reserve_quota_nblks(sc->tp, sc->orphanage, + adopt->orphanage_blkres, 0, true); + if (error) + goto out_cancel; + + /* + * Reserve enough quota in the child directory to change dotdot. + * Here we're also allowed to exceed file quota to repair inconsistent + * metadata. + */ + if (adopt->child_blkres) { + error = xfs_trans_reserve_quota_nblks(sc->tp, sc->ip, + adopt->child_blkres, 0, true); + if (error) + goto out_cancel; + } + + return 0; +out_cancel: + xchk_trans_cancel(sc); + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + return error; +} + +/* + * Compute the xfs_name for the directory entry that we're adding to the + * orphanage. Caller must hold ILOCKs of sc->ip and the orphanage and must not + * reuse namebuf until the adoption completes or is dissolved. + */ +int +xrep_adoption_compute_name( + struct xrep_adoption *adopt, + struct xfs_name *xname) +{ + struct xfs_scrub *sc = adopt->sc; + char *namebuf = (void *)xname->name; + xfs_ino_t ino; + unsigned int incr = 0; + int error = 0; + + adopt->xname = xname; + xname->len = snprintf(namebuf, MAXNAMELEN, "%llu", sc->ip->i_ino); + xname->type = xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode); + + /* Make sure the filename is unique in the lost+found. */ + error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino); + while (error == 0 && incr < 10000) { + xname->len = snprintf(namebuf, MAXNAMELEN, "%llu.%u", + sc->ip->i_ino, ++incr); + error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino); + } + if (error == 0) { + /* We already have 10,000 entries in the orphanage? */ + return -EFSCORRUPTED; + } + + if (error != -ENOENT) + return error; + return 0; +} + +/* + * Make sure the dcache does not have a positive dentry for the name we've + * chosen. The caller should have checked with the ondisk directory, so any + * discrepancy is a sign that something is seriously wrong. + */ +static int +xrep_adoption_check_dcache( + struct xrep_adoption *adopt) +{ + struct qstr qname = QSTR_INIT(adopt->xname->name, + adopt->xname->len); + struct xfs_scrub *sc = adopt->sc; + struct dentry *d_orphanage, *d_child; + int error = 0; + + d_orphanage = d_find_alias(VFS_I(sc->orphanage)); + if (!d_orphanage) + return 0; + + d_child = try_lookup_noperm(&qname, d_orphanage); + if (d_child) { + trace_xrep_adoption_check_child(sc->mp, d_child); + + if (d_is_positive(d_child)) { + ASSERT(d_is_negative(d_child)); + error = -EFSCORRUPTED; + } + + dput(d_child); + } + + dput(d_orphanage); + return error; +} + +/* + * Invalidate all dentries for the name that was added to the orphanage + * directory, and all dentries pointing to the child inode that was moved. + * + * There should not be any positive entries for the name, since we've + * maintained our lock on the orphanage directory. + */ +static void +xrep_adoption_zap_dcache( + struct xrep_adoption *adopt) +{ + struct qstr qname = QSTR_INIT(adopt->xname->name, + adopt->xname->len); + struct xfs_scrub *sc = adopt->sc; + struct dentry *d_orphanage, *d_child; + + /* Invalidate all dentries for the adoption name */ + d_orphanage = d_find_alias(VFS_I(sc->orphanage)); + if (!d_orphanage) + return; + + d_child = try_lookup_noperm(&qname, d_orphanage); + while (d_child != NULL) { + trace_xrep_adoption_invalidate_child(sc->mp, d_child); + + ASSERT(d_is_negative(d_child)); + d_invalidate(d_child); + dput(d_child); + d_child = d_lookup(d_orphanage, &qname); + } + + dput(d_orphanage); + + /* Invalidate all the dentries pointing down to this file. */ + while ((d_child = d_find_alias(VFS_I(sc->ip))) != NULL) { + trace_xrep_adoption_invalidate_child(sc->mp, d_child); + + d_invalidate(d_child); + dput(d_child); + } +} + +/* + * If we have to add an attr fork ahead of a parent pointer update, how much + * space should we ask for? + */ +static inline int +xrep_adoption_attr_sizeof( + const struct xrep_adoption *adopt) +{ + return sizeof(struct xfs_attr_sf_hdr) + + xfs_attr_sf_entsize_byname(sizeof(struct xfs_parent_rec), + adopt->xname->len); +} + +/* + * Move the current file to the orphanage under the computed name. + * + * Returns with a dirty transaction so that the caller can handle any other + * work, such as fixing up unlinked lists or resetting link counts. + */ +int +xrep_adoption_move( + struct xrep_adoption *adopt) +{ + struct xfs_scrub *sc = adopt->sc; + bool isdir = S_ISDIR(VFS_I(sc->ip)->i_mode); + int error; + + trace_xrep_adoption_reparent(sc->orphanage, adopt->xname, + sc->ip->i_ino); + + error = xrep_adoption_check_dcache(adopt); + if (error) + return error; + + /* + * If this filesystem has parent pointers, ensure that the file being + * moved to the orphanage has an attribute fork. This is required + * because the parent pointer code does not itself add attr forks. + */ + if (!xfs_inode_has_attr_fork(sc->ip) && xfs_has_parent(sc->mp)) { + int sf_size = xrep_adoption_attr_sizeof(adopt); + + error = xfs_bmap_add_attrfork(sc->tp, sc->ip, sf_size, true); + if (error) + return error; + } + + /* Create the new name in the orphanage. */ + error = xfs_dir_createname(sc->tp, sc->orphanage, adopt->xname, + sc->ip->i_ino, adopt->orphanage_blkres); + if (error) + return error; + + /* + * Bump the link count of the orphanage if we just added a + * subdirectory, and update its timestamps. + */ + xfs_trans_ichgtime(sc->tp, sc->orphanage, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + if (isdir) + xfs_bumplink(sc->tp, sc->orphanage); + xfs_trans_log_inode(sc->tp, sc->orphanage, XFS_ILOG_CORE); + + /* Bump the link count of the child. */ + if (adopt->bump_child_nlink) { + xfs_bumplink(sc->tp, sc->ip); + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + } + + /* Replace the dotdot entry if the child is a subdirectory. */ + if (isdir) { + error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot, + sc->orphanage->i_ino, adopt->child_blkres); + if (error) + return error; + } + + /* Add a parent pointer from the file back to the lost+found. */ + if (xfs_has_parent(sc->mp)) { + error = xfs_parent_addname(sc->tp, &adopt->ppargs, + sc->orphanage, adopt->xname, sc->ip); + if (error) + return error; + } + + /* + * Notify dirent hooks that we moved the file to /lost+found, and + * finish all the deferred work so that we know the adoption is fully + * recorded in the log. + */ + xfs_dir_update_hook(sc->orphanage, sc->ip, 1, adopt->xname); + + /* Remove negative dentries from the lost+found's dcache */ + xrep_adoption_zap_dcache(adopt); + return 0; +} + +/* + * Roll to a clean scrub transaction so that we can release the orphanage, + * even if xrep_adoption_move was not called. + * + * Commits all the work and deferred ops attached to an adoption request and + * rolls to a clean scrub transaction. On success, returns 0 with the scrub + * context holding a clean transaction with no inodes joined. On failure, + * returns negative errno with no scrub transaction. All inode locks are + * still held after this function returns. + */ +int +xrep_adoption_trans_roll( + struct xrep_adoption *adopt) +{ + struct xfs_scrub *sc = adopt->sc; + int error; + + trace_xrep_adoption_trans_roll(sc->orphanage, sc->ip, + !!(sc->tp->t_flags & XFS_TRANS_DIRTY)); + + /* Finish all the deferred ops to commit all repairs. */ + error = xrep_defer_finish(sc); + if (error) + return error; + + /* Roll the transaction once more to detach the inodes. */ + return xfs_trans_roll(&sc->tp); +} diff --git a/fs/xfs/scrub/orphanage.h b/fs/xfs/scrub/orphanage.h new file mode 100644 index 000000000000..7c7a2e7d81db --- /dev/null +++ b/fs/xfs/scrub/orphanage.h @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_ORPHANAGE_H__ +#define __XFS_SCRUB_ORPHANAGE_H__ + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_orphanage_create(struct xfs_scrub *sc); + +/* + * If we're doing a repair, ensure that the orphanage exists and attach it to + * the scrub context. + */ +static inline int +xrep_orphanage_try_create( + struct xfs_scrub *sc) +{ + int error; + + ASSERT(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR); + + error = xrep_orphanage_create(sc); + switch (error) { + case 0: + case -ENOENT: + case -ENOTDIR: + case -ENOSPC: + /* + * If the orphanage can't be found or isn't a directory, we'll + * keep going, but we won't be able to attach the file to the + * orphanage if we can't find the parent. + */ + return 0; + } + + return error; +} + +int xrep_orphanage_iolock_two(struct xfs_scrub *sc); + +void xrep_orphanage_ilock(struct xfs_scrub *sc, unsigned int ilock_flags); +bool xrep_orphanage_ilock_nowait(struct xfs_scrub *sc, + unsigned int ilock_flags); +void xrep_orphanage_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags); + +void xrep_orphanage_rele(struct xfs_scrub *sc); + +/* Information about a request to add a file to the orphanage. */ +struct xrep_adoption { + struct xfs_scrub *sc; + + /* Name used for the adoption. */ + struct xfs_name *xname; + + /* Parent pointer context tracking */ + struct xfs_parent_args ppargs; + + /* Block reservations for orphanage and child (if directory). */ + unsigned int orphanage_blkres; + unsigned int child_blkres; + + /* + * Does the caller want us to bump the child link count? This is not + * needed when reattaching files that have become disconnected but have + * nlink > 1. It is necessary when changing the directory tree + * structure. + */ + bool bump_child_nlink:1; +}; + +bool xrep_orphanage_can_adopt(struct xfs_scrub *sc); + +int xrep_adoption_trans_alloc(struct xfs_scrub *sc, + struct xrep_adoption *adopt); +int xrep_adoption_compute_name(struct xrep_adoption *adopt, + struct xfs_name *xname); +int xrep_adoption_move(struct xrep_adoption *adopt); +int xrep_adoption_trans_roll(struct xrep_adoption *adopt); +#else +struct xrep_adoption { /* empty */ }; +# define xrep_orphanage_rele(sc) ((void)0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_ORPHANAGE_H__ */ diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 7db873672146..3b692c4acc1e 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -10,19 +10,37 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_icache.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" +#include "xfs_attr.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/repair.h" +#include "scrub/listxattr.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/trace.h" /* Set us up to scrub parents. */ int xchk_setup_parent( struct xfs_scrub *sc) { + int error; + + if (xchk_could_repair(sc)) { + error = xrep_setup_parent(sc); + if (error) + return error; + } + return xchk_setup_inode_contents(sc, 0); } @@ -114,6 +132,14 @@ xchk_parent_validate( return 0; } + /* Is this the metadata root dir? Then '..' must point to itself. */ + if (sc->ip == mp->m_metadirip) { + if (sc->ip->i_ino != mp->m_sb.sb_metadirino || + sc->ip->i_ino != parent_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + /* '..' must not point to ourselves. */ if (sc->ip->i_ino == parent_ino) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); @@ -143,7 +169,8 @@ xchk_parent_validate( } if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) return error; - if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) { + if (dp == sc->ip || xrep_is_tempfile(dp) || + !S_ISDIR(VFS_I(dp)->i_mode)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); goto out_rele; } @@ -166,6 +193,12 @@ xchk_parent_validate( goto out_unlock; } + /* Metadata and regular inodes cannot cross trees. */ + if (xfs_is_metadir_inode(dp) != xfs_is_metadir_inode(sc->ip)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out_unlock; + } + /* Look for a directory entry in the parent pointing to the child. */ error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc); if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) @@ -185,6 +218,629 @@ out_rele: return error; } +/* + * Checking of Parent Pointers + * =========================== + * + * On filesystems with directory parent pointers, we check the referential + * integrity by visiting each parent pointer of a child file and checking that + * the directory referenced by the pointer actually has a dirent pointing + * forward to the child file. + */ + +/* Deferred parent pointer entry that we saved for later. */ +struct xchk_pptr { + /* Cookie for retrieval of the pptr name. */ + xfblob_cookie name_cookie; + + /* Parent pointer record. */ + struct xfs_parent_rec pptr_rec; + + /* Length of the pptr name. */ + uint8_t namelen; +}; + +struct xchk_pptrs { + struct xfs_scrub *sc; + + /* How many parent pointers did we find at the end? */ + unsigned long long pptrs_found; + + /* Parent of this directory. */ + xfs_ino_t parent_ino; + + /* Fixed-size array of xchk_pptr structures. */ + struct xfarray *pptr_entries; + + /* Blobs containing parent pointer names. */ + struct xfblob *pptr_names; + + /* Scratch buffer for scanning pptr xattrs */ + struct xfs_da_args pptr_args; + + /* If we've cycled the ILOCK, we must revalidate all deferred pptrs. */ + bool need_revalidate; + + /* Name buffer */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; +}; + +/* Does this parent pointer match the dotdot entry? */ +STATIC int +xchk_parent_scan_dotdot( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xchk_pptrs *pp = priv; + xfs_ino_t parent_ino; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) + return error; + + if (pp->parent_ino == parent_ino) + return -ECANCELED; + + return 0; +} + +/* Look up the dotdot entry so that we can check it as we walk the pptrs. */ +STATIC int +xchk_parent_pptr_and_dotdot( + struct xchk_pptrs *pp) +{ + struct xfs_scrub *sc = pp->sc; + int error; + + /* Look up '..' */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &pp->parent_ino); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + if (!xfs_verify_dir_ino(sc->mp, pp->parent_ino)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + + /* Is this the root dir? Then '..' must point to itself. */ + if (xchk_inode_is_dirtree_root(sc->ip)) { + if (sc->ip->i_ino != pp->parent_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + + /* + * If this is now an unlinked directory, the dotdot value is + * meaningless as long as it points to a valid inode. + */ + if (VFS_I(sc->ip)->i_nlink == 0) + return 0; + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Otherwise, walk the pptrs again, and check. */ + error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_dotdot, NULL, pp); + if (error == -ECANCELED) { + /* Found a parent pointer that matches dotdot. */ + return 0; + } + if (!error || error == -EFSCORRUPTED) { + /* Found a broken parent pointer or no match. */ + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + return error; +} + +/* + * Try to lock a parent directory for checking dirents. Returns the inode + * flags for the locks we now hold, or zero if we failed. + */ +STATIC unsigned int +xchk_parent_lock_dir( + struct xfs_scrub *sc, + struct xfs_inode *dp) +{ + if (!xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) + return 0; + + if (!xfs_ilock_nowait(dp, XFS_ILOCK_SHARED)) { + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return 0; + } + + if (!xfs_need_iread_extents(&dp->i_df)) + return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED; + + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + if (!xfs_ilock_nowait(dp, XFS_ILOCK_EXCL)) { + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return 0; + } + + return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL; +} + +/* Check the forward link (dirent) associated with this parent pointer. */ +STATIC int +xchk_parent_dirent( + struct xchk_pptrs *pp, + const struct xfs_name *xname, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = pp->sc; + xfs_ino_t child_ino; + int error; + + /* + * Use the name attached to this parent pointer to look up the + * directory entry in the alleged parent. + */ + error = xchk_dir_lookup(sc, dp, xname, &child_ino); + if (error == -ENOENT) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error)) + return error; + + /* Does the inode number match? */ + if (child_ino != sc->ip->i_ino) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + + return 0; +} + +/* Try to grab a parent directory. */ +STATIC int +xchk_parent_iget( + struct xchk_pptrs *pp, + const struct xfs_parent_rec *pptr, + struct xfs_inode **dpp) +{ + struct xfs_scrub *sc = pp->sc; + struct xfs_inode *ip; + xfs_ino_t parent_ino = be64_to_cpu(pptr->p_ino); + int error; + + /* Validate inode number. */ + error = xfs_dir_ino_validate(sc->mp, parent_ino); + if (error) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return -ECANCELED; + } + + error = xchk_iget(sc, parent_ino, &ip); + if (error == -EINVAL || error == -ENOENT) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return -ECANCELED; + } + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error)) + return error; + + /* The parent must be a directory. */ + if (!S_ISDIR(VFS_I(ip)->i_mode)) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + goto out_rele; + } + + /* Validate generation number. */ + if (VFS_I(ip)->i_generation != be32_to_cpu(pptr->p_gen)) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + goto out_rele; + } + + *dpp = ip; + return 0; +out_rele: + xchk_irele(sc, ip); + return 0; +} + +/* + * Walk an xattr of a file. If this xattr is a parent pointer, follow it up + * to a parent directory and check that the parent has a dirent pointing back + * to us. + */ +STATIC int +xchk_parent_scan_attr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + struct xchk_pptrs *pp = priv; + struct xfs_inode *dp = NULL; + const struct xfs_parent_rec *pptr_rec = value; + xfs_ino_t parent_ino; + unsigned int lockmode; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return error; + } + + /* No self-referential parent pointers. */ + if (parent_ino == sc->ip->i_ino) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return -ECANCELED; + } + + pp->pptrs_found++; + + error = xchk_parent_iget(pp, pptr_rec, &dp); + if (error) + return error; + if (!dp) + return 0; + + /* Try to lock the inode. */ + lockmode = xchk_parent_lock_dir(sc, dp); + if (!lockmode) { + struct xchk_pptr save_pp = { + .pptr_rec = *pptr_rec, /* struct copy */ + .namelen = namelen, + }; + + /* Couldn't lock the inode, so save the pptr for later. */ + trace_xchk_parent_defer(sc->ip, &xname, dp->i_ino); + + error = xfblob_storename(pp->pptr_names, &save_pp.name_cookie, + &xname); + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, + &error)) + goto out_rele; + + error = xfarray_append(pp->pptr_entries, &save_pp); + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, + &error)) + goto out_rele; + + goto out_rele; + } + + error = xchk_parent_dirent(pp, &xname, dp); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock(dp, lockmode); +out_rele: + xchk_irele(sc, dp); + return error; +} + +/* + * Revalidate a parent pointer that we collected in the past but couldn't check + * because of lock contention. Returns 0 if the parent pointer is still valid, + * -ENOENT if it has gone away on us, or a negative errno. + */ +STATIC int +xchk_parent_revalidate_pptr( + struct xchk_pptrs *pp, + const struct xfs_name *xname, + struct xfs_parent_rec *pptr) +{ + struct xfs_scrub *sc = pp->sc; + int error; + + error = xfs_parent_lookup(sc->tp, sc->ip, xname, pptr, &pp->pptr_args); + if (error == -ENOATTR) { + /* Parent pointer went away, nothing to revalidate. */ + return -ENOENT; + } + + return error; +} + +/* + * Check a parent pointer the slow way, which means we cycle locks a bunch + * and put up with revalidation until we get it done. + */ +STATIC int +xchk_parent_slow_pptr( + struct xchk_pptrs *pp, + const struct xfs_name *xname, + struct xfs_parent_rec *pptr) +{ + struct xfs_scrub *sc = pp->sc; + struct xfs_inode *dp = NULL; + unsigned int lockmode; + int error; + + /* Check that the deferred parent pointer still exists. */ + if (pp->need_revalidate) { + error = xchk_parent_revalidate_pptr(pp, xname, pptr); + if (error == -ENOENT) + return 0; + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, + &error)) + return error; + } + + error = xchk_parent_iget(pp, pptr, &dp); + if (error) + return error; + if (!dp) + return 0; + + /* + * If we can grab both IOLOCK and ILOCK of the alleged parent, we + * can proceed with the validation. + */ + lockmode = xchk_parent_lock_dir(sc, dp); + if (lockmode) { + trace_xchk_parent_slowpath(sc->ip, xname, dp->i_ino); + goto check_dirent; + } + + /* + * We couldn't lock the parent dir. Drop all the locks and try to + * get them again, one at a time. + */ + xchk_iunlock(sc, sc->ilock_flags); + pp->need_revalidate = true; + + trace_xchk_parent_ultraslowpath(sc->ip, xname, dp->i_ino); + + error = xchk_dir_trylock_for_pptrs(sc, dp, &lockmode); + if (error) + goto out_rele; + + /* Revalidate the parent pointer now that we cycled locks. */ + error = xchk_parent_revalidate_pptr(pp, xname, pptr); + if (error == -ENOENT) { + error = 0; + goto out_unlock; + } + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error)) + goto out_unlock; + +check_dirent: + error = xchk_parent_dirent(pp, xname, dp); +out_unlock: + xfs_iunlock(dp, lockmode); +out_rele: + xchk_irele(sc, dp); + return error; +} + +/* Check all the parent pointers that we deferred the first time around. */ +STATIC int +xchk_parent_finish_slow_pptrs( + struct xchk_pptrs *pp) +{ + xfarray_idx_t array_cur; + int error; + + foreach_xfarray_idx(pp->pptr_entries, array_cur) { + struct xchk_pptr pptr; + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + error = xfarray_load(pp->pptr_entries, array_cur, &pptr); + if (error) + return error; + + error = xfblob_loadname(pp->pptr_names, pptr.name_cookie, + &pp->xname, pptr.namelen); + if (error) + return error; + + error = xchk_parent_slow_pptr(pp, &pp->xname, &pptr.pptr_rec); + if (error) + return error; + } + + /* Empty out both xfiles now that we've checked everything. */ + xfarray_truncate(pp->pptr_entries); + xfblob_truncate(pp->pptr_names); + return 0; +} + +/* Count the number of parent pointers. */ +STATIC int +xchk_parent_count_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xchk_pptrs *pp = priv; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + pp->pptrs_found++; + return 0; +} + +/* + * Compare the number of parent pointers to the link count. For + * non-directories these should be the same. For unlinked directories the + * count should be zero; for linked directories, it should be nonzero. + */ +STATIC int +xchk_parent_count_pptrs( + struct xchk_pptrs *pp) +{ + struct xfs_scrub *sc = pp->sc; + int error; + + /* + * If we cycled the ILOCK while cross-checking parent pointers with + * dirents, then we need to recalculate the number of parent pointers. + */ + if (pp->need_revalidate) { + pp->pptrs_found = 0; + error = xchk_xattr_walk(sc, sc->ip, xchk_parent_count_pptr, + NULL, pp); + if (error == -EFSCORRUPTED) { + /* Found a bad parent pointer */ + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + if (error) + return error; + } + + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { + if (xchk_inode_is_dirtree_root(sc->ip)) + pp->pptrs_found++; + + if (VFS_I(sc->ip)->i_nlink == 0 && pp->pptrs_found > 0) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + else if (VFS_I(sc->ip)->i_nlink > 0 && + pp->pptrs_found == 0) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } else { + /* + * Starting with metadir, we allow checking of parent pointers + * of non-directory files that are children of the superblock. + * Pretend that we found a parent pointer attr. + */ + if (xfs_has_metadir(sc->mp) && xchk_inode_is_sb_rooted(sc->ip)) + pp->pptrs_found++; + + if (VFS_I(sc->ip)->i_nlink != pp->pptrs_found) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } + + return 0; +} + +/* Check parent pointers of a file. */ +STATIC int +xchk_parent_pptr( + struct xfs_scrub *sc) +{ + struct xchk_pptrs *pp; + char *descr; + int error; + + pp = kvzalloc(sizeof(struct xchk_pptrs), XCHK_GFP_FLAGS); + if (!pp) + return -ENOMEM; + pp->sc = sc; + pp->xname.name = pp->namebuf; + + /* + * Set up some staging memory for parent pointers that we can't check + * due to locking contention. + */ + descr = xchk_xfile_ino_descr(sc, "slow parent pointer entries"); + error = xfarray_create(descr, 0, sizeof(struct xchk_pptr), + &pp->pptr_entries); + kfree(descr); + if (error) + goto out_pp; + + descr = xchk_xfile_ino_descr(sc, "slow parent pointer names"); + error = xfblob_create(descr, &pp->pptr_names); + kfree(descr); + if (error) + goto out_entries; + + error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_attr, NULL, pp); + if (error == -ECANCELED) { + error = 0; + goto out_names; + } + if (error) + goto out_names; + + error = xchk_parent_finish_slow_pptrs(pp); + if (error == -ETIMEDOUT) { + /* Couldn't grab a lock, scrub was marked incomplete */ + error = 0; + goto out_names; + } + if (error) + goto out_names; + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_names; + + /* + * For subdirectories, make sure the dotdot entry references the same + * inode as the parent pointers. + * + * If we're scanning a /consistent/ directory, there should only be + * one parent pointer, and it should point to the same directory as + * the dotdot entry. + * + * However, a corrupt directory tree might feature a subdirectory with + * multiple parents. The directory loop scanner is responsible for + * correcting that kind of problem, so for now we only validate that + * the dotdot entry matches /one/ of the parents. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { + error = xchk_parent_pptr_and_dotdot(pp); + if (error) + goto out_names; + } + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_names; + + /* + * Complain if the number of parent pointers doesn't match the link + * count. This could be a sign of missing parent pointers (or an + * incorrect link count). + */ + error = xchk_parent_count_pptrs(pp); + if (error) + goto out_names; + +out_names: + xfblob_destroy(pp->pptr_names); +out_entries: + xfarray_destroy(pp->pptr_entries); +out_pp: + kvfree(pp); + return error; +} + /* Scrub a parent pointer. */ int xchk_parent( @@ -194,6 +850,9 @@ xchk_parent( xfs_ino_t parent_ino; int error = 0; + if (xfs_has_parent(mp)) + return xchk_parent_pptr(sc); + /* * If we're a directory, check that the '..' link points up to * a directory that has one entry pointing to us. @@ -237,3 +896,63 @@ xchk_parent( return error; } + +/* + * Decide if this file's extended attributes (and therefore its parent + * pointers) have been zapped to satisfy the inode and ifork verifiers. + * Checking and repairing should be postponed until the extended attribute + * structure is fixed. + */ +bool +xchk_pptr_looks_zapped( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(xfs_has_parent(ip->i_mount)); + + /* + * Temporary files that cannot be linked into the directory tree do not + * have attr forks because they cannot ever have parents. + */ + if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + return false; + + /* + * Directory tree roots do not have parents, so the expected outcome + * of a parent pointer scan is always the empty set. It's safe to scan + * them even if the attr fork was zapped. + */ + if (xchk_inode_is_dirtree_root(ip)) + return false; + + /* + * Metadata inodes that are rooted in the superblock do not have any + * parents. Hence the attr fork will not be initialized, but there are + * no parent pointers that might have been zapped. + */ + if (xchk_inode_is_sb_rooted(ip)) + return false; + + /* + * Linked and linkable non-rootdir files should always have an + * attribute fork because that is where parent pointers are + * stored. If the fork is absent, something is amiss. + */ + if (!xfs_inode_has_attr_fork(ip)) + return true; + + /* Repair zapped this file's attr fork a short time ago */ + if (xfs_ifork_zapped(ip, XFS_ATTR_FORK)) + return true; + + /* + * If the dinode repair found a bad attr fork, it will reset the fork + * to extents format with zero records and wait for the bmapbta + * scrubber to reconstruct the block mappings. The extended attribute + * structure always contain some content when parent pointers are + * enabled, so this is a clear sign of a zapped attr fork. + */ + return ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_af.if_nextents == 0; +} diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c new file mode 100644 index 000000000000..31bfe10be22a --- /dev/null +++ b/fs/xfs/scrub/parent_repair.c @@ -0,0 +1,1639 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_priv.h" +#include "xfs_trans_space.h" +#include "xfs_health.h" +#include "xfs_exchmaps.h" +#include "xfs_parent.h" +#include "xfs_attr.h" +#include "xfs_bmap.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/iscan.h" +#include "scrub/findparent.h" +#include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/orphanage.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/attr_repair.h" +#include "scrub/listxattr.h" + +/* + * Repairing The Directory Parent Pointer + * ====================================== + * + * Currently, only directories support parent pointers (in the form of '..' + * entries), so we simply scan the filesystem and update the '..' entry. + * + * Note that because the only parent pointer is the dotdot entry, we won't + * touch an unhealthy directory, since the directory repair code is perfectly + * capable of rebuilding a directory with the proper parent inode. + * + * See the section on locking issues in dir_repair.c for more information about + * conflicts with the VFS. The findparent code wll keep our incore parent + * inode up to date. + * + * If parent pointers are enabled, we instead reconstruct the parent pointer + * information by visiting every directory entry of every directory in the + * system and translating the relevant dirents into parent pointers. In this + * case, it is advantageous to stash all parent pointers created from dirents + * from a single parent file before replaying them into the temporary file. To + * save memory, the live filesystem scan reuses the findparent object. Parent + * pointer repair chooses either directory scanning or findparent, but not + * both. + * + * When salvaging completes, the remaining stashed entries are replayed to the + * temporary file. All non-parent pointer extended attributes are copied to + * the temporary file's extended attributes. An atomic file mapping exchange + * is used to commit the new xattr blocks to the file being repaired. This + * will disrupt attrmulti cursors. + */ + +/* Create a parent pointer in the tempfile. */ +#define XREP_PPTR_ADD (1) + +/* Remove a parent pointer from the tempfile. */ +#define XREP_PPTR_REMOVE (2) + +/* A stashed parent pointer update. */ +struct xrep_pptr { + /* Cookie for retrieval of the pptr name. */ + xfblob_cookie name_cookie; + + /* Parent pointer record. */ + struct xfs_parent_rec pptr_rec; + + /* Length of the pptr name. */ + uint8_t namelen; + + /* XREP_PPTR_{ADD,REMOVE} */ + uint8_t action; +}; + +/* + * Stash up to 8 pages of recovered parent pointers in pptr_recs and + * pptr_names before we write them to the temp file. + */ +#define XREP_PARENT_MAX_STASH_BYTES (PAGE_SIZE * 8) + +struct xrep_parent { + struct xfs_scrub *sc; + + /* Fixed-size array of xrep_pptr structures. */ + struct xfarray *pptr_recs; + + /* Blobs containing parent pointer names. */ + struct xfblob *pptr_names; + + /* xattr keys */ + struct xfarray *xattr_records; + + /* xattr values */ + struct xfblob *xattr_blobs; + + /* Scratch buffers for saving extended attributes */ + unsigned char *xattr_name; + void *xattr_value; + unsigned int xattr_value_sz; + + /* + * Information used to exchange the attr fork mappings, if the fs + * supports parent pointers. + */ + struct xrep_tempexch tx; + + /* + * Information used to scan the filesystem to find the inumber of the + * dotdot entry for this directory. On filesystems without parent + * pointers, we use the findparent_* functions on this object and + * access only the parent_ino field directly. + * + * When parent pointers are enabled, the directory entry scanner uses + * the iscan, hooks, and lock fields of this object directly. + * @pscan.lock coordinates access to pptr_recs, pptr_names, pptr, and + * pptr_scratch. This reduces the memory requirements of this + * structure. + * + * The lock also controls access to xattr_records and xattr_blobs(?) + */ + struct xrep_parent_scan_info pscan; + + /* Orphanage reparenting request. */ + struct xrep_adoption adoption; + + /* Directory entry name, plus the trailing null. */ + struct xfs_name xname; + unsigned char namebuf[MAXNAMELEN]; + + /* Scratch buffer for scanning pptr xattrs */ + struct xfs_da_args pptr_args; + + /* Have we seen any live updates of parent pointers recently? */ + bool saw_pptr_updates; + + /* Number of parents we found after all other repairs */ + unsigned long long parents; +}; + +struct xrep_parent_xattr { + /* Cookie for retrieval of the xattr name. */ + xfblob_cookie name_cookie; + + /* Cookie for retrieval of the xattr value. */ + xfblob_cookie value_cookie; + + /* XFS_ATTR_* flags */ + int flags; + + /* Length of the value and name. */ + uint32_t valuelen; + uint16_t namelen; +}; + +/* + * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write + * them to the temp file. + */ +#define XREP_PARENT_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8) + +/* Tear down all the incore stuff we created. */ +static void +xrep_parent_teardown( + struct xrep_parent *rp) +{ + xrep_findparent_scan_teardown(&rp->pscan); + kvfree(rp->xattr_name); + rp->xattr_name = NULL; + kvfree(rp->xattr_value); + rp->xattr_value = NULL; + if (rp->xattr_blobs) + xfblob_destroy(rp->xattr_blobs); + rp->xattr_blobs = NULL; + if (rp->xattr_records) + xfarray_destroy(rp->xattr_records); + rp->xattr_records = NULL; + if (rp->pptr_names) + xfblob_destroy(rp->pptr_names); + rp->pptr_names = NULL; + if (rp->pptr_recs) + xfarray_destroy(rp->pptr_recs); + rp->pptr_recs = NULL; +} + +/* Set up for a parent repair. */ +int +xrep_setup_parent( + struct xfs_scrub *sc) +{ + struct xrep_parent *rp; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + rp = kvzalloc(sizeof(struct xrep_parent), XCHK_GFP_FLAGS); + if (!rp) + return -ENOMEM; + rp->sc = sc; + rp->xname.name = rp->namebuf; + sc->buf = rp; + + error = xrep_tempfile_create(sc, S_IFREG); + if (error) + return error; + + return xrep_orphanage_try_create(sc); +} + +/* + * Scan all files in the filesystem for a child dirent that we can turn into + * the dotdot entry for this directory. + */ +STATIC int +xrep_parent_find_dotdot( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t ino; + unsigned int sick, checked; + int error; + + /* + * Avoid sick directories. There shouldn't be anyone else clearing the + * directory's sick status. + */ + xfs_inode_measure_sickness(sc->ip, &sick, &checked); + if (sick & XFS_SICK_INO_DIR) + return -EFSCORRUPTED; + + ino = xrep_findparent_self_reference(sc); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rp->pscan, ino); + return 0; + } + + /* + * Drop the ILOCK on this directory so that we can scan for the dotdot + * entry. Figure out who is going to be the parent of this directory, + * then retake the ILOCK so that we can salvage directory entries. + */ + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* Does the VFS dcache have an answer for us? */ + ino = xrep_findparent_from_dcache(sc); + if (ino != NULLFSINO) { + error = xrep_findparent_confirm(sc, &ino); + if (!error && ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rp->pscan, ino); + goto out_relock; + } + } + + /* Scan the entire filesystem for a parent. */ + error = xrep_findparent_scan(&rp->pscan); +out_relock: + xchk_ilock(sc, XFS_ILOCK_EXCL); + + return error; +} + +/* + * Add this stashed incore parent pointer to the temporary file. + * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and + * must not be in transaction context. + */ +STATIC int +xrep_parent_replay_update( + struct xrep_parent *rp, + const struct xfs_name *xname, + struct xrep_pptr *pptr) +{ + struct xfs_scrub *sc = rp->sc; + + switch (pptr->action) { + case XREP_PPTR_ADD: + /* Create parent pointer. */ + trace_xrep_parent_replay_parentadd(sc->tempip, xname, + &pptr->pptr_rec); + + return xfs_parent_set(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rp->pptr_args); + case XREP_PPTR_REMOVE: + /* Remove parent pointer. */ + trace_xrep_parent_replay_parentremove(sc->tempip, xname, + &pptr->pptr_rec); + + return xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rp->pptr_args); + } + + ASSERT(0); + return -EIO; +} + +/* + * Flush stashed parent pointer updates that have been recorded by the scanner. + * This is done to reduce the memory requirements of the parent pointer + * rebuild, since files can have a lot of hardlinks and the fs can be busy. + * + * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile + * IOLOCK. + */ +STATIC int +xrep_parent_replay_updates( + struct xrep_parent *rp) +{ + xfarray_idx_t array_cur; + int error; + + mutex_lock(&rp->pscan.lock); + foreach_xfarray_idx(rp->pptr_recs, array_cur) { + struct xrep_pptr pptr; + + error = xfarray_load(rp->pptr_recs, array_cur, &pptr); + if (error) + goto out_unlock; + + error = xfblob_loadname(rp->pptr_names, pptr.name_cookie, + &rp->xname, pptr.namelen); + if (error) + goto out_unlock; + rp->xname.len = pptr.namelen; + mutex_unlock(&rp->pscan.lock); + + error = xrep_parent_replay_update(rp, &rp->xname, &pptr); + if (error) + return error; + + mutex_lock(&rp->pscan.lock); + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rp->pptr_recs); + xfblob_truncate(rp->pptr_names); + mutex_unlock(&rp->pscan.lock); + return 0; +out_unlock: + mutex_unlock(&rp->pscan.lock); + return error; +} + +/* + * Remember that we want to create a parent pointer in the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_parent_stash_parentadd( + struct xrep_parent *rp, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_pptr pptr = { + .action = XREP_PPTR_ADD, + .namelen = name->len, + }; + int error; + + trace_xrep_parent_stash_parentadd(rp->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rp->pptr_recs, &pptr); +} + +/* + * Remember that we want to remove a parent pointer from the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_parent_stash_parentremove( + struct xrep_parent *rp, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_pptr pptr = { + .action = XREP_PPTR_REMOVE, + .namelen = name->len, + }; + int error; + + trace_xrep_parent_stash_parentremove(rp->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rp->pptr_recs, &pptr); +} + +/* + * Examine an entry of a directory. If this dirent leads us back to the file + * whose parent pointers we're rebuilding, add a pptr to the temporary + * directory. + */ +STATIC int +xrep_parent_scan_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_parent *rp = priv; + int error; + + /* Dirent doesn't point to this directory. */ + if (ino != rp->sc->ip->i_ino) + return 0; + + /* No weird looking names. */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) + return -EFSCORRUPTED; + + /* No mismatching ftypes. */ + if (name->type != xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode)) + return -EFSCORRUPTED; + + /* Don't pick up dot or dotdot entries; we only want child dirents. */ + if (xfs_dir2_samename(name, &xfs_name_dotdot) || + xfs_dir2_samename(name, &xfs_name_dot)) + return 0; + + /* + * Transform this dirent into a parent pointer and queue it for later + * addition to the temporary file. + */ + mutex_lock(&rp->pscan.lock); + error = xrep_parent_stash_parentadd(rp, name, dp); + mutex_unlock(&rp->pscan.lock); + return error; +} + +/* + * Decide if we want to look for dirents in this directory. Skip the file + * being repaired and any files being used to stage repairs. + */ +static inline bool +xrep_parent_want_scan( + struct xrep_parent *rp, + const struct xfs_inode *ip) +{ + return ip != rp->sc->ip && !xrep_is_tempfile(ip); +} + +/* + * Take ILOCK on a file that we want to scan. + * + * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt. + * Otherwise, take ILOCK_SHARED. + */ +static inline unsigned int +xrep_parent_scan_ilock( + struct xrep_parent *rp, + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + /* Still need to take the shared ILOCK to advance the iscan cursor. */ + if (!xrep_parent_want_scan(rp, ip)) + goto lock; + + if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) { + lock_mode = XFS_ILOCK_EXCL; + goto lock; + } + +lock: + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* + * Scan this file for relevant child dirents that point to the file whose + * parent pointers we're rebuilding. + */ +STATIC int +xrep_parent_scan_file( + struct xrep_parent *rp, + struct xfs_inode *ip) +{ + unsigned int lock_mode; + int error = 0; + + lock_mode = xrep_parent_scan_ilock(rp, ip); + + if (!xrep_parent_want_scan(rp, ip)) + goto scan_done; + + if (S_ISDIR(VFS_I(ip)->i_mode)) { + /* + * If the directory looks as though it has been zapped by the + * inode record repair code, we cannot scan for child dirents. + */ + if (xchk_dir_looks_zapped(ip)) { + error = -EBUSY; + goto scan_done; + } + + error = xchk_dir_walk(rp->sc, ip, xrep_parent_scan_dirent, rp); + if (error) + goto scan_done; + } + +scan_done: + xchk_iscan_mark_visited(&rp->pscan.iscan, ip); + xfs_iunlock(ip, lock_mode); + return error; +} + +/* Decide if we've stashed too much pptr data in memory. */ +static inline bool +xrep_parent_want_flush_stashed( + struct xrep_parent *rp) +{ + unsigned long long bytes; + + bytes = xfarray_bytes(rp->pptr_recs) + xfblob_bytes(rp->pptr_names); + return bytes > XREP_PARENT_MAX_STASH_BYTES; +} + +/* + * Scan all directories in the filesystem to look for dirents that we can turn + * into parent pointers. + */ +STATIC int +xrep_parent_scan_dirtree( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + struct xfs_inode *ip; + int error; + + /* + * Filesystem scans are time consuming. Drop the file ILOCK and all + * other resources for the duration of the scan and hope for the best. + * The live update hooks will keep our scan information up to date. + */ + xchk_trans_cancel(sc); + if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) + xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | + XFS_ILOCK_EXCL)); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&rp->pscan.iscan, &ip)) == 1) { + bool flush; + + error = xrep_parent_scan_file(rp, ip); + xchk_irele(sc, ip); + if (error) + break; + + /* Flush stashed pptr updates to constrain memory usage. */ + mutex_lock(&rp->pscan.lock); + flush = xrep_parent_want_flush_stashed(rp); + mutex_unlock(&rp->pscan.lock); + if (flush) { + xchk_trans_cancel(sc); + + error = xrep_tempfile_iolock_polled(sc); + if (error) + break; + + error = xrep_parent_replay_updates(rp); + xrep_tempfile_iounlock(sc); + if (error) + break; + + error = xchk_trans_alloc_empty(sc); + if (error) + break; + } + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rp->pscan.iscan); + if (error) { + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Retake sc->ip's ILOCK now that we're done flushing stashed parent + * pointers. We end this function with an empty transaction and the + * ILOCK. + */ + xchk_ilock(rp->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* + * Capture dirent updates being made by other threads which are relevant to the + * file being repaired. + */ +STATIC int +xrep_parent_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_parent *rp; + struct xfs_scrub *sc; + int error; + + rp = container_of(nb, struct xrep_parent, pscan.dhook.dirent_hook.nb); + sc = rp->sc; + + /* + * This thread updated a dirent that points to the file that we're + * repairing, so stash the update for replay against the temporary + * file. + */ + if (p->ip->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&rp->pscan.iscan, p->dp->i_ino)) { + mutex_lock(&rp->pscan.lock); + if (p->delta > 0) + error = xrep_parent_stash_parentadd(rp, p->name, p->dp); + else + error = xrep_parent_stash_parentremove(rp, p->name, + p->dp); + if (!error) + rp->saw_pptr_updates = true; + mutex_unlock(&rp->pscan.lock); + if (error) + goto out_abort; + } + + return NOTIFY_DONE; +out_abort: + xchk_iscan_abort(&rp->pscan.iscan); + return NOTIFY_DONE; +} + +/* Reset a directory's dotdot entry, if needed. */ +STATIC int +xrep_parent_reset_dotdot( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t ino; + unsigned int spaceres; + int error = 0; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &ino); + if (error || ino == rp->pscan.parent_ino) + return error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + trace_xrep_parent_reset_dotdot(sc->ip, rp->pscan.parent_ino); + + /* + * Reserve more space just in case we have to expand the dir. We're + * allowed to exceed quota to repair inconsistent metadata. + */ + spaceres = xfs_rename_space_res(sc->mp, 0, false, xfs_name_dotdot.len, + false); + error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, spaceres, 0, + true); + if (error) + return error; + + error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot, + rp->pscan.parent_ino, spaceres); + if (error) + return error; + + /* + * Roll transaction to detach the inode from the transaction but retain + * ILOCK_EXCL. + */ + return xfs_trans_roll(&sc->tp); +} + +/* Pass back the parent inumber if this a parent pointer */ +STATIC int +xrep_parent_lookup_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + xfs_ino_t *inop = priv; + xfs_ino_t parent_ino; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) + return error; + + *inop = parent_ino; + return -ECANCELED; +} + +/* + * Find the first parent of the scrub target by walking parent pointers for + * the purpose of deciding if we're going to move it to the orphanage. + * We don't care if the attr fork is zapped. + */ +STATIC int +xrep_parent_lookup_pptrs( + struct xfs_scrub *sc, + xfs_ino_t *inop) +{ + int error; + + *inop = NULLFSINO; + + error = xchk_xattr_walk(sc, sc->ip, xrep_parent_lookup_pptr, NULL, + inop); + if (error && error != -ECANCELED) + return error; + return 0; +} + +/* + * Move the current file to the orphanage. + * + * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon + * successful return, the scrub transaction will have enough extra reservation + * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the + * orphanage; and both inodes will be ijoined. + */ +STATIC int +xrep_parent_move_to_orphanage( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t orig_parent, new_parent; + int error; + + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { + /* + * We are about to drop the ILOCK on sc->ip to lock the + * orphanage and prepare for the adoption. Therefore, look up + * the old dotdot entry for sc->ip so that we can compare it + * after we re-lock sc->ip. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, + &orig_parent); + if (error) + return error; + } else { + /* + * We haven't dropped the ILOCK since we committed the new + * xattr structure (and hence the new parent pointer records), + * which means that the file cannot have been moved in the + * directory tree, and there are no parents. + */ + orig_parent = NULLFSINO; + } + + /* + * Drop the ILOCK on the scrub target and commit the transaction. + * Adoption computes its own resource requirements and gathers the + * necessary components. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* If we can take the orphanage's iolock then we're ready to move. */ + if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { + xchk_iunlock(sc, sc->ilock_flags); + error = xrep_orphanage_iolock_two(sc); + if (error) + return error; + } + + /* Grab transaction and ILOCK the two files. */ + error = xrep_adoption_trans_alloc(sc, &rp->adoption); + if (error) + return error; + + error = xrep_adoption_compute_name(&rp->adoption, &rp->xname); + if (error) + return error; + + /* + * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot + * entry again. If the parent changed or the child was unlinked while + * the child directory was unlocked, we don't need to move the child to + * the orphanage after all. For a non-directory, we have to scan for + * the first parent pointer to see if one has been added. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, + &new_parent); + else + error = xrep_parent_lookup_pptrs(sc, &new_parent); + if (error) + return error; + + /* + * Attach to the orphanage if we still have a linked directory and it + * hasn't been moved. + */ + if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) { + error = xrep_adoption_move(&rp->adoption); + if (error) + return error; + } + + /* + * Launder the scrub transaction so we can drop the orphanage ILOCK + * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK. + */ + error = xrep_adoption_trans_roll(&rp->adoption); + if (error) + return error; + + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + return 0; +} + +/* Ensure that the xattr value buffer is large enough. */ +STATIC int +xrep_parent_alloc_xattr_value( + struct xrep_parent *rp, + size_t bufsize) +{ + void *new_val; + + if (rp->xattr_value_sz >= bufsize) + return 0; + + if (rp->xattr_value) { + kvfree(rp->xattr_value); + rp->xattr_value = NULL; + rp->xattr_value_sz = 0; + } + + new_val = kvmalloc(bufsize, XCHK_GFP_FLAGS); + if (!new_val) + return -ENOMEM; + + rp->xattr_value = new_val; + rp->xattr_value_sz = bufsize; + return 0; +} + +/* Retrieve the (remote) value of a non-pptr xattr. */ +STATIC int +xrep_parent_fetch_xattr_remote( + struct xrep_parent *rp, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + unsigned int valuelen) +{ + struct xfs_scrub *sc = rp->sc; + struct xfs_da_args args = { + .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK, + .geo = sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .dp = ip, + .name = name, + .namelen = namelen, + .trans = sc->tp, + .valuelen = valuelen, + .owner = ip->i_ino, + }; + int error; + + /* + * If we need a larger value buffer, try to allocate one. If that + * fails, return with -EDEADLOCK to try harder. + */ + error = xrep_parent_alloc_xattr_value(rp, valuelen); + if (error == -ENOMEM) + return -EDEADLOCK; + if (error) + return error; + + args.value = rp->xattr_value; + xfs_attr_sethash(&args); + return xfs_attr_get_ilocked(&args); +} + +/* Stash non-pptr attributes for later replay into the temporary file. */ +STATIC int +xrep_parent_stash_xattr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xrep_parent_xattr key = { + .valuelen = valuelen, + .namelen = namelen, + .flags = attr_flags & XFS_ATTR_NSP_ONDISK_MASK, + }; + struct xrep_parent *rp = priv; + int error; + + if (attr_flags & (XFS_ATTR_INCOMPLETE | XFS_ATTR_PARENT)) + return 0; + + if (!value) { + error = xrep_parent_fetch_xattr_remote(rp, ip, attr_flags, + name, namelen, valuelen); + if (error) + return error; + + value = rp->xattr_value; + } + + trace_xrep_parent_stash_xattr(rp->sc->tempip, key.flags, (void *)name, + key.namelen, key.valuelen); + + error = xfblob_store(rp->xattr_blobs, &key.name_cookie, name, + key.namelen); + if (error) + return error; + + error = xfblob_store(rp->xattr_blobs, &key.value_cookie, value, + key.valuelen); + if (error) + return error; + + return xfarray_append(rp->xattr_records, &key); +} + +/* Insert one xattr key/value. */ +STATIC int +xrep_parent_insert_xattr( + struct xrep_parent *rp, + const struct xrep_parent_xattr *key) +{ + struct xfs_da_args args = { + .dp = rp->sc->tempip, + .attr_filter = key->flags, + .namelen = key->namelen, + .valuelen = key->valuelen, + .owner = rp->sc->ip->i_ino, + .geo = rp->sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + }; + int error; + + ASSERT(!(key->flags & XFS_ATTR_PARENT)); + + /* + * Grab pointers to the scrub buffer so that we can use them to insert + * attrs into the temp file. + */ + args.name = rp->xattr_name; + args.value = rp->xattr_value; + + /* + * The attribute name is stored near the end of the in-core buffer, + * though we reserve one more byte to ensure null termination. + */ + rp->xattr_name[XATTR_NAME_MAX] = 0; + + error = xfblob_load(rp->xattr_blobs, key->name_cookie, rp->xattr_name, + key->namelen); + if (error) + return error; + + error = xfblob_free(rp->xattr_blobs, key->name_cookie); + if (error) + return error; + + error = xfblob_load(rp->xattr_blobs, key->value_cookie, args.value, + key->valuelen); + if (error) + return error; + + error = xfblob_free(rp->xattr_blobs, key->value_cookie); + if (error) + return error; + + rp->xattr_name[key->namelen] = 0; + + trace_xrep_parent_insert_xattr(rp->sc->tempip, key->flags, + rp->xattr_name, key->namelen, key->valuelen); + + xfs_attr_sethash(&args); + return xfs_attr_set(&args, XFS_ATTRUPDATE_UPSERT, false); +} + +/* + * Periodically flush salvaged attributes to the temporary file. This is done + * to reduce the memory requirements of the xattr rebuild because files can + * contain millions of attributes. + */ +STATIC int +xrep_parent_flush_xattrs( + struct xrep_parent *rp) +{ + xfarray_idx_t array_cur; + int error; + + /* + * Entering this function, the scrub context has a reference to the + * inode being repaired, the temporary file, and the empty scrub + * transaction that we created for the xattr scan. We hold ILOCK_EXCL + * on the inode being repaired. + * + * To constrain kernel memory use, we occasionally flush salvaged + * xattrs from the xfarray and xfblob structures into the temporary + * file in preparation for exchanging the xattr structures at the end. + * Updating the temporary file requires a transaction, so we commit the + * scrub transaction and drop the ILOCK so that xfs_attr_set can + * allocate whatever transaction it wants. + * + * We still hold IOLOCK_EXCL on the inode being repaired, which + * prevents anyone from adding xattrs (or parent pointers) while we're + * flushing. + */ + xchk_trans_cancel(rp->sc); + xchk_iunlock(rp->sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK of the temporary file while we modify xattrs. This + * isn't strictly required because the temporary file is never revealed + * to userspace, but we follow the same locking rules. We still hold + * sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rp->sc); + if (error) + return error; + + /* Add all the salvaged attrs to the temporary file. */ + foreach_xfarray_idx(rp->xattr_records, array_cur) { + struct xrep_parent_xattr key; + + error = xfarray_load(rp->xattr_records, array_cur, &key); + if (error) + return error; + + error = xrep_parent_insert_xattr(rp, &key); + if (error) + return error; + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rp->xattr_records); + xfblob_truncate(rp->xattr_blobs); + + xrep_tempfile_iounlock(rp->sc); + + /* Recreate the empty transaction and relock the inode. */ + error = xchk_trans_alloc_empty(rp->sc); + if (error) + return error; + xchk_ilock(rp->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Decide if we've stashed too much xattr data in memory. */ +static inline bool +xrep_parent_want_flush_xattrs( + struct xrep_parent *rp) +{ + unsigned long long bytes; + + bytes = xfarray_bytes(rp->xattr_records) + + xfblob_bytes(rp->xattr_blobs); + return bytes > XREP_PARENT_XATTR_MAX_STASH_BYTES; +} + +/* Flush staged attributes to the temporary file if we're over the limit. */ +STATIC int +xrep_parent_try_flush_xattrs( + struct xfs_scrub *sc, + void *priv) +{ + struct xrep_parent *rp = priv; + int error; + + if (!xrep_parent_want_flush_xattrs(rp)) + return 0; + + error = xrep_parent_flush_xattrs(rp); + if (error) + return error; + + /* + * If there were any parent pointer updates to the xattr structure + * while we dropped the ILOCK, the xattr structure is now stale. + * Signal to the attr copy process that we need to start over, but + * this time without opportunistic attr flushing. + * + * This is unlikely to happen, so we're ok with restarting the copy. + */ + mutex_lock(&rp->pscan.lock); + if (rp->saw_pptr_updates) + error = -ESTALE; + mutex_unlock(&rp->pscan.lock); + return error; +} + +/* Copy all the non-pptr extended attributes into the temporary file. */ +STATIC int +xrep_parent_copy_xattrs( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + int error; + + /* + * Clear the pptr updates flag. We hold sc->ip ILOCKed, so there + * can't be any parent pointer updates in progress. + */ + mutex_lock(&rp->pscan.lock); + rp->saw_pptr_updates = false; + mutex_unlock(&rp->pscan.lock); + + /* Copy xattrs, stopping periodically to flush the incore buffers. */ + error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr, + xrep_parent_try_flush_xattrs, rp); + if (error && error != -ESTALE) + return error; + + if (error == -ESTALE) { + /* + * The xattr copy collided with a parent pointer update. + * Restart the copy, but this time hold the ILOCK all the way + * to the end to lock out any directory parent pointer updates. + */ + error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr, + NULL, rp); + if (error) + return error; + } + + /* Flush any remaining stashed xattrs to the temporary file. */ + if (xfarray_bytes(rp->xattr_records) == 0) + return 0; + + return xrep_parent_flush_xattrs(rp); +} + +/* + * Ensure that @sc->ip and @sc->tempip both have attribute forks before we head + * into the attr fork exchange transaction. All files on a filesystem with + * parent pointers must have an attr fork because the parent pointer code does + * not itself add attribute forks. + * + * Note: Unlinkable unlinked files don't need one, but the overhead of having + * an unnecessary attr fork is not justified by the additional code complexity + * that would be needed to track that state correctly. + */ +STATIC int +xrep_parent_ensure_attr_fork( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + int error; + + error = xfs_attr_add_fork(sc->tempip, + sizeof(struct xfs_attr_sf_hdr), 1); + if (error) + return error; + return xfs_attr_add_fork(sc->ip, sizeof(struct xfs_attr_sf_hdr), 1); +} + +/* + * Finish replaying stashed parent pointer updates, allocate a transaction for + * exchanging extent mappings, and take the ILOCKs of both files before we + * commit the new attribute structure. + */ +STATIC int +xrep_parent_finalize_tempfile( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + int error; + + /* + * Repair relies on the ILOCK to quiesce all possible xattr updates. + * Replay all queued parent pointer updates into the tempfile before + * exchanging the contents, even if that means dropping the ILOCKs and + * the transaction. + */ + do { + error = xrep_parent_replay_updates(rp); + if (error) + return error; + + error = xrep_parent_ensure_attr_fork(rp); + if (error) + return error; + + error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rp->tx); + if (error) + return error; + + if (xfarray_length(rp->pptr_recs) == 0) + break; + + xchk_trans_cancel(sc); + xrep_tempfile_iunlock_both(sc); + } while (!xchk_should_terminate(sc, &error)); + return error; +} + +/* + * Replay all the stashed parent pointers into the temporary file, copy all + * the non-pptr xattrs from the file being repaired into the temporary file, + * and exchange the attr fork contents atomically. + */ +STATIC int +xrep_parent_rebuild_pptrs( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t parent_ino = NULLFSINO; + int error; + + /* + * Copy non-ppttr xattrs from the file being repaired into the + * temporary file's xattr structure. We hold sc->ip's IOLOCK, which + * prevents setxattr/removexattr calls from occurring, but renames + * update the parent pointers without holding IOLOCK. If we detect + * stale attr structures, we restart the scan but only flush at the + * end. + */ + error = xrep_parent_copy_xattrs(rp); + if (error) + return error; + + /* + * Cancel the empty transaction that we used to walk and copy attrs, + * and drop the ILOCK so that we can take the IOLOCK on the temporary + * file. We still hold sc->ip's IOLOCK. + */ + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + error = xrep_tempfile_iolock_polled(sc); + if (error) + return error; + + /* + * Allocate transaction, lock inodes, and make sure that we've replayed + * all the stashed pptr updates to the tempdir. After this point, + * we're ready to exchange the attr fork mappings. + */ + error = xrep_parent_finalize_tempfile(rp); + if (error) + return error; + + /* Last chance to abort before we start committing pptr fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + if (xchk_iscan_aborted(&rp->pscan.iscan)) + return -ECANCELED; + + /* + * Exchange the attr fork contents and junk the old attr fork contents, + * which are now in the tempfile. + */ + error = xrep_xattr_swap(sc, &rp->tx); + if (error) + return error; + error = xrep_xattr_reset_tempfile_fork(sc); + if (error) + return error; + + /* + * Roll to get a transaction without any inodes joined to it. Then we + * can drop the tempfile's ILOCK and IOLOCK before doing more work on + * the scrub target file. + */ + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + xrep_tempfile_iunlock(sc); + xrep_tempfile_iounlock(sc); + + /* + * We've committed the new parent pointers. Find at least one parent + * so that we can decide if we're moving this file to the orphanage. + * For this purpose, root directories are their own parents. + */ + if (xchk_inode_is_dirtree_root(sc->ip)) { + xrep_findparent_scan_found(&rp->pscan, sc->ip->i_ino); + } else { + error = xrep_parent_lookup_pptrs(sc, &parent_ino); + if (error) + return error; + if (parent_ino != NULLFSINO) + xrep_findparent_scan_found(&rp->pscan, parent_ino); + } + return 0; +} + +/* + * Commit the new parent pointer structure (currently only the dotdot entry) to + * the file that we're repairing. + */ +STATIC int +xrep_parent_rebuild_tree( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + bool try_adoption; + int error; + + if (xfs_has_parent(sc->mp)) { + error = xrep_parent_rebuild_pptrs(rp); + if (error) + return error; + } + + /* + * Any file with no parent could be adopted. This check happens after + * rebuilding the parent pointer structure because we might have cycled + * the ILOCK during that process. + */ + try_adoption = rp->pscan.parent_ino == NULLFSINO; + + /* + * Starting with metadir, we allow checking of parent pointers + * of non-directory files that are children of the superblock. + * Lack of parent is ok here. + */ + if (try_adoption && xfs_has_metadir(sc->mp) && + xchk_inode_is_sb_rooted(sc->ip)) + try_adoption = false; + + if (try_adoption) { + if (xrep_orphanage_can_adopt(sc)) + return xrep_parent_move_to_orphanage(rp); + return -EFSCORRUPTED; + + } + + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + return xrep_parent_reset_dotdot(rp); + + return 0; +} + +/* Count the number of parent pointers. */ +STATIC int +xrep_parent_count_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xrep_parent *rp = priv; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + rp->parents++; + return 0; +} + +/* + * After all parent pointer rebuilding and adoption activity completes, reset + * the link count of this nondirectory, having scanned the fs to rebuild all + * parent pointers. + */ +STATIC int +xrep_parent_set_nondir_nlink( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + struct xfs_inode *ip = sc->ip; + struct xfs_perag *pag; + bool joined = false; + int error; + + /* Count parent pointers so we can reset the file link count. */ + rp->parents = 0; + error = xchk_xattr_walk(sc, ip, xrep_parent_count_pptr, NULL, rp); + if (error) + return error; + + /* + * Starting with metadir, we allow checking of parent pointers of + * non-directory files that are children of the superblock. Pretend + * that we found a parent pointer attr. + */ + if (xfs_has_metadir(sc->mp) && xchk_inode_is_sb_rooted(sc->ip)) + rp->parents++; + + if (rp->parents > 0 && xfs_inode_on_unlinked_list(ip)) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + + /* + * The file is on the unlinked list but we found parents. + * Remove the file from the unlinked list. + */ + pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, ip->i_ino)); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xfs_iunlink_remove(sc->tp, pag, ip); + xfs_perag_put(pag); + if (error) + return error; + } else if (rp->parents == 0 && !xfs_inode_on_unlinked_list(ip)) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + + /* + * The file is not on the unlinked list but we found no + * parents. Add the file to the unlinked list. + */ + error = xfs_iunlink(sc->tp, ip); + if (error) + return error; + } + + /* Set the correct link count. */ + if (VFS_I(ip)->i_nlink != rp->parents) { + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + set_nlink(VFS_I(ip), min_t(unsigned long long, rp->parents, + XFS_NLINK_PINNED)); + } + + /* Log the inode to keep it moving forward if we dirtied anything. */ + if (joined) + xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE); + return 0; +} + +/* Set up the filesystem scan so we can look for parents. */ +STATIC int +xrep_parent_setup_scan( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + char *descr; + struct xfs_da_geometry *geo = sc->mp->m_attr_geo; + int max_len; + int error; + + if (!xfs_has_parent(sc->mp)) + return xrep_findparent_scan_start(sc, &rp->pscan); + + /* Buffers for copying non-pptr attrs to the tempfile */ + rp->xattr_name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS); + if (!rp->xattr_name) + return -ENOMEM; + + /* + * Allocate enough memory to handle loading local attr values from the + * xfblob data while flushing stashed attrs to the temporary file. + * We only realloc the buffer when salvaging remote attr values, so + * TRY_HARDER means we allocate the maximal attr value size. + */ + if (sc->flags & XCHK_TRY_HARDER) + max_len = XATTR_SIZE_MAX; + else + max_len = xfs_attr_leaf_entsize_local_max(geo->blksize); + error = xrep_parent_alloc_xattr_value(rp, max_len); + if (error) + goto out_xattr_name; + + /* Set up some staging memory for logging parent pointer updates. */ + descr = xchk_xfile_ino_descr(sc, "parent pointer entries"); + error = xfarray_create(descr, 0, sizeof(struct xrep_pptr), + &rp->pptr_recs); + kfree(descr); + if (error) + goto out_xattr_value; + + descr = xchk_xfile_ino_descr(sc, "parent pointer names"); + error = xfblob_create(descr, &rp->pptr_names); + kfree(descr); + if (error) + goto out_recs; + + /* Set up some storage for copying attrs before the mapping exchange */ + descr = xchk_xfile_ino_descr(sc, + "parent pointer retained xattr entries"); + error = xfarray_create(descr, 0, sizeof(struct xrep_parent_xattr), + &rp->xattr_records); + kfree(descr); + if (error) + goto out_names; + + descr = xchk_xfile_ino_descr(sc, + "parent pointer retained xattr values"); + error = xfblob_create(descr, &rp->xattr_blobs); + kfree(descr); + if (error) + goto out_attr_keys; + + error = __xrep_findparent_scan_start(sc, &rp->pscan, + xrep_parent_live_update); + if (error) + goto out_attr_values; + + return 0; + +out_attr_values: + xfblob_destroy(rp->xattr_blobs); + rp->xattr_blobs = NULL; +out_attr_keys: + xfarray_destroy(rp->xattr_records); + rp->xattr_records = NULL; +out_names: + xfblob_destroy(rp->pptr_names); + rp->pptr_names = NULL; +out_recs: + xfarray_destroy(rp->pptr_recs); + rp->pptr_recs = NULL; +out_xattr_value: + kvfree(rp->xattr_value); + rp->xattr_value = NULL; +out_xattr_name: + kvfree(rp->xattr_name); + rp->xattr_name = NULL; + return error; +} + +int +xrep_parent( + struct xfs_scrub *sc) +{ + struct xrep_parent *rp = sc->buf; + int error; + + /* + * When the parent pointers feature is enabled, repairs are committed + * by atomically committing a new xattr structure and reaping the old + * attr fork. Reaping requires rmap and exchange-range to be enabled. + */ + if (xfs_has_parent(sc->mp)) { + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + } + + error = xrep_parent_setup_scan(rp); + if (error) + return error; + + if (xfs_has_parent(sc->mp)) + error = xrep_parent_scan_dirtree(rp); + else + error = xrep_parent_find_dotdot(rp); + if (error) + goto out_teardown; + + /* Last chance to abort before we start committing dotdot fixes. */ + if (xchk_should_terminate(sc, &error)) + goto out_teardown; + + error = xrep_parent_rebuild_tree(rp); + if (error) + goto out_teardown; + if (xfs_has_parent(sc->mp) && !S_ISDIR(VFS_I(sc->ip)->i_mode)) { + error = xrep_parent_set_nondir_nlink(rp); + if (error) + goto out_teardown; + } + + error = xrep_defer_finish(sc); + +out_teardown: + xrep_parent_teardown(rp); + return error; +} diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 183d531875ea..58d6d4ed2853 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -212,12 +212,18 @@ xchk_quota_item( if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (mp->m_sb.sb_rblocks < dq->q_rtb.count) + xchk_fblock_set_warning(sc, XFS_DATA_FORK, + offset); } else { if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (mp->m_sb.sb_rblocks < dq->q_rtb.count) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, + offset); } - if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks) + if (dq->q_ino.count > fs_icount) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); /* diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c index 0bab4c30cb85..8f4c8d41f308 100644 --- a/fs/xfs/scrub/quota_repair.c +++ b/fs/xfs/scrub/quota_repair.c @@ -12,7 +12,6 @@ #include "xfs_defer.h" #include "xfs_btree.h" #include "xfs_bit.h" -#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -77,8 +76,6 @@ xrep_quota_item_fill_bmap_hole( irec, &nmaps); if (error) return error; - if (nmaps != 1) - return -ENOSPC; dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock); @@ -236,7 +233,7 @@ xrep_quota_item( rqi->need_quotacheck = true; dirty = true; } - if (dq->q_rtb.count > mp->m_sb.sb_rblocks) { + if (!xfs_has_reflink(mp) && dq->q_rtb.count > mp->m_sb.sb_rblocks) { dq->q_rtb.reserved -= dq->q_rtb.count; dq->q_rtb.reserved += mp->m_sb.sb_rblocks; dq->q_rtb.count = mp->m_sb.sb_rblocks; @@ -444,10 +441,6 @@ xrep_quota_data_fork( XFS_BMAPI_CONVERT, 0, &nrec, &nmap); if (error) goto out; - if (nmap != 1) { - error = -ENOSPC; - goto out; - } ASSERT(nrec.br_startoff == irec.br_startoff); ASSERT(nrec.br_blockcount == irec.br_blockcount); diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c index c77eb2de8df7..dc4033b91e44 100644 --- a/fs/xfs/scrub/quotacheck.c +++ b/fs/xfs/scrub/quotacheck.c @@ -398,10 +398,13 @@ xqcheck_collect_inode( bool isreg = S_ISREG(VFS_I(ip)->i_mode); int error = 0; - if (xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) { + if (xfs_is_metadir_inode(ip) || + xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) { /* * Quota files are never counted towards quota, so we do not - * need to take the lock. + * need to take the lock. Files do not switch between the + * metadata and regular directory trees without a reallocation, + * so we do not need to ILOCK them either. */ xchk_iscan_mark_visited(&xqc->iscan, ip); return 0; diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c index dfdcb96b6c16..01c9a2dc0f2c 100644 --- a/fs/xfs/scrub/readdir.c +++ b/fs/xfs/scrub/readdir.c @@ -18,6 +18,7 @@ #include "xfs_trans.h" #include "xfs_error.h" #include "scrub/scrub.h" +#include "scrub/common.h" #include "scrub/readdir.h" /* Call a function for every entry in a shortform directory. */ @@ -99,7 +100,7 @@ xchk_dir_walk_block( unsigned int off, next_off, end; int error; - error = xfs_dir3_block_read(sc->tp, dp, &bp); + error = xfs_dir3_block_read(sc->tp, dp, dp->i_ino, &bp); if (error) return error; @@ -175,7 +176,7 @@ xchk_read_leaf_dir_buf( if (new_off > *curoff) *curoff = new_off; - return xfs_dir3_data_read(tp, dp, map.br_startoff, 0, bpp); + return xfs_dir3_data_read(tp, dp, dp->i_ino, map.br_startoff, 0, bpp); } /* Call a function for every entry in a leaf directory. */ @@ -273,8 +274,8 @@ xchk_dir_walk( .dp = dp, .geo = dp->i_mount->m_dir_geo, .trans = sc->tp, + .owner = dp->i_ino, }; - bool isblock; int error; if (xfs_is_shutdown(dp->i_mount)) @@ -283,22 +284,17 @@ xchk_dir_walk( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) + switch (xfs_dir2_format(&args, &error)) { + case XFS_DIR2_FMT_SF: return xchk_dir_walk_sf(sc, dp, dirent_fn, priv); - - /* dir2 functions require that the data fork is loaded */ - error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); - if (error) - return error; - - error = xfs_dir2_isblock(&args, &isblock); - if (error) - return error; - - if (isblock) + case XFS_DIR2_FMT_BLOCK: return xchk_dir_walk_block(sc, dp, dirent_fn, priv); - - return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv); + case XFS_DIR2_FMT_LEAF: + case XFS_DIR2_FMT_NODE: + return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv); + default: + return error; + } } /* @@ -324,50 +320,102 @@ xchk_dir_lookup( .hashval = xfs_dir2_hashname(dp->i_mount, name), .whichfork = XFS_DATA_FORK, .op_flags = XFS_DA_OP_OKNOENT, + .owner = dp->i_ino, }; - bool isblock, isleaf; int error; if (xfs_is_shutdown(dp->i_mount)) return -EIO; + /* + * A temporary directory's block headers are written with the owner + * set to sc->ip, so we must switch the owner here for the lookup. + */ + if (dp == sc->tempip) + args.owner = sc->ip->i_ino; + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - error = xfs_dir2_sf_lookup(&args); - goto out_check_rval; - } + error = xfs_dir_lookup_args(&args); + if (!error) + *ino = args.inumber; + return error; +} - /* dir2 functions require that the data fork is loaded */ - error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); - if (error) - return error; +/* + * Try to grab the IOLOCK and ILOCK of sc->ip and ip, returning @ip's lock + * state. The caller may have a transaction, so we must use trylock for both + * IOLOCKs. + */ +static inline unsigned int +xchk_dir_trylock_both( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (!xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + return 0; - error = xfs_dir2_isblock(&args, &isblock); - if (error) - return error; + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + goto parent_iolock; - if (isblock) { - error = xfs_dir2_block_lookup(&args); - goto out_check_rval; - } + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + goto parent_ilock; - error = xfs_dir2_isleaf(&args, &isleaf); - if (error) - return error; + return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL; + +parent_ilock: + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); +parent_iolock: + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + return 0; +} + +/* + * Try for a limited time to grab the IOLOCK and ILOCK of both the scrub target + * (@sc->ip) and the inode at the other end (@ip) of a directory or parent + * pointer link so that we can check that link. + * + * We do not know ahead of time that the directory tree is /not/ corrupt, so we + * cannot use the "lock two inode" functions because we do not know that there + * is not a racing thread trying to take the locks in opposite order. First + * take IOLOCK_EXCL of the scrub target, and then try to take IOLOCK_SHARED + * of @ip to synchronize with the VFS. Next, take ILOCK_EXCL of the scrub + * target and @ip to synchronize with XFS. + * + * If the trylocks succeed, *lockmode will be set to the locks held for @ip; + * @sc->ilock_flags will be set for the locks held for @sc->ip; and zero will + * be returned. If not, returns -EDEADLOCK to try again; or -ETIMEDOUT if + * XCHK_TRY_HARDER was set. Returns -EINTR if the process has been killed. + */ +int +xchk_dir_trylock_for_pptrs( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int *lockmode) +{ + unsigned int nr; + int error = 0; + + ASSERT(sc->ilock_flags == 0); + + for (nr = 0; nr < HZ; nr++) { + *lockmode = xchk_dir_trylock_both(sc, ip); + if (*lockmode) + return 0; - if (isleaf) { - error = xfs_dir2_leaf_lookup(&args); - goto out_check_rval; + if (xchk_should_terminate(sc, &error)) + return error; + + delay(1); } - error = xfs_dir2_node_lookup(&args); + if (sc->flags & XCHK_TRY_HARDER) { + xchk_set_incomplete(sc); + return -ETIMEDOUT; + } -out_check_rval: - if (error == -EEXIST) - error = 0; - if (!error) - *ino = args.inumber; - return error; + return -EDEADLOCK; } diff --git a/fs/xfs/scrub/readdir.h b/fs/xfs/scrub/readdir.h index 55787f4df123..da501877a64d 100644 --- a/fs/xfs/scrub/readdir.h +++ b/fs/xfs/scrub/readdir.h @@ -16,4 +16,7 @@ int xchk_dir_walk(struct xfs_scrub *sc, struct xfs_inode *dp, int xchk_dir_lookup(struct xfs_scrub *sc, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t *ino); +int xchk_dir_trylock_for_pptrs(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int *lockmode); + #endif /* __XFS_SCRUB_READDIR_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 0252a3b5b65a..8703897c0a9c 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -33,6 +33,9 @@ #include "xfs_attr.h" #include "xfs_attr_remote.h" #include "xfs_defer.h" +#include "xfs_metafile.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -40,6 +43,7 @@ #include "scrub/bitmap.h" #include "scrub/agb_bitmap.h" #include "scrub/fsb_bitmap.h" +#include "scrub/rtb_bitmap.h" #include "scrub/reap.h" /* @@ -137,7 +141,7 @@ xreap_put_freelist( agfl_bp, agbno, 0); if (error) return error; - xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, + xfs_extent_busy_insert(sc->tp, pag_group(sc->sa.pag), agbno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; @@ -211,6 +215,48 @@ static inline void xreap_defer_finish_reset(struct xreap_state *rs) rs->force_roll = false; } +/* + * Compute the maximum length of a buffer cache scan (in units of sectors), + * given a quantity of fs blocks. + */ +xfs_daddr_t +xrep_bufscan_max_sectors( + struct xfs_mount *mp, + xfs_extlen_t fsblocks) +{ + int max_fsbs; + + /* Remote xattr values are the largest buffers that we support. */ + max_fsbs = xfs_attr3_max_rmt_blocks(mp); + + return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs)); +} + +/* + * Return an incore buffer from a sector scan, or NULL if there are no buffers + * left to return. + */ +struct xfs_buf * +xrep_bufscan_advance( + struct xfs_mount *mp, + struct xrep_bufscan *scan) +{ + scan->__sector_count += scan->daddr_step; + while (scan->__sector_count <= scan->max_sectors) { + struct xfs_buf *bp = NULL; + int error; + + error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr, + scan->__sector_count, XBF_LIVESCAN, &bp); + if (!error) + return bp; + + scan->__sector_count += scan->daddr_step; + } + + return NULL; +} + /* Try to invalidate the incore buffers for an extent that we're freeing. */ STATIC void xreap_agextent_binval( @@ -221,7 +267,6 @@ xreap_agextent_binval( struct xfs_scrub *sc = rs->sc; struct xfs_perag *pag = sc->sa.pag; struct xfs_mount *mp = sc->mp; - xfs_agnumber_t agno = sc->sa.pag->pag_agno; xfs_agblock_t agbno_next = agbno + *aglenp; xfs_agblock_t bno = agbno; @@ -241,28 +286,15 @@ xreap_agextent_binval( * of any plausible size. */ while (bno < agbno_next) { - xfs_agblock_t fsbcount; - xfs_agblock_t max_fsbs; - - /* - * Max buffer size is the max remote xattr buffer size, which - * is one fs block larger than 64k. - */ - max_fsbs = min_t(xfs_agblock_t, agbno_next - bno, - xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX)); - - for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) { - struct xfs_buf *bp = NULL; - xfs_daddr_t daddr; - int error; - - daddr = XFS_AGB_TO_DADDR(mp, agno, bno); - error = xfs_buf_incore(mp->m_ddev_targp, daddr, - XFS_FSB_TO_BB(mp, fsbcount), - XBF_LIVESCAN, &bp); - if (error) - continue; - + struct xrep_bufscan scan = { + .daddr = xfs_agbno_to_daddr(pag, bno), + .max_sectors = xrep_bufscan_max_sectors(mp, + agbno_next - bno), + .daddr_step = XFS_FSB_TO_BB(mp, 1), + }; + struct xfs_buf *bp; + + while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { xfs_trans_bjoin(sc->tp, bp); xfs_trans_binval(sc->tp, bp); rs->invalidated++; @@ -282,7 +314,7 @@ xreap_agextent_binval( } out: - trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp); + trace_xreap_agextent_binval(pag_group(sc->sa.pag), agbno, *aglenp); } /* @@ -341,7 +373,8 @@ xreap_agextent_select( out_found: *aglenp = len; - trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked); + trace_xreap_agextent_select(pag_group(sc->sa.pag), agbno, len, + *crosslinked); out_cur: xfs_btree_del_cursor(cur, error); return error; @@ -362,7 +395,9 @@ xreap_agextent_iter( xfs_fsblock_t fsbno; int error = 0; - fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno); + ASSERT(rs->resv != XFS_AG_RESV_METAFILE); + + fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno); /* * If there are other rmappings, this block is cross linked and must @@ -378,7 +413,8 @@ xreap_agextent_iter( * to run xfs_repair. */ if (crosslinked) { - trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); + trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno, + *aglenp); rs->force_roll = true; @@ -388,7 +424,8 @@ xreap_agextent_iter( * records from the refcountbt, which will remove the * rmap record as well. */ - xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + xfs_refcount_free_cow_extent(sc->tp, false, fsbno, + *aglenp); return 0; } @@ -396,7 +433,7 @@ xreap_agextent_iter( *aglenp, rs->oinfo); } - trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp); + trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp); /* * Invalidate as many buffers as we can, starting at agbno. If this @@ -420,9 +457,9 @@ xreap_agextent_iter( if (rs->oinfo == &XFS_RMAP_OINFO_COW) { ASSERT(rs->resv == XFS_AG_RESV_NONE); - xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL, - rs->resv, true); + rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD); if (error) return error; @@ -448,7 +485,7 @@ xreap_agextent_iter( * system with large EFIs. */ error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, - rs->resv, true); + rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD); if (error) return error; @@ -646,3 +683,637 @@ xrep_reap_fsblocks( return 0; } + +#ifdef CONFIG_XFS_RT +/* + * Figure out the longest run of blocks that we can dispose of with a single + * call. Cross-linked blocks should have their reverse mappings removed, but + * single-owner extents can be freed. Units are rt blocks, not rt extents. + */ +STATIC int +xreap_rgextent_select( + struct xreap_state *rs, + xfs_rgblock_t rgbno, + xfs_rgblock_t rgbno_next, + bool *crosslinked, + xfs_extlen_t *rglenp) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_btree_cur *cur; + xfs_rgblock_t bno = rgbno + 1; + xfs_extlen_t len = 1; + int error; + + /* + * Determine if there are any other rmap records covering the first + * block of this extent. If so, the block is crosslinked. + */ + cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg); + error = xfs_rmap_has_other_keys(cur, rgbno, 1, rs->oinfo, + crosslinked); + if (error) + goto out_cur; + + /* + * Figure out how many of the subsequent blocks have the same crosslink + * status. + */ + while (bno < rgbno_next) { + bool also_crosslinked; + + error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo, + &also_crosslinked); + if (error) + goto out_cur; + + if (*crosslinked != also_crosslinked) + break; + + len++; + bno++; + } + + *rglenp = len; + trace_xreap_agextent_select(rtg_group(sc->sr.rtg), rgbno, len, + *crosslinked); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Dispose of as much of the beginning of this rtgroup extent as possible. + * The number of blocks disposed of will be returned in @rglenp. + */ +STATIC int +xreap_rgextent_iter( + struct xreap_state *rs, + xfs_rgblock_t rgbno, + xfs_extlen_t *rglenp, + bool crosslinked) +{ + struct xfs_scrub *sc = rs->sc; + xfs_rtblock_t rtbno; + int error; + + /* + * The only caller so far is CoW fork repair, so we only know how to + * unlink or free CoW staging extents. Here we don't have to worry + * about invalidating buffers! + */ + if (rs->oinfo != &XFS_RMAP_OINFO_COW) { + ASSERT(rs->oinfo == &XFS_RMAP_OINFO_COW); + return -EFSCORRUPTED; + } + ASSERT(rs->resv == XFS_AG_RESV_NONE); + + rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno); + + /* + * If there are other rmappings, this block is cross linked and must + * not be freed. Remove the forward and reverse mapping and move on. + */ + if (crosslinked) { + trace_xreap_dispose_unmap_extent(rtg_group(sc->sr.rtg), rgbno, + *rglenp); + + xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); + rs->deferred++; + return 0; + } + + trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp); + + /* + * The CoW staging extent is not crosslinked. Use deferred work items + * to remove the refcountbt records (which removes the rmap records) + * and free the extent. We're not worried about the system going down + * here because log recovery walks the refcount btree to clean out the + * CoW staging extents. + */ + xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); + error = xfs_free_extent_later(sc->tp, rtbno, *rglenp, NULL, + rs->resv, + XFS_FREE_EXTENT_REALTIME | + XFS_FREE_EXTENT_SKIP_DISCARD); + if (error) + return error; + + rs->deferred++; + return 0; +} + +#define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ + XFS_RTGLOCK_RMAP | \ + XFS_RTGLOCK_REFCOUNT) + +/* + * Break a rt file metadata extent into sub-extents by fate (crosslinked, not + * crosslinked), and dispose of each sub-extent separately. The extent must + * be aligned to a realtime extent. + */ +STATIC int +xreap_rtmeta_extent( + uint64_t rtbno, + uint64_t len, + void *priv) +{ + struct xreap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; + xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, rtbno); + xfs_rgblock_t rgbno_next = rgbno + len; + int error = 0; + + ASSERT(sc->ip != NULL); + ASSERT(!sc->sr.rtg); + + /* + * We're reaping blocks after repairing file metadata, which means that + * we have to init the xchk_ag structure ourselves. + */ + sc->sr.rtg = xfs_rtgroup_get(sc->mp, xfs_rtb_to_rgno(sc->mp, rtbno)); + if (!sc->sr.rtg) + return -EFSCORRUPTED; + + xfs_rtgroup_lock(sc->sr.rtg, XREAP_RTGLOCK_ALL); + + while (rgbno < rgbno_next) { + xfs_extlen_t rglen; + bool crosslinked; + + error = xreap_rgextent_select(rs, rgbno, rgbno_next, + &crosslinked, &rglen); + if (error) + goto out_unlock; + + error = xreap_rgextent_iter(rs, rgbno, &rglen, crosslinked); + if (error) + goto out_unlock; + + if (xreap_want_defer_finish(rs)) { + error = xfs_defer_finish(&sc->tp); + if (error) + goto out_unlock; + xreap_defer_finish_reset(rs); + } else if (xreap_want_roll(rs)) { + error = xfs_trans_roll_inode(&sc->tp, sc->ip); + if (error) + goto out_unlock; + xreap_reset(rs); + } + + rgbno += rglen; + } + +out_unlock: + xfs_rtgroup_unlock(sc->sr.rtg, XREAP_RTGLOCK_ALL); + xfs_rtgroup_put(sc->sr.rtg); + sc->sr.rtg = NULL; + return error; +} + +/* + * Dispose of every block of every rt metadata extent in the bitmap. + * Do not use this to dispose of the mappings in an ondisk inode fork. + */ +int +xrep_reap_rtblocks( + struct xfs_scrub *sc, + struct xrtb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xreap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = XFS_AG_RESV_NONE, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip != NULL); + + error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) + return xrep_defer_finish(sc); + + return 0; +} +#endif /* CONFIG_XFS_RT */ + +/* + * Dispose of every block of an old metadata btree that used to be rooted in a + * metadata directory file. + */ +int +xrep_reap_metadir_fsblocks( + struct xfs_scrub *sc, + struct xfsb_bitmap *bitmap) +{ + /* + * Reap old metadir btree blocks with XFS_AG_RESV_NONE because the old + * blocks are no longer mapped by the inode, and inode metadata space + * reservations can only account freed space to the i_nblocks. + */ + struct xfs_owner_info oinfo; + struct xreap_state rs = { + .sc = sc, + .oinfo = &oinfo, + .resv = XFS_AG_RESV_NONE, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip != NULL); + ASSERT(xfs_is_metadir_inode(sc->ip)); + + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); + + error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) { + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return xrep_reset_metafile_resv(sc); +} + +/* + * Metadata files are not supposed to share blocks with anything else. + * If blocks are shared, we remove the reverse mapping (thus reducing the + * crosslink factor); if blocks are not shared, we also need to free them. + * + * This first step determines the longest subset of the passed-in imap + * (starting at its beginning) that is either crosslinked or not crosslinked. + * The blockcount will be adjust down as needed. + */ +STATIC int +xreap_bmapi_select( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap, + bool *crosslinked) +{ + struct xfs_owner_info oinfo; + struct xfs_btree_cur *cur; + xfs_filblks_t len = 1; + xfs_agblock_t bno; + xfs_agblock_t agbno; + xfs_agblock_t agbno_next; + int error; + + agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock); + agbno_next = agbno + imap->br_blockcount; + + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + + xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff); + error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked); + if (error) + goto out_cur; + + bno = agbno + 1; + while (bno < agbno_next) { + bool also_crosslinked; + + oinfo.oi_offset++; + error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo, + &also_crosslinked); + if (error) + goto out_cur; + + if (also_crosslinked != *crosslinked) + break; + + len++; + bno++; + } + + imap->br_blockcount = len; + trace_xreap_bmapi_select(pag_group(sc->sa.pag), agbno, len, + *crosslinked); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Decide if this buffer can be joined to a transaction. This is true for most + * buffers, but there are two cases that we want to catch: large remote xattr + * value buffers are not logged and can overflow the buffer log item dirty + * bitmap size; and oversized cached buffers if things have really gone + * haywire. + */ +static inline bool +xreap_buf_loggable( + const struct xfs_buf *bp) +{ + int i; + + for (i = 0; i < bp->b_map_count; i++) { + int chunks; + int map_size; + + chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), + XFS_BLF_CHUNK); + map_size = DIV_ROUND_UP(chunks, NBWORD); + if (map_size > XFS_BLF_DATAMAP_SIZE) + return false; + } + + return true; +} + +/* + * Invalidate any buffers for this file mapping. The @imap blockcount may be + * adjusted downward if we need to roll the transaction. + */ +STATIC int +xreap_bmapi_binval( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag = sc->sa.pag; + int bmap_flags = xfs_bmapi_aflag(whichfork); + xfs_fileoff_t off; + xfs_fileoff_t max_off; + xfs_extlen_t scan_blocks; + xfs_agblock_t bno; + xfs_agblock_t agbno; + xfs_agblock_t agbno_next; + unsigned int invalidated = 0; + int error; + + /* + * Avoid invalidating AG headers and post-EOFS blocks because we never + * own those. + */ + agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock); + agbno_next = agbno + imap->br_blockcount; + if (!xfs_verify_agbno(pag, agbno) || + !xfs_verify_agbno(pag, agbno_next - 1)) + return 0; + + /* + * Buffers for file blocks can span multiple contiguous mappings. This + * means that for each block in the mapping, there could exist an + * xfs_buf indexed by that block with any length up to the maximum + * buffer size (remote xattr values) or to the next hole in the fork. + * To set up our binval scan, first we need to figure out the location + * of the next hole. + */ + off = imap->br_startoff + imap->br_blockcount; + max_off = off + xfs_attr3_max_rmt_blocks(mp); + while (off < max_off) { + struct xfs_bmbt_irec hmap; + int nhmaps = 1; + + error = xfs_bmapi_read(ip, off, max_off - off, &hmap, + &nhmaps, bmap_flags); + if (error) + return error; + if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) { + ASSERT(0); + return -EFSCORRUPTED; + } + + if (!xfs_bmap_is_real_extent(&hmap)) + break; + + off = hmap.br_startoff + hmap.br_blockcount; + } + scan_blocks = off - imap->br_startoff; + + trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks); + + /* + * If there are incore buffers for these blocks, invalidate them. If + * we can't (try)lock the buffer we assume it's owned by someone else + * and leave it alone. The buffer cache cannot detect aliasing, so + * employ nested loops to detect incore buffers of any plausible size. + */ + while (bno < agbno_next) { + struct xrep_bufscan scan = { + .daddr = xfs_agbno_to_daddr(pag, bno), + .max_sectors = xrep_bufscan_max_sectors(mp, + scan_blocks), + .daddr_step = XFS_FSB_TO_BB(mp, 1), + }; + struct xfs_buf *bp; + + while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { + if (xreap_buf_loggable(bp)) { + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + } else { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + invalidated++; + + /* + * Stop invalidating if we've hit the limit; we should + * still have enough reservation left to free however + * much of the mapping we've seen so far. + */ + if (invalidated > XREAP_MAX_BINVAL) { + imap->br_blockcount = agbno_next - bno; + goto out; + } + } + + bno++; + scan_blocks--; + } + +out: + trace_xreap_bmapi_binval(pag_group(sc->sa.pag), agbno, + imap->br_blockcount); + return 0; +} + +/* + * Dispose of as much of the beginning of this file fork mapping as possible. + * The number of blocks disposed of is returned in @imap->br_blockcount. + */ +STATIC int +xrep_reap_bmapi_iter( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap, + bool crosslinked) +{ + int error; + + if (crosslinked) { + /* + * If there are other rmappings, this block is cross linked and + * must not be freed. Remove the reverse mapping, leave the + * buffer cache in its possibly confused state, and move on. + * We don't want to risk discarding valid data buffers from + * anybody else who thinks they own the block, even though that + * runs the risk of stale buffer warnings in the future. + */ + trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), + XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), + imap->br_blockcount); + + /* + * Schedule removal of the mapping from the fork. We use + * deferred log intents in this function to control the exact + * sequence of metadata updates. + */ + xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + -(int64_t)imap->br_blockcount); + xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap); + return 0; + } + + /* + * If the block is not crosslinked, we can invalidate all the incore + * buffers for the extent, and then free the extent. This is a bit of + * a mess since we don't detect discontiguous buffers that are indexed + * by a block starting before the first block of the extent but overlap + * anyway. + */ + trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), + XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), + imap->br_blockcount); + + /* + * Invalidate as many buffers as we can, starting at the beginning of + * this mapping. If this function sets blockcount to zero, the + * transaction is full of logged buffer invalidations, so we need to + * return early so that we can roll and retry. + */ + error = xreap_bmapi_binval(sc, ip, whichfork, imap); + if (error || imap->br_blockcount == 0) + return error; + + /* + * Schedule removal of the mapping from the fork. We use deferred log + * intents in this function to control the exact sequence of metadata + * updates. + */ + xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + -(int64_t)imap->br_blockcount); + return xfs_free_extent_later(sc->tp, imap->br_startblock, + imap->br_blockcount, NULL, XFS_AG_RESV_NONE, + XFS_FREE_EXTENT_SKIP_DISCARD); +} + +/* + * Dispose of as much of this file extent as we can. Upon successful return, + * the imap will reflect the mapping that was removed from the fork. + */ +STATIC int +xreap_ifork_extent( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap) +{ + xfs_agnumber_t agno; + bool crosslinked; + int error; + + ASSERT(sc->sa.pag == NULL); + + trace_xreap_ifork_extent(sc, ip, whichfork, imap); + + agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock); + sc->sa.pag = xfs_perag_get(sc->mp, agno); + if (!sc->sa.pag) + return -EFSCORRUPTED; + + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); + if (error) + goto out_pag; + + /* + * Decide the fate of the blocks at the beginning of the mapping, then + * update the mapping to use it with the unmap calls. + */ + error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked); + if (error) + goto out_agf; + + error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked); + if (error) + goto out_agf; + +out_agf: + xfs_trans_brelse(sc->tp, sc->sa.agf_bp); + sc->sa.agf_bp = NULL; +out_pag: + xfs_perag_put(sc->sa.pag); + sc->sa.pag = NULL; + return error; +} + +/* + * Dispose of each block mapped to the given fork of the given file. Callers + * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip. The fork + * must not have any delalloc reservations. + */ +int +xrep_reap_ifork( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork) +{ + xfs_fileoff_t off = 0; + int bmap_flags = xfs_bmapi_aflag(whichfork); + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(ip == sc->ip || ip == sc->tempip); + ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip)); + + while (off < XFS_MAX_FILEOFF) { + struct xfs_bmbt_irec imap; + int nimaps = 1; + + /* Read the next extent, skip past holes and delalloc. */ + error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap, + &nimaps, bmap_flags); + if (error) + return error; + if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) { + ASSERT(0); + return -EFSCORRUPTED; + } + + /* + * If this is a real space mapping, reap as much of it as we + * can in a single transaction. + */ + if (xfs_bmap_is_real_extent(&imap)) { + error = xreap_ifork_extent(sc, ip, whichfork, &imap); + if (error) + return error; + + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + off = imap.br_startoff + imap.br_blockcount; + } + + return 0; +} diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h index 0b69f16dd98f..4c8f62701fb3 100644 --- a/fs/xfs/scrub/reap.h +++ b/fs/xfs/scrub/reap.h @@ -13,5 +13,35 @@ int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap, const struct xfs_owner_info *oinfo); +int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork); +int xrep_reap_metadir_fsblocks(struct xfs_scrub *sc, + struct xfsb_bitmap *bitmap); + +#ifdef CONFIG_XFS_RT +int xrep_reap_rtblocks(struct xfs_scrub *sc, struct xrtb_bitmap *bitmap, + const struct xfs_owner_info *oinfo); +#else +# define xrep_reap_rtblocks(...) (-EOPNOTSUPP) +#endif /* CONFIG_XFS_RT */ + +/* Buffer cache scan context. */ +struct xrep_bufscan { + /* Disk address for the buffers we want to scan. */ + xfs_daddr_t daddr; + + /* Maximum number of sectors to scan. */ + xfs_daddr_t max_sectors; + + /* Each round, increment the search length by this number of sectors. */ + xfs_daddr_t daddr_step; + + /* Internal scan state; initialize to zero. */ + xfs_daddr_t __sector_count; +}; + +xfs_daddr_t xrep_bufscan_max_sectors(struct xfs_mount *mp, + xfs_extlen_t fsblocks); +struct xfs_buf *xrep_bufscan_advance(struct xfs_mount *mp, + struct xrep_bufscan *scan); #endif /* __XFS_SCRUB_REAP_H__ */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index d0c7d4a29c0f..d46528023015 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -421,7 +421,7 @@ xchk_refcount_mergeable( if (r1->rc_refcount != r2->rc_refcount) return false; if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount > - MAXREFCEXTLEN) + XFS_REFC_LEN_MAX) return false; return true; @@ -453,7 +453,8 @@ xchk_refcountbt_rec( struct xchk_refcbt_records *rrc = bs->private; xfs_refcount_btrec_to_irec(rec, &irec); - if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { + if (xfs_refcount_check_irec(to_perag(bs->cur->bc_group), &irec) != + NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -490,7 +491,7 @@ xchk_refcount_xref_rmap( struct xfs_scrub *sc, xfs_filblks_t cow_blocks) { - xfs_extlen_t refcbt_blocks = 0; + xfs_filblks_t refcbt_blocks = 0; xfs_filblks_t blocks; int error; diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c index a00d7ce7ae5b..9c8cb5332da0 100644 --- a/fs/xfs/scrub/refcount_repair.c +++ b/fs/xfs/scrub/refcount_repair.c @@ -183,13 +183,13 @@ xrep_refc_stash( if (xchk_should_terminate(sc, &error)) return error; - irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount); + irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount); error = xrep_refc_check_ext(rr->sc, &irec); if (error) return error; - trace_xrep_refc_found(sc->sa.pag, &irec); + trace_xrep_refc_found(pag_group(sc->sa.pag), &irec); return xfarray_append(rr->refcount_records, &irec); } @@ -215,7 +215,7 @@ xrep_refc_rmap_shareable( return false; /* Metadata in files are never shareable */ - if (xfs_internal_inum(mp, rmap->rm_owner)) + if (xfs_is_sb_inum(mp, rmap->rm_owner)) return false; /* Metadata and unwritten file blocks are not shareable. */ @@ -422,7 +422,7 @@ xrep_refc_find_refcounts( /* * Set up a bag to store all the rmap records that we're tracking to * generate a reference count record. If the size of the bag exceeds - * MAXREFCOUNT, we clamp rc_refcount. + * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount. */ error = rcbag_init(sc->mp, sc->xmbtp, &rcstack); if (error) @@ -590,7 +590,6 @@ xrep_refc_build_new_tree( struct xfs_scrub *sc = rr->sc; struct xfs_btree_cur *refc_cur; struct xfs_perag *pag = sc->sa.pag; - xfs_fsblock_t fsbno; int error; error = xrep_refc_sort_records(rr); @@ -603,8 +602,8 @@ xrep_refc_build_new_tree( * to root the new btree while it's under construction and before we * attach it to the AG header. */ - fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp)); - xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno, + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, + xfs_agbno_to_fsb(pag, xfs_refc_block(sc->mp)), XFS_AG_RESV_METADATA); rr->new_btree.bload.get_records = xrep_refc_get_records; rr->new_btree.bload.claim_block = xrep_refc_claim_block; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index f43dce771cdd..f8f9ed30f56b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -21,6 +21,7 @@ #include "xfs_rmap.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_rtbitmap.h" #include "xfs_extent_busy.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" @@ -32,6 +33,17 @@ #include "xfs_reflink.h" #include "xfs_health.h" #include "xfs_buf_mem.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_dir2.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" +#include "xfs_rtalloc.h" +#include "xfs_metafile.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_zone_alloc.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -39,6 +51,7 @@ #include "scrub/bitmap.h" #include "scrub/stats.h" #include "scrub/xfile.h" +#include "scrub/attr_repair.h" /* * Attempt to repair some metadata, if the metadata is corrupt and userspace @@ -56,6 +69,7 @@ xrep_attempt( trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error); xchk_ag_btcur_free(&sc->sa); + xchk_rtgroup_btcur_free(&sc->sr); /* Repair whatever's broken. */ ASSERT(sc->ops->repair); @@ -290,7 +304,7 @@ xrep_calc_ag_resblks( icount = pag->pagi_count; } else { /* Try to get the actual counters from disk. */ - error = xfs_ialloc_read_agi(pag, NULL, &bp); + error = xfs_ialloc_read_agi(pag, NULL, 0, &bp); if (!error) { icount = pag->pagi_count; xfs_buf_relse(bp); @@ -300,7 +314,7 @@ xrep_calc_ag_resblks( /* Now grab the block counters from the AGF. */ error = xfs_alloc_read_agf(pag, NULL, 0, &bp); if (error) { - aglen = pag->block_count; + aglen = pag_group(pag)->xg_block_count; freelen = aglen; usedlen = aglen; } else { @@ -320,16 +334,14 @@ xrep_calc_ag_resblks( /* If the block counts are impossible, make worst-case assumptions. */ if (aglen == NULLAGBLOCK || - aglen != pag->block_count || + aglen != pag_group(pag)->xg_block_count || freelen >= aglen) { - aglen = pag->block_count; + aglen = pag_group(pag)->xg_block_count; freelen = aglen; usedlen = aglen; } - xfs_perag_put(pag); - trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen, - freelen, usedlen); + trace_xrep_calc_ag_resblks(pag, icount, aglen, freelen, usedlen); /* * Figure out how many blocks we'd need worst case to rebuild @@ -367,12 +379,48 @@ xrep_calc_ag_resblks( rmapbt_sz = 0; } - trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz, - inobt_sz, rmapbt_sz, refcbt_sz); + trace_xrep_calc_ag_resblks_btsize(pag, bnobt_sz, inobt_sz, rmapbt_sz, + refcbt_sz); + xfs_perag_put(pag); return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz)); } +#ifdef CONFIG_XFS_RT +/* + * Figure out how many blocks to reserve for a rtgroup repair. We calculate + * the worst case estimate for the number of blocks we'd need to rebuild one of + * any type of per-rtgroup btree. + */ +xfs_extlen_t +xrep_calc_rtgroup_resblks( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_scrub_metadata *sm = sc->sm; + uint64_t usedlen; + xfs_extlen_t rmapbt_sz = 0; + + if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + return 0; + if (!xfs_has_rtgroups(mp)) { + ASSERT(0); + return -EFSCORRUPTED; + } + + usedlen = xfs_rtbxlen_to_blen(mp, xfs_rtgroup_extents(mp, sm->sm_agno)); + ASSERT(usedlen <= XFS_MAX_RGBLOCKS); + + if (xfs_has_rmapbt(mp)) + rmapbt_sz = xfs_rtrmapbt_calc_size(mp, usedlen); + + trace_xrep_calc_rtgroup_resblks_btsize(mp, sm->sm_agno, usedlen, + rmapbt_sz); + + return rmapbt_sz; +} +#endif /* CONFIG_XFS_RT */ + /* * Reconstructing per-AG Btrees * @@ -409,7 +457,7 @@ xrep_fix_freelist( args.mp = sc->mp; args.tp = sc->tp; - args.agno = sc->sa.pag->pag_agno; + args.agno = pag_agno(sc->sa.pag); args.alignment = 1; args.pag = sc->sa.pag; @@ -478,7 +526,7 @@ xrep_findroot_block( int block_level; int error = 0; - daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno); + daddr = xfs_agbno_to_daddr(ri->sc->sa.pag, agbno); /* * Blocks in the AGFL have stale contents that might just happen to @@ -607,7 +655,7 @@ xrep_findroot_block( else fab->root = NULLAGBLOCK; - trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno, + trace_xrep_findroot_block(ri->sc->sa.pag, agbno, be32_to_cpu(btblock->bb_magic), fab->height - 1); out: xfs_trans_brelse(ri->sc->tp, bp); @@ -724,7 +772,7 @@ xrep_update_qflags( xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1); no_update: - mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); } /* Force a quotacheck the next time we mount. */ @@ -908,7 +956,7 @@ xrep_reinit_pagi( ASSERT(xfs_perag_initialised_agi(pag)); clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); - error = xfs_ialloc_read_agi(pag, sc->tp, &bp); + error = xfs_ialloc_read_agi(pag, sc->tp, 0, &bp); if (error) return error; @@ -934,7 +982,7 @@ xrep_ag_init( ASSERT(!sa->pag); - error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp); + error = xfs_ialloc_read_agi(pag, sc->tp, 0, &sa->agi_bp); if (error) return error; @@ -948,6 +996,83 @@ xrep_ag_init( return 0; } +#ifdef CONFIG_XFS_RT +/* Initialize all the btree cursors for a RT repair. */ +void +xrep_rtgroup_btcur_init( + struct xfs_scrub *sc, + struct xchk_rt *sr) +{ + struct xfs_mount *mp = sc->mp; + + ASSERT(sr->rtg != NULL); + + if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTRMAPBT && + (sr->rtlock_flags & XFS_RTGLOCK_RMAP) && + xfs_has_rtrmapbt(mp)) + sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg); + + if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTREFCBT && + (sr->rtlock_flags & XFS_RTGLOCK_REFCOUNT) && + xfs_has_rtreflink(mp)) + sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg); +} + +/* + * Given a reference to a rtgroup structure, lock rtgroup btree inodes and + * create btree cursors. Must only be called to repair a regular rt file. + */ +int +xrep_rtgroup_init( + struct xfs_scrub *sc, + struct xfs_rtgroup *rtg, + struct xchk_rt *sr, + unsigned int rtglock_flags) +{ + ASSERT(sr->rtg == NULL); + + xfs_rtgroup_lock(rtg, rtglock_flags); + sr->rtlock_flags = rtglock_flags; + + /* Grab our own passive reference from the caller's ref. */ + sr->rtg = xfs_rtgroup_hold(rtg); + xrep_rtgroup_btcur_init(sc, sr); + return 0; +} + +/* Ensure that all rt blocks in the given range are not marked free. */ +int +xrep_require_rtext_inuse( + struct xfs_scrub *sc, + xfs_rgblock_t rgbno, + xfs_filblks_t len) +{ + struct xfs_mount *mp = sc->mp; + xfs_rtxnum_t startrtx; + xfs_rtxnum_t endrtx; + bool is_free = false; + int error = 0; + + if (xfs_has_zoned(mp)) { + if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1)) + return -EFSCORRUPTED; + return 0; + } + + startrtx = xfs_rgbno_to_rtx(mp, rgbno); + endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1); + + error = xfs_rtalloc_extent_is_free(sc->sr.rtg, sc->tp, startrtx, + endrtx - startrtx + 1, &is_free); + if (error) + return error; + if (is_free) + return -EFSCORRUPTED; + + return 0; +} +#endif /* CONFIG_XFS_RT */ + /* Reinitialize the per-AG block reservation for the AG we just fixed. */ int xrep_reset_perag_resv( @@ -963,18 +1088,15 @@ xrep_reset_perag_resv( ASSERT(sc->tp); sc->flags &= ~XREP_RESET_PERAG_RESV; - error = xfs_ag_resv_free(sc->sa.pag); - if (error) - goto out; + xfs_ag_resv_free(sc->sa.pag); error = xfs_ag_resv_init(sc->sa.pag, sc->tp); if (error == -ENOSPC) { xfs_err(sc->mp, "Insufficient free space to reset per-AG reservation for AG %u after repair.", - sc->sa.pag->pag_agno); + pag_agno(sc->sa.pag)); error = 0; } -out: return error; } @@ -1004,55 +1126,27 @@ xrep_metadata_inode_subtype( struct xfs_scrub *sc, unsigned int scrub_type) { - __u32 smtype = sc->sm->sm_type; - __u32 smflags = sc->sm->sm_flags; - unsigned int sick_mask = sc->sick_mask; + struct xfs_scrub_subord *sub; int error; /* - * Let's see if the inode needs repair. We're going to open-code calls - * to the scrub and repair functions so that we can hang on to the + * Let's see if the inode needs repair. Use a subordinate scrub context + * to call the scrub and repair functions so that we can hang on to the * resources that we already acquired instead of using the standard * setup/teardown routines. */ - sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; - sc->sm->sm_type = scrub_type; - - switch (scrub_type) { - case XFS_SCRUB_TYPE_INODE: - error = xchk_inode(sc); - break; - case XFS_SCRUB_TYPE_BMBTD: - error = xchk_bmap_data(sc); - break; - case XFS_SCRUB_TYPE_BMBTA: - error = xchk_bmap_attr(sc); - break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - } + sub = xchk_scrub_create_subord(sc, scrub_type); + error = sub->sc.ops->scrub(&sub->sc); if (error) goto out; - - if (!xrep_will_attempt(sc)) + if (!xrep_will_attempt(&sub->sc)) goto out; /* * Repair some part of the inode. This will potentially join the inode * to the transaction. */ - switch (scrub_type) { - case XFS_SCRUB_TYPE_INODE: - error = xrep_inode(sc); - break; - case XFS_SCRUB_TYPE_BMBTD: - error = xrep_bmap(sc, XFS_DATA_FORK, false); - break; - case XFS_SCRUB_TYPE_BMBTA: - error = xrep_bmap(sc, XFS_ATTR_FORK, false); - break; - } + error = sub->sc.ops->repair(&sub->sc); if (error) goto out; @@ -1061,10 +1155,10 @@ xrep_metadata_inode_subtype( * that the inode will not be joined to the transaction when we exit * the function. */ - error = xfs_defer_finish(&sc->tp); + error = xfs_defer_finish(&sub->sc.tp); if (error) goto out; - error = xfs_trans_roll(&sc->tp); + error = xfs_trans_roll(&sub->sc.tp); if (error) goto out; @@ -1072,31 +1166,18 @@ xrep_metadata_inode_subtype( * Clear the corruption flags and re-check the metadata that we just * repaired. */ - sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; - - switch (scrub_type) { - case XFS_SCRUB_TYPE_INODE: - error = xchk_inode(sc); - break; - case XFS_SCRUB_TYPE_BMBTD: - error = xchk_bmap_data(sc); - break; - case XFS_SCRUB_TYPE_BMBTA: - error = xchk_bmap_attr(sc); - break; - } + sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + error = sub->sc.ops->scrub(&sub->sc); if (error) goto out; /* If corruption persists, the repair has failed. */ - if (xchk_needs_repair(sc->sm)) { + if (xchk_needs_repair(sub->sc.sm)) { error = -EFSCORRUPTED; goto out; } out: - sc->sick_mask = sick_mask; - sc->sm->sm_type = smtype; - sc->sm->sm_flags = smflags; + xchk_scrub_free_subord(sub); return error; } @@ -1122,10 +1203,17 @@ xrep_metadata_inode_forks( if (error) return error; - /* Make sure the attr fork looks ok before we delete it. */ - error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); - if (error) - return error; + /* + * Metadata files can only have extended attributes on metadir + * filesystems, either for parent pointers or for actual xattr data. + * For a non-metadir filesystem, make sure the attr fork looks ok + * before we delete it. + */ + if (xfs_inode_hasattr(sc->ip)) { + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); + if (error) + return error; + } /* Clear the reflink flag since metadata never shares. */ if (xfs_is_reflink_inode(sc->ip)) { @@ -1137,6 +1225,20 @@ xrep_metadata_inode_forks( } /* + * Metadata files on non-metadir filesystems cannot have attr forks, + * so clear them now. + */ + if (xfs_inode_hasattr(sc->ip) && !xfs_has_metadir(sc->mp)) { + if (!dirty) { + dirty = true; + xfs_trans_ijoin(sc->tp, sc->ip, 0); + } + error = xrep_xattr_reset_fork(sc); + if (error) + return error; + } + + /* * If we modified the inode, roll the transaction but don't rejoin the * inode to the new transaction because xrep_bmap_data can do that. */ @@ -1201,3 +1303,141 @@ xrep_trans_cancel_hook_dummy( current->journal_info = *cookiep; *cookiep = NULL; } + +/* + * See if this buffer can pass the given ->verify_struct() function. + * + * If the buffer already has ops attached and they're not the ones that were + * passed in, we reject the buffer. Otherwise, we perform the structure test + * (note that we do not check CRCs) and return the outcome of the test. The + * buffer ops and error state are left unchanged. + */ +bool +xrep_buf_verify_struct( + struct xfs_buf *bp, + const struct xfs_buf_ops *ops) +{ + const struct xfs_buf_ops *old_ops = bp->b_ops; + xfs_failaddr_t fa; + int old_error; + + if (old_ops) { + if (old_ops != ops) + return false; + } + + old_error = bp->b_error; + bp->b_ops = ops; + fa = bp->b_ops->verify_struct(bp); + bp->b_ops = old_ops; + bp->b_error = old_error; + + return fa == NULL; +} + +/* Check the sanity of a rmap record for a metadata btree inode. */ +int +xrep_check_ino_btree_mapping( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *rec) +{ + enum xbtree_recpacking outcome; + int error; + + /* + * Metadata btree inodes never have extended attributes, and all blocks + * should have the bmbt block flag set. + */ + if ((rec->rm_flags & XFS_RMAP_ATTR_FORK) || + !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + return -EFSCORRUPTED; + + /* Make sure the block is within the AG. */ + if (!xfs_verify_agbext(sc->sa.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, + rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* + * Reset the block count of the inode being repaired, and adjust the dquot + * block usage to match. The inode must not have an xattr fork. + */ +void +xrep_inode_set_nblocks( + struct xfs_scrub *sc, + int64_t new_blocks) +{ + int64_t delta = + new_blocks - sc->ip->i_nblocks; + + sc->ip->i_nblocks = new_blocks; + + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + if (delta != 0) + xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, + delta); +} + +/* Reset the block reservation for a metadata inode. */ +int +xrep_reset_metafile_resv( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + int64_t delta; + int error; + + delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail - + mp->m_metafile_resv_target; + if (delta == 0) + return 0; + + /* + * Too many blocks have been reserved, transfer some from the incore + * reservation back to the filesystem. + */ + if (delta > 0) { + int64_t give_back; + + give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail); + if (give_back > 0) { + xfs_mod_sb_delalloc(mp, -give_back); + xfs_add_fdblocks(mp, give_back); + mp->m_metafile_resv_avail -= give_back; + } + + return 0; + } + + /* + * Not enough reservation; try to take some blocks from the filesystem + * to the metabtree reservation. + */ + delta = -delta; /* delta is negative here, so invert the sign. */ + error = xfs_dec_fdblocks(mp, delta, true); + while (error == -ENOSPC) { + delta--; + if (delta == 0) { + xfs_warn(sc->mp, +"Insufficient free space to reset metabtree reservation after repair."); + return 0; + } + error = xfs_dec_fdblocks(mp, delta, true); + } + if (error) + return error; + + xfs_mod_sb_delalloc(mp, delta); + mp->m_metafile_resv_avail += delta; + return 0; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index ce082d941459..af0a3a9e5ed9 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -8,6 +8,7 @@ #include "xfs_quota_defs.h" +struct xfs_rtgroup; struct xchk_stats_run; static inline int xrep_notsupported(struct xfs_scrub *sc) @@ -49,7 +50,9 @@ xrep_trans_commit( struct xbitmap; struct xagb_bitmap; +struct xrgb_bitmap; struct xfsb_bitmap; +struct xrtb_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags); @@ -90,6 +93,14 @@ int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten); int xrep_metadata_inode_forks(struct xfs_scrub *sc); int xrep_setup_ag_rmapbt(struct xfs_scrub *sc); int xrep_setup_ag_refcountbt(struct xfs_scrub *sc); +int xrep_setup_xattr(struct xfs_scrub *sc); +int xrep_setup_directory(struct xfs_scrub *sc); +int xrep_setup_parent(struct xfs_scrub *sc); +int xrep_setup_nlinks(struct xfs_scrub *sc); +int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks); +int xrep_setup_dirtree(struct xfs_scrub *sc); +int xrep_setup_rtrmapbt(struct xfs_scrub *sc); +int xrep_setup_rtrefcountbt(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); @@ -100,6 +111,20 @@ int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap); void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag, struct xchk_ag *sa); +#ifdef CONFIG_XFS_RT +int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg, + struct xchk_rt *sr, unsigned int rtglock_flags); +void xrep_rtgroup_btcur_init(struct xfs_scrub *sc, struct xchk_rt *sr); +int xrep_require_rtext_inuse(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_filblks_t len); +xfs_extlen_t xrep_calc_rtgroup_resblks(struct xfs_scrub *sc); +#else +# define xrep_rtgroup_init(sc, rtg, sr, lockflags) (-ENOSYS) +# define xrep_calc_rtgroup_resblks(sc) (0) +#endif /* CONFIG_XFS_RT */ + +int xrep_check_ino_btree_mapping(struct xfs_scrub *sc, + const struct xfs_rmap_irec *rec); /* Metadata revalidators */ @@ -123,11 +148,25 @@ int xrep_bmap_attr(struct xfs_scrub *sc); int xrep_bmap_cow(struct xfs_scrub *sc); int xrep_nlinks(struct xfs_scrub *sc); int xrep_fscounters(struct xfs_scrub *sc); +int xrep_xattr(struct xfs_scrub *sc); +int xrep_directory(struct xfs_scrub *sc); +int xrep_parent(struct xfs_scrub *sc); +int xrep_symlink(struct xfs_scrub *sc); +int xrep_dirtree(struct xfs_scrub *sc); +int xrep_metapath(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xrep_rtbitmap(struct xfs_scrub *sc); +int xrep_rtsummary(struct xfs_scrub *sc); +int xrep_rgsuperblock(struct xfs_scrub *sc); +int xrep_rtrmapbt(struct xfs_scrub *sc); +int xrep_rtrefcountbt(struct xfs_scrub *sc); #else # define xrep_rtbitmap xrep_notsupported +# define xrep_rtsummary xrep_notsupported +# define xrep_rgsuperblock xrep_notsupported +# define xrep_rtrmapbt xrep_notsupported +# define xrep_rtrefcountbt xrep_notsupported #endif /* CONFIG_XFS_RT */ #ifdef CONFIG_XFS_QUOTA @@ -145,10 +184,23 @@ int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep, struct xfs_trans **tpp); void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp); +bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks); +int xrep_reset_metafile_resv(struct xfs_scrub *sc); + #else #define xrep_ino_dqattach(sc) (0) -#define xrep_will_attempt(sc) (false) + +/* + * When online repair is not built into the kernel, we still want to attempt + * the repair so that the stub xrep_attempt below will return EOPNOTSUPP. + */ +static inline bool xrep_will_attempt(const struct xfs_scrub *sc) +{ + return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + xchk_needs_repair(sc->sm); +} static inline int xrep_attempt( @@ -167,6 +219,8 @@ xrep_calc_ag_resblks( return 0; } +#define xrep_calc_rtgroup_resblks xrep_calc_ag_resblks + static inline int xrep_reset_perag_resv( struct xfs_scrub *sc) @@ -188,9 +242,22 @@ xrep_setup_nothing( #define xrep_setup_ag_allocbt xrep_setup_nothing #define xrep_setup_ag_rmapbt xrep_setup_nothing #define xrep_setup_ag_refcountbt xrep_setup_nothing +#define xrep_setup_xattr xrep_setup_nothing +#define xrep_setup_directory xrep_setup_nothing +#define xrep_setup_parent xrep_setup_nothing +#define xrep_setup_nlinks xrep_setup_nothing +#define xrep_setup_dirtree xrep_setup_nothing +#define xrep_setup_metapath xrep_setup_nothing +#define xrep_setup_rtrmapbt xrep_setup_nothing +#define xrep_setup_rtrefcountbt xrep_setup_nothing #define xrep_setup_inode(sc, imap) ((void)0) +static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x) +{ + return 0; +} + #define xrep_revalidate_allocbt (NULL) #define xrep_revalidate_iallocbt (NULL) @@ -212,6 +279,16 @@ xrep_setup_nothing( #define xrep_quotacheck xrep_notsupported #define xrep_nlinks xrep_notsupported #define xrep_fscounters xrep_notsupported +#define xrep_rtsummary xrep_notsupported +#define xrep_xattr xrep_notsupported +#define xrep_directory xrep_notsupported +#define xrep_parent xrep_notsupported +#define xrep_symlink xrep_notsupported +#define xrep_dirtree xrep_notsupported +#define xrep_metapath xrep_notsupported +#define xrep_rgsuperblock xrep_notsupported +#define xrep_rtrmapbt xrep_notsupported +#define xrep_rtrefcountbt xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rgb_bitmap.h b/fs/xfs/scrub/rgb_bitmap.h new file mode 100644 index 000000000000..4c3126b66dcb --- /dev/null +++ b/fs/xfs/scrub/rgb_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RGB_BITMAP_H__ +#define __XFS_SCRUB_RGB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_rgblock_t */ + +struct xrgb_bitmap { + struct xbitmap32 rgbitmap; +}; + +static inline void xrgb_bitmap_init(struct xrgb_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->rgbitmap); +} + +static inline void xrgb_bitmap_destroy(struct xrgb_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->rgbitmap); +} + +static inline int xrgb_bitmap_set(struct xrgb_bitmap *bitmap, + xfs_rgblock_t start, xfs_extlen_t len) +{ + return xbitmap32_set(&bitmap->rgbitmap, start, len); +} + +static inline int xrgb_bitmap_walk(struct xrgb_bitmap *bitmap, + xbitmap32_walk_fn fn, void *priv) +{ + return xbitmap32_walk(&bitmap->rgbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_RGB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/rgsuper.c b/fs/xfs/scrub/rgsuper.c new file mode 100644 index 000000000000..d189732d0e24 --- /dev/null +++ b/fs/xfs/scrub/rgsuper.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_rtgroup.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_rmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" + +/* Set us up with a transaction and an empty context. */ +int +xchk_setup_rgsuperblock( + struct xfs_scrub *sc) +{ + return xchk_trans_alloc(sc, 0); +} + +/* Cross-reference with the other rt metadata. */ +STATIC void +xchk_rgsuperblock_xref( + struct xfs_scrub *sc) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xchk_xref_is_used_rt_space(sc, xfs_rgbno_to_rtb(sc->sr.rtg, 0), 1); + xchk_xref_is_only_rt_owned_by(sc, 0, 1, &XFS_RMAP_OINFO_FS); +} + +int +xchk_rgsuperblock( + struct xfs_scrub *sc) +{ + xfs_rgnumber_t rgno = sc->sm->sm_agno; + int error; + + /* + * Only rtgroup 0 has a superblock. We may someday want to use higher + * rgno for other functions, similar to what we do with the primary + * super scrub function. + */ + if (rgno != 0) + return -ENOENT; + + /* + * Grab an active reference to the rtgroup structure. If we can't get + * it, we're racing with something that's tearing down the group, so + * signal that the group no longer exists. Take the rtbitmap in shared + * mode so that the group can't change while we're doing things. + */ + error = xchk_rtgroup_init_existing(sc, rgno, &sc->sr); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP_SHARED); + if (error) + return error; + + /* + * Since we already validated the rt superblock at mount time, we don't + * need to check its contents again. All we need is to cross-reference. + */ + xchk_rgsuperblock_xref(sc); + return 0; +} + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int +xrep_rgsuperblock( + struct xfs_scrub *sc) +{ + ASSERT(rtg_rgno(sc->sr.rtg) == 0); + + xfs_log_sb(sc->tp); + return 0; +} +#endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index ba5bbc3fb754..39e9ad7cd8ae 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -358,7 +358,7 @@ xchk_rmapbt_rec( struct xfs_rmap_irec irec; if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || - xfs_rmap_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { + xfs_rmap_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -410,7 +410,7 @@ xchk_rmapbt_walk_ag_metadata( goto out; /* OWN_LOG: Internal log */ - if (xfs_ag_contains_log(mp, sc->sa.pag->pag_agno)) { + if (xfs_ag_contains_log(mp, pag_agno(sc->sa.pag))) { error = xagb_bitmap_set(&cr->log_owned, XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart), mp->m_sb.sb_logblocks); diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c index e8e07b683eab..f5f73078ffe2 100644 --- a/fs/xfs/scrub/rmap_repair.c +++ b/fs/xfs/scrub/rmap_repair.c @@ -31,6 +31,9 @@ #include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_ag.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrefcount_btree.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -231,7 +234,7 @@ xrep_rmap_stash( if (xchk_iscan_aborted(&rr->iscan)) return -EFSCORRUPTED; - trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap); + trace_xrep_rmap_found(sc->sa.pag, &rmap); mutex_lock(&rr->lock); mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree); @@ -344,7 +347,7 @@ xrep_rmap_visit_bmbt( int error; if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) != - rf->rr->sc->sa.pag->pag_agno) + pag_agno(rf->rr->sc->sa.pag)) return 0; agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock); @@ -391,7 +394,7 @@ xrep_rmap_visit_iroot_btree_block( return 0; fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno) + if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != pag_agno(rf->rr->sc->sa.pag)) return 0; agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -432,14 +435,6 @@ out: return error; } -static inline bool -is_rt_data_fork( - struct xfs_inode *ip, - int whichfork) -{ - return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK; -} - /* * Iterate the block mapping btree to collect rmap records for anything in this * fork that matches the AG. Sets @mappings_done to true if we've scanned the @@ -507,6 +502,69 @@ xrep_rmap_scan_iext( return xrep_rmap_stash_accumulated(rf); } +static int +xrep_rmap_scan_meta_btree( + struct xrep_rmap_ifork *rf, + struct xfs_inode *ip) +{ + struct xfs_scrub *sc = rf->rr->sc; + struct xfs_rtgroup *rtg = NULL; + struct xfs_btree_cur *cur = NULL; + enum xfs_rtg_inodes type; + int error; + + if (rf->whichfork != XFS_DATA_FORK) + return -EFSCORRUPTED; + + switch (ip->i_metatype) { + case XFS_METAFILE_RTRMAP: + type = XFS_RTGI_RMAP; + break; + case XFS_METAFILE_RTREFCOUNT: + type = XFS_RTGI_REFCOUNT; + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + + while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) { + if (ip == rtg->rtg_inodes[type]) + goto found; + } + + /* + * We should never find an rt metadata btree inode that isn't + * associated with an rtgroup yet has ondisk blocks allocated to it. + */ + if (ip->i_nblocks) { + ASSERT(0); + return -EFSCORRUPTED; + } + + return 0; + +found: + switch (ip->i_metatype) { + case XFS_METAFILE_RTRMAP: + cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg); + break; + case XFS_METAFILE_RTREFCOUNT: + cur = xfs_rtrefcountbt_init_cursor(sc->tp, rtg); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + goto out_rtg; + } + + error = xrep_rmap_scan_iroot_btree(rf, cur); + xfs_btree_del_cursor(cur, error); +out_rtg: + xfs_rtgroup_rele(rtg); + return error; +} + /* Find all the extents from a given AG in an inode fork. */ STATIC int xrep_rmap_scan_ifork( @@ -520,14 +578,14 @@ xrep_rmap_scan_ifork( .whichfork = whichfork, }; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + bool mappings_done; int error = 0; if (!ifp) return 0; - if (ifp->if_format == XFS_DINODE_FMT_BTREE) { - bool mappings_done; - + switch (ifp->if_format) { + case XFS_DINODE_FMT_BTREE: /* * Scan the bmap btree for data device mappings. This includes * the btree blocks themselves, even if this is a realtime @@ -536,15 +594,18 @@ xrep_rmap_scan_ifork( error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done); if (error || mappings_done) return error; - } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { - return 0; + fallthrough; + case XFS_DINODE_FMT_EXTENTS: + /* Scan incore extent cache if this isn't a realtime file. */ + if (xfs_ifork_is_realtime(ip, whichfork)) + return 0; + + return xrep_rmap_scan_iext(&rf, ifp); + case XFS_DINODE_FMT_META_BTREE: + return xrep_rmap_scan_meta_btree(&rf, ip); } - /* Scan incore extent cache if this isn't a realtime file. */ - if (xfs_ifork_is_realtime(ip, whichfork)) - return 0; - - return xrep_rmap_scan_iext(&rf, ifp); + return 0; } /* @@ -578,23 +639,9 @@ xrep_rmap_scan_inode( struct xrep_rmap *rr, struct xfs_inode *ip) { - unsigned int lock_mode = 0; + unsigned int lock_mode = xrep_rmap_scan_ilock(ip); int error; - /* - * Directory updates (create/link/unlink/rename) drop the directory's - * ILOCK before finishing any rmapbt updates associated with directory - * shape changes. For this scan to coordinate correctly with the live - * update hook, we must take the only lock (i_rwsem) that is held all - * the way to dir op completion. This will get fixed by the parent - * pointer patchset. - */ - if (S_ISDIR(VFS_I(ip)->i_mode)) { - lock_mode = XFS_IOLOCK_SHARED; - xfs_ilock(ip, lock_mode); - } - lock_mode |= xrep_rmap_scan_ilock(ip); - /* Check the data fork. */ error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK); if (error) @@ -644,7 +691,7 @@ xrep_rmap_walk_inobt( return error; xfs_inobt_btrec_to_irec(mp, rec, &irec); - if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL) + if (xfs_inobt_check_irec(to_perag(cur->bc_group), &irec) != NULL) return -EFSCORRUPTED; agino = irec.ir_startino; @@ -823,7 +870,7 @@ xrep_rmap_find_log_rmaps( { struct xfs_scrub *sc = rr->sc; - if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno)) + if (!xfs_ag_contains_log(sc->mp, pag_agno(sc->sa.pag))) return 0; return xrep_rmap_stash(rr, @@ -998,7 +1045,7 @@ xrep_rmap_try_reserve( { struct xrep_rmap_agfl ra = { .bitmap = freesp_blocks, - .agno = rr->sc->sa.pag->pag_agno, + .agno = pag_agno(rr->sc->sa.pag), }; struct xfs_scrub *sc = rr->sc; struct xrep_newbt_resv *resv, *n; @@ -1294,7 +1341,6 @@ xrep_rmap_build_new_tree( struct xfs_perag *pag = sc->sa.pag; struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_btree_cur *rmap_cur; - xfs_fsblock_t fsbno; int error; /* @@ -1312,9 +1358,9 @@ xrep_rmap_build_new_tree( * rmapbt per-AG reservation, which we will adjust further after * committing the new btree. */ - fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp)); xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE, - fsbno, XFS_AG_RESV_RMAPBT); + xfs_agbno_to_fsb(pag, XFS_RMAP_BLOCK(sc->mp)), + XFS_AG_RESV_RMAPBT); rr->new_btree.bload.get_records = xrep_rmap_get_records; rr->new_btree.bload.claim_block = xrep_rmap_claim_block; rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent; @@ -1575,7 +1621,7 @@ xrep_rmapbt_live_update( if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo)) goto out_unlock; - trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p); + trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p); error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); if (error) @@ -1619,7 +1665,7 @@ xrep_rmap_setup_scan( /* Set up in-memory rmap btree */ error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp, - sc->sa.pag->pag_agno); + pag_agno(sc->sa.pag)); if (error) goto out_mutex; @@ -1634,7 +1680,7 @@ xrep_rmap_setup_scan( */ ASSERT(sc->flags & XCHK_FSGATES_RMAP); xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update); - error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook); + error = xfs_rmap_hook_add(pag_group(sc->sa.pag), &rr->rhook); if (error) goto out_iscan; return 0; @@ -1655,7 +1701,7 @@ xrep_rmap_teardown( struct xfs_scrub *sc = rr->sc; xchk_iscan_abort(&rr->iscan); - xfs_rmap_hook_del(sc->sa.pag, &rr->rhook); + xfs_rmap_hook_del(pag_group(sc->sa.pag), &rr->rhook); xchk_iscan_teardown(&rr->iscan); xfbtree_destroy(&rr->rmap_btree); mutex_destroy(&rr->lock); diff --git a/fs/xfs/scrub/rtb_bitmap.h b/fs/xfs/scrub/rtb_bitmap.h new file mode 100644 index 000000000000..1313ef605511 --- /dev/null +++ b/fs/xfs/scrub/rtb_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RTB_BITMAP_H__ +#define __XFS_SCRUB_RTB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_rtblock_t */ + +struct xrtb_bitmap { + struct xbitmap64 rtbitmap; +}; + +static inline void xrtb_bitmap_init(struct xrtb_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->rtbitmap); +} + +static inline void xrtb_bitmap_destroy(struct xrtb_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->rtbitmap); +} + +static inline int xrtb_bitmap_set(struct xrtb_bitmap *bitmap, + xfs_rtblock_t start, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->rtbitmap, start, len); +} + +static inline int xrtb_bitmap_walk(struct xrtb_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->rtbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_RTB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 46583517377f..d5ff8609dbfb 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -9,17 +9,25 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_btree.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bit.h" +#include "xfs_rtgroup.h" #include "xfs_sb.h" +#include "xfs_rmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_exchmaps.h" +#include "xfs_zone_alloc.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" +#include "scrub/tempexch.h" #include "scrub/rtbitmap.h" +#include "scrub/btree.h" /* Set us up with the realtime metadata locked. */ int @@ -30,10 +38,19 @@ xchk_setup_rtbitmap( struct xchk_rtbitmap *rtb; int error; - rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS); + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + rtb = kzalloc(struct_size(rtb, words, xchk_rtbitmap_wordcnt(sc)), + XCHK_GFP_FLAGS); if (!rtb) return -ENOMEM; sc->buf = rtb; + rtb->sc = sc; + + error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); + if (error) + return error; if (xchk_could_repair(sc)) { error = xrep_setup_rtbitmap(sc, rtb); @@ -45,7 +62,7 @@ xchk_setup_rtbitmap( if (error) return error; - error = xchk_install_live_inode(sc, sc->mp->m_rbmip); + error = xchk_install_live_inode(sc, rtg_bitmap(sc->sr.rtg)); if (error) return error; @@ -53,7 +70,9 @@ xchk_setup_rtbitmap( if (error) return error; - xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL); + if (error) + return error; /* * Now that we've locked the rtbitmap, we can't race with growfsrt @@ -61,32 +80,65 @@ xchk_setup_rtbitmap( * Hence it is safe to compute and check the geometry values. */ if (mp->m_sb.sb_rblocks) { - rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rtb->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks); rtb->rextslog = xfs_compute_rextslog(rtb->rextents); - rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents); + rtb->rbmblocks = xfs_rtbitmap_blockcount(mp); } + return 0; } -/* Realtime bitmap. */ +/* Per-rtgroup bitmap contents. */ + +/* Cross-reference rtbitmap entries with other metadata. */ +STATIC void +xchk_rtbitmap_xref( + struct xchk_rtbitmap *rtb, + xfs_rtblock_t startblock, + xfs_rtblock_t blockcount) +{ + struct xfs_scrub *sc = rtb->sc; + xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, startblock); + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + if (!sc->sr.rmap_cur) + return; + + xchk_xref_has_no_rt_owner(sc, rgbno, blockcount); + xchk_xref_is_not_rt_shared(sc, rgbno, blockcount); + xchk_xref_is_not_rt_cow_staging(sc, rgbno, blockcount); + + if (rtb->next_free_rgbno < rgbno) + xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno, + rgbno - rtb->next_free_rgbno); + rtb->next_free_rgbno = rgbno + blockcount; +} /* Scrub a free extent record from the realtime bitmap. */ STATIC int xchk_rtbitmap_rec( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) { - struct xfs_scrub *sc = priv; + struct xchk_rtbitmap *rtb = priv; + struct xfs_scrub *sc = rtb->sc; xfs_rtblock_t startblock; xfs_filblks_t blockcount; - startblock = xfs_rtx_to_rtb(mp, rec->ar_startext); - blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount); + startblock = xfs_rtx_to_rtb(rtg, rec->ar_startext); + blockcount = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount); - if (!xfs_verify_rtbext(mp, startblock, blockcount)) + if (!xfs_verify_rtbext(rtg_mount(rtg), startblock, blockcount)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + + xchk_rtbitmap_xref(rtb, startblock, blockcount); + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -ECANCELED; + return 0; } @@ -134,24 +186,27 @@ xchk_rtbitmap_check_extents( return error; } -/* Scrub the realtime bitmap. */ +/* Scrub this group's realtime bitmap. */ int xchk_rtbitmap( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; + struct xfs_rtgroup *rtg = sc->sr.rtg; + struct xfs_inode *rbmip = rtg_bitmap(rtg); struct xchk_rtbitmap *rtb = sc->buf; + xfs_rgblock_t last_rgbno; int error; /* Is sb_rextents correct? */ if (mp->m_sb.sb_rextents != rtb->rextents) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } /* Is sb_rextslog correct? */ if (mp->m_sb.sb_rextslog != rtb->rextslog) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } @@ -160,17 +215,17 @@ xchk_rtbitmap( * case can we exceed 4bn bitmap blocks since the super field is a u32. */ if (rtb->rbmblocks > U32_MAX) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } /* The bitmap file length must be aligned to an fsblock. */ - if (mp->m_rbmip->i_disk_size & mp->m_blockmask) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + if (rbmip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } @@ -179,8 +234,8 @@ xchk_rtbitmap( * growfsrt expands the bitmap file before updating sb_rextents, so the * file can be larger than sb_rbmblocks. */ - if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + if (rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) { + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } @@ -193,10 +248,20 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc); + rtb->next_free_rgbno = 0; + error = xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtbitmap_rec, rtb); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) return error; + /* + * Check that the are rmappings for all rt extents between the end of + * the last free extent we saw and the last possible extent in the rt + * group. + */ + last_rgbno = rtg->rtg_extents * mp->m_sb.sb_rextsize - 1; + if (rtb->next_free_rgbno < last_rgbno) + xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno, + last_rgbno - rtb->next_free_rgbno); return 0; } @@ -207,6 +272,7 @@ xchk_xref_is_used_rt_space( xfs_rtblock_t rtbno, xfs_extlen_t len) { + struct xfs_rtgroup *rtg = sc->sr.rtg; xfs_rtxnum_t startext; xfs_rtxnum_t endext; bool is_free; @@ -215,15 +281,19 @@ xchk_xref_is_used_rt_space( if (xchk_skip_xref(sc->sm)) return; + if (xfs_has_zoned(sc->mp)) { + if (!xfs_zone_rgbno_is_valid(rtg, + xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1)) + xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino); + return; + } + startext = xfs_rtb_to_rtx(sc->mp, rtbno); endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1); - xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, + error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext, endext - startext + 1, &is_free); if (!xchk_should_check_xref(sc, &error, NULL)) - goto out_unlock; + return; if (is_free) - xchk_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino); -out_unlock: - xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino); } diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h index 85304ff019e1..fe52b877253d 100644 --- a/fs/xfs/scrub/rtbitmap.h +++ b/fs/xfs/scrub/rtbitmap.h @@ -6,17 +6,72 @@ #ifndef __XFS_SCRUB_RTBITMAP_H__ #define __XFS_SCRUB_RTBITMAP_H__ +/* + * We use an xfile to construct new bitmap blocks for the portion of the + * rtbitmap file that we're replacing. Whereas the ondisk bitmap must be + * accessed through the buffer cache, the xfile bitmap supports direct + * word-level accesses. Therefore, we create a small abstraction for linear + * access. + */ +typedef unsigned long long xrep_wordoff_t; +typedef unsigned int xrep_wordcnt_t; + +/* Mask to round an rtx down to the nearest bitmap word. */ +#define XREP_RTBMP_WORDMASK ((1ULL << XFS_NBWORDLOG) - 1) + + struct xchk_rtbitmap { + struct xfs_scrub *sc; + uint64_t rextents; uint64_t rbmblocks; unsigned int rextslog; unsigned int resblks; + + /* The next free rt group block number that we expect to see. */ + xfs_rgblock_t next_free_rgbno; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* stuff for staging a new bitmap */ + struct xfs_rtalloc_args args; + struct xrep_tempexch tempexch; +#endif + + /* The next rtgroup block we expect to see during our rtrmapbt walk. */ + xfs_rgblock_t next_rgbno; + + /* rtgroup lock flags */ + unsigned int rtglock_flags; + + /* rtword position of xfile as we write buffers to disk. */ + xrep_wordoff_t prep_wordoff; + + /* In-Memory rtbitmap for repair. */ + union xfs_rtword_raw words[]; }; #ifdef CONFIG_XFS_ONLINE_REPAIR int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb); + +/* + * How big should the words[] buffer be? + * + * For repairs, we want a full fsblock worth of space so that we can memcpy a + * buffer full of 1s into the xfile bitmap. The xfile bitmap doesn't have + * rtbitmap block headers, so we don't use blockwsize. Scrub doesn't use the + * words buffer at all. + */ +static inline unsigned int +xchk_rtbitmap_wordcnt( + struct xfs_scrub *sc) +{ + if (xchk_could_repair(sc)) + return sc->mp->m_sb.sb_blocksize >> XFS_WORDLOG; + return 0; +} #else # define xrep_setup_rtbitmap(sc, rtb) (0) +# define xchk_rtbitmap_wordcnt(sc) (0) #endif /* CONFIG_XFS_ONLINE_REPAIR */ #endif /* __XFS_SCRUB_RTBITMAP_H__ */ diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c index 46f5d5f605c9..203a1a97c502 100644 --- a/fs/xfs/scrub/rtbitmap_repair.c +++ b/fs/xfs/scrub/rtbitmap_repair.c @@ -12,32 +12,66 @@ #include "xfs_btree.h" #include "xfs_log_format.h" #include "xfs_trans.h" +#include "xfs_rtalloc.h" #include "xfs_inode.h" #include "xfs_bit.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" +#include "xfs_rmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_exchmaps.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" +#include "xfs_extent_busy.h" +#include "xfs_refcount.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/xfile.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/reap.h" #include "scrub/rtbitmap.h" -/* Set up to repair the realtime bitmap file metadata. */ +/* rt bitmap content repairs */ + +/* Set up to repair the realtime bitmap for this group. */ int xrep_setup_rtbitmap( struct xfs_scrub *sc, struct xchk_rtbitmap *rtb) { struct xfs_mount *mp = sc->mp; - unsigned long long blocks = 0; + char *descr; + unsigned long long blocks = mp->m_sb.sb_rbmblocks; + int error; + + error = xrep_tempfile_create(sc, S_IFREG); + if (error) + return error; + + /* Create an xfile to hold our reconstructed bitmap. */ + descr = xchk_xfile_rtgroup_descr(sc, "bitmap file"); + error = xfile_create(descr, blocks * mp->m_sb.sb_blocksize, &sc->xfile); + kfree(descr); + if (error) + return error; /* - * Reserve enough blocks to write out a completely new bmbt for a - * maximally fragmented bitmap file. We do not hold the rtbitmap - * ILOCK yet, so this is entirely speculative. + * Reserve enough blocks to write out a completely new bitmap file, + * plus twice as many blocks as we would need if we can only allocate + * one block per data fork mapping. This should cover the + * preallocation of the temporary file and exchanging the extent + * mappings. + * + * We cannot use xfs_exchmaps_estimate because we have not yet + * constructed the replacement bitmap and therefore do not know how + * many extents it will use. By the time we do, we will have a dirty + * transaction (which we cannot drop because we cannot drop the + * rtbitmap ILOCK) and cannot ask for more reservation. */ - blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks); + blocks += xfs_bmbt_calc_size(mp, blocks) * 2; if (blocks > UINT_MAX) return -EOPNOTSUPP; @@ -45,6 +79,325 @@ xrep_setup_rtbitmap( return 0; } +static inline xrep_wordoff_t +rtx_to_wordoff( + struct xfs_mount *mp, + xfs_rtxnum_t rtx) +{ + return rtx >> XFS_NBWORDLOG; +} + +static inline xrep_wordcnt_t +rtxlen_to_wordcnt( + xfs_rtxlen_t rtxlen) +{ + return rtxlen >> XFS_NBWORDLOG; +} + +/* Helper functions to record rtwords in an xfile. */ + +static inline int +xfbmp_load( + struct xchk_rtbitmap *rtb, + xrep_wordoff_t wordoff, + xfs_rtword_t *word) +{ + union xfs_rtword_raw urk; + int error; + + ASSERT(xfs_has_rtgroups(rtb->sc->mp)); + + error = xfile_load(rtb->sc->xfile, &urk, + sizeof(union xfs_rtword_raw), + wordoff << XFS_WORDLOG); + if (error) + return error; + + *word = be32_to_cpu(urk.rtg); + return 0; +} + +static inline int +xfbmp_store( + struct xchk_rtbitmap *rtb, + xrep_wordoff_t wordoff, + const xfs_rtword_t word) +{ + union xfs_rtword_raw urk; + + ASSERT(xfs_has_rtgroups(rtb->sc->mp)); + + urk.rtg = cpu_to_be32(word); + return xfile_store(rtb->sc->xfile, &urk, + sizeof(union xfs_rtword_raw), + wordoff << XFS_WORDLOG); +} + +static inline int +xfbmp_copyin( + struct xchk_rtbitmap *rtb, + xrep_wordoff_t wordoff, + const union xfs_rtword_raw *word, + xrep_wordcnt_t nr_words) +{ + return xfile_store(rtb->sc->xfile, word, nr_words << XFS_WORDLOG, + wordoff << XFS_WORDLOG); +} + +static inline int +xfbmp_copyout( + struct xchk_rtbitmap *rtb, + xrep_wordoff_t wordoff, + union xfs_rtword_raw *word, + xrep_wordcnt_t nr_words) +{ + return xfile_load(rtb->sc->xfile, word, nr_words << XFS_WORDLOG, + wordoff << XFS_WORDLOG); +} + +/* Perform a logical OR operation on an rtword in the incore bitmap. */ +static int +xrep_rtbitmap_or( + struct xchk_rtbitmap *rtb, + xrep_wordoff_t wordoff, + xfs_rtword_t mask) +{ + xfs_rtword_t word; + int error; + + error = xfbmp_load(rtb, wordoff, &word); + if (error) + return error; + + trace_xrep_rtbitmap_or(rtb->sc->mp, wordoff, mask, word); + + return xfbmp_store(rtb, wordoff, word | mask); +} + +/* + * Mark as free every rt extent between the next rt block we expected to see + * in the rtrmap records and the given rt block. + */ +STATIC int +xrep_rtbitmap_mark_free( + struct xchk_rtbitmap *rtb, + xfs_rgblock_t rgbno) +{ + struct xfs_mount *mp = rtb->sc->mp; + struct xchk_rt *sr = &rtb->sc->sr; + struct xfs_rtgroup *rtg = sr->rtg; + xfs_rtxnum_t startrtx; + xfs_rtxnum_t nextrtx; + xrep_wordoff_t wordoff, nextwordoff; + unsigned int bit; + unsigned int bufwsize; + xfs_extlen_t mod; + xfs_rtword_t mask; + enum xbtree_recpacking outcome; + int error; + + if (!xfs_verify_rgbext(rtg, rtb->next_rgbno, rgbno - rtb->next_rgbno)) + return -EFSCORRUPTED; + + /* + * Convert rt blocks to rt extents The block range we find must be + * aligned to an rtextent boundary on both ends. + */ + startrtx = xfs_rgbno_to_rtx(mp, rtb->next_rgbno); + mod = xfs_rgbno_to_rtxoff(mp, rtb->next_rgbno); + if (mod) + return -EFSCORRUPTED; + + nextrtx = xfs_rgbno_to_rtx(mp, rgbno - 1) + 1; + mod = xfs_rgbno_to_rtxoff(mp, rgbno - 1); + if (mod != mp->m_sb.sb_rextsize - 1) + return -EFSCORRUPTED; + + /* Must not be shared or CoW staging. */ + if (sr->refc_cur) { + error = xfs_refcount_has_records(sr->refc_cur, + XFS_REFC_DOMAIN_SHARED, rtb->next_rgbno, + rgbno - rtb->next_rgbno, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + error = xfs_refcount_has_records(sr->refc_cur, + XFS_REFC_DOMAIN_COW, rtb->next_rgbno, + rgbno - rtb->next_rgbno, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + } + + trace_xrep_rtbitmap_record_free(mp, startrtx, nextrtx - 1); + + /* Set bits as needed to round startrtx up to the nearest word. */ + bit = startrtx & XREP_RTBMP_WORDMASK; + if (bit) { + xfs_rtblock_t len = nextrtx - startrtx; + unsigned int lastbit; + + lastbit = min(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + + error = xrep_rtbitmap_or(rtb, rtx_to_wordoff(mp, startrtx), + mask); + if (error || lastbit - bit == len) + return error; + startrtx += XFS_NBWORD - bit; + } + + /* Set bits as needed to round nextrtx down to the nearest word. */ + bit = nextrtx & XREP_RTBMP_WORDMASK; + if (bit) { + mask = ((xfs_rtword_t)1 << bit) - 1; + + error = xrep_rtbitmap_or(rtb, rtx_to_wordoff(mp, nextrtx), + mask); + if (error || startrtx + bit == nextrtx) + return error; + nextrtx -= bit; + } + + trace_xrep_rtbitmap_record_free_bulk(mp, startrtx, nextrtx - 1); + + /* Set all the words in between, up to a whole fs block at once. */ + wordoff = rtx_to_wordoff(mp, startrtx); + nextwordoff = rtx_to_wordoff(mp, nextrtx); + bufwsize = mp->m_sb.sb_blocksize >> XFS_WORDLOG; + + while (wordoff < nextwordoff) { + xrep_wordoff_t rem; + xrep_wordcnt_t wordcnt; + + wordcnt = min_t(xrep_wordcnt_t, nextwordoff - wordoff, + bufwsize); + + /* + * Try to keep us aligned to the rtwords buffer to reduce the + * number of xfile writes. + */ + rem = wordoff & (bufwsize - 1); + if (rem) + wordcnt = min_t(xrep_wordcnt_t, wordcnt, + bufwsize - rem); + + error = xfbmp_copyin(rtb, wordoff, rtb->words, wordcnt); + if (error) + return error; + + wordoff += wordcnt; + } + + return 0; +} + +/* Set free space in the rtbitmap based on rtrmapbt records. */ +STATIC int +xrep_rtbitmap_walk_rtrmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xchk_rtbitmap *rtb = priv; + int error = 0; + + if (xchk_should_terminate(rtb->sc, &error)) + return error; + + if (rtb->next_rgbno < rec->rm_startblock) { + error = xrep_rtbitmap_mark_free(rtb, rec->rm_startblock); + if (error) + return error; + } + + rtb->next_rgbno = max(rtb->next_rgbno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + +/* + * Walk the rtrmapbt to find all the gaps between records, and mark the gaps + * in the realtime bitmap that we're computing. + */ +STATIC int +xrep_rtbitmap_find_freespace( + struct xchk_rtbitmap *rtb) +{ + struct xfs_scrub *sc = rtb->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_rtgroup *rtg = sc->sr.rtg; + uint64_t blockcount; + int error; + + /* Prepare a buffer of ones so that we can accelerate bulk setting. */ + memset(rtb->words, 0xFF, mp->m_sb.sb_blocksize); + + xrep_rtgroup_btcur_init(sc, &sc->sr); + error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_rtbitmap_walk_rtrmap, + rtb); + if (error) + goto out; + + /* + * Mark as free every possible rt extent from the last one we saw to + * the end of the rt group. + */ + blockcount = rtg->rtg_extents * mp->m_sb.sb_rextsize; + if (rtb->next_rgbno < blockcount) { + error = xrep_rtbitmap_mark_free(rtb, blockcount); + if (error) + goto out; + } + +out: + xchk_rtgroup_btcur_free(&sc->sr); + return error; +} + +static int +xrep_rtbitmap_prep_buf( + struct xfs_scrub *sc, + struct xfs_buf *bp, + void *data) +{ + struct xchk_rtbitmap *rtb = data; + struct xfs_mount *mp = sc->mp; + union xfs_rtword_raw *ondisk; + int error; + + rtb->args.mp = sc->mp; + rtb->args.tp = sc->tp; + rtb->args.rbmbp = bp; + ondisk = xfs_rbmblock_wordptr(&rtb->args, 0); + rtb->args.rbmbp = NULL; + + error = xfbmp_copyout(rtb, rtb->prep_wordoff, ondisk, + mp->m_blockwsize); + if (error) + return error; + + if (xfs_has_rtgroups(sc->mp)) { + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC); + hdr->rt_owner = cpu_to_be64(sc->ip->i_ino); + hdr->rt_blkno = cpu_to_be64(xfs_buf_daddr(bp)); + hdr->rt_lsn = 0; + uuid_copy(&hdr->rt_uuid, &sc->mp->m_sb.sb_meta_uuid); + bp->b_ops = &xfs_rtbitmap_buf_ops; + } else { + bp->b_ops = &xfs_rtbuf_ops; + } + + rtb->prep_wordoff += mp->m_blockwsize; + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTBITMAP_BUF); + return 0; +} + /* * Make sure that the given range of the data fork of the realtime file is * mapped to written blocks. The caller must ensure that the inode is joined @@ -108,8 +461,6 @@ xrep_rtbitmap_data_mappings( 0, &map, &nmaps); if (error) return error; - if (nmaps != 1) - return -EFSCORRUPTED; /* Commit new extent and all deferred work. */ error = xrep_defer_finish(sc); @@ -162,9 +513,18 @@ xrep_rtbitmap( { struct xchk_rtbitmap *rtb = sc->buf; struct xfs_mount *mp = sc->mp; + struct xfs_group *xg = rtg_group(sc->sr.rtg); unsigned long long blocks = 0; + unsigned int busy_gen; int error; + /* We require the realtime rmapbt to rebuild anything. */ + if (!xfs_has_rtrmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + /* Impossibly large rtbitmap means we can't touch the filesystem. */ if (rtb->rbmblocks > U32_MAX) return 0; @@ -197,6 +557,79 @@ xrep_rtbitmap( if (error) return error; - /* Fix inconsistent bitmap geometry */ - return xrep_rtbitmap_geometry(sc, rtb); + /* + * Fix inconsistent bitmap geometry. This function returns with a + * clean scrub transaction. + */ + error = xrep_rtbitmap_geometry(sc, rtb); + if (error) + return error; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. + */ + if (!xfs_extent_busy_list_empty(xg, &busy_gen)) { + error = xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0); + if (error) + return error; + } + + /* + * Generate the new rtbitmap data. We don't need the rtbmp information + * once this call is finished. + */ + error = xrep_rtbitmap_find_freespace(rtb); + if (error) + return error; + + /* + * Try to take ILOCK_EXCL of the temporary file. We had better be the + * only ones holding onto this inode, but we can't block while holding + * the rtbitmap file's ILOCK_EXCL. + */ + while (!xrep_tempfile_ilock_nowait(sc)) { + if (xchk_should_terminate(sc, &error)) + return error; + delay(1); + } + + /* + * Make sure we have space allocated for the part of the bitmap + * file that corresponds to this group. We already joined sc->ip. + */ + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + error = xrep_tempfile_prealloc(sc, 0, rtb->rbmblocks); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* Copy the bitmap file that we generated. */ + error = xrep_tempfile_copyin(sc, 0, rtb->rbmblocks, + xrep_rtbitmap_prep_buf, rtb); + if (error) + return error; + error = xrep_tempfile_set_isize(sc, + XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)); + if (error) + return error; + + /* + * Now exchange the data fork contents. We're done with the temporary + * buffer, so we can reuse it for the tempfile exchmaps information. + */ + error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, 0, + rtb->rbmblocks, &rtb->tempexch); + if (error) + return error; + + error = xrep_tempexch_contents(sc, &rtb->tempexch); + if (error) + return error; + + /* Free the old rtbitmap blocks if they're not in use. */ + return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); } diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c new file mode 100644 index 000000000000..4c5dffc73641 --- /dev/null +++ b/fs/xfs/scrub/rtrefcount.c @@ -0,0 +1,661 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_btree.h" +#include "xfs_rmap.h" +#include "xfs_refcount.h" +#include "xfs_inode.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" +#include "xfs_metafile.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_rtalloc.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/repair.h" + +/* Set us up with the realtime refcount metadata locked. */ +int +xchk_setup_rtrefcountbt( + struct xfs_scrub *sc) +{ + int error; + + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + if (xchk_could_repair(sc)) { + error = xrep_setup_rtrefcountbt(sc); + if (error) + return error; + } + + error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); + if (error) + return error; + + error = xchk_setup_rt(sc); + if (error) + return error; + + error = xchk_install_live_inode(sc, rtg_refcount(sc->sr.rtg)); + if (error) + return error; + + return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL); +} + +/* Realtime Reference count btree scrubber. */ + +/* + * Confirming Reference Counts via Reverse Mappings + * + * We want to count the reverse mappings overlapping a refcount record + * (bno, len, refcount), allowing for the possibility that some of the + * overlap may come from smaller adjoining reverse mappings, while some + * comes from single extents which overlap the range entirely. The + * outer loop is as follows: + * + * 1. For all reverse mappings overlapping the refcount extent, + * a. If a given rmap completely overlaps, mark it as seen. + * b. Otherwise, record the fragment (in agbno order) for later + * processing. + * + * Once we've seen all the rmaps, we know that for all blocks in the + * refcount record we want to find $refcount owners and we've already + * visited $seen extents that overlap all the blocks. Therefore, we + * need to find ($refcount - $seen) owners for every block in the + * extent; call that quantity $target_nr. Proceed as follows: + * + * 2. Pull the first $target_nr fragments from the list; all of them + * should start at or before the start of the extent. + * Call this subset of fragments the working set. + * 3. Until there are no more unprocessed fragments, + * a. Find the shortest fragments in the set and remove them. + * b. Note the block number of the end of these fragments. + * c. Pull the same number of fragments from the list. All of these + * fragments should start at the block number recorded in the + * previous step. + * d. Put those fragments in the set. + * 4. Check that there are $target_nr fragments remaining in the list, + * and that they all end at or beyond the end of the refcount extent. + * + * If the refcount is correct, all the check conditions in the algorithm + * should always hold true. If not, the refcount is incorrect. + */ +struct xchk_rtrefcnt_frag { + struct list_head list; + struct xfs_rmap_irec rm; +}; + +struct xchk_rtrefcnt_check { + struct xfs_scrub *sc; + struct list_head fragments; + + /* refcount extent we're examining */ + xfs_rgblock_t bno; + xfs_extlen_t len; + xfs_nlink_t refcount; + + /* number of owners seen */ + xfs_nlink_t seen; +}; + +/* + * Decide if the given rmap is large enough that we can redeem it + * towards refcount verification now, or if it's a fragment, in + * which case we'll hang onto it in the hopes that we'll later + * discover that we've collected exactly the correct number of + * fragments as the rtrefcountbt says we should have. + */ +STATIC int +xchk_rtrefcountbt_rmap_check( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xchk_rtrefcnt_check *refchk = priv; + struct xchk_rtrefcnt_frag *frag; + xfs_rgblock_t rm_last; + xfs_rgblock_t rc_last; + int error = 0; + + if (xchk_should_terminate(refchk->sc, &error)) + return error; + + rm_last = rec->rm_startblock + rec->rm_blockcount - 1; + rc_last = refchk->bno + refchk->len - 1; + + /* Confirm that a single-owner refc extent is a CoW stage. */ + if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) { + xchk_btree_xref_set_corrupt(refchk->sc, cur, 0); + return 0; + } + + if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) { + /* + * The rmap overlaps the refcount record, so we can confirm + * one refcount owner seen. + */ + refchk->seen++; + } else { + /* + * This rmap covers only part of the refcount record, so + * save the fragment for later processing. If the rmapbt + * is healthy each rmap_irec we see will be in agbno order + * so we don't need insertion sort here. + */ + frag = kmalloc(sizeof(struct xchk_rtrefcnt_frag), + XCHK_GFP_FLAGS); + if (!frag) + return -ENOMEM; + memcpy(&frag->rm, rec, sizeof(frag->rm)); + list_add_tail(&frag->list, &refchk->fragments); + } + + return 0; +} + +/* + * Given a bunch of rmap fragments, iterate through them, keeping + * a running tally of the refcount. If this ever deviates from + * what we expect (which is the rtrefcountbt's refcount minus the + * number of extents that totally covered the rtrefcountbt extent), + * we have a rtrefcountbt error. + */ +STATIC void +xchk_rtrefcountbt_process_rmap_fragments( + struct xchk_rtrefcnt_check *refchk) +{ + struct list_head worklist; + struct xchk_rtrefcnt_frag *frag; + struct xchk_rtrefcnt_frag *n; + xfs_rgblock_t bno; + xfs_rgblock_t rbno; + xfs_rgblock_t next_rbno; + xfs_nlink_t nr; + xfs_nlink_t target_nr; + + target_nr = refchk->refcount - refchk->seen; + if (target_nr == 0) + return; + + /* + * There are (refchk->rc.rc_refcount - refchk->nr refcount) + * references we haven't found yet. Pull that many off the + * fragment list and figure out where the smallest rmap ends + * (and therefore the next rmap should start). All the rmaps + * we pull off should start at or before the beginning of the + * refcount record's range. + */ + INIT_LIST_HEAD(&worklist); + rbno = NULLRGBLOCK; + + /* Make sure the fragments actually /are/ in bno order. */ + bno = 0; + list_for_each_entry(frag, &refchk->fragments, list) { + if (frag->rm.rm_startblock < bno) + goto done; + bno = frag->rm.rm_startblock; + } + + /* + * Find all the rmaps that start at or before the refc extent, + * and put them on the worklist. + */ + nr = 0; + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + if (frag->rm.rm_startblock > refchk->bno || nr > target_nr) + break; + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (bno < rbno) + rbno = bno; + list_move_tail(&frag->list, &worklist); + nr++; + } + + /* + * We should have found exactly $target_nr rmap fragments starting + * at or before the refcount extent. + */ + if (nr != target_nr) + goto done; + + while (!list_empty(&refchk->fragments)) { + /* Discard any fragments ending at rbno from the worklist. */ + nr = 0; + next_rbno = NULLRGBLOCK; + list_for_each_entry_safe(frag, n, &worklist, list) { + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (bno != rbno) { + if (bno < next_rbno) + next_rbno = bno; + continue; + } + list_del(&frag->list); + kfree(frag); + nr++; + } + + /* Try to add nr rmaps starting at rbno to the worklist. */ + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; + if (frag->rm.rm_startblock != rbno) + goto done; + list_move_tail(&frag->list, &worklist); + if (next_rbno > bno) + next_rbno = bno; + nr--; + if (nr == 0) + break; + } + + /* + * If we get here and nr > 0, this means that we added fewer + * items to the worklist than we discarded because the fragment + * list ran out of items. Therefore, we cannot maintain the + * required refcount. Something is wrong, so we're done. + */ + if (nr) + goto done; + + rbno = next_rbno; + } + + /* + * Make sure the last extent we processed ends at or beyond + * the end of the refcount extent. + */ + if (rbno < refchk->bno + refchk->len) + goto done; + + /* Actually record us having seen the remaining refcount. */ + refchk->seen = refchk->refcount; +done: + /* Delete fragments and work list. */ + list_for_each_entry_safe(frag, n, &worklist, list) { + list_del(&frag->list); + kfree(frag); + } + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { + list_del(&frag->list); + kfree(frag); + } +} + +/* Use the rmap entries covering this extent to verify the refcount. */ +STATIC void +xchk_rtrefcountbt_xref_rmap( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *irec) +{ + struct xchk_rtrefcnt_check refchk = { + .sc = sc, + .bno = irec->rc_startblock, + .len = irec->rc_blockcount, + .refcount = irec->rc_refcount, + .seen = 0, + }; + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + struct xchk_rtrefcnt_frag *frag; + struct xchk_rtrefcnt_frag *n; + int error; + + if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + /* Cross-reference with the rmapbt to confirm the refcount. */ + memset(&low, 0, sizeof(low)); + low.rm_startblock = irec->rc_startblock; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1; + + INIT_LIST_HEAD(&refchk.fragments); + error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high, + xchk_rtrefcountbt_rmap_check, &refchk); + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) + goto out_free; + + xchk_rtrefcountbt_process_rmap_fragments(&refchk); + if (irec->rc_refcount != refchk.seen) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); + +out_free: + list_for_each_entry_safe(frag, n, &refchk.fragments, list) { + list_del(&frag->list); + kfree(frag); + } +} + +/* Cross-reference with the other btrees. */ +STATIC void +xchk_rtrefcountbt_xref( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *irec) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xchk_xref_is_used_rt_space(sc, + xfs_rgbno_to_rtb(sc->sr.rtg, irec->rc_startblock), + irec->rc_blockcount); + xchk_rtrefcountbt_xref_rmap(sc, irec); +} + +struct xchk_rtrefcbt_records { + /* Previous refcount record. */ + struct xfs_refcount_irec prev_rec; + + /* The next rtgroup block where we aren't expecting shared extents. */ + xfs_rgblock_t next_unshared_rgbno; + + /* Number of CoW blocks we expect. */ + xfs_extlen_t cow_blocks; + + /* Was the last record a shared or CoW staging extent? */ + enum xfs_refc_domain prev_domain; +}; + +static inline bool +xchk_rtrefcount_mergeable( + struct xchk_rtrefcbt_records *rrc, + const struct xfs_refcount_irec *r2) +{ + const struct xfs_refcount_irec *r1 = &rrc->prev_rec; + + /* Ignore if prev_rec is not yet initialized. */ + if (r1->rc_blockcount > 0) + return false; + + if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock) + return false; + if (r1->rc_refcount != r2->rc_refcount) + return false; + if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount > + XFS_REFC_LEN_MAX) + return false; + + return true; +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_rtrefcountbt_check_mergeable( + struct xchk_btree *bs, + struct xchk_rtrefcbt_records *rrc, + const struct xfs_refcount_irec *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (xchk_rtrefcount_mergeable(rrc, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec)); +} + +STATIC int +xchk_rtrefcountbt_rmap_check_gap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + xfs_rgblock_t *next_bno = priv; + + if (*next_bno != NULLRGBLOCK && rec->rm_startblock < *next_bno) + return -ECANCELED; + + *next_bno = rec->rm_startblock + rec->rm_blockcount; + return 0; +} + +/* + * Make sure that a gap in the reference count records does not correspond to + * overlapping records (i.e. shared extents) in the reverse mappings. + */ +static inline void +xchk_rtrefcountbt_xref_gaps( + struct xfs_scrub *sc, + struct xchk_rtrefcbt_records *rrc, + xfs_rtblock_t bno) +{ + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + xfs_rgblock_t next_bno = NULLRGBLOCK; + int error; + + if (bno <= rrc->next_unshared_rgbno || !sc->sr.rmap_cur || + xchk_skip_xref(sc->sm)) + return; + + memset(&low, 0, sizeof(low)); + low.rm_startblock = rrc->next_unshared_rgbno; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = bno - 1; + + error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high, + xchk_rtrefcountbt_rmap_check_gap, &next_bno); + if (error == -ECANCELED) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); + else + xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur); +} + +/* Scrub a rtrefcountbt record. */ +STATIC int +xchk_rtrefcountbt_rec( + struct xchk_btree *bs, + const union xfs_btree_rec *rec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + struct xchk_rtrefcbt_records *rrc = bs->private; + struct xfs_refcount_irec irec; + u32 mod; + + xfs_refcount_btrec_to_irec(rec, &irec); + if (xfs_rtrefcount_check_irec(to_rtg(bs->cur->bc_group), &irec) != + NULL) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + /* We can only share full rt extents. */ + mod = xfs_rgbno_to_rtxoff(mp, irec.rc_startblock); + if (mod) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + mod = xfs_extlen_to_rtxmod(mp, irec.rc_blockcount); + if (mod) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (irec.rc_domain == XFS_REFC_DOMAIN_COW) + rrc->cow_blocks += irec.rc_blockcount; + + /* Shared records always come before CoW records. */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED && + rrc->prev_domain == XFS_REFC_DOMAIN_COW) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + rrc->prev_domain = irec.rc_domain; + + xchk_rtrefcountbt_check_mergeable(bs, rrc, &irec); + xchk_rtrefcountbt_xref(bs->sc, &irec); + + /* + * If this is a record for a shared extent, check that all blocks + * between the previous record and this one have at most one reverse + * mapping. + */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) { + xchk_rtrefcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock); + rrc->next_unshared_rgbno = irec.rc_startblock + + irec.rc_blockcount; + } + + return 0; +} + +/* Make sure we have as many refc blocks as the rmap says. */ +STATIC void +xchk_refcount_xref_rmap( + struct xfs_scrub *sc, + const struct xfs_owner_info *btree_oinfo, + xfs_extlen_t cow_blocks) +{ + xfs_filblks_t refcbt_blocks = 0; + xfs_filblks_t blocks; + int error; + + if (!sc->sr.rmap_cur || !sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + /* Check that we saw as many refcbt blocks as the rmap knows about. */ + error = xfs_btree_count_blocks(sc->sr.refc_cur, &refcbt_blocks); + if (!xchk_btree_process_error(sc, sc->sr.refc_cur, 0, &error)) + return; + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, btree_oinfo, + &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (blocks != refcbt_blocks) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + + /* Check that we saw as many cow blocks as the rmap knows about. */ + error = xchk_count_rmap_ownedby_ag(sc, sc->sr.rmap_cur, + &XFS_RMAP_OINFO_COW, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) + return; + if (blocks != cow_blocks) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); +} + +/* Scrub the refcount btree for some AG. */ +int +xchk_rtrefcountbt( + struct xfs_scrub *sc) +{ + struct xfs_owner_info btree_oinfo; + struct xchk_rtrefcbt_records rrc = { + .cow_blocks = 0, + .next_unshared_rgbno = 0, + .prev_domain = XFS_REFC_DOMAIN_SHARED, + }; + int error; + + error = xchk_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + xfs_rmap_ino_bmbt_owner(&btree_oinfo, rtg_refcount(sc->sr.rtg)->i_ino, + XFS_DATA_FORK); + error = xchk_btree(sc, sc->sr.refc_cur, xchk_rtrefcountbt_rec, + &btree_oinfo, &rrc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + /* + * Check that all blocks between the last refcount > 1 record and the + * end of the rt volume have at most one reverse mapping. + */ + xchk_rtrefcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_rblocks); + + xchk_refcount_xref_rmap(sc, &btree_oinfo, rrc.cow_blocks); + + return 0; +} + +/* xref check that a cow staging extent is marked in the rtrefcountbt. */ +void +xchk_xref_is_rt_cow_staging( + struct xfs_scrub *sc, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + struct xfs_refcount_irec rc; + int has_refcount; + int error; + + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) + return; + + /* Find the CoW staging extent. */ + error = xfs_refcount_lookup_le(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW, + bno, &has_refcount); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (!has_refcount) { + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); + return; + } + + error = xfs_refcount_get_rec(sc->sr.refc_cur, &rc, &has_refcount); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (!has_refcount) { + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); + return; + } + + /* CoW lookup returned a shared extent record? */ + if (rc.rc_domain != XFS_REFC_DOMAIN_COW) + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); + + /* Must be at least as long as what was passed in */ + if (rc.rc_blockcount < len) + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); +} + +/* + * xref check that the extent is not shared. Only file data blocks + * can have multiple owners. + */ +void +xchk_xref_is_not_rt_shared( + struct xfs_scrub *sc, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_refcount_has_records(sc->sr.refc_cur, + XFS_REFC_DOMAIN_SHARED, bno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); +} + +/* xref check that the extent is not being used for CoW staging. */ +void +xchk_xref_is_not_rt_cow_staging( + struct xfs_scrub *sc, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_refcount_has_records(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW, + bno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); +} diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c new file mode 100644 index 000000000000..983362447826 --- /dev/null +++ b/fs/xfs/scrub/rtrefcount_repair.c @@ -0,0 +1,761 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_error.h" +#include "xfs_health.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_rtalloc.h" +#include "xfs_ag.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" +#include "scrub/rcbag.h" + +/* + * Rebuilding the Reference Count Btree + * ==================================== + * + * This algorithm is "borrowed" from xfs_repair. Imagine the rmap + * entries as rectangles representing extents of physical blocks, and + * that the rectangles can be laid down to allow them to overlap each + * other; then we know that we must emit a refcnt btree entry wherever + * the amount of overlap changes, i.e. the emission stimulus is + * level-triggered: + * + * - --- + * -- ----- ---- --- ------ + * -- ---- ----------- ---- --------- + * -------------------------------- ----------- + * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^ + * 2 1 23 21 3 43 234 2123 1 01 2 3 0 + * + * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner). + * + * Note that in the actual refcnt btree we don't store the refcount < 2 + * cases because the bnobt tells us which blocks are free; single-use + * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt + * supports storing multiple entries covering a given block we could + * theoretically dispense with the refcntbt and simply count rmaps, but + * that's inefficient in the (hot) write path, so we'll take the cost of + * the extra tree to save time. Also there's no guarantee that rmap + * will be enabled. + * + * Given an array of rmaps sorted by physical block number, a starting + * physical block (sp), a bag to hold rmaps that cover sp, and the next + * physical block where the level changes (np), we can reconstruct the + * rt refcount btree as follows: + * + * While there are still unprocessed rmaps in the array, + * - Set sp to the physical block (pblk) of the next unprocessed rmap. + * - Add to the bag all rmaps in the array where startblock == sp. + * - Set np to the physical block where the bag size will change. This + * is the minimum of (the pblk of the next unprocessed rmap) and + * (startblock + len of each rmap in the bag). + * - Record the bag size as old_bag_size. + * + * - While the bag isn't empty, + * - Remove from the bag all rmaps where startblock + len == np. + * - Add to the bag all rmaps in the array where startblock == np. + * - If the bag size isn't old_bag_size, store the refcount entry + * (sp, np - sp, bag_size) in the refcnt btree. + * - If the bag is empty, break out of the inner loop. + * - Set old_bag_size to the bag size + * - Set sp = np. + * - Set np to the physical block where the bag size will change. + * This is the minimum of (the pblk of the next unprocessed rmap) + * and (startblock + len of each rmap in the bag). + * + * Like all the other repairers, we make a list of all the refcount + * records we need, then reinitialize the rt refcount btree root and + * insert all the records. + */ + +struct xrep_rtrefc { + /* refcount extents */ + struct xfarray *refcount_records; + + /* new refcountbt information */ + struct xrep_newbt new_btree; + + /* old refcountbt blocks */ + struct xfsb_bitmap old_rtrefcountbt_blocks; + + struct xfs_scrub *sc; + + /* get_records()'s position in the rt refcount record array. */ + xfarray_idx_t array_cur; + + /* # of refcountbt blocks */ + xfs_filblks_t btblocks; +}; + +/* Set us up to repair refcount btrees. */ +int +xrep_setup_rtrefcountbt( + struct xfs_scrub *sc) +{ + char *descr; + int error; + + descr = xchk_xfile_ag_descr(sc, "rmap record bag"); + error = xrep_setup_xfbtree(sc, descr); + kfree(descr); + return error; +} + +/* Check for any obvious conflicts with this shared/CoW staging extent. */ +STATIC int +xrep_rtrefc_check_ext( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *rec) +{ + xfs_rgblock_t last; + + if (xfs_rtrefcount_check_irec(sc->sr.rtg, rec) != NULL) + return -EFSCORRUPTED; + + if (xfs_rgbno_to_rtxoff(sc->mp, rec->rc_startblock) != 0) + return -EFSCORRUPTED; + + last = rec->rc_startblock + rec->rc_blockcount - 1; + if (xfs_rgbno_to_rtxoff(sc->mp, last) != sc->mp->m_sb.sb_rextsize - 1) + return -EFSCORRUPTED; + + /* Make sure this isn't free space or misaligned. */ + return xrep_require_rtext_inuse(sc, rec->rc_startblock, + rec->rc_blockcount); +} + +/* Record a reference count extent. */ +STATIC int +xrep_rtrefc_stash( + struct xrep_rtrefc *rr, + enum xfs_refc_domain domain, + xfs_rgblock_t bno, + xfs_extlen_t len, + uint64_t refcount) +{ + struct xfs_refcount_irec irec = { + .rc_startblock = bno, + .rc_blockcount = len, + .rc_refcount = refcount, + .rc_domain = domain, + }; + int error = 0; + + if (xchk_should_terminate(rr->sc, &error)) + return error; + + irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount); + + error = xrep_rtrefc_check_ext(rr->sc, &irec); + if (error) + return error; + + trace_xrep_refc_found(rtg_group(rr->sc->sr.rtg), &irec); + + return xfarray_append(rr->refcount_records, &irec); +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_rtrefc_stash_cow( + struct xrep_rtrefc *rr, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + return xrep_rtrefc_stash(rr, XFS_REFC_DOMAIN_COW, bno, len, 1); +} + +/* Decide if an rmap could describe a shared extent. */ +static inline bool +xrep_rtrefc_rmap_shareable( + const struct xfs_rmap_irec *rmap) +{ + /* rt metadata are never sharable */ + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return false; + + /* Unwritten file blocks are not shareable. */ + if (rmap->rm_flags & XFS_RMAP_UNWRITTEN) + return false; + + return true; +} + +/* Grab the next (abbreviated) rmap record from the rmapbt. */ +STATIC int +xrep_rtrefc_walk_rmaps( + struct xrep_rtrefc *rr, + struct xfs_rmap_irec *rmap, + bool *have_rec) +{ + struct xfs_btree_cur *cur = rr->sc->sr.rmap_cur; + struct xfs_mount *mp = cur->bc_mp; + int have_gt; + int error = 0; + + *have_rec = false; + + /* + * Loop through the remaining rmaps. Remember CoW staging + * extents and the refcountbt blocks from the old tree for later + * disposal. We can only share written data fork extents, so + * keep looping until we find an rmap for one. + */ + do { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfs_btree_increment(cur, 0, &have_gt); + if (error) + return error; + if (!have_gt) + return 0; + + error = xfs_rmap_get_rec(cur, rmap, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(mp, !have_gt)) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + + if (rmap->rm_owner == XFS_RMAP_OWN_COW) { + error = xrep_rtrefc_stash_cow(rr, rmap->rm_startblock, + rmap->rm_blockcount); + if (error) + return error; + } else if (xfs_is_sb_inum(mp, rmap->rm_owner) || + (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | + XFS_RMAP_BMBT_BLOCK))) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + } while (!xrep_rtrefc_rmap_shareable(rmap)); + + *have_rec = true; + return 0; +} + +static inline uint32_t +xrep_rtrefc_encode_startblock( + const struct xfs_refcount_irec *irec) +{ + uint32_t start; + + start = irec->rc_startblock & ~XFS_REFC_COWFLAG; + if (irec->rc_domain == XFS_REFC_DOMAIN_COW) + start |= XFS_REFC_COWFLAG; + + return start; +} + +/* + * Compare two refcount records. We want to sort in order of increasing block + * number. + */ +static int +xrep_rtrefc_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_refcount_irec *ap = a; + const struct xfs_refcount_irec *bp = b; + uint32_t sa, sb; + + sa = xrep_rtrefc_encode_startblock(ap); + sb = xrep_rtrefc_encode_startblock(bp); + + if (sa > sb) + return 1; + if (sa < sb) + return -1; + return 0; +} + +/* + * Sort the refcount extents by startblock or else the btree records will be in + * the wrong order. Make sure the records do not overlap in physical space. + */ +STATIC int +xrep_rtrefc_sort_records( + struct xrep_rtrefc *rr) +{ + struct xfs_refcount_irec irec; + xfarray_idx_t cur; + enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED; + xfs_rgblock_t next_rgbno = 0; + int error; + + error = xfarray_sort(rr->refcount_records, xrep_rtrefc_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rr->refcount_records, cur) { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfarray_load(rr->refcount_records, cur, &irec); + if (error) + return error; + + if (dom == XFS_REFC_DOMAIN_SHARED && + irec.rc_domain == XFS_REFC_DOMAIN_COW) { + dom = irec.rc_domain; + next_rgbno = 0; + } + + if (dom != irec.rc_domain) + return -EFSCORRUPTED; + if (irec.rc_startblock < next_rgbno) + return -EFSCORRUPTED; + + next_rgbno = irec.rc_startblock + irec.rc_blockcount; + } + + return error; +} + +/* Record extents that belong to the realtime refcount inode. */ +STATIC int +xrep_rtrefc_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rtrefc *rr = priv; + int error = 0; + + if (xchk_should_terminate(rr->sc, &error)) + return error; + + /* Skip extents which are not owned by this inode and fork. */ + if (rec->rm_owner != rr->sc->ip->i_ino) + return 0; + + error = xrep_check_ino_btree_mapping(rr->sc, rec); + if (error) + return error; + + return xfsb_bitmap_set(&rr->old_rtrefcountbt_blocks, + xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock), + rec->rm_blockcount); +} + +/* + * Walk forward through the rmap btree to collect all rmaps starting at + * @bno in @rmap_bag. These represent the file(s) that share ownership of + * the current block. Upon return, the rmap cursor points to the last record + * satisfying the startblock constraint. + */ +static int +xrep_rtrefc_push_rmaps_at( + struct xrep_rtrefc *rr, + struct rcbag *rcstack, + xfs_rgblock_t bno, + struct xfs_rmap_irec *rmap, + bool *have) +{ + struct xfs_scrub *sc = rr->sc; + int have_gt; + int error; + + while (*have && rmap->rm_startblock == bno) { + error = rcbag_add(rcstack, rr->sc->tp, rmap); + if (error) + return error; + + error = xrep_rtrefc_walk_rmaps(rr, rmap, have); + if (error) + return error; + } + + error = xfs_btree_decrement(sc->sr.rmap_cur, 0, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(sc->mp, !have_gt)) { + xfs_btree_mark_sick(sc->sr.rmap_cur); + return -EFSCORRUPTED; + } + + return 0; +} + +/* Scan one AG for reverse mappings for the realtime refcount btree. */ +STATIC int +xrep_rtrefc_scan_ag( + struct xrep_rtrefc *rr, + struct xfs_perag *pag) +{ + struct xfs_scrub *sc = rr->sc; + int error; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrefc_walk_rmap, rr); + xchk_ag_free(sc, &sc->sa); + return error; +} + +/* Iterate all the rmap records to generate reference count data. */ +STATIC int +xrep_rtrefc_find_refcounts( + struct xrep_rtrefc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct rcbag *rcstack; + struct xfs_perag *pag = NULL; + uint64_t old_stack_height; + xfs_rgblock_t sbno; + xfs_rgblock_t cbno; + xfs_rgblock_t nbno; + bool have; + int error; + + /* Scan for old rtrefc btree blocks. */ + while ((pag = xfs_perag_next(sc->mp, pag))) { + error = xrep_rtrefc_scan_ag(rr, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + xrep_rtgroup_btcur_init(sc, &sc->sr); + + /* + * Set up a bag to store all the rmap records that we're tracking to + * generate a reference count record. If this exceeds + * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount. + */ + error = rcbag_init(sc->mp, sc->xmbtp, &rcstack); + if (error) + goto out_cur; + + /* Start the rtrmapbt cursor to the left of all records. */ + error = xfs_btree_goto_left_edge(sc->sr.rmap_cur); + if (error) + goto out_bag; + + /* Process reverse mappings into refcount data. */ + while (xfs_btree_has_more_records(sc->sr.rmap_cur)) { + struct xfs_rmap_irec rmap; + + /* Push all rmaps with pblk == sbno onto the stack */ + error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have); + if (error) + goto out_bag; + if (!have) + break; + sbno = cbno = rmap.rm_startblock; + error = xrep_rtrefc_push_rmaps_at(rr, rcstack, sbno, &rmap, + &have); + if (error) + goto out_bag; + + /* Set nbno to the bno of the next refcount change */ + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + old_stack_height = rcbag_count(rcstack); + + /* While stack isn't empty... */ + while (rcbag_count(rcstack) > 0) { + /* Pop all rmaps that end at nbno */ + error = rcbag_remove_ending_at(rcstack, sc->tp, nbno); + if (error) + goto out_bag; + + /* Push array items that start at nbno */ + error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have); + if (error) + goto out_bag; + if (have) { + error = xrep_rtrefc_push_rmaps_at(rr, rcstack, + nbno, &rmap, &have); + if (error) + goto out_bag; + } + + /* Emit refcount if necessary */ + ASSERT(nbno > cbno); + if (rcbag_count(rcstack) != old_stack_height) { + if (old_stack_height > 1) { + error = xrep_rtrefc_stash(rr, + XFS_REFC_DOMAIN_SHARED, + cbno, nbno - cbno, + old_stack_height); + if (error) + goto out_bag; + } + cbno = nbno; + } + + /* Stack empty, go find the next rmap */ + if (rcbag_count(rcstack) == 0) + break; + old_stack_height = rcbag_count(rcstack); + sbno = nbno; + + /* Set nbno to the bno of the next refcount change */ + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, + &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + } + } + + ASSERT(rcbag_count(rcstack) == 0); +out_bag: + rcbag_free(&rcstack); +out_cur: + xchk_rtgroup_btcur_free(&sc->sr); + return error; +} + +/* Retrieve refcountbt data for bulk load. */ +STATIC int +xrep_rtrefc_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xrep_rtrefc *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(rr->refcount_records, rr->array_cur++, + &cur->bc_rec.rc); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_rtrefc_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_rtrefc *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Figure out how much space we need to create the incore btree root block. */ +STATIC size_t +xrep_rtrefc_iroot_size( + struct xfs_btree_cur *cur, + unsigned int level, + unsigned int nr_this_level, + void *priv) +{ + return xfs_rtrefcount_broot_space_calc(cur->bc_mp, level, + nr_this_level); +} + +/* + * Use the collected refcount information to stage a new rt refcount btree. If + * this is successful we'll return with the new btree root information logged + * to the repair transaction but not yet committed. + */ +STATIC int +xrep_rtrefc_build_new_tree( + struct xrep_rtrefc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_rtgroup *rtg = sc->sr.rtg; + struct xfs_btree_cur *refc_cur; + int error; + + error = xrep_rtrefc_sort_records(rr); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the realtime refcount inode. + */ + error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc); + if (error) + return error; + + rr->new_btree.bload.get_records = xrep_rtrefc_get_records; + rr->new_btree.bload.claim_block = xrep_rtrefc_claim_block; + rr->new_btree.bload.iroot_size = xrep_rtrefc_iroot_size; + + refc_cur = xfs_rtrefcountbt_init_cursor(NULL, rtg); + xfs_btree_stage_ifakeroot(refc_cur, &rr->new_btree.ifake); + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(refc_cur, &rr->new_btree.bload, + xfarray_length(rr->refcount_records)); + if (error) + goto err_cur; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* + * Guess how many blocks we're going to need to rebuild an entire + * rtrefcountbt from the number of extents we found, and pump up our + * transaction to have sufficient block reservation. We're allowed + * to exceed quota to repair inconsistent metadata, though this is + * unlikely. + */ + error = xfs_trans_reserve_more_inode(sc->tp, rtg_refcount(rtg), + rr->new_btree.bload.nr_blocks, 0, true); + if (error) + goto err_cur; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + rr->new_btree.bload.nr_blocks); + if (error) + goto err_cur; + + /* Add all observed refcount records. */ + rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE; + rr->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr); + if (error) + goto err_cur; + + /* + * Install the new rtrefc btree in the inode. After this point the old + * btree is no longer accessible, the new tree is live, and we can + * delete the cursor. + */ + xfs_rtrefcountbt_commit_staged_btree(refc_cur, sc->tp); + xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks); + xfs_btree_del_cursor(refc_cur, 0); + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_trans(sc); +err_cur: + xfs_btree_del_cursor(refc_cur, error); + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* Rebuild the rt refcount btree. */ +int +xrep_rtrefcountbt( + struct xfs_scrub *sc) +{ + struct xrep_rtrefc *rr; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rtrmapbt(mp)) + return -EOPNOTSUPP; + + /* Make sure any problems with the fork are fixed. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + rr = kzalloc(sizeof(struct xrep_rtrefc), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + rr->sc = sc; + + /* Set up enough storage to handle one refcount record per rt extent. */ + descr = xchk_xfile_ag_descr(sc, "reference count records"); + error = xfarray_create(descr, mp->m_sb.sb_rextents, + sizeof(struct xfs_refcount_irec), + &rr->refcount_records); + kfree(descr); + if (error) + goto out_rr; + + /* Collect all reference counts. */ + xfsb_bitmap_init(&rr->old_rtrefcountbt_blocks); + error = xrep_rtrefc_find_refcounts(rr); + if (error) + goto out_bitmap; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Rebuild the refcount information. */ + error = xrep_rtrefc_build_new_tree(rr); + if (error) + goto out_bitmap; + + /* + * Free all the extents that were allocated to the former rtrefcountbt + * and aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, + &rr->old_rtrefcountbt_blocks); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&rr->old_rtrefcountbt_blocks); + xfarray_destroy(rr->refcount_records); +out_rr: + kfree(rr); + return error; +} diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c new file mode 100644 index 000000000000..12989fe80e8b --- /dev/null +++ b/fs/xfs/scrub/rtrmap.c @@ -0,0 +1,323 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_inode.h" +#include "xfs_rtalloc.h" +#include "xfs_rtgroup.h" +#include "xfs_metafile.h" +#include "xfs_refcount.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* Set us up with the realtime metadata locked. */ +int +xchk_setup_rtrmapbt( + struct xfs_scrub *sc) +{ + int error; + + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + if (xchk_could_repair(sc)) { + error = xrep_setup_rtrmapbt(sc); + if (error) + return error; + } + + error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); + if (error) + return error; + + error = xchk_setup_rt(sc); + if (error) + return error; + + error = xchk_install_live_inode(sc, rtg_rmap(sc->sr.rtg)); + if (error) + return error; + + return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL); +} + +/* Realtime reverse mapping. */ + +struct xchk_rtrmap { + /* + * The furthest-reaching of the rmapbt records that we've already + * processed. This enables us to detect overlapping records for space + * allocations that cannot be shared. + */ + struct xfs_rmap_irec overlap_rec; + + /* + * The previous rmapbt record, so that we can check for two records + * that could be one. + */ + struct xfs_rmap_irec prev_rec; +}; + +static inline bool +xchk_rtrmapbt_is_shareable( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *irec) +{ + if (!xfs_has_rtreflink(sc->mp)) + return false; + if (irec->rm_flags & XFS_RMAP_UNWRITTEN) + return false; + return true; +} + +/* Flag failures for records that overlap but cannot. */ +STATIC void +xchk_rtrmapbt_check_overlapping( + struct xchk_btree *bs, + struct xchk_rtrmap *cr, + const struct xfs_rmap_irec *irec) +{ + xfs_rtblock_t pnext, inext; + + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + /* No previous record? */ + if (cr->overlap_rec.rm_blockcount == 0) + goto set_prev; + + /* Do overlap_rec and irec overlap? */ + pnext = cr->overlap_rec.rm_startblock + cr->overlap_rec.rm_blockcount; + if (pnext <= irec->rm_startblock) + goto set_prev; + + /* Overlap is only allowed if both records are data fork mappings. */ + if (!xchk_rtrmapbt_is_shareable(bs->sc, &cr->overlap_rec) || + !xchk_rtrmapbt_is_shareable(bs->sc, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + /* Save whichever rmap record extends furthest. */ + inext = irec->rm_startblock + irec->rm_blockcount; + if (pnext > inext) + return; + +set_prev: + memcpy(&cr->overlap_rec, irec, sizeof(struct xfs_rmap_irec)); +} + +/* Decide if two reverse-mapping records can be merged. */ +static inline bool +xchk_rtrmap_mergeable( + struct xchk_rtrmap *cr, + const struct xfs_rmap_irec *r2) +{ + const struct xfs_rmap_irec *r1 = &cr->prev_rec; + + /* Ignore if prev_rec is not yet initialized. */ + if (cr->prev_rec.rm_blockcount == 0) + return false; + + if (r1->rm_owner != r2->rm_owner) + return false; + if (r1->rm_startblock + r1->rm_blockcount != r2->rm_startblock) + return false; + if ((unsigned long long)r1->rm_blockcount + r2->rm_blockcount > + XFS_RMAP_LEN_MAX) + return false; + if (r1->rm_flags != r2->rm_flags) + return false; + return r1->rm_offset + r1->rm_blockcount == r2->rm_offset; +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_rtrmapbt_check_mergeable( + struct xchk_btree *bs, + struct xchk_rtrmap *cr, + const struct xfs_rmap_irec *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (xchk_rtrmap_mergeable(cr, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec)); +} + +/* Cross-reference a rmap against the refcount btree. */ +STATIC void +xchk_rtrmapbt_xref_rtrefc( + struct xfs_scrub *sc, + struct xfs_rmap_irec *irec) +{ + xfs_rgblock_t fbno; + xfs_extlen_t flen; + bool is_inode; + bool is_bmbt; + bool is_attr; + bool is_unwritten; + int error; + + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) + return; + + is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner); + is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK; + is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK; + is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN; + + /* If this is shared, must be a data fork extent. */ + error = xfs_refcount_find_shared(sc->sr.refc_cur, irec->rm_startblock, + irec->rm_blockcount, &fbno, &flen, false); + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) + return; + if (flen != 0 && (!is_inode || is_attr || is_bmbt || is_unwritten)) + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); +} + +/* Cross-reference with other metadata. */ +STATIC void +xchk_rtrmapbt_xref( + struct xfs_scrub *sc, + struct xfs_rmap_irec *irec) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xchk_xref_is_used_rt_space(sc, + xfs_rgbno_to_rtb(sc->sr.rtg, irec->rm_startblock), + irec->rm_blockcount); + if (irec->rm_owner == XFS_RMAP_OWN_COW) + xchk_xref_is_cow_staging(sc, irec->rm_startblock, + irec->rm_blockcount); + else + xchk_rtrmapbt_xref_rtrefc(sc, irec); +} + +/* Scrub a realtime rmapbt record. */ +STATIC int +xchk_rtrmapbt_rec( + struct xchk_btree *bs, + const union xfs_btree_rec *rec) +{ + struct xchk_rtrmap *cr = bs->private; + struct xfs_rmap_irec irec; + + if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || + xfs_rtrmap_check_irec(to_rtg(bs->cur->bc_group), &irec) != NULL) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + xchk_rtrmapbt_check_mergeable(bs, cr, &irec); + xchk_rtrmapbt_check_overlapping(bs, cr, &irec); + xchk_rtrmapbt_xref(bs->sc, &irec); + return 0; +} + +/* Scrub the realtime rmap btree. */ +int +xchk_rtrmapbt( + struct xfs_scrub *sc) +{ + struct xfs_inode *ip = rtg_rmap(sc->sr.rtg); + struct xfs_owner_info oinfo; + struct xchk_rtrmap cr = { }; + int error; + + error = xchk_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, XFS_DATA_FORK); + return xchk_btree(sc, sc->sr.rmap_cur, xchk_rtrmapbt_rec, &oinfo, &cr); +} + +/* xref check that the extent has no realtime reverse mapping at all */ +void +xchk_xref_has_no_rt_owner( + struct xfs_scrub *sc, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); +} + +/* xref check that the extent is completely mapped */ +void +xchk_xref_has_rt_owner( + struct xfs_scrub *sc, + xfs_rgblock_t bno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; + int error; + + if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) + return; + if (outcome != XBTREE_RECPACKING_FULL) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); +} + +/* xref check that the extent is only owned by a given owner */ +void +xchk_xref_is_only_rt_owned_by( + struct xfs_scrub *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo) +{ + struct xfs_rmap_matches res; + int error; + + if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_rmap_count_owners(sc->sr.rmap_cur, bno, len, oinfo, &res); + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) + return; + if (res.matches != 1) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); + if (res.bad_non_owner_matches) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); + if (res.non_owner_matches) + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); +} diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c new file mode 100644 index 000000000000..fc2592c53af5 --- /dev/null +++ b/fs/xfs/scrub/rtrmap_repair.c @@ -0,0 +1,987 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_quota.h" +#include "xfs_rtalloc.h" +#include "xfs_ag.h" +#include "xfs_rtgroup.h" +#include "xfs_refcount.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/rgb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Realtime Reverse Mapping Btree Repair + * ===================================== + * + * This isn't quite as difficult as repairing the rmap btree on the data + * device, since we only store the data fork extents of realtime files on the + * realtime device. We still have to freeze the filesystem and stop the + * background threads like we do for the rmap repair, but we only have to scan + * realtime inodes. + * + * Collecting entries for the new realtime rmap btree is easy -- all we have + * to do is generate rtrmap entries from the data fork mappings of all realtime + * files in the filesystem. We then scan the rmap btrees of the data device + * looking for extents belonging to the old btree and note them in a bitmap. + * + * To rebuild the realtime rmap btree, we bulk-load the collected mappings into + * a new btree cursor and atomically swap that into the realtime inode. Then + * we can free the blocks from the old btree. + * + * We use the 'xrep_rtrmap' prefix for all the rmap functions. + */ + +/* Context for collecting rmaps */ +struct xrep_rtrmap { + /* new rtrmapbt information */ + struct xrep_newbt new_btree; + + /* lock for the xfbtree and xfile */ + struct mutex lock; + + /* rmap records generated from primary metadata */ + struct xfbtree rtrmap_btree; + + struct xfs_scrub *sc; + + /* bitmap of old rtrmapbt blocks */ + struct xfsb_bitmap old_rtrmapbt_blocks; + + /* Hooks into rtrmap update code. */ + struct xfs_rmap_hook rhook; + + /* inode scan cursor */ + struct xchk_iscan iscan; + + /* in-memory btree cursor for the ->get_blocks walk */ + struct xfs_btree_cur *mcur; + + /* Number of records we're staging in the new btree. */ + uint64_t nr_records; +}; + +/* Set us up to repair rt reverse mapping btrees. */ +int +xrep_setup_rtrmapbt( + struct xfs_scrub *sc) +{ + struct xrep_rtrmap *rr; + char *descr; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP); + + descr = xchk_xfile_rtgroup_descr(sc, "reverse mapping records"); + error = xrep_setup_xfbtree(sc, descr); + kfree(descr); + if (error) + return error; + + rr = kzalloc(sizeof(struct xrep_rtrmap), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + + rr->sc = sc; + sc->buf = rr; + return 0; +} + +/* Make sure there's nothing funny about this mapping. */ +STATIC int +xrep_rtrmap_check_mapping( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *rec) +{ + if (xfs_rtrmap_check_irec(sc->sr.rtg, rec) != NULL) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + return xrep_require_rtext_inuse(sc, rec->rm_startblock, + rec->rm_blockcount); +} + +/* Store a reverse-mapping record. */ +static inline int +xrep_rtrmap_stash( + struct xrep_rtrmap *rr, + xfs_rgblock_t startblock, + xfs_extlen_t blockcount, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + struct xfs_rmap_irec rmap = { + .rm_startblock = startblock, + .rm_blockcount = blockcount, + .rm_owner = owner, + .rm_offset = offset, + .rm_flags = flags, + }; + struct xfs_scrub *sc = rr->sc; + struct xfs_btree_cur *mcur; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + if (xchk_iscan_aborted(&rr->iscan)) + return -EFSCORRUPTED; + + trace_xrep_rtrmap_found(sc->mp, &rmap); + + /* Add entry to in-memory btree. */ + mutex_lock(&rr->lock); + mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, sc->tp, &rr->rtrmap_btree); + error = xfs_rmap_map_raw(mcur, &rmap); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_cancel; + + error = xfbtree_trans_commit(&rr->rtrmap_btree, sc->tp); + if (error) + goto out_abort; + + mutex_unlock(&rr->lock); + return 0; + +out_cancel: + xfbtree_trans_cancel(&rr->rtrmap_btree, sc->tp); +out_abort: + xchk_iscan_abort(&rr->iscan); + mutex_unlock(&rr->lock); + return error; +} + +/* Finding all file and bmbt extents. */ + +/* Context for accumulating rmaps for an inode fork. */ +struct xrep_rtrmap_ifork { + /* + * Accumulate rmap data here to turn multiple adjacent bmaps into a + * single rmap. + */ + struct xfs_rmap_irec accum; + + struct xrep_rtrmap *rr; +}; + +/* Stash an rmap that we accumulated while walking an inode fork. */ +STATIC int +xrep_rtrmap_stash_accumulated( + struct xrep_rtrmap_ifork *rf) +{ + if (rf->accum.rm_blockcount == 0) + return 0; + + return xrep_rtrmap_stash(rf->rr, rf->accum.rm_startblock, + rf->accum.rm_blockcount, rf->accum.rm_owner, + rf->accum.rm_offset, rf->accum.rm_flags); +} + +/* Accumulate a bmbt record. */ +STATIC int +xrep_rtrmap_visit_bmbt( + struct xfs_btree_cur *cur, + struct xfs_bmbt_irec *rec, + void *priv) +{ + struct xrep_rtrmap_ifork *rf = priv; + struct xfs_rmap_irec *accum = &rf->accum; + struct xfs_mount *mp = rf->rr->sc->mp; + xfs_rgblock_t rgbno; + unsigned int rmap_flags = 0; + int error; + + if (xfs_rtb_to_rgno(mp, rec->br_startblock) != + rtg_rgno(rf->rr->sc->sr.rtg)) + return 0; + + if (rec->br_state == XFS_EXT_UNWRITTEN) + rmap_flags |= XFS_RMAP_UNWRITTEN; + + /* If this bmap is adjacent to the previous one, just add it. */ + rgbno = xfs_rtb_to_rgbno(mp, rec->br_startblock); + if (accum->rm_blockcount > 0 && + rec->br_startoff == accum->rm_offset + accum->rm_blockcount && + rgbno == accum->rm_startblock + accum->rm_blockcount && + rmap_flags == accum->rm_flags) { + accum->rm_blockcount += rec->br_blockcount; + return 0; + } + + /* Otherwise stash the old rmap and start accumulating a new one. */ + error = xrep_rtrmap_stash_accumulated(rf); + if (error) + return error; + + accum->rm_startblock = rgbno; + accum->rm_blockcount = rec->br_blockcount; + accum->rm_offset = rec->br_startoff; + accum->rm_flags = rmap_flags; + return 0; +} + +/* + * Iterate the block mapping btree to collect rmap records for anything in this + * fork that maps to the rt volume. Sets @mappings_done to true if we've + * scanned the block mappings in this fork. + */ +STATIC int +xrep_rtrmap_scan_bmbt( + struct xrep_rtrmap_ifork *rf, + struct xfs_inode *ip, + bool *mappings_done) +{ + struct xrep_rtrmap *rr = rf->rr; + struct xfs_btree_cur *cur; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + int error = 0; + + *mappings_done = false; + + /* + * If the incore extent cache is already loaded, we'll just use the + * incore extent scanner to record mappings. Don't bother walking the + * ondisk extent tree. + */ + if (!xfs_need_iread_extents(ifp)) + return 0; + + /* Accumulate all the mappings in the bmap btree. */ + cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, XFS_DATA_FORK); + error = xfs_bmap_query_all(cur, xrep_rtrmap_visit_bmbt, rf); + xfs_btree_del_cursor(cur, error); + if (error) + return error; + + /* Stash any remaining accumulated rmaps and exit. */ + *mappings_done = true; + return xrep_rtrmap_stash_accumulated(rf); +} + +/* + * Iterate the in-core extent cache to collect rmap records for anything in + * this fork that matches the AG. + */ +STATIC int +xrep_rtrmap_scan_iext( + struct xrep_rtrmap_ifork *rf, + struct xfs_ifork *ifp) +{ + struct xfs_bmbt_irec rec; + struct xfs_iext_cursor icur; + int error; + + for_each_xfs_iext(ifp, &icur, &rec) { + if (isnullstartblock(rec.br_startblock)) + continue; + error = xrep_rtrmap_visit_bmbt(NULL, &rec, rf); + if (error) + return error; + } + + return xrep_rtrmap_stash_accumulated(rf); +} + +/* Find all the extents on the realtime device mapped by an inode fork. */ +STATIC int +xrep_rtrmap_scan_dfork( + struct xrep_rtrmap *rr, + struct xfs_inode *ip) +{ + struct xrep_rtrmap_ifork rf = { + .accum = { .rm_owner = ip->i_ino, }, + .rr = rr, + }; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + int error = 0; + + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + bool mappings_done; + + /* + * Scan the bmbt for mappings. If the incore extent tree is + * loaded, we want to scan the cached mappings since that's + * faster when the extent counts are very high. + */ + error = xrep_rtrmap_scan_bmbt(&rf, ip, &mappings_done); + if (error || mappings_done) + return error; + } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { + /* realtime data forks should only be extents or btree */ + return -EFSCORRUPTED; + } + + /* Scan incore extent cache. */ + return xrep_rtrmap_scan_iext(&rf, ifp); +} + +/* Record reverse mappings for a file. */ +STATIC int +xrep_rtrmap_scan_inode( + struct xrep_rtrmap *rr, + struct xfs_inode *ip) +{ + unsigned int lock_mode; + int error = 0; + + /* Skip the rt rmap btree inode. */ + if (rr->sc->ip == ip) + return 0; + + lock_mode = xfs_ilock_data_map_shared(ip); + + /* Check the data fork if it's on the realtime device. */ + if (XFS_IS_REALTIME_INODE(ip)) { + error = xrep_rtrmap_scan_dfork(rr, ip); + if (error) + goto out_unlock; + } + + xchk_iscan_mark_visited(&rr->iscan, ip); +out_unlock: + xfs_iunlock(ip, lock_mode); + return error; +} + +/* Record extents that belong to the realtime rmap inode. */ +STATIC int +xrep_rtrmap_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rtrmap *rr = priv; + int error = 0; + + if (xchk_should_terminate(rr->sc, &error)) + return error; + + /* Skip extents which are not owned by this inode and fork. */ + if (rec->rm_owner != rr->sc->ip->i_ino) + return 0; + + error = xrep_check_ino_btree_mapping(rr->sc, rec); + if (error) + return error; + + return xfsb_bitmap_set(&rr->old_rtrmapbt_blocks, + xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock), + rec->rm_blockcount); +} + +/* Scan one AG for reverse mappings for the realtime rmap btree. */ +STATIC int +xrep_rtrmap_scan_ag( + struct xrep_rtrmap *rr, + struct xfs_perag *pag) +{ + struct xfs_scrub *sc = rr->sc; + int error; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrmap_walk_rmap, rr); + xchk_ag_free(sc, &sc->sa); + return error; +} + +struct xrep_rtrmap_stash_run { + struct xrep_rtrmap *rr; + uint64_t owner; +}; + +static int +xrep_rtrmap_stash_run( + uint32_t start, + uint32_t len, + void *priv) +{ + struct xrep_rtrmap_stash_run *rsr = priv; + struct xrep_rtrmap *rr = rsr->rr; + xfs_rgblock_t rgbno = start; + + return xrep_rtrmap_stash(rr, rgbno, len, rsr->owner, 0, 0); +} + +/* + * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure + * that the ranges are in units of FS blocks. + */ +STATIC int +xrep_rtrmap_stash_bitmap( + struct xrep_rtrmap *rr, + struct xrgb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xrep_rtrmap_stash_run rsr = { + .rr = rr, + .owner = oinfo->oi_owner, + }; + + return xrgb_bitmap_walk(bitmap, xrep_rtrmap_stash_run, &rsr); +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_rtrmap_walk_cowblocks( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec, + void *priv) +{ + struct xrgb_bitmap *bitmap = priv; + + if (!xfs_refcount_check_domain(irec) || + irec->rc_domain != XFS_REFC_DOMAIN_COW) + return -EFSCORRUPTED; + + return xrgb_bitmap_set(bitmap, irec->rc_startblock, + irec->rc_blockcount); +} + +/* + * Collect rmaps for the blocks containing the refcount btree, and all CoW + * staging extents. + */ +STATIC int +xrep_rtrmap_find_refcount_rmaps( + struct xrep_rtrmap *rr) +{ + struct xrgb_bitmap cow_blocks; /* COWBIT */ + struct xfs_refcount_irec low = { + .rc_startblock = 0, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_refcount_irec high = { + .rc_startblock = -1U, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_scrub *sc = rr->sc; + int error; + + if (!xfs_has_rtreflink(sc->mp)) + return 0; + + xrgb_bitmap_init(&cow_blocks); + + /* Collect rmaps for CoW staging extents. */ + error = xfs_refcount_query_range(sc->sr.refc_cur, &low, &high, + xrep_rtrmap_walk_cowblocks, &cow_blocks); + if (error) + goto out_bitmap; + + /* Generate rmaps for everything. */ + error = xrep_rtrmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW); + if (error) + goto out_bitmap; + +out_bitmap: + xrgb_bitmap_destroy(&cow_blocks); + return error; +} + +/* Count and check all collected records. */ +STATIC int +xrep_rtrmap_check_record( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rtrmap *rr = priv; + int error; + + error = xrep_rtrmap_check_mapping(rr->sc, rec); + if (error) + return error; + + rr->nr_records++; + return 0; +} + +/* Generate all the reverse-mappings for the realtime device. */ +STATIC int +xrep_rtrmap_find_rmaps( + struct xrep_rtrmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = NULL; + struct xfs_inode *ip; + struct xfs_btree_cur *mcur; + int error; + + /* Generate rmaps for the realtime superblock */ + if (xfs_has_rtsb(sc->mp) && rtg_rgno(rr->sc->sr.rtg) == 0) { + error = xrep_rtrmap_stash(rr, 0, sc->mp->m_sb.sb_rextsize, + XFS_RMAP_OWN_FS, 0, 0); + if (error) + return error; + } + + /* Find CoW staging extents. */ + xrep_rtgroup_btcur_init(sc, &sc->sr); + error = xrep_rtrmap_find_refcount_rmaps(rr); + xchk_rtgroup_btcur_free(&sc->sr); + if (error) + return error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Unlock the realtime metadata inodes and cancel the transaction to + * release the log grant space while we scan the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done even though + * we take the IOLOCK to quiesce the file because empty transactions + * do not take sb_internal. + */ + xchk_trans_cancel(sc); + xchk_rtgroup_unlock(&sc->sr); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { + error = xrep_rtrmap_scan_inode(rr, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rr->iscan); + if (error) + return error; + + /* + * Switch out for a real transaction and lock the RT metadata in + * preparation for building a new tree. + */ + xchk_trans_cancel(sc); + error = xchk_setup_rt(sc); + if (error) + return error; + error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL); + if (error) + return error; + + /* + * If a hook failed to update the in-memory btree, we lack the data to + * continue the repair. + */ + if (xchk_iscan_aborted(&rr->iscan)) + return -EFSCORRUPTED; + + /* Scan for old rtrmap blocks. */ + while ((pag = xfs_perag_next(sc->mp, pag))) { + error = xrep_rtrmap_scan_ag(rr, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + /* + * Now that we have everything locked again, we need to count the + * number of rmap records stashed in the btree. This should reflect + * all actively-owned rt files in the filesystem. At the same time, + * check all our records before we start building a new btree, which + * requires the rtbitmap lock. + */ + mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, NULL, &rr->rtrmap_btree); + rr->nr_records = 0; + error = xfs_rmap_query_all(mcur, xrep_rtrmap_check_record, rr); + xfs_btree_del_cursor(mcur, error); + + return error; +} + +/* Building the new rtrmap btree. */ + +/* Retrieve rtrmapbt data for bulk load. */ +STATIC int +xrep_rtrmap_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xrep_rtrmap *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + int stat = 0; + + error = xfs_btree_increment(rr->mcur, 0, &stat); + if (error) + return error; + if (!stat) + return -EFSCORRUPTED; + + error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat); + if (error) + return error; + if (!stat) + return -EFSCORRUPTED; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_rtrmap_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_rtrmap *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Figure out how much space we need to create the incore btree root block. */ +STATIC size_t +xrep_rtrmap_iroot_size( + struct xfs_btree_cur *cur, + unsigned int level, + unsigned int nr_this_level, + void *priv) +{ + return xfs_rtrmap_broot_space_calc(cur->bc_mp, level, nr_this_level); +} + +/* + * Use the collected rmap information to stage a new rmap btree. If this is + * successful we'll return with the new btree root information logged to the + * repair transaction but not yet committed. This implements section (III) + * above. + */ +STATIC int +xrep_rtrmap_build_new_tree( + struct xrep_rtrmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_rtgroup *rtg = sc->sr.rtg; + struct xfs_btree_cur *rmap_cur; + int error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the realtime rmapbt inode. + */ + error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc); + if (error) + return error; + + rr->new_btree.bload.get_records = xrep_rtrmap_get_records; + rr->new_btree.bload.claim_block = xrep_rtrmap_claim_block; + rr->new_btree.bload.iroot_size = xrep_rtrmap_iroot_size; + + rmap_cur = xfs_rtrmapbt_init_cursor(NULL, rtg); + xfs_btree_stage_ifakeroot(rmap_cur, &rr->new_btree.ifake); + + /* Compute how many blocks we'll need for the rmaps collected. */ + error = xfs_btree_bload_compute_geometry(rmap_cur, + &rr->new_btree.bload, rr->nr_records); + if (error) + goto err_cur; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* + * Guess how many blocks we're going to need to rebuild an entire + * rtrmapbt from the number of extents we found, and pump up our + * transaction to have sufficient block reservation. We're allowed + * to exceed quota to repair inconsistent metadata, though this is + * unlikely. + */ + error = xfs_trans_reserve_more_inode(sc->tp, rtg_rmap(rtg), + rr->new_btree.bload.nr_blocks, 0, true); + if (error) + goto err_cur; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + rr->new_btree.bload.nr_blocks); + if (error) + goto err_cur; + + /* + * Create a cursor to the in-memory btree so that we can bulk load the + * new btree. + */ + rr->mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, NULL, &rr->rtrmap_btree); + error = xfs_btree_goto_left_edge(rr->mcur); + if (error) + goto err_mcur; + + /* Add all observed rmap records. */ + rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE; + error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr); + if (error) + goto err_mcur; + + /* + * Install the new rtrmap btree in the inode. After this point the old + * btree is no longer accessible, the new tree is live, and we can + * delete the cursor. + */ + xfs_rtrmapbt_commit_staged_btree(rmap_cur, sc->tp); + xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks); + xfs_btree_del_cursor(rmap_cur, 0); + xfs_btree_del_cursor(rr->mcur, 0); + rr->mcur = NULL; + + /* + * Now that we've written the new btree to disk, we don't need to keep + * updating the in-memory btree. Abort the scan to stop live updates. + */ + xchk_iscan_abort(&rr->iscan); + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_trans(sc); + +err_mcur: + xfs_btree_del_cursor(rr->mcur, error); +err_cur: + xfs_btree_del_cursor(rmap_cur, error); + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* Reaping the old btree. */ + +static inline bool +xrep_rtrmapbt_want_live_update( + struct xchk_iscan *iscan, + const struct xfs_owner_info *oi) +{ + if (xchk_iscan_aborted(iscan)) + return false; + + /* + * We scanned the CoW staging extents before we started the iscan, so + * we need all the updates. + */ + if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner)) + return true; + + /* Ignore updates to files that the scanner hasn't visited yet. */ + return xchk_iscan_want_live_update(iscan, oi->oi_owner); +} + +/* + * Apply a rtrmapbt update from the regular filesystem into our shadow btree. + * We're running from the thread that owns the rtrmap ILOCK and is generating + * the update, so we must be careful about which parts of the struct + * xrep_rtrmap that we change. + */ +static int +xrep_rtrmapbt_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_rmap_update_params *p = data; + struct xrep_rtrmap *rr; + struct xfs_mount *mp; + struct xfs_btree_cur *mcur; + struct xfs_trans *tp; + void *txcookie; + int error; + + rr = container_of(nb, struct xrep_rtrmap, rhook.rmap_hook.nb); + mp = rr->sc->mp; + + if (!xrep_rtrmapbt_want_live_update(&rr->iscan, &p->oinfo)) + goto out_unlock; + + trace_xrep_rmap_live_update(rtg_group(rr->sc->sr.rtg), action, p); + + error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); + if (error) + goto out_abort; + + mutex_lock(&rr->lock); + mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, tp, &rr->rtrmap_btree); + error = __xfs_rmap_finish_intent(mcur, action, p->startblock, + p->blockcount, &p->oinfo, p->unwritten); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_cancel; + + error = xfbtree_trans_commit(&rr->rtrmap_btree, tp); + if (error) + goto out_cancel; + + xrep_trans_cancel_hook_dummy(&txcookie, tp); + mutex_unlock(&rr->lock); + return NOTIFY_DONE; + +out_cancel: + xfbtree_trans_cancel(&rr->rtrmap_btree, tp); + xrep_trans_cancel_hook_dummy(&txcookie, tp); +out_abort: + xchk_iscan_abort(&rr->iscan); + mutex_unlock(&rr->lock); +out_unlock: + return NOTIFY_DONE; +} + +/* Set up the filesystem scan components. */ +STATIC int +xrep_rtrmap_setup_scan( + struct xrep_rtrmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + int error; + + mutex_init(&rr->lock); + xfsb_bitmap_init(&rr->old_rtrmapbt_blocks); + + /* Set up some storage */ + error = xfs_rtrmapbt_mem_init(sc->mp, &rr->rtrmap_btree, sc->xmbtp, + rtg_rgno(sc->sr.rtg)); + if (error) + goto out_bitmap; + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &rr->iscan); + + /* + * Hook into live rtrmap operations so that we can update our in-memory + * btree to reflect live changes on the filesystem. Since we drop the + * rtrmap ILOCK to scan all the inodes, we need this piece to avoid + * installing a stale btree. + */ + ASSERT(sc->flags & XCHK_FSGATES_RMAP); + xfs_rmap_hook_setup(&rr->rhook, xrep_rtrmapbt_live_update); + error = xfs_rmap_hook_add(rtg_group(sc->sr.rtg), &rr->rhook); + if (error) + goto out_iscan; + return 0; + +out_iscan: + xchk_iscan_teardown(&rr->iscan); + xfbtree_destroy(&rr->rtrmap_btree); +out_bitmap: + xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks); + mutex_destroy(&rr->lock); + return error; +} + +/* Tear down scan components. */ +STATIC void +xrep_rtrmap_teardown( + struct xrep_rtrmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + + xchk_iscan_abort(&rr->iscan); + xfs_rmap_hook_del(rtg_group(sc->sr.rtg), &rr->rhook); + xchk_iscan_teardown(&rr->iscan); + xfbtree_destroy(&rr->rtrmap_btree); + xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks); + mutex_destroy(&rr->lock); +} + +/* Repair the realtime rmap btree. */ +int +xrep_rtrmapbt( + struct xfs_scrub *sc) +{ + struct xrep_rtrmap *rr = sc->buf; + int error; + + /* Make sure any problems with the fork are fixed. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + error = xrep_rtrmap_setup_scan(rr); + if (error) + return error; + + /* Collect rmaps for realtime files. */ + error = xrep_rtrmap_find_rmaps(rr); + if (error) + goto out_records; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Rebuild the rtrmap information. */ + error = xrep_rtrmap_build_new_tree(rr); + if (error) + goto out_records; + + /* + * Free all the extents that were allocated to the former rtrmapbt and + * aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); + if (error) + goto out_records; + +out_records: + xrep_rtrmap_teardown(rr); + return error; +} diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index 5055092bd9e8..4ac679c1bd29 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -17,10 +17,15 @@ #include "xfs_bit.h" #include "xfs_bmap.h" #include "xfs_sb.h" +#include "xfs_exchmaps.h" +#include "xfs_rtgroup.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/xfile.h" +#include "scrub/repair.h" +#include "scrub/tempexch.h" +#include "scrub/rtsummary.h" /* * Realtime Summary @@ -32,18 +37,6 @@ * (potentially large) amount of data in pageable memory. */ -struct xchk_rtsummary { - struct xfs_rtalloc_args args; - - uint64_t rextents; - uint64_t rbmblocks; - uint64_t rsumsize; - unsigned int rsumlevels; - - /* Memory buffer for the summary comparison. */ - union xfs_suminfo_raw words[]; -}; - /* Set us up to check the rtsummary file. */ int xchk_setup_rtsummary( @@ -54,27 +47,41 @@ xchk_setup_rtsummary( struct xchk_rtsummary *rts; int error; + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize), XCHK_GFP_FLAGS); if (!rts) return -ENOMEM; sc->buf = rts; + error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); + if (error) + return error; + + if (xchk_could_repair(sc)) { + error = xrep_setup_rtsummary(sc, rts); + if (error) + return error; + } + /* * Create an xfile to construct a new rtsummary file. The xfile allows * us to avoid pinning kernel memory for this purpose. */ descr = xchk_xfile_descr(sc, "realtime summary file"); - error = xfile_create(descr, mp->m_rsumsize, &sc->xfile); + error = xfile_create(descr, XFS_FSB_TO_B(mp, mp->m_rsumblocks), + &sc->xfile); kfree(descr); if (error) return error; - error = xchk_trans_alloc(sc, 0); + error = xchk_trans_alloc(sc, rts->resblks); if (error) return error; - error = xchk_install_live_inode(sc, mp->m_rsumip); + error = xchk_install_live_inode(sc, rtg_summary(sc->sr.rtg)); if (error) return error; @@ -82,32 +89,27 @@ xchk_setup_rtsummary( if (error) return error; - /* - * Locking order requires us to take the rtbitmap first. We must be - * careful to unlock it ourselves when we are done with the rtbitmap - * file since the scrub infrastructure won't do that for us. Only - * then we can lock the rtsummary inode. - */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP); + if (error) + return error; /* * Now that we've locked the rtbitmap and rtsummary, we can't race with * growfsrt trying to expand the summary or change the size of the rt * volume. Hence it is safe to compute and check the geometry values. + * + * Note that there is no strict requirement for an exclusive lock on the + * summary here, but to keep the locking APIs simple we lock both inodes + * exclusively here. If we ever start caring about running concurrent + * fsmap with scrub this could be changed. */ if (mp->m_sb.sb_rblocks) { - xfs_filblks_t rsumblocks; - int rextslog; - - rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); - rextslog = xfs_compute_rextslog(rts->rextents); - rts->rsumlevels = rextslog + 1; - rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents); - rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels, - rts->rbmblocks); - rts->rsumsize = XFS_FSB_TO_B(mp, rsumblocks); + rts->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks); + rts->rbmblocks = xfs_rtbitmap_blockcount(mp); + rts->rsumblocks = + xfs_rtsummary_blockcount(mp, &rts->rsumlevels); } + return 0; } @@ -135,7 +137,7 @@ xfsum_store( sumoff << XFS_WORDLOG); } -static inline int +inline int xfsum_copyout( struct xfs_scrub *sc, xfs_rtsumoff_t sumoff, @@ -151,6 +153,11 @@ xchk_rtsum_inc( struct xfs_mount *mp, union xfs_suminfo_raw *v) { + if (xfs_has_rtgroups(mp)) { + be32_add_cpu(&v->rtg, 1); + return be32_to_cpu(v->rtg); + } + v->old += 1; return v->old; } @@ -158,11 +165,12 @@ xchk_rtsum_inc( /* Update the summary file to reflect the free extent that we've accumulated. */ STATIC int xchk_rtsum_record_free( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) { + struct xfs_mount *mp = rtg_mount(rtg); struct xfs_scrub *sc = priv; xfs_fileoff_t rbmoff; xfs_rtblock_t rtbno; @@ -181,11 +189,11 @@ xchk_rtsum_record_free( lenlog = xfs_highbit64(rec->ar_extcount); offs = xfs_rtsumoffs(mp, lenlog, rbmoff); - rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); - rtlen = xfs_rtx_to_rtb(mp, rec->ar_extcount); + rtbno = xfs_rtx_to_rtb(rtg, rec->ar_startext); + rtlen = xfs_rtxlen_to_extlen(mp, rec->ar_extcount); if (!xfs_verify_rtbext(mp, rtbno, rtlen)) { - xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino); return -EFSCORRUPTED; } @@ -207,15 +215,14 @@ xchk_rtsum_compute( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; - unsigned long long rtbmp_blocks; + struct xfs_rtgroup *rtg = sc->sr.rtg; /* If the bitmap size doesn't match the computed size, bail. */ - rtbmp_blocks = xfs_rtbitmap_blockcount(mp, mp->m_sb.sb_rextents); - if (XFS_FSB_TO_B(mp, rtbmp_blocks) != mp->m_rbmip->i_disk_size) + if (XFS_FSB_TO_B(mp, xfs_rtbitmap_blockcount(mp)) != + rtg_bitmap(rtg)->i_disk_size) return -EFSCORRUPTED; - return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free, - sc); + return xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtsum_record_free, sc); } /* Compare the rtsummary file against the one we computed. */ @@ -234,8 +241,9 @@ xchk_rtsum_compare( xfs_rtsumoff_t sumoff = 0; int error = 0; - rts->args.mp = sc->mp; + rts->args.mp = mp; rts->args.tp = sc->tp; + rts->args.rtg = sc->sr.rtg; /* Mappings may not cross or lie beyond EOF. */ endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); @@ -302,31 +310,34 @@ xchk_rtsummary( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; + struct xfs_rtgroup *rtg = sc->sr.rtg; + struct xfs_inode *rbmip = rtg_bitmap(rtg); + struct xfs_inode *rsumip = rtg_summary(rtg); struct xchk_rtsummary *rts = sc->buf; - int error = 0; + int error; /* Is sb_rextents correct? */ if (mp->m_sb.sb_rextents != rts->rextents) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); - goto out_rbm; + xchk_ino_set_corrupt(sc, rbmip->i_ino); + return 0; } /* Is m_rsumlevels correct? */ if (mp->m_rsumlevels != rts->rsumlevels) { - xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); - goto out_rbm; + xchk_ino_set_corrupt(sc, rsumip->i_ino); + return 0; } /* Is m_rsumsize correct? */ - if (mp->m_rsumsize != rts->rsumsize) { - xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); - goto out_rbm; + if (mp->m_rsumblocks != rts->rsumblocks) { + xchk_ino_set_corrupt(sc, rsumip->i_ino); + return 0; } /* The summary file length must be aligned to an fsblock. */ - if (mp->m_rsumip->i_disk_size & mp->m_blockmask) { - xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); - goto out_rbm; + if (rsumip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, rsumip->i_ino); + return 0; } /* @@ -334,15 +345,15 @@ xchk_rtsummary( * growfsrt expands the summary file before updating sb_rextents, so * the file can be larger than rsumsize. */ - if (mp->m_rsumip->i_disk_size < rts->rsumsize) { - xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); - goto out_rbm; + if (rsumip->i_disk_size < XFS_FSB_TO_B(mp, rts->rsumblocks)) { + xchk_ino_set_corrupt(sc, rsumip->i_ino); + return 0; } /* Invoke the fork scrubber. */ error = xchk_metadata_inode_forks(sc); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - goto out_rbm; + return error; /* Construct the new summary file from the rtbitmap. */ error = xchk_rtsum_compute(sc); @@ -351,18 +362,12 @@ xchk_rtsummary( * EFSCORRUPTED means the rtbitmap is corrupt, which is an xref * error since we're checking the summary file. */ - xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); - error = 0; - goto out_rbm; + xchk_ino_set_corrupt(sc, rbmip->i_ino); + return 0; } if (error) - goto out_rbm; + return error; /* Does the computed summary file match the actual rtsummary file? */ - error = xchk_rtsum_compare(sc); - -out_rbm: - /* Unlock the rtbitmap since we're done with it. */ - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - return error; + return xchk_rtsum_compare(sc); } diff --git a/fs/xfs/scrub/rtsummary.h b/fs/xfs/scrub/rtsummary.h new file mode 100644 index 000000000000..e44b04cb6e2d --- /dev/null +++ b/fs/xfs/scrub/rtsummary.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RTSUMMARY_H__ +#define __XFS_SCRUB_RTSUMMARY_H__ + +struct xchk_rtsummary { +#ifdef CONFIG_XFS_ONLINE_REPAIR + struct xrep_tempexch tempexch; +#endif + struct xfs_rtalloc_args args; + + uint64_t rextents; + uint64_t rbmblocks; + xfs_filblks_t rsumblocks; + unsigned int rsumlevels; + unsigned int resblks; + + /* suminfo position of xfile as we write buffers to disk. */ + xfs_rtsumoff_t prep_wordoff; + + /* Memory buffer for the summary comparison. */ + union xfs_suminfo_raw words[]; +}; + +int xfsum_copyout(struct xfs_scrub *sc, xfs_rtsumoff_t sumoff, + union xfs_suminfo_raw *rawinfo, unsigned int nr_words); + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_setup_rtsummary(struct xfs_scrub *sc, struct xchk_rtsummary *rts); +#else +# define xrep_setup_rtsummary(sc, rts) (0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_RTSUMMARY_H__ */ diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c new file mode 100644 index 000000000000..d593977d70df --- /dev/null +++ b/fs/xfs/scrub/rtsummary_repair.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_exchmaps.h" +#include "xfs_rtbitmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/reap.h" +#include "scrub/xfile.h" +#include "scrub/rtsummary.h" + +/* Set us up to repair the rtsummary file. */ +int +xrep_setup_rtsummary( + struct xfs_scrub *sc, + struct xchk_rtsummary *rts) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks; + int error; + + error = xrep_tempfile_create(sc, S_IFREG); + if (error) + return error; + + /* + * If we're doing a repair, we reserve enough blocks to write out a + * completely new summary file, plus twice as many blocks as we would + * need if we can only allocate one block per data fork mapping. This + * should cover the preallocation of the temporary file and exchanging + * the extent mappings. + * + * We cannot use xfs_exchmaps_estimate because we have not yet + * constructed the replacement rtsummary and therefore do not know how + * many extents it will use. By the time we do, we will have a dirty + * transaction (which we cannot drop because we cannot drop the + * rtsummary ILOCK) and cannot ask for more reservation. + */ + blocks = mp->m_rsumblocks; + blocks += xfs_bmbt_calc_size(mp, blocks) * 2; + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + rts->resblks += blocks; + return 0; +} + +static int +xrep_rtsummary_prep_buf( + struct xfs_scrub *sc, + struct xfs_buf *bp, + void *data) +{ + struct xchk_rtsummary *rts = data; + struct xfs_mount *mp = sc->mp; + union xfs_suminfo_raw *ondisk; + int error; + + rts->args.mp = mp; + rts->args.tp = sc->tp; + rts->args.rtg = sc->sr.rtg; + rts->args.sumbp = bp; + ondisk = xfs_rsumblock_infoptr(&rts->args, 0); + rts->args.sumbp = NULL; + + error = xfsum_copyout(sc, rts->prep_wordoff, ondisk, mp->m_blockwsize); + if (error) + return error; + + if (xfs_has_rtgroups(sc->mp)) { + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC); + hdr->rt_owner = cpu_to_be64(sc->ip->i_ino); + hdr->rt_blkno = cpu_to_be64(xfs_buf_daddr(bp)); + hdr->rt_lsn = 0; + uuid_copy(&hdr->rt_uuid, &sc->mp->m_sb.sb_meta_uuid); + bp->b_ops = &xfs_rtsummary_buf_ops; + } else { + bp->b_ops = &xfs_rtbuf_ops; + } + + rts->prep_wordoff += mp->m_blockwsize; + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTSUMMARY_BUF); + return 0; +} + +/* Repair the realtime summary. */ +int +xrep_rtsummary( + struct xfs_scrub *sc) +{ + struct xchk_rtsummary *rts = sc->buf; + struct xfs_mount *mp = sc->mp; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(mp)) + return -EOPNOTSUPP; + + /* Walk away if we disagree on the size of the rt bitmap. */ + if (rts->rbmblocks != mp->m_sb.sb_rbmblocks) + return 0; + + /* Make sure any problems with the fork are fixed. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + /* + * Try to take ILOCK_EXCL of the temporary file. We had better be the + * only ones holding onto this inode, but we can't block while holding + * the rtsummary file's ILOCK_EXCL. + */ + while (!xrep_tempfile_ilock_nowait(sc)) { + if (xchk_should_terminate(sc, &error)) + return error; + delay(1); + } + + /* Make sure we have space allocated for the entire summary file. */ + xfs_trans_ijoin(sc->tp, sc->ip, 0); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + error = xrep_tempfile_prealloc(sc, 0, rts->rsumblocks); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* Copy the rtsummary file that we generated. */ + error = xrep_tempfile_copyin(sc, 0, rts->rsumblocks, + xrep_rtsummary_prep_buf, rts); + if (error) + return error; + error = xrep_tempfile_set_isize(sc, XFS_FSB_TO_B(mp, rts->rsumblocks)); + if (error) + return error; + + /* + * Now exchange the contents. Nothing in repair uses the temporary + * buffer, so we can reuse it for the tempfile exchrange information. + */ + error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, 0, + rts->rsumblocks, &rts->tempexch); + if (error) + return error; + + error = xrep_tempexch_contents(sc, &rts->tempexch); + if (error) + return error; + + /* Reset incore state and blow out the summary cache. */ + if (sc->sr.rtg->rtg_rsum_cache) + memset(sc->sr.rtg->rtg_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks); + + mp->m_rsumlevels = rts->rsumlevels; + mp->m_rsumblocks = rts->rsumblocks; + + /* Free the old rtsummary blocks if they're not in use. */ + return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 20fac9723c08..76e24032e99a 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -17,6 +17,11 @@ #include "xfs_scrub.h" #include "xfs_buf_mem.h" #include "xfs_rmap.h" +#include "xfs_exchrange.h" +#include "xfs_exchmaps.h" +#include "xfs_dir2.h" +#include "xfs_parent.h" +#include "xfs_icache.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -24,6 +29,8 @@ #include "scrub/health.h" #include "scrub/stats.h" #include "scrub/xfile.h" +#include "scrub/tempfile.h" +#include "scrub/orphanage.h" /* * Online Scrub and Repair @@ -142,6 +149,18 @@ xchk_probe( if (xchk_should_terminate(sc, &error)) return error; + /* + * If the caller is probing to see if repair works but repair isn't + * built into the kernel, return EOPNOTSUPP because that's the signal + * that userspace expects. If online repair is built in, set the + * CORRUPT flag (without any of the usual tracing/logging) to force us + * into xrep_probe. + */ + if (xchk_could_repair(sc)) { + if (!IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)) + return -EOPNOTSUPP; + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + } return 0; } @@ -157,7 +176,7 @@ xchk_fsgates_disable( trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL); if (sc->flags & XCHK_FSGATES_DRAIN) - xfs_drain_wait_disable(); + xfs_defer_drain_wait_disable(); if (sc->flags & XCHK_FSGATES_QUOTA) xfs_dqtrx_hook_disable(); @@ -171,6 +190,39 @@ xchk_fsgates_disable( sc->flags &= ~XCHK_FSGATES_ALL; } +/* Free the resources associated with a scrub subtype. */ +void +xchk_scrub_free_subord( + struct xfs_scrub_subord *sub) +{ + struct xfs_scrub *sc = sub->parent_sc; + + ASSERT(sc->ip == sub->sc.ip); + ASSERT(sc->orphanage == sub->sc.orphanage); + ASSERT(sc->tempip == sub->sc.tempip); + + sc->sm->sm_type = sub->old_smtype; + sc->sm->sm_flags = sub->old_smflags | + (sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT); + sc->tp = sub->sc.tp; + + if (sub->sc.buf) { + if (sub->sc.buf_cleanup) + sub->sc.buf_cleanup(sub->sc.buf); + kvfree(sub->sc.buf); + } + if (sub->sc.xmbtp) + xmbuf_free(sub->sc.xmbtp); + if (sub->sc.xfile) + xfile_destroy(sub->sc.xfile); + + sc->ilock_flags = sub->sc.ilock_flags; + sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags; + sc->temp_ilock_flags = sub->sc.temp_ilock_flags; + + kfree(sub); +} + /* Free all the resources and finish the transactions. */ STATIC int xchk_teardown( @@ -178,6 +230,8 @@ xchk_teardown( int error) { xchk_ag_free(sc, &sc->sa); + xchk_rtgroup_btcur_free(&sc->sr); + if (sc->tp) { if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) error = xfs_trans_commit(sc->tp); @@ -185,6 +239,8 @@ xchk_teardown( xfs_trans_cancel(sc->tp); sc->tp = NULL; } + if (sc->sr.rtg) + xchk_rtgroup_free(sc, &sc->sr); if (sc->ip) { if (sc->ilock_flags) xchk_iunlock(sc, sc->ilock_flags); @@ -211,6 +267,8 @@ xchk_teardown( sc->buf = NULL; } + xrep_tempfile_rele(sc); + xrep_orphanage_rele(sc); xchk_fsgates_disable(sc); return error; } @@ -319,37 +377,39 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_INODE, .setup = xchk_setup_directory, .scrub = xchk_directory, - .repair = xrep_notsupported, + .repair = xrep_directory, }, [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ .type = ST_INODE, .setup = xchk_setup_xattr, .scrub = xchk_xattr, - .repair = xrep_notsupported, + .repair = xrep_xattr, }, [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ .type = ST_INODE, .setup = xchk_setup_symlink, .scrub = xchk_symlink, - .repair = xrep_notsupported, + .repair = xrep_symlink, }, [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ .type = ST_INODE, .setup = xchk_setup_parent, .scrub = xchk_parent, - .repair = xrep_notsupported, + .repair = xrep_parent, }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ - .type = ST_FS, + .type = ST_RTGROUP, + .has = xfs_has_nonzoned, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, .repair = xrep_rtbitmap, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ - .type = ST_FS, + .type = ST_RTGROUP, + .has = xfs_has_nonzoned, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, - .repair = xrep_notsupported, + .repair = xrep_rtsummary, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ .type = ST_FS, @@ -393,6 +453,41 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .scrub = xchk_health_record, .repair = xrep_notsupported, }, + [XFS_SCRUB_TYPE_DIRTREE] = { /* directory tree structure */ + .type = ST_INODE, + .setup = xchk_setup_dirtree, + .scrub = xchk_dirtree, + .has = xfs_has_parent, + .repair = xrep_dirtree, + }, + [XFS_SCRUB_TYPE_METAPATH] = { /* metadata directory tree path */ + .type = ST_GENERIC, + .setup = xchk_setup_metapath, + .scrub = xchk_metapath, + .has = xfs_has_metadir, + .repair = xrep_metapath, + }, + [XFS_SCRUB_TYPE_RGSUPER] = { /* realtime group superblock */ + .type = ST_RTGROUP, + .setup = xchk_setup_rgsuperblock, + .scrub = xchk_rgsuperblock, + .has = xfs_has_rtsb, + .repair = xrep_rgsuperblock, + }, + [XFS_SCRUB_TYPE_RTRMAPBT] = { /* realtime group rmapbt */ + .type = ST_RTGROUP, + .setup = xchk_setup_rtrmapbt, + .scrub = xchk_rtrmapbt, + .has = xfs_has_rtrmapbt, + .repair = xrep_rtrmapbt, + }, + [XFS_SCRUB_TYPE_RTREFCBT] = { /* realtime refcountbt */ + .type = ST_RTGROUP, + .setup = xchk_setup_rtrefcountbt, + .scrub = xchk_rtrefcountbt, + .has = xfs_has_rtreflink, + .repair = xrep_rtrefcountbt, + }, }; static int @@ -440,6 +535,35 @@ xchk_validate_inputs( if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino)) goto out; break; + case ST_GENERIC: + break; + case ST_RTGROUP: + if (sm->sm_ino || sm->sm_gen) + goto out; + if (xfs_has_rtgroups(mp)) { + /* + * On a rtgroups filesystem, there won't be an rtbitmap + * or rtsummary file for group 0 unless there's + * actually a realtime volume attached. However, older + * xfs_scrub always calls the rtbitmap/rtsummary + * scrubbers with sm_agno==0 so transform the error + * code to ENOENT. + */ + if (sm->sm_agno >= mp->m_sb.sb_rgcount) { + if (sm->sm_agno == 0) + error = -ENOENT; + goto out; + } + } else { + /* + * Prior to rtgroups, the rtbitmap/rtsummary scrubbers + * accepted sm_agno==0, so we still accept that for + * scrubbing pre-rtgroups filesystems. + */ + if (sm->sm_agno != 0) + goto out; + } + break; default: goto out; } @@ -497,8 +621,38 @@ static inline void xchk_postmortem(struct xfs_scrub *sc) } #endif /* CONFIG_XFS_ONLINE_REPAIR */ +/* + * Create a new scrub context from an existing one, but with a different scrub + * type. + */ +struct xfs_scrub_subord * +xchk_scrub_create_subord( + struct xfs_scrub *sc, + unsigned int subtype) +{ + struct xfs_scrub_subord *sub; + + sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS); + if (!sub) + return ERR_PTR(-ENOMEM); + + sub->old_smtype = sc->sm->sm_type; + sub->old_smflags = sc->sm->sm_flags; + sub->parent_sc = sc; + memcpy(&sub->sc, sc, sizeof(struct xfs_scrub)); + sub->sc.ops = &meta_scrub_ops[subtype]; + sub->sc.sm->sm_type = subtype; + sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + sub->sc.buf = NULL; + sub->sc.buf_cleanup = NULL; + sub->sc.xfile = NULL; + sub->sc.xmbtp = NULL; + + return sub; +} + /* Dispatch metadata scrubbing. */ -int +STATIC int xfs_scrub_metadata( struct file *file, struct xfs_scrub_metadata *sm) @@ -526,9 +680,6 @@ xfs_scrub_metadata( if (error) goto out; - xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, - "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); - sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); if (!sc) { error = -ENOMEM; @@ -540,6 +691,7 @@ xfs_scrub_metadata( sc->sm = sm; sc->ops = &meta_scrub_ops[sm->sm_type]; sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); + sc->relax = INIT_XCHK_RELAX; retry_op: /* * When repairs are allowed, prevent freezing or readonly remount while @@ -643,3 +795,221 @@ try_harder: run.retries++; goto retry_op; } + +/* Scrub one aspect of one piece of metadata. */ +int +xfs_ioc_scrub_metadata( + struct file *file, + void __user *arg) +{ + struct xfs_scrub_metadata scrub; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&scrub, arg, sizeof(scrub))) + return -EFAULT; + + error = xfs_scrub_metadata(file, &scrub); + if (error) + return error; + + if (copy_to_user(arg, &scrub, sizeof(scrub))) + return -EFAULT; + + return 0; +} + +/* Decide if there have been any scrub failures up to this point. */ +static inline int +xfs_scrubv_check_barrier( + struct xfs_mount *mp, + const struct xfs_scrub_vec *vectors, + const struct xfs_scrub_vec *stop_vec) +{ + const struct xfs_scrub_vec *v; + __u32 failmask; + + failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT; + + for (v = vectors; v < stop_vec; v++) { + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) + continue; + + /* + * Runtime errors count as a previous failure, except the ones + * used to ask userspace to retry. + */ + switch (v->sv_ret) { + case -EBUSY: + case -ENOENT: + case -EUSERS: + case 0: + break; + default: + return -ECANCELED; + } + + /* + * If any of the out-flags on the scrub vector match the mask + * that was set on the barrier vector, that's a previous fail. + */ + if (v->sv_flags & failmask) + return -ECANCELED; + } + + return 0; +} + +/* + * If the caller provided us with a nonzero inode number that isn't the ioctl + * file, try to grab a reference to it to eliminate all further untrusted inode + * lookups. If we can't get the inode, let each scrub function try again. + */ +STATIC struct xfs_inode * +xchk_scrubv_open_by_handle( + struct xfs_mount *mp, + const struct xfs_scrub_vec_head *head) +{ + struct xfs_trans *tp; + struct xfs_inode *ip; + int error; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return NULL; + + error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip); + xfs_trans_cancel(tp); + if (error) + return NULL; + + if (VFS_I(ip)->i_generation != head->svh_gen) { + xfs_irele(ip); + return NULL; + } + + return ip; +} + +/* Vectored scrub implementation to reduce ioctl calls. */ +int +xfs_ioc_scrubv_metadata( + struct file *file, + void __user *arg) +{ + struct xfs_scrub_vec_head head; + struct xfs_scrub_vec_head __user *uhead = arg; + struct xfs_scrub_vec *vectors; + struct xfs_scrub_vec __user *uvectors; + struct xfs_inode *ip_in = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip_in->i_mount; + struct xfs_inode *handle_ip = NULL; + struct xfs_scrub_vec *v; + size_t vec_bytes; + unsigned int i; + int error = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&head, uhead, sizeof(head))) + return -EFAULT; + + if (head.svh_reserved) + return -EINVAL; + if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL) + return -EINVAL; + if (head.svh_nr == 0) + return 0; + + vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec)); + if (vec_bytes > PAGE_SIZE) + return -ENOMEM; + + uvectors = u64_to_user_ptr(head.svh_vectors); + vectors = memdup_user(uvectors, vec_bytes); + if (IS_ERR(vectors)) + return PTR_ERR(vectors); + + trace_xchk_scrubv_start(ip_in, &head); + + for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { + if (v->sv_reserved) { + error = -EINVAL; + goto out_free; + } + + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER && + (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) { + error = -EINVAL; + goto out_free; + } + + trace_xchk_scrubv_item(mp, &head, i, v); + } + + /* + * If the caller wants us to do a scrub-by-handle and the file used to + * call the ioctl is not the same file, load the incore inode and pin + * it across all the scrubv actions to avoid repeated UNTRUSTED + * lookups. The reference is not passed to deeper layers of scrub + * because each scrubber gets to decide its own strategy and return + * values for getting an inode. + */ + if (head.svh_ino && head.svh_ino != ip_in->i_ino) + handle_ip = xchk_scrubv_open_by_handle(mp, &head); + + /* Run all the scrubbers. */ + for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { + struct xfs_scrub_metadata sm = { + .sm_type = v->sv_type, + .sm_flags = v->sv_flags, + .sm_ino = head.svh_ino, + .sm_gen = head.svh_gen, + .sm_agno = head.svh_agno, + }; + + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) { + v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v); + if (v->sv_ret) { + trace_xchk_scrubv_barrier_fail(mp, &head, i, v); + break; + } + + continue; + } + + v->sv_ret = xfs_scrub_metadata(file, &sm); + v->sv_flags = sm.sm_flags; + + trace_xchk_scrubv_outcome(mp, &head, i, v); + + if (head.svh_rest_us) { + ktime_t expires; + + expires = ktime_add_ns(ktime_get(), + head.svh_rest_us * 1000); + set_current_state(TASK_KILLABLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } + + if (fatal_signal_pending(current)) { + error = -EINTR; + goto out_free; + } + } + + if (copy_to_user(uvectors, vectors, vec_bytes) || + copy_to_user(uhead, &head, sizeof(head))) { + error = -EFAULT; + goto out_free; + } + +out_free: + if (handle_ip) + xfs_irele(handle_ip); + kfree(vectors); + return error; +} diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 9ad65b604fe1..a3f1abc91390 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -8,6 +8,49 @@ struct xfs_scrub; +struct xchk_relax { + unsigned long next_resched; + unsigned int resched_nr; + bool interruptible; +}; + +/* Yield to the scheduler at most 10x per second. */ +#define XCHK_RELAX_NEXT (jiffies + (HZ / 10)) + +#define INIT_XCHK_RELAX \ + (struct xchk_relax){ \ + .next_resched = XCHK_RELAX_NEXT, \ + .resched_nr = 0, \ + .interruptible = true, \ + } + +/* + * Relax during a scrub operation and exit if there's a fatal signal pending. + * + * If preemption is disabled, we need to yield to the scheduler every now and + * then so that we don't run afoul of the soft lockup watchdog or RCU stall + * detector. cond_resched calls are somewhat expensive (~5ns) so we want to + * ratelimit this to 10x per second. Amortize the cost of the other checks by + * only doing it once every 100 calls. + */ +static inline int xchk_maybe_relax(struct xchk_relax *widget) +{ + /* Amortize the cost of scheduling and checking signals. */ + if (likely(++widget->resched_nr < 100)) + return 0; + widget->resched_nr = 0; + + if (unlikely(widget->next_resched <= jiffies)) { + cond_resched(); + widget->next_resched = XCHK_RELAX_NEXT; + } + + if (widget->interruptible && fatal_signal_pending(current)) + return -EINTR; + + return 0; +} + /* * Standard flags for allocating memory within scrub. NOFS context is * configured by the process allocation scope. Scrub and repair must be able @@ -17,12 +60,21 @@ struct xfs_scrub; #define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \ __GFP_RETRY_MAYFAIL)) +/* + * For opening files by handle for fsck operations, we don't trust the inumber + * or the allocation state; therefore, perform an untrusted lookup. We don't + * want these inodes to pollute the cache, so mark them for immediate removal. + */ +#define XCHK_IGET_FLAGS (XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE) + /* Type info and names for the scrub types. */ enum xchk_type { ST_NONE = 1, /* disabled */ ST_PERAG, /* per-AG metadata */ ST_FS, /* per-FS metadata */ ST_INODE, /* per-inode metadata */ + ST_GENERIC, /* determined by the scrubber */ + ST_RTGROUP, /* rtgroup metadata */ }; struct xchk_meta_ops { @@ -44,7 +96,7 @@ struct xchk_meta_ops { int (*repair_eval)(struct xfs_scrub *sc); /* Decide if we even have this piece of metadata. */ - bool (*has)(struct xfs_mount *); + bool (*has)(const struct xfs_mount *); /* type describing required/allowed inputs */ enum xchk_type type; @@ -67,6 +119,19 @@ struct xchk_ag { struct xfs_btree_cur *refc_cur; }; +/* Inode lock state for the RT volume. */ +struct xchk_rt { + /* incore rtgroup, if applicable */ + struct xfs_rtgroup *rtg; + + /* XFS_RTGLOCK_* lock state if locked */ + unsigned int rtlock_flags; + + /* rtgroup btrees */ + struct xfs_btree_cur *rmap_cur; + struct xfs_btree_cur *refc_cur; +}; + struct xfs_scrub { /* General scrub state. */ struct xfs_mount *mp; @@ -105,6 +170,14 @@ struct xfs_scrub { /* Lock flags for @ip. */ uint ilock_flags; + /* The orphanage, for stashing files that have lost their parent. */ + uint orphanage_ilock_flags; + struct xfs_inode *orphanage; + + /* A temporary file on this filesystem, for staging new metadata. */ + struct xfs_inode *tempip; + uint temp_ilock_flags; + /* See the XCHK/XREP state flags below. */ unsigned int flags; @@ -115,8 +188,20 @@ struct xfs_scrub { */ unsigned int sick_mask; + /* + * Clear these XFS_SICK_* flags but only if the scan is ok. Useful for + * removing ZAPPED flags after a repair. + */ + unsigned int healthy_mask; + + /* next time we want to cond_resched() */ + struct xchk_relax relax; + /* State tracking for single-AG operations. */ struct xchk_ag sa; + + /* State tracking for realtime operations. */ + struct xchk_rt sr; }; /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ @@ -141,6 +226,40 @@ struct xfs_scrub { XCHK_FSGATES_DIRENTS | \ XCHK_FSGATES_RMAP) +struct xfs_scrub_subord { + struct xfs_scrub sc; + struct xfs_scrub *parent_sc; + unsigned int old_smtype; + unsigned int old_smflags; +}; + +struct xfs_scrub_subord *xchk_scrub_create_subord(struct xfs_scrub *sc, + unsigned int subtype); +void xchk_scrub_free_subord(struct xfs_scrub_subord *sub); + +/* + * We /could/ terminate a scrub/repair operation early. If we're not + * in a good place to continue (fatal signal, etc.) then bail out. + * Note that we're careful not to make any judgements about *error. + */ +static inline bool +xchk_should_terminate( + struct xfs_scrub *sc, + int *error) +{ + if (xchk_maybe_relax(&sc->relax)) { + if (*error == 0) + *error = -EINTR; + return true; + } + return false; +} + +static inline int xchk_nothing(struct xfs_scrub *sc) +{ + return -ENOENT; +} + /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); int xchk_superblock(struct xfs_scrub *sc); @@ -159,35 +278,27 @@ int xchk_directory(struct xfs_scrub *sc); int xchk_xattr(struct xfs_scrub *sc); int xchk_symlink(struct xfs_scrub *sc); int xchk_parent(struct xfs_scrub *sc); +int xchk_dirtree(struct xfs_scrub *sc); +int xchk_metapath(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xchk_rtbitmap(struct xfs_scrub *sc); int xchk_rtsummary(struct xfs_scrub *sc); +int xchk_rgsuperblock(struct xfs_scrub *sc); +int xchk_rtrmapbt(struct xfs_scrub *sc); +int xchk_rtrefcountbt(struct xfs_scrub *sc); #else -static inline int -xchk_rtbitmap(struct xfs_scrub *sc) -{ - return -ENOENT; -} -static inline int -xchk_rtsummary(struct xfs_scrub *sc) -{ - return -ENOENT; -} +# define xchk_rtbitmap xchk_nothing +# define xchk_rtsummary xchk_nothing +# define xchk_rgsuperblock xchk_nothing +# define xchk_rtrmapbt xchk_nothing +# define xchk_rtrefcountbt xchk_nothing #endif #ifdef CONFIG_XFS_QUOTA int xchk_quota(struct xfs_scrub *sc); int xchk_quotacheck(struct xfs_scrub *sc); #else -static inline int -xchk_quota(struct xfs_scrub *sc) -{ - return -ENOENT; -} -static inline int -xchk_quotacheck(struct xfs_scrub *sc) -{ - return -ENOENT; -} +# define xchk_quota xchk_nothing +# define xchk_quotacheck xchk_nothing #endif int xchk_fscounters(struct xfs_scrub *sc); int xchk_nlinks(struct xfs_scrub *sc); @@ -214,8 +325,26 @@ void xchk_xref_is_not_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno, #ifdef CONFIG_XFS_RT void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, xfs_extlen_t len); +void xchk_xref_has_no_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len); +void xchk_xref_has_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len); +void xchk_xref_is_only_rt_owned_by(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len, const struct xfs_owner_info *oinfo); +void xchk_xref_is_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len); +void xchk_xref_is_not_rt_shared(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len); +void xchk_xref_is_not_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno, + xfs_extlen_t len); #else # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) +# define xchk_xref_has_no_rt_owner(sc, rtbno, len) do { } while (0) +# define xchk_xref_has_rt_owner(sc, rtbno, len) do { } while (0) +# define xchk_xref_is_only_rt_owned_by(sc, bno, len, oinfo) do { } while (0) +# define xchk_xref_is_rt_cow_staging(sc, bno, len) do { } while (0) +# define xchk_xref_is_not_rt_shared(sc, bno, len) do { } while (0) +# define xchk_xref_is_not_rt_cow_staging(sc, bno, len) do { } while (0) #endif #endif /* __XFS_SCRUB_SCRUB_H__ */ diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index 42cafbed94ac..f8a37ea97791 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -79,6 +79,11 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters", [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck", [XFS_SCRUB_TYPE_NLINKS] = "nlinks", + [XFS_SCRUB_TYPE_DIRTREE] = "dirtree", + [XFS_SCRUB_TYPE_METAPATH] = "metapath", + [XFS_SCRUB_TYPE_RGSUPER] = "rgsuper", + [XFS_SCRUB_TYPE_RTRMAPBT] = "rtrmapbt", + [XFS_SCRUB_TYPE_RTREFCBT] = "rtrefcountbt", }; /* Format the scrub stats into a text buffer, similar to pcp style. */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index d77d8a9598f6..c848bcc07cd5 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -10,6 +10,7 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_symlink.h" #include "xfs_health.h" @@ -17,18 +18,28 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/health.h" +#include "scrub/repair.h" /* Set us up to scrub a symbolic link. */ int xchk_setup_symlink( struct xfs_scrub *sc) { + unsigned int resblks = 0; + int error; + /* Allocate the buffer without the inode lock held. */ sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; - return xchk_setup_inode_contents(sc, 0); + if (xchk_could_repair(sc)) { + error = xrep_setup_symlink(sc, &resblks); + if (error) + return error; + } + + return xchk_setup_inode_contents(sc, resblks); } /* Symbolic links. */ diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c new file mode 100644 index 000000000000..953ce7be78dc --- /dev/null +++ b/fs/xfs/scrub/symlink_repair.c @@ -0,0 +1,510 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_symlink.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_symlink_remote.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/reap.h" +#include "scrub/health.h" + +/* + * Symbolic Link Repair + * ==================== + * + * We repair symbolic links by reading whatever target data we can find, up to + * the first NULL byte. If the recovered target strlen matches i_size, then + * we rewrite the target. In all other cases, we replace the target with an + * overly long string that cannot possibly resolve. The new target is written + * into a private hidden temporary file, and then a file contents exchange + * commits the new symlink target to the file being repaired. + */ + +/* Set us up to repair the symlink file. */ +int +xrep_setup_symlink( + struct xfs_scrub *sc, + unsigned int *resblks) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks; + int error; + + error = xrep_tempfile_create(sc, S_IFLNK); + if (error) + return error; + + /* + * If we're doing a repair, we reserve enough blocks to write out a + * completely new symlink file, plus twice as many blocks as we would + * need if we can only allocate one block per data fork mapping. This + * should cover the preallocation of the temporary file and exchanging + * the extent mappings. + * + * We cannot use xfs_exchmaps_estimate because we have not yet + * constructed the replacement symlink and therefore do not know how + * many extents it will use. By the time we do, we will have a dirty + * transaction (which we cannot drop because we cannot drop the + * symlink ILOCK) and cannot ask for more reservation. + */ + blocks = xfs_symlink_blocks(sc->mp, XFS_SYMLINK_MAXLEN); + blocks += xfs_bmbt_calc_size(mp, blocks) * 2; + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + *resblks += blocks; + return 0; +} + +/* + * Try to salvage the pathname from remote blocks. Returns the number of bytes + * salvaged or a negative errno. + */ +STATIC ssize_t +xrep_symlink_salvage_remote( + struct xfs_scrub *sc) +{ + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_inode *ip = sc->ip; + struct xfs_buf *bp; + char *target_buf = sc->buf; + xfs_failaddr_t fa; + xfs_filblks_t fsblocks; + xfs_daddr_t d; + loff_t len; + loff_t offset = 0; + unsigned int byte_cnt; + bool magic_ok; + bool hdr_ok; + int n; + int nmaps = XFS_SYMLINK_MAPS; + int error; + + /* We'll only read until the buffer is full. */ + len = min_t(loff_t, ip->i_disk_size, XFS_SYMLINK_MAXLEN); + fsblocks = xfs_symlink_blocks(sc->mp, len); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + return error; + + for (n = 0; n < nmaps; n++) { + struct xfs_dsymlink_hdr *dsl; + + d = XFS_FSB_TO_DADDR(sc->mp, mval[n].br_startblock); + + /* Read the rmt block. We'll run the verifiers manually. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, + d, XFS_FSB_TO_BB(sc->mp, mval[n].br_blockcount), + 0, &bp, NULL); + if (error) + return error; + bp->b_ops = &xfs_symlink_buf_ops; + + /* How many bytes do we expect to get out of this buffer? */ + byte_cnt = XFS_FSB_TO_B(sc->mp, mval[n].br_blockcount); + byte_cnt = XFS_SYMLINK_BUF_SPACE(sc->mp, byte_cnt); + byte_cnt = min_t(unsigned int, byte_cnt, len); + + /* + * See if the verifiers accept this block. We're willing to + * salvage if the if the offset/byte/ino are ok and either the + * verifier passed or the magic is ok. Anything else and we + * stop dead in our tracks. + */ + fa = bp->b_ops->verify_struct(bp); + dsl = bp->b_addr; + magic_ok = dsl->sl_magic == cpu_to_be32(XFS_SYMLINK_MAGIC); + hdr_ok = xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp); + if (!hdr_ok || (fa != NULL && !magic_ok)) + break; + + memcpy(target_buf + offset, dsl + 1, byte_cnt); + + len -= byte_cnt; + offset += byte_cnt; + } + return offset; +} + +/* + * Try to salvage an inline symlink's contents. Returns the number of bytes + * salvaged or a negative errno. + */ +STATIC ssize_t +xrep_symlink_salvage_inline( + struct xfs_scrub *sc) +{ + struct xfs_inode *ip = sc->ip; + char *target_buf = sc->buf; + char *old_target; + struct xfs_ifork *ifp; + unsigned int nr; + + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + if (!ifp->if_data) + return 0; + + /* + * If inode repair zapped the link target, pretend that we didn't find + * any bytes at all so that we can replace the (now totally lost) link + * target with a warning message. + */ + old_target = ifp->if_data; + if (xfs_inode_has_sickness(sc->ip, XFS_SICK_INO_SYMLINK_ZAPPED) && + sc->ip->i_disk_size == 1 && old_target[0] == '?') + return 0; + + nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); + strncpy(target_buf, ifp->if_data, nr); + return nr; +} + +#define DUMMY_TARGET \ + "The target of this symbolic link could not be recovered at all and " \ + "has been replaced with this explanatory message. To avoid " \ + "accidentally pointing to an existing file path, this message is " \ + "longer than the maximum supported file name length. That is an " \ + "acceptable length for a symlink target on XFS but will produce " \ + "File Name Too Long errors if resolved." + +/* Salvage whatever we can of the target. */ +STATIC int +xrep_symlink_salvage( + struct xfs_scrub *sc) +{ + char *target_buf = sc->buf; + ssize_t buflen = 0; + + BUILD_BUG_ON(sizeof(DUMMY_TARGET) - 1 <= NAME_MAX); + + /* + * Salvage the target if there weren't any corruption problems observed + * while scanning it. + */ + if (!(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + if (sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) + buflen = xrep_symlink_salvage_inline(sc); + else + buflen = xrep_symlink_salvage_remote(sc); + if (buflen < 0) + return buflen; + + /* + * NULL-terminate the buffer because the ondisk target does not + * do that for us. If salvage didn't find the exact amount of + * data that we expected to find, don't salvage anything. + */ + target_buf[buflen] = 0; + if (strlen(target_buf) != sc->ip->i_disk_size) + buflen = 0; + } + + /* + * Change an empty target into a dummy target and clear the symlink + * target zapped flag. + */ + if (buflen == 0) { + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED); + sprintf(target_buf, DUMMY_TARGET); + } + + trace_xrep_symlink_salvage_target(sc->ip, target_buf, + strlen(target_buf)); + return 0; +} + +STATIC void +xrep_symlink_local_to_remote( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp, + void *priv) +{ + struct xfs_scrub *sc = priv; + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + xfs_symlink_local_to_remote(tp, bp, ip, ifp, NULL); + + if (!xfs_has_crc(sc->mp)) + return; + + dsl->sl_owner = cpu_to_be64(sc->ip->i_ino); + xfs_trans_log_buf(tp, bp, 0, + sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1); +} + +/* + * Prepare both links' data forks for an exchange. Promote the tempfile from + * local format to extents format, and if the file being repaired has a short + * format data fork, turn it into an empty extent list. + */ +STATIC int +xrep_symlink_swap_prep( + struct xfs_scrub *sc, + bool temp_local, + bool ip_local) +{ + int error; + + /* + * If the temp link is in shortform format, convert that to a remote + * target so that we can use the atomic mapping exchange. + */ + if (temp_local) { + int logflags = XFS_ILOG_CORE; + + error = xfs_bmap_local_to_extents(sc->tp, sc->tempip, 1, + &logflags, XFS_DATA_FORK, + xrep_symlink_local_to_remote, + sc); + if (error) + return error; + + xfs_trans_log_inode(sc->tp, sc->ip, 0); + + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + /* + * If the file being repaired had a shortform data fork, convert that + * to an empty extent list in preparation for the atomic mapping + * exchange. + */ + if (ip_local) { + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + ifp->if_bytes = 0; + ifp->if_data = NULL; + ifp->if_height = 0; + + xfs_trans_log_inode(sc->tp, sc->ip, + XFS_ILOG_CORE | XFS_ILOG_DDATA); + } + + return 0; +} + +/* Exchange the temporary symlink's data fork with the one being repaired. */ +STATIC int +xrep_symlink_swap( + struct xfs_scrub *sc) +{ + struct xrep_tempexch *tx = sc->buf; + bool ip_local, temp_local; + int error; + + ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + + /* + * If the both links have a local format data fork and the rebuilt + * remote data would fit in the repaired file's data fork, copy the + * contents from the tempfile and declare ourselves done. + */ + if (ip_local && temp_local && + sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) { + xrep_tempfile_copyout_local(sc, XFS_DATA_FORK); + return 0; + } + + /* Otherwise, make sure both data forks are in block-mapping mode. */ + error = xrep_symlink_swap_prep(sc, temp_local, ip_local); + if (error) + return error; + + return xrep_tempexch_contents(sc, tx); +} + +/* + * Free all the remote blocks and reset the data fork. The caller must join + * the inode to the transaction. This function returns with the inode joined + * to a clean scrub transaction. + */ +STATIC int +xrep_symlink_reset_fork( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK); + int error; + + /* Unmap all the remote target buffers. */ + if (xfs_ifork_has_extents(ifp)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); + if (error) + return error; + } + + trace_xrep_symlink_reset_fork(sc->tempip); + + /* Reset the temp symlink target to dummy content. */ + xfs_idestroy_fork(ifp); + return xfs_symlink_write_target(sc->tp, sc->tempip, sc->tempip->i_ino, + "?", 1, 0, 0); +} + +/* + * Reinitialize a link target. Caller must ensure the inode is joined to + * the transaction. + */ +STATIC int +xrep_symlink_rebuild( + struct xfs_scrub *sc) +{ + struct xrep_tempexch *tx; + char *target_buf = sc->buf; + xfs_fsblock_t fs_blocks; + unsigned int target_len; + unsigned int resblks; + int error; + + /* How many blocks do we need? */ + target_len = strlen(target_buf); + ASSERT(target_len != 0); + if (target_len == 0 || target_len > XFS_SYMLINK_MAXLEN) + return -EFSCORRUPTED; + + trace_xrep_symlink_rebuild(sc->ip); + + /* + * In preparation to write the new symlink target to the temporary + * file, drop the ILOCK of the file being repaired (it shouldn't be + * joined) and take the ILOCK of the temporary file. + * + * The VFS does not take the IOLOCK while reading a symlink (and new + * symlinks are hidden with INEW until they've been written) so it's + * possible that a readlink() could see the old corrupted contents + * while we're doing this. + */ + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xrep_tempfile_ilock(sc); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + /* + * Reserve resources to reinitialize the target. We're allowed to + * exceed file quota to repair inconsistent metadata, though this is + * unlikely. + */ + fs_blocks = xfs_symlink_blocks(sc->mp, target_len); + resblks = xfs_symlink_space_res(sc->mp, target_len, fs_blocks); + error = xfs_trans_reserve_quota_nblks(sc->tp, sc->tempip, resblks, 0, + true); + if (error) + return error; + + /* Erase the dummy target set up by the tempfile initialization. */ + xfs_idestroy_fork(&sc->tempip->i_df); + sc->tempip->i_df.if_bytes = 0; + sc->tempip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; + + /* Write the salvaged target to the temporary link. */ + error = xfs_symlink_write_target(sc->tp, sc->tempip, sc->ip->i_ino, + target_buf, target_len, fs_blocks, resblks); + if (error) + return error; + + /* + * Commit the repair transaction so that we can use the atomic mapping + * exchange functions to compute the correct block reservations and + * re-lock the inodes. + */ + target_buf = NULL; + error = xrep_trans_commit(sc); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + xrep_tempfile_iunlock(sc); + + /* + * We're done with the temporary buffer, so we can reuse it for the + * tempfile contents exchange information. + */ + tx = sc->buf; + error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, tx); + if (error) + return error; + + /* + * Exchange the temp link's data fork with the file being repaired. + * This recreates the transaction and takes the ILOCKs of the file + * being repaired and the temporary file. + */ + error = xrep_symlink_swap(sc); + if (error) + return error; + + /* + * Release the old symlink blocks and reset the data fork of the temp + * link to an empty shortform link. This is the last repair action we + * perform on the symlink, so we don't need to clean the transaction. + */ + return xrep_symlink_reset_fork(sc); +} + +/* Repair a symbolic link. */ +int +xrep_symlink( + struct xfs_scrub *sc) +{ + int error; + + /* The rmapbt is required to reap the old data fork. */ + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + error = xrep_symlink_salvage(sc); + if (error) + return error; + + /* Now reset the target. */ + error = xrep_symlink_rebuild(sc); + if (error) + return error; + + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/tempexch.h b/fs/xfs/scrub/tempexch.h new file mode 100644 index 000000000000..eccda720c2ca --- /dev/null +++ b/fs/xfs/scrub/tempexch.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_TEMPEXCH_H__ +#define __XFS_SCRUB_TEMPEXCH_H__ + +#ifdef CONFIG_XFS_ONLINE_REPAIR +struct xrep_tempexch { + struct xfs_exchmaps_req req; +}; + +int xrep_tempexch_trans_reserve(struct xfs_scrub *sc, int whichfork, + xfs_fileoff_t off, xfs_filblks_t len, struct xrep_tempexch *ti); +int xrep_tempexch_trans_alloc(struct xfs_scrub *sc, int whichfork, + struct xrep_tempexch *ti); + +int xrep_tempexch_contents(struct xfs_scrub *sc, struct xrep_tempexch *ti); +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_TEMPEXCH_H__ */ diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c new file mode 100644 index 000000000000..cf99e0ca51b0 --- /dev/null +++ b/fs/xfs/scrub/tempfile.c @@ -0,0 +1,980 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_dir2.h" +#include "xfs_exchrange.h" +#include "xfs_exchmaps.h" +#include "xfs_defer.h" +#include "xfs_symlink_remote.h" +#include "xfs_metafile.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/trace.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/xfile.h" + +/* + * Create a temporary file for reconstructing metadata, with the intention of + * atomically exchanging the temporary file's contents with the file that's + * being repaired. + */ +int +xrep_tempfile_create( + struct xfs_scrub *sc, + uint16_t mode) +{ + struct xfs_icreate_args args = { + .pip = sc->mp->m_rootip, + .mode = mode, + .flags = XFS_ICREATE_TMPFILE | XFS_ICREATE_UNLINKABLE, + }; + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = NULL; + struct xfs_dquot *udqp; + struct xfs_dquot *gdqp; + struct xfs_dquot *pdqp; + struct xfs_trans_res *tres; + struct xfs_inode *dp = mp->m_rootip; + xfs_ino_t ino; + unsigned int resblks; + bool is_dir = S_ISDIR(mode); + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + if (xfs_is_readonly(mp)) + return -EROFS; + + ASSERT(sc->tp == NULL); + ASSERT(sc->tempip == NULL); + + /* + * Make sure that we have allocated dquot(s) on disk. The temporary + * inode should be completely root owned so that we don't fail due to + * quota limits. + */ + error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp); + if (error) + return error; + + if (is_dir) { + resblks = xfs_mkdir_space_res(mp, 0); + tres = &M_RES(mp)->tr_mkdir; + } else { + resblks = XFS_IALLOC_SPACE_RES(mp); + tres = &M_RES(mp)->tr_create_tmpfile; + } + + error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, + &tp); + if (error) + goto out_release_dquots; + + /* Allocate inode, set up directory. */ + error = xfs_dialloc(&tp, &args, &ino); + if (error) + goto out_trans_cancel; + error = xfs_icreate(tp, ino, &args, &sc->tempip); + if (error) + goto out_trans_cancel; + + /* We don't touch file data, so drop the realtime flags. */ + sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT); + xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE); + + /* + * Mark our temporary file as private so that LSMs and the ACL code + * don't try to add their own metadata or reason about these files. + * The file should never be exposed to userspace. + */ + VFS_I(sc->tempip)->i_flags |= S_PRIVATE; + VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR; + + if (is_dir) { + error = xfs_dir_init(tp, sc->tempip, dp); + if (error) + goto out_trans_cancel; + } else if (S_ISLNK(VFS_I(sc->tempip)->i_mode)) { + /* + * Initialize the temporary symlink with a meaningless target + * that won't trip the verifiers. Repair must rewrite the + * target with meaningful content before swapping with the file + * being repaired. A single-byte target will not write a + * remote target block, so the owner is irrelevant. + */ + error = xfs_symlink_write_target(tp, sc->tempip, + sc->tempip->i_ino, ".", 1, 0, 0); + if (error) + goto out_trans_cancel; + } + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp); + + /* + * Put our temp file on the unlinked list so it's purged automatically. + * All file-based metadata being reconstructed using this file must be + * atomically exchanged with the original file because the contents + * here will be purged when the inode is dropped or log recovery cleans + * out the unlinked list. + */ + error = xfs_iunlink(tp, sc->tempip); + if (error) + goto out_trans_cancel; + + error = xfs_trans_commit(tp); + if (error) + goto out_release_inode; + + trace_xrep_tempfile_create(sc); + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + /* Finish setting up the incore / vfs context. */ + xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); + xfs_setup_iops(sc->tempip); + xfs_finish_inode_setup(sc->tempip); + + sc->temp_ilock_flags = 0; + return error; + +out_trans_cancel: + xfs_trans_cancel(tp); +out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (sc->tempip) { + xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); + xfs_finish_inode_setup(sc->tempip); + xchk_irele(sc, sc->tempip); + } +out_release_dquots: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + return error; +} + +/* + * Move sc->tempip from the regular directory tree to the metadata directory + * tree if sc->ip is part of the metadata directory tree and tempip has an + * eligible file mode. + * + * Temporary files have to be created before we even know which inode we're + * going to scrub, so we assume that they will be part of the regular directory + * tree. If it turns out that we're actually scrubbing a file from the + * metadata directory tree, we have to subtract the temp file from the root + * dquots and detach the dquots prior to setting the METADATA iflag. However, + * the scrub setup functions grab sc->ip and create sc->tempip before we + * actually get around to checking if the file mode is the right type for the + * scrubber. + */ +int +xrep_tempfile_adjust_directory_tree( + struct xfs_scrub *sc) +{ + int error; + + if (!sc->tempip) + return 0; + + ASSERT(sc->tp == NULL); + ASSERT(!xfs_is_metadir_inode(sc->tempip)); + + if (!sc->ip || !xfs_is_metadir_inode(sc->ip)) + return 0; + if (!S_ISDIR(VFS_I(sc->tempip)->i_mode) && + !S_ISREG(VFS_I(sc->tempip)->i_mode)) + return 0; + + xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL); + sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; + + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_iolock; + + xrep_tempfile_ilock(sc); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + /* Metadir files are not accounted in quota, so drop icount */ + xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, -1L); + xfs_metafile_set_iflag(sc->tp, sc->tempip, XFS_METAFILE_UNKNOWN); + + error = xrep_trans_commit(sc); + if (error) + goto out_ilock; + + xfs_iflags_set(sc->tempip, XFS_IRECOVERY); + xfs_qm_dqdetach(sc->tempip); +out_ilock: + xrep_tempfile_iunlock(sc); +out_iolock: + xrep_tempfile_iounlock(sc); + return error; +} + +/* + * Remove this temporary file from the metadata directory tree so that it can + * be inactivated the normal way. + */ +STATIC int +xrep_tempfile_remove_metadir( + struct xfs_scrub *sc) +{ + int error; + + if (!sc->tempip || !xfs_is_metadir_inode(sc->tempip)) + return 0; + + ASSERT(sc->tp == NULL); + + xfs_iflags_clear(sc->tempip, XFS_IRECOVERY); + + xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL); + sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; + + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_iolock; + + xrep_tempfile_ilock(sc); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + xfs_metafile_clear_iflag(sc->tp, sc->tempip); + + /* Non-metadir files are accounted in quota, so bump bcount/icount */ + error = xfs_qm_dqattach_locked(sc->tempip, false); + if (error) + goto out_cancel; + + xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, 1L); + xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_BCOUNT, + sc->tempip->i_nblocks); + error = xrep_trans_commit(sc); + goto out_ilock; + +out_cancel: + xchk_trans_cancel(sc); +out_ilock: + xrep_tempfile_iunlock(sc); +out_iolock: + xrep_tempfile_iounlock(sc); + return error; +} + +/* Take IOLOCK_EXCL on the temporary file, maybe. */ +bool +xrep_tempfile_iolock_nowait( + struct xfs_scrub *sc) +{ + if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) { + sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; + return true; + } + + return false; +} + +/* + * Take the temporary file's IOLOCK while holding a different inode's IOLOCK. + * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock + * to avoid deadlocks and lockdep complaints. + */ +int +xrep_tempfile_iolock_polled( + struct xfs_scrub *sc) +{ + int error = 0; + + while (!xrep_tempfile_iolock_nowait(sc)) { + if (xchk_should_terminate(sc, &error)) + return error; + delay(1); + } + + return 0; +} + +/* Release IOLOCK_EXCL on the temporary file. */ +void +xrep_tempfile_iounlock( + struct xfs_scrub *sc) +{ + xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL); + sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL; +} + +/* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */ +void +xrep_tempfile_ilock( + struct xfs_scrub *sc) +{ + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->tempip, XFS_ILOCK_EXCL); +} + +/* Try to grab ILOCK_EXCL on the temporary file. */ +bool +xrep_tempfile_ilock_nowait( + struct xfs_scrub *sc) +{ + if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) { + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; + return true; + } + + return false; +} + +/* Unlock ILOCK_EXCL on the temporary file after an update. */ +void +xrep_tempfile_iunlock( + struct xfs_scrub *sc) +{ + xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); + sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL; +} + +/* + * Begin the process of making changes to both the file being scrubbed and + * the temporary file by taking ILOCK_EXCL on both. + */ +void +xrep_tempfile_ilock_both( + struct xfs_scrub *sc) +{ + xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip, XFS_ILOCK_EXCL); + sc->ilock_flags |= XFS_ILOCK_EXCL; + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; +} + +/* Unlock ILOCK_EXCL on both files. */ +void +xrep_tempfile_iunlock_both( + struct xfs_scrub *sc) +{ + xrep_tempfile_iunlock(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); +} + +/* Release the temporary file. */ +void +xrep_tempfile_rele( + struct xfs_scrub *sc) +{ + if (!sc->tempip) + return; + + if (sc->temp_ilock_flags) { + xfs_iunlock(sc->tempip, sc->temp_ilock_flags); + sc->temp_ilock_flags = 0; + } + + xrep_tempfile_remove_metadir(sc); + xchk_irele(sc, sc->tempip); + sc->tempip = NULL; +} + +/* + * Make sure that the given range of the data fork of the temporary file is + * mapped to written blocks. The caller must ensure that both inodes are + * joined to the transaction. + */ +int +xrep_tempfile_prealloc( + struct xfs_scrub *sc, + xfs_fileoff_t off, + xfs_filblks_t len) +{ + struct xfs_bmbt_irec map; + xfs_fileoff_t end = off + len; + int error; + + ASSERT(sc->tempip != NULL); + ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip)); + + for (; off < end; off = map.br_startoff + map.br_blockcount) { + int nmaps = 1; + + /* + * If we have a real extent mapping this block then we're + * in ok shape. + */ + error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps, + XFS_DATA_FORK); + if (error) + return error; + if (nmaps == 0) { + ASSERT(nmaps != 0); + return -EFSCORRUPTED; + } + + if (xfs_bmap_is_written_extent(&map)) + continue; + + /* + * If we find a delalloc reservation then something is very + * very wrong. Bail out. + */ + if (map.br_startblock == DELAYSTARTBLOCK) + return -EFSCORRUPTED; + + /* + * Make sure this block has a real zeroed extent allocated to + * it. + */ + nmaps = 1; + error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off, + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map, + &nmaps); + if (error) + return error; + if (nmaps != 1) + return -EFSCORRUPTED; + + trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map); + + /* Commit new extent and all deferred work. */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + return 0; +} + +/* + * Write data to each block of a file. The given range of the tempfile's data + * fork must already be populated with written extents. + */ +int +xrep_tempfile_copyin( + struct xfs_scrub *sc, + xfs_fileoff_t off, + xfs_filblks_t len, + xrep_tempfile_copyin_fn prep_fn, + void *data) +{ + LIST_HEAD(buffers_list); + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + xfs_fileoff_t flush_mask; + xfs_fileoff_t end = off + len; + loff_t pos = XFS_FSB_TO_B(mp, off); + int error = 0; + + ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode)); + + /* Flush buffers to disk every 512K */ + flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1; + + for (; off < end; off++, pos += mp->m_sb.sb_blocksize) { + struct xfs_bmbt_irec map; + int nmaps = 1; + + /* Read block mapping for this file block. */ + error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0); + if (error) + goto out_err; + if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) { + error = -EFSCORRUPTED; + goto out_err; + } + + /* Get the metadata buffer for this offset in the file. */ + error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), + mp->m_bsize, 0, &bp); + if (error) + goto out_err; + + trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map); + + /* Read in a block's worth of data from the xfile. */ + error = prep_fn(sc, bp, data); + if (error) { + xfs_trans_brelse(sc->tp, bp); + goto out_err; + } + + /* Queue buffer, and flush if we have too much dirty data. */ + xfs_buf_delwri_queue_here(bp, &buffers_list); + xfs_trans_brelse(sc->tp, bp); + + if (!(off & flush_mask)) { + error = xfs_buf_delwri_submit(&buffers_list); + if (error) + goto out_err; + } + } + + /* + * Write the new blocks to disk. If the ordered list isn't empty after + * that, then something went wrong and we have to fail. This should + * never happen, but we'll check anyway. + */ + error = xfs_buf_delwri_submit(&buffers_list); + if (error) + goto out_err; + + if (!list_empty(&buffers_list)) { + ASSERT(list_empty(&buffers_list)); + error = -EIO; + goto out_err; + } + + return 0; + +out_err: + xfs_buf_delwri_cancel(&buffers_list); + return error; +} + +/* + * Set the temporary file's size. Caller must join the tempfile to the scrub + * transaction and is responsible for adjusting block mappings as needed. + */ +int +xrep_tempfile_set_isize( + struct xfs_scrub *sc, + unsigned long long isize) +{ + if (sc->tempip->i_disk_size == isize) + return 0; + + sc->tempip->i_disk_size = isize; + i_size_write(VFS_I(sc->tempip), isize); + return xrep_tempfile_roll_trans(sc); +} + +/* + * Roll a repair transaction involving the temporary file. Caller must join + * both the temporary file and the file being scrubbed to the transaction. + * This function return with both inodes joined to a new scrub transaction, + * or the usual negative errno. + */ +int +xrep_tempfile_roll_trans( + struct xfs_scrub *sc) +{ + int error; + + xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE); + error = xrep_roll_trans(sc); + if (error) + return error; + + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + return 0; +} + +/* + * Fill out the mapping exchange request in preparation for atomically + * committing the contents of a metadata file that we've rebuilt in the temp + * file. + */ +STATIC int +xrep_tempexch_prep_request( + struct xfs_scrub *sc, + int whichfork, + xfs_fileoff_t off, + xfs_filblks_t len, + struct xrep_tempexch *tx) +{ + struct xfs_exchmaps_req *req = &tx->req; + + memset(tx, 0, sizeof(struct xrep_tempexch)); + + /* COW forks don't exist on disk. */ + if (whichfork == XFS_COW_FORK) { + ASSERT(0); + return -EINVAL; + } + + /* Both files should have the relevant forks. */ + if (!xfs_ifork_ptr(sc->ip, whichfork) || + !xfs_ifork_ptr(sc->tempip, whichfork)) { + ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL); + ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL); + return -EINVAL; + } + + /* Exchange all mappings in both forks. */ + req->ip1 = sc->tempip; + req->ip2 = sc->ip; + req->startoff1 = off; + req->startoff2 = off; + switch (whichfork) { + case XFS_ATTR_FORK: + req->flags |= XFS_EXCHMAPS_ATTR_FORK; + break; + case XFS_DATA_FORK: + /* Exchange sizes when exchanging all data fork mappings. */ + if (off == 0 && len == XFS_MAX_FILEOFF) + req->flags |= XFS_EXCHMAPS_SET_SIZES; + break; + } + req->blockcount = len; + + return 0; +} + +/* + * Fill out the mapping exchange resource estimation structures in preparation + * for exchanging the contents of a metadata file that we've rebuilt in the + * temp file. Caller must hold IOLOCK_EXCL but not ILOCK_EXCL on both files. + */ +STATIC int +xrep_tempexch_estimate( + struct xfs_scrub *sc, + struct xrep_tempexch *tx) +{ + struct xfs_exchmaps_req *req = &tx->req; + struct xfs_ifork *ifp; + struct xfs_ifork *tifp; + int whichfork = xfs_exchmaps_reqfork(req); + int state = 0; + + /* + * The exchmaps code only knows how to exchange file fork space + * mappings. Any fork data in local format must be promoted to a + * single block before the exchange can take place. + */ + ifp = xfs_ifork_ptr(sc->ip, whichfork); + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) + state |= 1; + + tifp = xfs_ifork_ptr(sc->tempip, whichfork); + if (tifp->if_format == XFS_DINODE_FMT_LOCAL) + state |= 2; + + switch (state) { + case 0: + /* Both files have mapped extents; use the regular estimate. */ + return xfs_exchrange_estimate(req); + case 1: + /* + * The file being repaired is in local format, but the temp + * file has mapped extents. To perform the exchange, the file + * being repaired must have its shorform data converted to an + * ondisk block so that the forks will be in extents format. + * We need one resblk for the conversion; the number of + * exchanges is (worst case) the temporary file's extent count + * plus the block we converted. + */ + req->ip1_bcount = sc->tempip->i_nblocks; + req->ip2_bcount = 1; + req->nr_exchanges = 1 + tifp->if_nextents; + req->resblks = 1; + break; + case 2: + /* + * The temporary file is in local format, but the file being + * repaired has mapped extents. To perform the exchange, the + * temp file must have its shortform data converted to an + * ondisk block, and the fork changed to extents format. We + * need one resblk for the conversion; the number of exchanges + * is (worst case) the extent count of the file being repaired + * plus the block we converted. + */ + req->ip1_bcount = 1; + req->ip2_bcount = sc->ip->i_nblocks; + req->nr_exchanges = 1 + ifp->if_nextents; + req->resblks = 1; + break; + case 3: + /* + * Both forks are in local format. To perform the exchange, + * both files must have their shortform data converted to + * fsblocks, and both forks must be converted to extents + * format. We need two resblks for the two conversions, and + * the number of exchanges is 1 since there's only one block at + * fileoff 0. Presumably, the caller could not exchange the + * two inode fork areas directly. + */ + req->ip1_bcount = 1; + req->ip2_bcount = 1; + req->nr_exchanges = 1; + req->resblks = 2; + break; + } + + return xfs_exchmaps_estimate_overhead(req); +} + +/* + * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip + * this if quota enforcement is disabled or if both inodes' dquots are the + * same. The qretry structure must be initialized to zeroes before the first + * call to this function. + */ +STATIC int +xrep_tempexch_reserve_quota( + struct xfs_scrub *sc, + const struct xrep_tempexch *tx) +{ + struct xfs_trans *tp = sc->tp; + const struct xfs_exchmaps_req *req = &tx->req; + int64_t ddelta, rdelta; + int error; + + /* + * Don't bother with a quota reservation if we're not enforcing them + * or the two inodes have the same dquots. + */ + if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || + xfs_is_metadir_inode(req->ip1) || + (req->ip1->i_udquot == req->ip2->i_udquot && + req->ip1->i_gdquot == req->ip2->i_gdquot && + req->ip1->i_pdquot == req->ip2->i_pdquot)) + return 0; + + /* + * Quota reservation for each file comes from two sources. First, we + * need to account for any net gain in mapped blocks during the + * exchange. Second, we need reservation for the gross gain in mapped + * blocks so that we don't trip over any quota block reservation + * assertions. We must reserve the gross gain because the quota code + * subtracts from bcount the number of blocks that we unmap; it does + * not add that quantity back to the quota block reservation. + */ + ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount); + rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount); + error = xfs_trans_reserve_quota_nblks(tp, req->ip1, + ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount, + true); + if (error) + return error; + + ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount); + rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount); + return xfs_trans_reserve_quota_nblks(tp, req->ip2, + ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount, + true); +} + +/* + * Prepare an existing transaction for an atomic file contents exchange. + * + * This function fills out the mapping exchange request and resource estimation + * structures in preparation for exchanging the contents of a metadata file + * that has been rebuilt in the temp file. Next, it reserves space and quota + * for the transaction. + * + * The caller must hold ILOCK_EXCL of the scrub target file and the temporary + * file. The caller must join both inodes to the transaction with no unlock + * flags, and is responsible for dropping both ILOCKs when appropriate. Only + * use this when those ILOCKs cannot be dropped. + */ +int +xrep_tempexch_trans_reserve( + struct xfs_scrub *sc, + int whichfork, + xfs_fileoff_t off, + xfs_filblks_t len, + struct xrep_tempexch *tx) +{ + int error; + + ASSERT(sc->tp != NULL); + xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL); + + error = xrep_tempexch_prep_request(sc, whichfork, off, len, tx); + if (error) + return error; + + error = xfs_exchmaps_estimate(&tx->req); + if (error) + return error; + + error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0); + if (error) + return error; + + return xrep_tempexch_reserve_quota(sc, tx); +} + +/* + * Create a new transaction for a file contents exchange. + * + * This function fills out the mapping excahange request and resource + * estimation structures in preparation for exchanging the contents of a + * metadata file that has been rebuilt in the temp file. Next, it reserves + * space, takes ILOCK_EXCL of both inodes, joins them to the transaction and + * reserves quota for the transaction. + * + * The caller is responsible for dropping both ILOCKs when appropriate. + */ +int +xrep_tempexch_trans_alloc( + struct xfs_scrub *sc, + int whichfork, + struct xrep_tempexch *tx) +{ + unsigned int flags = 0; + int error; + + ASSERT(sc->tp == NULL); + ASSERT(xfs_has_exchange_range(sc->mp)); + + error = xrep_tempexch_prep_request(sc, whichfork, 0, XFS_MAX_FILEOFF, + tx); + if (error) + return error; + + error = xrep_tempexch_estimate(sc, tx); + if (error) + return error; + + if (xfs_has_lazysbcount(sc->mp)) + flags |= XFS_TRANS_RES_FDBLKS; + + error = xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, + tx->req.resblks, 0, flags, &sc->tp); + if (error) + return error; + + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_exchrange_ilock(sc->tp, sc->ip, sc->tempip); + + return xrep_tempexch_reserve_quota(sc, tx); +} + +/* + * Exchange file mappings (and hence file contents) between the file being + * repaired and the temporary file. Returns with both inodes locked and joined + * to a clean scrub transaction. + */ +int +xrep_tempexch_contents( + struct xfs_scrub *sc, + struct xrep_tempexch *tx) +{ + int error; + + ASSERT(xfs_has_exchange_range(sc->mp)); + + xfs_exchange_mappings(sc->tp, &tx->req); + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* + * If we exchanged the ondisk sizes of two metadata files, we must + * exchanged the incore sizes as well. + */ + if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) { + loff_t temp; + + temp = i_size_read(VFS_I(sc->ip)); + i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); + i_size_write(VFS_I(sc->tempip), temp); + } + + return 0; +} + +/* + * Write local format data from one of the temporary file's forks into the same + * fork of file being repaired, and exchange the file sizes, if appropriate. + * Caller must ensure that the file being repaired has enough fork space to + * hold all the bytes. + */ +void +xrep_tempfile_copyout_local( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_ifork *temp_ifp; + struct xfs_ifork *ifp; + unsigned int ilog_flags = XFS_ILOG_CORE; + + temp_ifp = xfs_ifork_ptr(sc->tempip, whichfork); + ifp = xfs_ifork_ptr(sc->ip, whichfork); + + ASSERT(temp_ifp != NULL); + ASSERT(ifp != NULL); + ASSERT(temp_ifp->if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); + + switch (whichfork) { + case XFS_DATA_FORK: + ASSERT(sc->tempip->i_disk_size <= + xfs_inode_data_fork_size(sc->ip)); + break; + case XFS_ATTR_FORK: + ASSERT(sc->tempip->i_forkoff >= sc->ip->i_forkoff); + break; + default: + ASSERT(0); + return; + } + + /* Recreate @sc->ip's incore fork (ifp) with data from temp_ifp. */ + xfs_idestroy_fork(ifp); + xfs_init_local_fork(sc->ip, whichfork, temp_ifp->if_data, + temp_ifp->if_bytes); + + if (whichfork == XFS_DATA_FORK) { + i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); + sc->ip->i_disk_size = sc->tempip->i_disk_size; + } + + ilog_flags |= xfs_ilog_fdata(whichfork); + xfs_trans_log_inode(sc->tp, sc->ip, ilog_flags); +} + +/* Decide if a given XFS inode is a temporary file for a repair. */ +bool +xrep_is_tempfile( + const struct xfs_inode *ip) +{ + const struct inode *inode = &ip->i_vnode; + struct xfs_mount *mp = ip->i_mount; + + /* + * Files in the metadata directory tree also have S_PRIVATE set and + * IOP_XATTR unset, so we must distinguish them separately. We (ab)use + * the IRECOVERY flag to mark temporary metadir inodes knowing that the + * end of log recovery clears IRECOVERY, so the only ones that can + * exist during online repair are the ones we create. + */ + if (xfs_has_metadir(mp) && (ip->i_diflags2 & XFS_DIFLAG2_METADATA)) + return __xfs_iflags_test(ip, XFS_IRECOVERY); + + if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR)) + return true; + + return false; +} diff --git a/fs/xfs/scrub/tempfile.h b/fs/xfs/scrub/tempfile.h new file mode 100644 index 000000000000..71c1b54599c3 --- /dev/null +++ b/fs/xfs/scrub/tempfile.h @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_TEMPFILE_H__ +#define __XFS_SCRUB_TEMPFILE_H__ + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_tempfile_create(struct xfs_scrub *sc, uint16_t mode); +void xrep_tempfile_rele(struct xfs_scrub *sc); + +int xrep_tempfile_adjust_directory_tree(struct xfs_scrub *sc); + +bool xrep_tempfile_iolock_nowait(struct xfs_scrub *sc); +int xrep_tempfile_iolock_polled(struct xfs_scrub *sc); +void xrep_tempfile_iounlock(struct xfs_scrub *sc); + +void xrep_tempfile_ilock(struct xfs_scrub *sc); +bool xrep_tempfile_ilock_nowait(struct xfs_scrub *sc); +void xrep_tempfile_iunlock(struct xfs_scrub *sc); +void xrep_tempfile_iunlock_both(struct xfs_scrub *sc); +void xrep_tempfile_ilock_both(struct xfs_scrub *sc); + +int xrep_tempfile_prealloc(struct xfs_scrub *sc, xfs_fileoff_t off, + xfs_filblks_t len); + +enum xfs_blft; + +typedef int (*xrep_tempfile_copyin_fn)(struct xfs_scrub *sc, + struct xfs_buf *bp, void *data); + +int xrep_tempfile_copyin(struct xfs_scrub *sc, xfs_fileoff_t off, + xfs_filblks_t len, xrep_tempfile_copyin_fn fn, void *data); + +int xrep_tempfile_set_isize(struct xfs_scrub *sc, unsigned long long isize); + +int xrep_tempfile_roll_trans(struct xfs_scrub *sc); +void xrep_tempfile_copyout_local(struct xfs_scrub *sc, int whichfork); +bool xrep_is_tempfile(const struct xfs_inode *ip); +#else +static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc) +{ + xchk_ilock(sc, XFS_IOLOCK_EXCL); +} +# define xrep_is_tempfile(ip) (false) +# define xrep_tempfile_adjust_directory_tree(sc) (0) +# define xrep_tempfile_rele(sc) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_TEMPFILE_H__ */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 3dd281d6d185..2450e214103f 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -19,13 +19,21 @@ #include "xfs_da_format.h" #include "xfs_dir2.h" #include "xfs_rmap.h" +#include "xfs_parent.h" +#include "xfs_metafile.h" +#include "xfs_rtgroup.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/quota.h" #include "scrub/iscan.h" +#include "scrub/orphanage.h" #include "scrub/nlinks.h" #include "scrub/fscounters.h" +#include "scrub/bitmap.h" +#include "scrub/ino_bitmap.h" +#include "scrub/xfblob.h" +#include "scrub/dirtree.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 5b294be52c55..d7c4ced47c15 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -17,6 +17,7 @@ #include "xfs_bit.h" #include "xfs_quota_defs.h" +struct xfs_rtgroup; struct xfs_scrub; struct xfile; struct xfarray; @@ -26,6 +27,10 @@ struct xchk_iscan; struct xchk_nlink; struct xchk_fscounters; struct xfs_rmap_update_params; +struct xfs_parent_rec; +enum xchk_dirpath_outcome; +struct xchk_dirtree; +struct xchk_dirtree_outcomes; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -36,6 +41,9 @@ struct xfs_rmap_update_params; TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); +TRACE_DEFINE_ENUM(XG_TYPE_AG); +TRACE_DEFINE_ENUM(XG_TYPE_RTG); + TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF); @@ -64,6 +72,12 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTREFCBT); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -93,7 +107,13 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \ { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \ { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \ - { XFS_SCRUB_TYPE_HEALTHY, "healthy" } + { XFS_SCRUB_TYPE_HEALTHY, "healthy" }, \ + { XFS_SCRUB_TYPE_DIRTREE, "dirtree" }, \ + { XFS_SCRUB_TYPE_BARRIER, "barrier" }, \ + { XFS_SCRUB_TYPE_METAPATH, "metapath" }, \ + { XFS_SCRUB_TYPE_RGSUPER, "rgsuper" }, \ + { XFS_SCRUB_TYPE_RTRMAPBT, "rtrmapbt" }, \ + { XFS_SCRUB_TYPE_RTREFCBT, "rtrefcountbt" } #define XFS_SCRUB_FLAG_STRINGS \ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ @@ -169,6 +189,8 @@ DEFINE_EVENT(xchk_class, name, \ DEFINE_SCRUB_EVENT(xchk_start); DEFINE_SCRUB_EVENT(xchk_done); DEFINE_SCRUB_EVENT(xchk_deadlock_retry); +DEFINE_SCRUB_EVENT(xchk_dirtree_start); +DEFINE_SCRUB_EVENT(xchk_dirtree_done); DEFINE_SCRUB_EVENT(xrep_attempt); DEFINE_SCRUB_EVENT(xrep_done); @@ -199,6 +221,81 @@ DEFINE_EVENT(xchk_fsgate_class, name, \ DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable); DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable); +DECLARE_EVENT_CLASS(xchk_vector_head_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), + TP_ARGS(ip, vhead), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, inum) + __field(unsigned int, gen) + __field(unsigned int, flags) + __field(unsigned short, rest_us) + __field(unsigned short, nr_vecs) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = vhead->svh_agno; + __entry->inum = vhead->svh_ino; + __entry->gen = vhead->svh_gen; + __entry->flags = vhead->svh_flags; + __entry->rest_us = vhead->svh_rest_us; + __entry->nr_vecs = vhead->svh_nr; + ), + TP_printk("dev %d:%d ino 0x%llx agno 0x%x inum 0x%llx gen 0x%x flags 0x%x rest_us %u nr_vecs %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->inum, + __entry->gen, + __entry->flags, + __entry->rest_us, + __entry->nr_vecs) +) +#define DEFINE_SCRUBV_HEAD_EVENT(name) \ +DEFINE_EVENT(xchk_vector_head_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), \ + TP_ARGS(ip, vhead)) + +DEFINE_SCRUBV_HEAD_EVENT(xchk_scrubv_start); + +DECLARE_EVENT_CLASS(xchk_vector_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, + unsigned int vec_nr, struct xfs_scrub_vec *v), + TP_ARGS(mp, vhead, vec_nr, v), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, vec_nr) + __field(unsigned int, vec_type) + __field(unsigned int, vec_flags) + __field(int, vec_ret) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->vec_nr = vec_nr; + __entry->vec_type = v->sv_type; + __entry->vec_flags = v->sv_flags; + __entry->vec_ret = v->sv_ret; + ), + TP_printk("dev %d:%d vec[%u] type %s flags %s ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->vec_nr, + __print_symbolic(__entry->vec_type, XFS_SCRUB_TYPE_STRINGS), + __print_flags(__entry->vec_flags, "|", XFS_SCRUB_FLAG_STRINGS), + __entry->vec_ret) +) +#define DEFINE_SCRUBV_EVENT(name) \ +DEFINE_EVENT(xchk_vector_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, \ + unsigned int vec_nr, struct xfs_scrub_vec *v), \ + TP_ARGS(mp, vhead, vec_nr, v)) + +DEFINE_SCRUBV_EVENT(xchk_scrubv_barrier_fail); +DEFINE_SCRUBV_EVENT(xchk_scrubv_item); +DEFINE_SCRUBV_EVENT(xchk_scrubv_outcome); + TRACE_EVENT(xchk_op_error, TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int error, void *ret_ip), @@ -364,6 +461,7 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \ DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error); DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning); +DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_preen); #ifdef CONFIG_XFS_QUOTA DECLARE_EVENT_CLASS(xchk_dqiter_class, @@ -475,7 +573,7 @@ TRACE_EVENT(xchk_btree_op_error, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -515,10 +613,10 @@ TRACE_EVENT(xchk_ifork_btree_op_error, TP_fast_assign( xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; - __entry->ino = sc->ip->i_ino; + __entry->ino = cur->bc_ino.ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->ptr = cur->bc_levels[level].ptr; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); @@ -558,7 +656,7 @@ TRACE_EVENT(xchk_btree_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -598,7 +696,7 @@ TRACE_EVENT(xchk_ifork_btree_error, __entry->ino = sc->ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -637,7 +735,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __assign_str(name, cur->bc_ops->name); + __assign_str(name); __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->level = level; @@ -686,12 +784,12 @@ TRACE_EVENT(xchk_xref_error, ); TRACE_EVENT(xchk_iallocbt_check_cluster, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t startino, xfs_daddr_t map_daddr, - unsigned short map_len, unsigned int chunk_ino, - unsigned int nr_inodes, uint16_t cluster_mask, - uint16_t holemask, unsigned int cluster_ino), - TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes, + TP_PROTO(const struct xfs_perag *pag, xfs_agino_t startino, + xfs_daddr_t map_daddr, unsigned short map_len, + unsigned int chunk_ino, unsigned int nr_inodes, + uint16_t cluster_mask, uint16_t holemask, + unsigned int cluster_ino), + TP_ARGS(pag, startino, map_daddr, map_len, chunk_ino, nr_inodes, cluster_mask, holemask, cluster_ino), TP_STRUCT__entry( __field(dev_t, dev) @@ -706,8 +804,8 @@ TRACE_EVENT(xchk_iallocbt_check_cluster, __field(uint16_t, holemask) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->startino = startino; __entry->map_daddr = map_daddr; __entry->map_len = map_len; @@ -836,7 +934,8 @@ DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze); DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw); TRACE_EVENT(xchk_refcount_incorrect, - TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec, + TP_PROTO(const struct xfs_perag *pag, + const struct xfs_refcount_irec *irec, xfs_nlink_t seen), TP_ARGS(pag, irec, seen), TP_STRUCT__entry( @@ -849,8 +948,8 @@ TRACE_EVENT(xchk_refcount_incorrect, __field(xfs_nlink_t, seen) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; @@ -873,18 +972,16 @@ TRACE_EVENT(xfile_create, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long, ino) - __array(char, pathname, 256) + __array(char, pathname, MAXNAMELEN) ), TP_fast_assign( - char pathname[257]; char *path; __entry->ino = file_inode(xf->file)->i_ino; - memset(pathname, 0, sizeof(pathname)); - path = file_path(xf->file, pathname, sizeof(pathname) - 1); + path = file_path(xf->file, __entry->pathname, MAXNAMELEN); if (IS_ERR(path)) - path = "(unknown)"; - strncpy(__entry->pathname, path, sizeof(__entry->pathname)); + strncpy(__entry->pathname, "(unknown)", + sizeof(__entry->pathname)); ), TP_printk("xfino 0x%lx path '%s'", __entry->ino, @@ -947,6 +1044,7 @@ DEFINE_XFILE_EVENT(xfile_store); DEFINE_XFILE_EVENT(xfile_seek_data); DEFINE_XFILE_EVENT(xfile_get_folio); DEFINE_XFILE_EVENT(xfile_put_folio); +DEFINE_XFILE_EVENT(xfile_discard); TRACE_EVENT(xfarray_create, TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity), @@ -1300,7 +1398,7 @@ TRACE_EVENT(xchk_iscan_iget_batch, __entry->unavail) ); -TRACE_EVENT(xchk_iscan_iget_retry_wait, +DECLARE_EVENT_CLASS(xchk_iscan_retry_wait_class, TP_PROTO(struct xchk_iscan *iscan), TP_ARGS(iscan), TP_STRUCT__entry( @@ -1326,7 +1424,13 @@ TRACE_EVENT(xchk_iscan_iget_retry_wait, __entry->remaining, __entry->iget_timeout, __entry->retry_delay) -); +) +#define DEFINE_ISCAN_RETRY_WAIT_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_retry_wait_class, name, \ + TP_PROTO(struct xchk_iscan *iscan), \ + TP_ARGS(iscan)) +DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_iget_retry_wait); +DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_agi_retry_wait); TRACE_EVENT(xchk_nlinks_collect_dirent, TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, @@ -1354,6 +1458,33 @@ TRACE_EVENT(xchk_nlinks_collect_dirent, __get_str(name)) ); +TRACE_EVENT(xchk_nlinks_collect_pptr, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, + const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(mp, dp, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp->i_ino; + __entry->ino = be64_to_cpu(pptr->p_ino); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->namelen, + __get_str(name)) +); + TRACE_EVENT(xchk_nlinks_collect_metafile, TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino), TP_ARGS(mp, ino), @@ -1502,76 +1633,413 @@ DEFINE_EVENT(xchk_nlinks_diff_class, name, \ TP_ARGS(mp, ip, live)) DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode); +DECLARE_EVENT_CLASS(xchk_pptr_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, + xfs_ino_t far_ino), + TP_ARGS(ip, name, far_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + __field(xfs_ino_t, far_ino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name, name->len); + __entry->far_ino = far_ino; + ), + TP_printk("dev %d:%d ino 0x%llx name '%.*s' far_ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __get_str(name), + __entry->far_ino) +) +#define DEFINE_XCHK_PPTR_EVENT(name) \ +DEFINE_EVENT(xchk_pptr_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \ + xfs_ino_t far_ino), \ + TP_ARGS(ip, name, far_ino)) +DEFINE_XCHK_PPTR_EVENT(xchk_dir_defer); +DEFINE_XCHK_PPTR_EVENT(xchk_dir_slowpath); +DEFINE_XCHK_PPTR_EVENT(xchk_dir_ultraslowpath); +DEFINE_XCHK_PPTR_EVENT(xchk_parent_defer); +DEFINE_XCHK_PPTR_EVENT(xchk_parent_slowpath); +DEFINE_XCHK_PPTR_EVENT(xchk_parent_ultraslowpath); + +DECLARE_EVENT_CLASS(xchk_dirtree_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int path_nr, const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(sc, ip, path_nr, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, path_nr) + __field(xfs_ino_t, child_ino) + __field(unsigned int, child_gen) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->child_ino = ip->i_ino; + __entry->child_gen = VFS_I(ip)->i_generation; + __entry->parent_ino = be64_to_cpu(pptr->p_ino); + __entry->parent_gen = be32_to_cpu(pptr->p_gen); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d path %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->child_ino, + __entry->child_gen, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +); +#define DEFINE_XCHK_DIRTREE_EVENT(name) \ +DEFINE_EVENT(xchk_dirtree_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \ + unsigned int path_nr, const struct xfs_name *name, \ + const struct xfs_parent_rec *pptr), \ + TP_ARGS(sc, ip, path_nr, name, pptr)) +DEFINE_XCHK_DIRTREE_EVENT(xchk_dirtree_create_path); +DEFINE_XCHK_DIRTREE_EVENT(xchk_dirpath_walk_upwards); + +DECLARE_EVENT_CLASS(xchk_dirpath_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int path_nr, unsigned int step_nr, + const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(sc, ip, path_nr, step_nr, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, path_nr) + __field(unsigned int, step_nr) + __field(xfs_ino_t, child_ino) + __field(unsigned int, child_gen) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->step_nr = step_nr; + __entry->child_ino = ip->i_ino; + __entry->child_gen = VFS_I(ip)->i_generation; + __entry->parent_ino = be64_to_cpu(pptr->p_ino); + __entry->parent_gen = be32_to_cpu(pptr->p_gen); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d path %u step %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->step_nr, + __entry->child_ino, + __entry->child_gen, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +); +#define DEFINE_XCHK_DIRPATH_EVENT(name) \ +DEFINE_EVENT(xchk_dirpath_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \ + unsigned int path_nr, unsigned int step_nr, \ + const struct xfs_name *name, \ + const struct xfs_parent_rec *pptr), \ + TP_ARGS(sc, ip, path_nr, step_nr, name, pptr)) +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_disappeared); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_badgen); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_nondir_parent); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_unlinked_parent); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_found_next_step); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_crosses_tree); + +TRACE_DEFINE_ENUM(XCHK_DIRPATH_SCANNING); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_DELETE); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_CORRUPT); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_LOOP); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_STALE); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_OK); +TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETING); +TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETED); +TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTING); +TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTED); + +#define XCHK_DIRPATH_OUTCOME_STRINGS \ + { XCHK_DIRPATH_SCANNING, "scanning" }, \ + { XCHK_DIRPATH_DELETE, "delete" }, \ + { XCHK_DIRPATH_CORRUPT, "corrupt" }, \ + { XCHK_DIRPATH_LOOP, "loop" }, \ + { XCHK_DIRPATH_STALE, "stale" }, \ + { XCHK_DIRPATH_OK, "ok" }, \ + { XREP_DIRPATH_DELETING, "deleting" }, \ + { XREP_DIRPATH_DELETED, "deleted" }, \ + { XREP_DIRPATH_ADOPTING, "adopting" }, \ + { XREP_DIRPATH_ADOPTED, "adopted" } + +DECLARE_EVENT_CLASS(xchk_dirpath_outcome_class, + TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr, + unsigned int nr_steps, \ + unsigned int outcome), + TP_ARGS(sc, path_nr, nr_steps, outcome), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long long, path_nr) + __field(unsigned int, nr_steps) + __field(unsigned int, outcome) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->nr_steps = nr_steps; + __entry->outcome = outcome; + ), + TP_printk("dev %d:%d path %llu steps %u outcome %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->nr_steps, + __print_symbolic(__entry->outcome, XCHK_DIRPATH_OUTCOME_STRINGS)) +); +#define DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(name) \ +DEFINE_EVENT(xchk_dirpath_outcome_class, name, \ + TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr, \ + unsigned int nr_steps, \ + unsigned int outcome), \ + TP_ARGS(sc, path_nr, nr_steps, outcome)) +DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_set_outcome); +DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_evaluate_path); + +DECLARE_EVENT_CLASS(xchk_dirtree_evaluate_class, + TP_PROTO(const struct xchk_dirtree *dl, + const struct xchk_dirtree_outcomes *oc), + TP_ARGS(dl, oc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, rootino) + __field(unsigned int, nr_paths) + __field(unsigned int, bad) + __field(unsigned int, suspect) + __field(unsigned int, good) + __field(bool, needs_adoption) + ), + TP_fast_assign( + __entry->dev = dl->sc->mp->m_super->s_dev; + __entry->ino = dl->sc->ip->i_ino; + __entry->rootino = dl->root_ino; + __entry->nr_paths = dl->nr_paths; + __entry->bad = oc->bad; + __entry->suspect = oc->suspect; + __entry->good = oc->good; + __entry->needs_adoption = oc->needs_adoption ? 1 : 0; + ), + TP_printk("dev %d:%d ino 0x%llx rootino 0x%llx nr_paths %u bad %u suspect %u good %u adopt? %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->rootino, + __entry->nr_paths, + __entry->bad, + __entry->suspect, + __entry->good, + __entry->needs_adoption) +); +#define DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(name) \ +DEFINE_EVENT(xchk_dirtree_evaluate_class, name, \ + TP_PROTO(const struct xchk_dirtree *dl, \ + const struct xchk_dirtree_outcomes *oc), \ + TP_ARGS(dl, oc)) +DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xchk_dirtree_evaluate); + +TRACE_EVENT(xchk_dirpath_changed, + TP_PROTO(struct xfs_scrub *sc, unsigned int path_nr, + unsigned int step_nr, const struct xfs_inode *dp, + const struct xfs_inode *ip, const struct xfs_name *xname), + TP_ARGS(sc, path_nr, step_nr, dp, ip, xname), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, path_nr) + __field(unsigned int, step_nr) + __field(xfs_ino_t, child_ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, xname->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->step_nr = step_nr; + __entry->child_ino = ip->i_ino; + __entry->parent_ino = dp->i_ino; + __entry->namelen = xname->len; + memcpy(__get_str(name), xname->name, xname->len); + ), + TP_printk("dev %d:%d path %u step %u child_ino 0x%llx parent_ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->step_nr, + __entry->child_ino, + __entry->parent_ino, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_dirtree_live_update, + TP_PROTO(struct xfs_scrub *sc, const struct xfs_inode *dp, + int action, const struct xfs_inode *ip, int delta, + const struct xfs_name *xname), + TP_ARGS(sc, dp, action, ip, delta, xname), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, parent_ino) + __field(int, action) + __field(xfs_ino_t, child_ino) + __field(int, delta) + __field(unsigned int, namelen) + __dynamic_array(char, name, xname->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->parent_ino = dp->i_ino; + __entry->action = action; + __entry->child_ino = ip->i_ino; + __entry->delta = delta; + __entry->namelen = xname->len; + memcpy(__get_str(name), xname->name, xname->len); + ), + TP_printk("dev %d:%d parent_ino 0x%llx child_ino 0x%llx nlink_delta %d name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->parent_ino, + __entry->child_ino, + __entry->delta, + __entry->namelen, + __get_str(name)) +); + +DECLARE_EVENT_CLASS(xchk_metapath_class, + TP_PROTO(struct xfs_scrub *sc, const char *path, + struct xfs_inode *dp, xfs_ino_t ino), + TP_ARGS(sc, path, dp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, scrub_ino) + __field(xfs_ino_t, parent_ino) + __field(xfs_ino_t, ino) + __string(name, path) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->scrub_ino = sc->ip ? sc->ip->i_ino : NULLFSINO; + __entry->parent_ino = dp ? dp->i_ino : NULLFSINO; + __entry->ino = ino; + __assign_str(name); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx name '%s' ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->scrub_ino, + __entry->parent_ino, + __get_str(name), + __entry->ino) +); +#define DEFINE_XCHK_METAPATH_EVENT(name) \ +DEFINE_EVENT(xchk_metapath_class, name, \ + TP_PROTO(struct xfs_scrub *sc, const char *path, \ + struct xfs_inode *dp, xfs_ino_t ino), \ + TP_ARGS(sc, path, dp, ino)) +DEFINE_XCHK_METAPATH_EVENT(xchk_metapath_lookup); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) DECLARE_EVENT_CLASS(xrep_extent_class, - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), - TP_ARGS(pag, agbno, len), + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, + xfs_extlen_t len), + TP_ARGS(xg, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agbno, __entry->len) ); #define DEFINE_REPAIR_EXTENT_EVENT(name) \ DEFINE_EVENT(xrep_extent_class, name, \ - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), \ - TP_ARGS(pag, agbno, len)) + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \ + xfs_extlen_t len), \ + TP_ARGS(xg, agbno, len)) DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent); DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent); DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); +DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); DECLARE_EVENT_CLASS(xrep_reap_find_class, - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, - bool crosslinked), - TP_ARGS(pag, agbno, len, crosslinked), + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, + xfs_extlen_t len, bool crosslinked), + TP_ARGS(xg, agbno, len, crosslinked), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) __field(bool, crosslinked) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->agbno = agbno; __entry->len = len; __entry->crosslinked = crosslinked; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d", + TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x crosslinked %d", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agbno, __entry->len, __entry->crosslinked ? 1 : 0) ); #define DEFINE_REPAIR_REAP_FIND_EVENT(name) \ DEFINE_EVENT(xrep_reap_find_class, name, \ - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, \ - bool crosslinked), \ - TP_ARGS(pag, agbno, len, crosslinked)) + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \ + xfs_extlen_t len, bool crosslinked), \ + TP_ARGS(xg, agbno, len, crosslinked)) DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select); +DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select); -DECLARE_EVENT_CLASS(xrep_rmap_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len, - uint64_t owner, uint64_t offset, unsigned int flags), - TP_ARGS(mp, agno, agbno, len, owner, offset, flags), +TRACE_EVENT(xrep_ibt_walk_rmap, + TP_PROTO(const struct xfs_perag *pag, const struct xfs_rmap_irec *rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -1582,13 +2050,13 @@ DECLARE_EVENT_CLASS(xrep_rmap_class, __field(unsigned int, flags) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->agbno = agbno; - __entry->len = len; - __entry->owner = owner; - __entry->offset = offset; - __entry->flags = flags; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->agbno = rec->rm_startblock; + __entry->len = rec->rm_blockcount; + __entry->owner = rec->rm_owner; + __entry->offset = rec->rm_offset; + __entry->flags = rec->rm_flags; ), TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -1599,19 +2067,11 @@ DECLARE_EVENT_CLASS(xrep_rmap_class, __entry->offset, __entry->flags) ); -#define DEFINE_REPAIR_RMAP_EVENT(name) \ -DEFINE_EVENT(xrep_rmap_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_extlen_t len, \ - uint64_t owner, uint64_t offset, unsigned int flags), \ - TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap); -DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap); TRACE_EVENT(xrep_abt_found, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + TP_PROTO(const struct xfs_perag *pag, const struct xfs_alloc_rec_incore *rec), - TP_ARGS(mp, agno, rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -1619,8 +2079,8 @@ TRACE_EVENT(xrep_abt_found, __field(xfs_extlen_t, blockcount) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->startblock = rec->ar_startblock; __entry->blockcount = rec->ar_blockcount; ), @@ -1632,9 +2092,9 @@ TRACE_EVENT(xrep_abt_found, ) TRACE_EVENT(xrep_ibt_found, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + TP_PROTO(const struct xfs_perag *pag, const struct xfs_inobt_rec_incore *rec), - TP_ARGS(mp, agno, rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -1645,8 +2105,8 @@ TRACE_EVENT(xrep_ibt_found, __field(uint64_t, freemask) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->startino = rec->ir_startino; __entry->holemask = rec->ir_holemask; __entry->count = rec->ir_count; @@ -1664,28 +2124,33 @@ TRACE_EVENT(xrep_ibt_found, ) TRACE_EVENT(xrep_refc_found, - TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec), - TP_ARGS(pag, rec), + TP_PROTO(const struct xfs_group *xg, + const struct xfs_refcount_irec *rec), + TP_ARGS(xg, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(enum xfs_refc_domain, domain) + __field(enum xfs_group_type, type) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->agno = xg->xg_gno; + __entry->type = xg->xg_type; __entry->domain = rec->rc_domain; __entry->startblock = rec->rc_startblock; __entry->blockcount = rec->rc_blockcount; __entry->refcount = rec->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d %sno 0x%x dom %s %sbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount) @@ -1724,9 +2189,8 @@ TRACE_EVENT(xrep_bmap_found, ); TRACE_EVENT(xrep_rmap_found, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - const struct xfs_rmap_irec *rec), - TP_ARGS(mp, agno, rec), + TP_PROTO(const struct xfs_perag *pag, const struct xfs_rmap_irec *rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -1737,8 +2201,8 @@ TRACE_EVENT(xrep_rmap_found, __field(unsigned int, flags) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = rec->rm_startblock; __entry->len = rec->rm_blockcount; __entry->owner = rec->rm_owner; @@ -1756,9 +2220,9 @@ TRACE_EVENT(xrep_rmap_found, ); TRACE_EVENT(xrep_findroot_block, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, uint32_t magic, uint16_t level), - TP_ARGS(mp, agno, agbno, magic, level), + TP_ARGS(pag, agbno, magic, level), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -1767,8 +2231,8 @@ TRACE_EVENT(xrep_findroot_block, __field(uint16_t, level) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = agbno; __entry->magic = magic; __entry->level = level; @@ -1781,10 +2245,10 @@ TRACE_EVENT(xrep_findroot_block, __entry->level) ) TRACE_EVENT(xrep_calc_ag_resblks, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t icount, xfs_agblock_t aglen, xfs_agblock_t freelen, + TP_PROTO(const struct xfs_perag *pag, xfs_agino_t icount, + xfs_agblock_t aglen, xfs_agblock_t freelen, xfs_agblock_t usedlen), - TP_ARGS(mp, agno, icount, aglen, freelen, usedlen), + TP_ARGS(pag, icount, aglen, freelen, usedlen), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -1794,8 +2258,8 @@ TRACE_EVENT(xrep_calc_ag_resblks, __field(xfs_agblock_t, usedlen) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->icount = icount; __entry->aglen = aglen; __entry->freelen = freelen; @@ -1810,10 +2274,10 @@ TRACE_EVENT(xrep_calc_ag_resblks, __entry->usedlen) ) TRACE_EVENT(xrep_calc_ag_resblks_btsize, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t bnobt_sz, xfs_agblock_t inobt_sz, - xfs_agblock_t rmapbt_sz, xfs_agblock_t refcbt_sz), - TP_ARGS(mp, agno, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz), + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t bnobt_sz, + xfs_agblock_t inobt_sz, xfs_agblock_t rmapbt_sz, + xfs_agblock_t refcbt_sz), + TP_ARGS(pag, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -1823,8 +2287,8 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize, __field(xfs_agblock_t, refcbt_sz) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->bnobt_sz = bnobt_sz; __entry->inobt_sz = inobt_sz; __entry->rmapbt_sz = rmapbt_sz; @@ -1838,6 +2302,32 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize, __entry->rmapbt_sz, __entry->refcbt_sz) ) + +#ifdef CONFIG_XFS_RT +TRACE_EVENT(xrep_calc_rtgroup_resblks_btsize, + TP_PROTO(struct xfs_mount *mp, xfs_rgnumber_t rgno, + xfs_rgblock_t usedlen, xfs_rgblock_t rmapbt_sz), + TP_ARGS(mp, rgno, usedlen, rmapbt_sz), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, usedlen) + __field(xfs_rgblock_t, rmapbt_sz) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rgno = rgno; + __entry->usedlen = usedlen; + __entry->rmapbt_sz = rmapbt_sz; + ), + TP_printk("dev %d:%d rgno 0x%x usedlen %u rmapbt %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->usedlen, + __entry->rmapbt_sz) +); +#endif /* CONFIG_XFS_RT */ + TRACE_EVENT(xrep_reset_counters, TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc), TP_ARGS(mp, fsc), @@ -1864,10 +2354,9 @@ TRACE_EVENT(xrep_reset_counters, ) DECLARE_EVENT_CLASS(xrep_newbt_extent_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len, - int64_t owner), - TP_ARGS(mp, agno, agbno, len, owner), + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, + xfs_extlen_t len, int64_t owner), + TP_ARGS(pag, agbno, len, owner), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -1876,8 +2365,8 @@ DECLARE_EVENT_CLASS(xrep_newbt_extent_class, __field(int64_t, owner) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = agbno; __entry->len = len; __entry->owner = owner; @@ -1891,10 +2380,9 @@ DECLARE_EVENT_CLASS(xrep_newbt_extent_class, ); #define DEFINE_NEWBT_EXTENT_EVENT(name) \ DEFINE_EVENT(xrep_newbt_extent_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_extlen_t len, \ - int64_t owner), \ - TP_ARGS(mp, agno, agbno, len, owner)) + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \ + xfs_extlen_t len, int64_t owner), \ + TP_ARGS(pag, agbno, len, owner)) DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks); DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks); DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks); @@ -2182,7 +2670,7 @@ TRACE_EVENT(xrep_cow_replace_mapping, ); TRACE_EVENT(xrep_cow_free_staging, - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t blockcount), TP_ARGS(pag, agbno, blockcount), TP_STRUCT__entry( @@ -2192,8 +2680,8 @@ TRACE_EVENT(xrep_cow_free_staging, __field(xfs_extlen_t, blockcount) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = agbno; __entry->blockcount = blockcount; ), @@ -2238,11 +2726,12 @@ DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode); DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode); TRACE_EVENT(xrep_rmap_live_update, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int op, + TP_PROTO(const struct xfs_group *xg, unsigned int op, const struct xfs_rmap_update_params *p), - TP_ARGS(mp, agno, op, p), + TP_ARGS(xg, op, p), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(unsigned int, op) __field(xfs_agblock_t, agbno) @@ -2252,8 +2741,9 @@ TRACE_EVENT(xrep_rmap_live_update, __field(unsigned int, flags) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->op = op; __entry->agbno = p->startblock; __entry->len = p->blockcount; @@ -2262,10 +2752,12 @@ TRACE_EVENT(xrep_rmap_live_update, if (p->unwritten) __entry->flags |= XFS_RMAP_UNWRITTEN; ), - TP_printk("dev %d:%d agno 0x%x op %d agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + TP_printk("dev %d:%d %sno 0x%x op %d %sbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __entry->op, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agbno, __entry->len, __entry->owner, @@ -2273,6 +2765,1076 @@ TRACE_EVENT(xrep_rmap_live_update, __entry->flags) ); +TRACE_EVENT(xrep_tempfile_create, + TP_PROTO(struct xfs_scrub *sc), + TP_ARGS(sc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, inum) + __field(unsigned int, gen) + __field(unsigned int, flags) + __field(xfs_ino_t, temp_inum) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->file ? XFS_I(file_inode(sc->file))->i_ino : 0; + __entry->type = sc->sm->sm_type; + __entry->agno = sc->sm->sm_agno; + __entry->inum = sc->sm->sm_ino; + __entry->gen = sc->sm->sm_gen; + __entry->flags = sc->sm->sm_flags; + __entry->temp_inum = sc->tempip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx type %s inum 0x%llx gen 0x%x flags 0x%x temp_inum 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->inum, + __entry->gen, + __entry->flags, + __entry->temp_inum) +); + +DECLARE_EVENT_CLASS(xrep_tempfile_class, + TP_PROTO(struct xfs_scrub *sc, int whichfork, + struct xfs_bmbt_irec *irec), + TP_ARGS(sc, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, lblk) + __field(xfs_filblks_t, len) + __field(xfs_fsblock_t, pblk) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->tempip->i_ino; + __entry->whichfork = whichfork; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->lblk, + __entry->len, + __entry->pblk, + __entry->state) +); +#define DEFINE_XREP_TEMPFILE_EVENT(name) \ +DEFINE_EVENT(xrep_tempfile_class, name, \ + TP_PROTO(struct xfs_scrub *sc, int whichfork, \ + struct xfs_bmbt_irec *irec), \ + TP_ARGS(sc, whichfork, irec)) +DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_prealloc); +DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_copyin); + +TRACE_EVENT(xreap_ifork_extent, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork, + const struct xfs_bmbt_irec *irec), + TP_ARGS(sc, ip, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, fileoff) + __field(xfs_filblks_t, len) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->whichfork = whichfork; + __entry->fileoff = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock); + __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock); + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ip 0x%llx whichfork %s agno 0x%x agbno 0x%x fileoff 0x%llx fsbcount 0x%llx state 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->agno, + __entry->agbno, + __entry->fileoff, + __entry->len, + __entry->state) +); + +TRACE_EVENT(xreap_bmapi_binval_scan, + TP_PROTO(struct xfs_scrub *sc, const struct xfs_bmbt_irec *irec, + xfs_extlen_t scan_blocks), + TP_ARGS(sc, irec, scan_blocks), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_filblks_t, len) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, scan_blocks) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->len = irec->br_blockcount; + __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock); + __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock); + __entry->scan_blocks = scan_blocks; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%llx scan_blocks 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->scan_blocks) +); + +TRACE_EVENT(xrep_xattr_recover_leafblock, + TP_PROTO(struct xfs_inode *ip, xfs_dablk_t dabno, uint16_t magic), + TP_ARGS(ip, dabno, magic), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_dablk_t, dabno) + __field(uint16_t, magic) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->dabno = dabno; + __entry->magic = magic; + ), + TP_printk("dev %d:%d ino 0x%llx dablk 0x%x magic 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->dabno, + __entry->magic) +); + +DECLARE_EVENT_CLASS(xrep_xattr_salvage_class, + TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name, + unsigned int namelen, unsigned int valuelen), + TP_ARGS(ip, flags, name, namelen, valuelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, flags) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + __field(unsigned int, valuelen) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->flags = flags; + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + __entry->valuelen = valuelen; + ), + TP_printk("dev %d:%d ino 0x%llx flags %s name '%.*s' valuelen 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->flags, "|", XFS_ATTR_NAMESPACE_STR), + __entry->namelen, + __get_str(name), + __entry->valuelen) +); +#define DEFINE_XREP_XATTR_SALVAGE_EVENT(name) \ +DEFINE_EVENT(xrep_xattr_salvage_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name, \ + unsigned int namelen, unsigned int valuelen), \ + TP_ARGS(ip, flags, name, namelen, valuelen)) +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_salvage_rec); +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_insert_rec); +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_stash_xattr); +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_insert_xattr); + +DECLARE_EVENT_CLASS(xrep_pptr_salvage_class, + TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name, + unsigned int namelen, const void *value, unsigned int valuelen), + TP_ARGS(ip, flags, name, namelen, value, valuelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + ), + TP_fast_assign( + const struct xfs_parent_rec *rec = value; + + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = be64_to_cpu(rec->p_ino); + __entry->parent_gen = be32_to_cpu(rec->p_gen); + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_PPTR_SALVAGE_EVENT(name) \ +DEFINE_EVENT(xrep_pptr_salvage_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name, \ + unsigned int namelen, const void *value, unsigned int valuelen), \ + TP_ARGS(ip, flags, name, namelen, value, valuelen)) +DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_salvage_pptr); +DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_insert_pptr); + +TRACE_EVENT(xrep_xattr_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip), + TP_ARGS(ip, arg_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, src_ino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->src_ino = arg_ip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx src 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->src_ino) +) +#define DEFINE_XREP_XATTR_EVENT(name) \ +DEFINE_EVENT(xrep_xattr_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip), \ + TP_ARGS(ip, arg_ip)) +DEFINE_XREP_XATTR_EVENT(xrep_xattr_rebuild_tree); +DEFINE_XREP_XATTR_EVENT(xrep_xattr_reset_fork); +DEFINE_XREP_XATTR_EVENT(xrep_xattr_full_reset); + +DECLARE_EVENT_CLASS(xrep_xattr_pptr_scan_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, + const struct xfs_name *name), + TP_ARGS(ip, dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = dp->i_ino; + __entry->parent_gen = VFS_IC(dp)->i_generation; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(name) \ +DEFINE_EVENT(xrep_xattr_pptr_scan_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \ + const struct xfs_name *name), \ + TP_ARGS(ip, dp, name)) +DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentadd); +DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentremove); + +TRACE_EVENT(xrep_dir_recover_dirblock, + TP_PROTO(struct xfs_inode *dp, xfs_dablk_t dabno, uint32_t magic, + uint32_t magic_guess), + TP_ARGS(dp, dabno, magic, magic_guess), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_dablk_t, dabno) + __field(uint32_t, magic) + __field(uint32_t, magic_guess) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->dabno = dabno; + __entry->magic = magic; + __entry->magic_guess = magic_guess; + ), + TP_printk("dev %d:%d dir 0x%llx dablk 0x%x magic 0x%x magic_guess 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->dabno, + __entry->magic, + __entry->magic_guess) +); + +DECLARE_EVENT_CLASS(xrep_dir_class, + TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino), + TP_ARGS(dp, parent_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_ino_t, parent_ino) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->parent_ino = parent_ino; + ), + TP_printk("dev %d:%d dir 0x%llx parent 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->parent_ino) +) +#define DEFINE_XREP_DIR_EVENT(name) \ +DEFINE_EVENT(xrep_dir_class, name, \ + TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino), \ + TP_ARGS(dp, parent_ino)) +DEFINE_XREP_DIR_EVENT(xrep_dir_rebuild_tree); +DEFINE_XREP_DIR_EVENT(xrep_dir_reset_fork); +DEFINE_XREP_DIR_EVENT(xrep_parent_reset_dotdot); + +DECLARE_EVENT_CLASS(xrep_dirent_class, + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name, + xfs_ino_t ino), + TP_ARGS(dp, name, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + __field(xfs_ino_t, ino) + __field(uint8_t, ftype) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + __entry->ino = ino; + __entry->ftype = name->type; + ), + TP_printk("dev %d:%d dir 0x%llx ftype %s name '%.*s' ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __entry->namelen, + __get_str(name), + __entry->ino) +) +#define DEFINE_XREP_DIRENT_EVENT(name) \ +DEFINE_EVENT(xrep_dirent_class, name, \ + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name, \ + xfs_ino_t ino), \ + TP_ARGS(dp, name, ino)) +DEFINE_XREP_DIRENT_EVENT(xrep_dir_salvage_entry); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_createname); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_createname); +DEFINE_XREP_DIRENT_EVENT(xrep_adoption_reparent); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_removename); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_removename); + +DECLARE_EVENT_CLASS(xrep_adoption_class, + TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved), + TP_ARGS(dp, ip, moved), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_ino_t, child_ino) + __field(bool, moved) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->child_ino = ip->i_ino; + __entry->moved = moved; + ), + TP_printk("dev %d:%d dir 0x%llx child 0x%llx moved? %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->child_ino, + __entry->moved) +); +#define DEFINE_XREP_ADOPTION_EVENT(name) \ +DEFINE_EVENT(xrep_adoption_class, name, \ + TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved), \ + TP_ARGS(dp, ip, moved)) +DEFINE_XREP_ADOPTION_EVENT(xrep_adoption_trans_roll); + +DECLARE_EVENT_CLASS(xrep_parent_salvage_class, + TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino), + TP_ARGS(dp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->ino = ino; + ), + TP_printk("dev %d:%d dir 0x%llx parent 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->ino) +) +#define DEFINE_XREP_PARENT_SALVAGE_EVENT(name) \ +DEFINE_EVENT(xrep_parent_salvage_class, name, \ + TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino), \ + TP_ARGS(dp, ino)) +DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_dir_salvaged_parent); +DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_dirent); +DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_from_dcache); + +DECLARE_EVENT_CLASS(xrep_pptr_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(ip, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = be64_to_cpu(pptr->p_ino); + __entry->parent_gen = be32_to_cpu(pptr->p_gen); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_PPTR_EVENT(name) \ +DEFINE_EVENT(xrep_pptr_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \ + const struct xfs_parent_rec *pptr), \ + TP_ARGS(ip, name, pptr)) +DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentadd); +DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentremove); +DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentadd); +DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentremove); + +DECLARE_EVENT_CLASS(xrep_pptr_scan_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, + const struct xfs_name *name), + TP_ARGS(ip, dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = dp->i_ino; + __entry->parent_gen = VFS_IC(dp)->i_generation; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_PPTR_SCAN_EVENT(name) \ +DEFINE_EVENT(xrep_pptr_scan_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \ + const struct xfs_name *name), \ + TP_ARGS(ip, dp, name)) +DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentadd); +DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentremove); + +TRACE_EVENT(xrep_nlinks_set_record, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *obs), + TP_ARGS(mp, ino, obs), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = obs->parents; + __entry->backrefs = obs->backrefs; + __entry->children = obs->children; + ), + TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents, + __entry->backrefs, + __entry->children) +); + +DECLARE_EVENT_CLASS(xrep_dentry_class, + TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry), + TP_ARGS(mp, dentry), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, flags) + __field(unsigned long, ino) + __field(bool, positive) + __field(unsigned long, parent_ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, dentry->d_name.len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->flags = dentry->d_flags; + __entry->positive = d_is_positive(dentry); + if (dentry->d_parent && d_inode(dentry->d_parent)) + __entry->parent_ino = d_inode(dentry->d_parent)->i_ino; + else + __entry->parent_ino = -1UL; + __entry->ino = d_inode(dentry) ? d_inode(dentry)->i_ino : 0; + __entry->namelen = dentry->d_name.len; + memcpy(__get_str(name), dentry->d_name.name, dentry->d_name.len); + ), + TP_printk("dev %d:%d flags 0x%x positive? %d parent_ino 0x%lx ino 0x%lx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->flags, + __entry->positive, + __entry->parent_ino, + __entry->ino, + __entry->namelen, + __get_str(name)) +); +#define DEFINE_REPAIR_DENTRY_EVENT(name) \ +DEFINE_EVENT(xrep_dentry_class, name, \ + TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry), \ + TP_ARGS(mp, dentry)) +DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_check_child); +DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_invalidate_child); +DEFINE_REPAIR_DENTRY_EVENT(xrep_dirtree_delete_child); + +TRACE_EVENT(xrep_symlink_salvage_target, + TP_PROTO(struct xfs_inode *ip, char *target, unsigned int targetlen), + TP_ARGS(ip, target, targetlen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, targetlen) + __dynamic_array(char, target, targetlen + 1) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->targetlen = targetlen; + memcpy(__get_str(target), target, targetlen); + __get_str(target)[targetlen] = 0; + ), + TP_printk("dev %d:%d ip 0x%llx target '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->targetlen, + __get_str(target)) +); + +DECLARE_EVENT_CLASS(xrep_symlink_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + ), + TP_printk("dev %d:%d ip 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +); + +#define DEFINE_XREP_SYMLINK_EVENT(name) \ +DEFINE_EVENT(xrep_symlink_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_rebuild); +DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_reset_fork); + +TRACE_EVENT(xrep_iunlink_visit, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t bucket_agino, struct xfs_inode *ip), + TP_ARGS(pag, bucket, bucket_agino, ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(unsigned int, bucket) + __field(xfs_agino_t, bucket_agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->agino = XFS_INO_TO_AGINO(pag_mount(pag), ip->i_ino); + __entry->bucket = bucket; + __entry->bucket_agino = bucket_agino; + __entry->prev_agino = ip->i_prev_unlinked; + __entry->next_agino = ip->i_next_unlinked; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x bucket_agino 0x%x prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->agino, + __entry->bucket_agino, + __entry->prev_agino, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_reload_next, + TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino), + TP_ARGS(ip, prev_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, old_prev_agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + __field(unsigned int, nlink) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->old_prev_agino = ip->i_prev_unlinked; + __entry->prev_agino = prev_agino; + __entry->next_agino = ip->i_next_unlinked; + __entry->nlink = VFS_I(ip)->i_nlink; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u old_prev_agino %u prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->nlink, + __entry->old_prev_agino, + __entry->prev_agino, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_reload_ondisk, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(unsigned int, nlink) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->nlink = VFS_I(ip)->i_nlink; + __entry->next_agino = ip->i_next_unlinked; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->nlink, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t prev_agino, xfs_agino_t next_agino), + TP_ARGS(pag, bucket, prev_agino, next_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->bucket = bucket; + __entry->prev_agino = prev_agino; + __entry->next_agino = next_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->prev_agino, + __entry->next_agino) +); + +DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t prev_agino, xfs_agino_t next_agino), + TP_ARGS(pag, bucket, prev_agino, next_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->bucket = bucket; + __entry->prev_agino = prev_agino; + __entry->next_agino = next_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->prev_agino, + __entry->next_agino) +); +#define DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(name) \ +DEFINE_EVENT(xrep_iunlink_resolve_class, name, \ + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, \ + xfs_agino_t prev_agino, xfs_agino_t next_agino), \ + TP_ARGS(pag, bucket, prev_agino, next_agino)) +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_uncached); +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_wronglist); +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_nolist); +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_ok); + +TRACE_EVENT(xrep_iunlink_relink_next, + TP_PROTO(struct xfs_inode *ip, xfs_agino_t next_agino), + TP_ARGS(ip, next_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, next_agino) + __field(xfs_agino_t, new_next_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->next_agino = ip->i_next_unlinked; + __entry->new_next_agino = next_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x -> 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->next_agino, + __entry->new_next_agino) +); + +TRACE_EVENT(xrep_iunlink_relink_prev, + TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino), + TP_ARGS(ip, prev_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, new_prev_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->prev_agino = ip->i_prev_unlinked; + __entry->new_prev_agino = prev_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x prev_agino 0x%x -> 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->prev_agino, + __entry->new_prev_agino) +); + +TRACE_EVENT(xrep_iunlink_add_to_bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t agino, xfs_agino_t curr_head), + TP_ARGS(pag, bucket, agino, curr_head), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->bucket = bucket; + __entry->agino = agino; + __entry->next_agino = curr_head; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->agino, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_commit_bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t old_agino, xfs_agino_t agino), + TP_ARGS(pag, bucket, old_agino, agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, old_agino) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->bucket = bucket; + __entry->old_agino = old_agino; + __entry->agino = agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x -> 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->old_agino, + __entry->agino) +); + +DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xrep_dirpath_set_outcome); +DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_delete_path); +DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_create_adoption); +DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xrep_dirtree_decided_fate); + +DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_lookup); +DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_try_unlink); +DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_unlink); +DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_link); + +#ifdef CONFIG_XFS_RT +DECLARE_EVENT_CLASS(xrep_rtbitmap_class, + TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, xfs_rtxnum_t end), + TP_ARGS(mp, start, end), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(xfs_rtxnum_t, start) + __field(xfs_rtxnum_t, end) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->start = start; + __entry->end = end; + ), + TP_printk("dev %d:%d rtdev %d:%d startrtx 0x%llx endrtx 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->start, + __entry->end) +); +#define DEFINE_REPAIR_RGBITMAP_EVENT(name) \ +DEFINE_EVENT(xrep_rtbitmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, \ + xfs_rtxnum_t end), \ + TP_ARGS(mp, start, end)) +DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rtbitmap_record_free); +DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rtbitmap_record_free_bulk); + +TRACE_EVENT(xrep_rtbitmap_or, + TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff, + xfs_rtword_t mask, xfs_rtword_t word), + TP_ARGS(mp, wordoff, mask, word), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(unsigned long long, wordoff) + __field(unsigned int, mask) + __field(unsigned int, word) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->wordoff = wordoff; + __entry->mask = mask; + __entry->word = word; + ), + TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx mask 0x%x word 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->wordoff, + __entry->mask, + __entry->word) +); + +TRACE_EVENT(xrep_rtbitmap_load, + TP_PROTO(struct xfs_rtgroup *rtg, xfs_fileoff_t rbmoff, + xfs_rtxnum_t rtx, xfs_rtxnum_t len), + TP_ARGS(rtg, rbmoff, rtx, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_fileoff_t, rbmoff) + __field(xfs_rtxnum_t, rtx) + __field(xfs_rtxnum_t, len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rtdev = rtg_mount(rtg)->m_rtdev_targp->bt_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->rbmoff = rbmoff; + __entry->rtx = rtx; + __entry->len = len; + ), + TP_printk("dev %d:%d rtdev %d:%d rgno 0x%x rbmoff 0x%llx rtx 0x%llx rtxcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->rgno, + __entry->rbmoff, + __entry->rtx, + __entry->len) +); + +TRACE_EVENT(xrep_rtbitmap_load_words, + TP_PROTO(struct xfs_mount *mp, xfs_fileoff_t rbmoff, + unsigned long long wordoff, unsigned int wordcnt), + TP_ARGS(mp, rbmoff, wordoff, wordcnt), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(xfs_fileoff_t, rbmoff) + __field(unsigned long long, wordoff) + __field(unsigned int, wordcnt) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->rbmoff = rbmoff; + __entry->wordoff = wordoff; + __entry->wordcnt = wordcnt; + ), + TP_printk("dev %d:%d rtdev %d:%d rbmoff 0x%llx wordoff 0x%llx wordcnt 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->rbmoff, + __entry->wordoff, + __entry->wordcnt) +); + +TRACE_EVENT(xrep_rtbitmap_load_word, + TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff, + unsigned int bit, xfs_rtword_t ondisk_word, + xfs_rtword_t xfile_word, xfs_rtword_t word_mask), + TP_ARGS(mp, wordoff, bit, ondisk_word, xfile_word, word_mask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(unsigned long long, wordoff) + __field(unsigned int, bit) + __field(xfs_rtword_t, ondisk_word) + __field(xfs_rtword_t, xfile_word) + __field(xfs_rtword_t, word_mask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->wordoff = wordoff; + __entry->bit = bit; + __entry->ondisk_word = ondisk_word; + __entry->xfile_word = xfile_word; + __entry->word_mask = word_mask; + ), + TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx bit %u ondisk 0x%x(0x%x) inmem 0x%x(0x%x) result 0x%x mask 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->wordoff, + __entry->bit, + __entry->ondisk_word, + __entry->ondisk_word & __entry->word_mask, + __entry->xfile_word, + __entry->xfile_word & ~__entry->word_mask, + (__entry->xfile_word & ~__entry->word_mask) | + (__entry->ondisk_word & __entry->word_mask), + __entry->word_mask) +); + +TRACE_EVENT(xrep_rtrmap_found, + TP_PROTO(struct xfs_mount *mp, const struct xfs_rmap_irec *rec), + TP_ARGS(mp, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(xfs_rgblock_t, rgbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->rgbno = rec->rm_startblock; + __entry->len = rec->rm_blockcount; + __entry->owner = rec->rm_owner; + __entry->offset = rec->rm_offset; + __entry->flags = rec->rm_flags; + ), + TP_printk("dev %d:%d rtdev %d:%d rgbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->rgbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); +#endif /* CONFIG_XFS_RT */ + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index 17c982a4821d..cdd13ed9c569 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -7,9 +7,9 @@ #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" -#include "scrub/scrub.h" #include "scrub/trace.h" /* @@ -486,6 +486,9 @@ xfarray_sortinfo_alloc( xfarray_sortinfo_lo(si)[0] = 0; xfarray_sortinfo_hi(si)[0] = array->nr - 1; + si->relax = INIT_XCHK_RELAX; + if (flags & XFARRAY_SORT_KILLABLE) + si->relax.interruptible = false; trace_xfarray_sort(si, nr_bytes); *infop = si; @@ -503,10 +506,7 @@ xfarray_sort_terminated( * few seconds so that we don't run afoul of the soft lockup watchdog * or RCU stall detector. */ - cond_resched(); - - if ((si->flags & XFARRAY_SORT_KILLABLE) && - fatal_signal_pending(current)) { + if (xchk_maybe_relax(&si->relax)) { if (*error == 0) *error = -EINTR; return true; @@ -822,12 +822,14 @@ xfarray_sort_scan( /* Grab the first folio that backs this array element. */ if (!si->folio) { + struct folio *folio; loff_t next_pos; - si->folio = xfile_get_folio(si->array->xfile, idx_pos, + folio = xfile_get_folio(si->array->xfile, idx_pos, si->array->obj_size, XFILE_ALLOC); - if (IS_ERR(si->folio)) - return PTR_ERR(si->folio); + if (IS_ERR(folio)) + return PTR_ERR(folio); + si->folio = folio; si->first_folio_idx = xfarray_idx(si->array, folio_pos(si->folio) + si->array->obj_size - 1); @@ -1048,6 +1050,24 @@ xfarray_sort( out_free: trace_xfarray_sort_stats(si, error); + xfarray_sort_scan_done(si); kvfree(si); return error; } + +/* How many bytes is this array consuming? */ +unsigned long long +xfarray_bytes( + struct xfarray *array) +{ + return xfile_bytes(array->xfile); +} + +/* Empty the entire array. */ +void +xfarray_truncate( + struct xfarray *array) +{ + xfile_discard(array->xfile, 0, MAX_LFS_FILESIZE); + array->nr = 0; +} diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h index acb2f94c56c1..5eeeeed13ae2 100644 --- a/fs/xfs/scrub/xfarray.h +++ b/fs/xfs/scrub/xfarray.h @@ -8,6 +8,7 @@ /* xfile array index type, along with cursor initialization */ typedef uint64_t xfarray_idx_t; +#define XFARRAY_NULLIDX ((__force xfarray_idx_t)-1ULL) #define XFARRAY_CURSOR_INIT ((__force xfarray_idx_t)0) /* Iterate each index of an xfile array. */ @@ -44,6 +45,8 @@ int xfarray_unset(struct xfarray *array, xfarray_idx_t idx); int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr); int xfarray_store_anywhere(struct xfarray *array, const void *ptr); bool xfarray_element_is_null(struct xfarray *array, const void *ptr); +void xfarray_truncate(struct xfarray *array); +unsigned long long xfarray_bytes(struct xfarray *array); /* * Load an array element, but zero the buffer if there's no data because we @@ -124,6 +127,9 @@ struct xfarray_sortinfo { /* XFARRAY_SORT_* flags; see below. */ unsigned int flags; + /* next time we want to cond_resched() */ + struct xchk_relax relax; + /* Cache a folio here for faster scanning for pivots */ struct folio *folio; diff --git a/fs/xfs/scrub/xfblob.c b/fs/xfs/scrub/xfblob.c new file mode 100644 index 000000000000..6ef2a9637f16 --- /dev/null +++ b/fs/xfs/scrub/xfblob.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "scrub/scrub.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" + +/* + * XFS Blob Storage + * ================ + * Stores and retrieves blobs using an xfile. Objects are appended to the file + * and the offset is returned as a magic cookie for retrieval. + */ + +#define XB_KEY_MAGIC 0xABAADDAD +struct xb_key { + uint32_t xb_magic; /* XB_KEY_MAGIC */ + uint32_t xb_size; /* size of the blob, in bytes */ + loff_t xb_offset; /* byte offset of this key */ + /* blob comes after here */ +} __packed; + +/* Initialize a blob storage object. */ +int +xfblob_create( + const char *description, + struct xfblob **blobp) +{ + struct xfblob *blob; + struct xfile *xfile; + int error; + + error = xfile_create(description, 0, &xfile); + if (error) + return error; + + blob = kmalloc(sizeof(struct xfblob), XCHK_GFP_FLAGS); + if (!blob) { + error = -ENOMEM; + goto out_xfile; + } + + blob->xfile = xfile; + blob->last_offset = PAGE_SIZE; + + *blobp = blob; + return 0; + +out_xfile: + xfile_destroy(xfile); + return error; +} + +/* Destroy a blob storage object. */ +void +xfblob_destroy( + struct xfblob *blob) +{ + xfile_destroy(blob->xfile); + kfree(blob); +} + +/* Retrieve a blob. */ +int +xfblob_load( + struct xfblob *blob, + xfblob_cookie cookie, + void *ptr, + uint32_t size) +{ + struct xb_key key; + int error; + + error = xfile_load(blob->xfile, &key, sizeof(key), cookie); + if (error) + return error; + + if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) { + ASSERT(0); + return -ENODATA; + } + if (size < key.xb_size) { + ASSERT(0); + return -EFBIG; + } + + return xfile_load(blob->xfile, ptr, key.xb_size, + cookie + sizeof(key)); +} + +/* Store a blob. */ +int +xfblob_store( + struct xfblob *blob, + xfblob_cookie *cookie, + const void *ptr, + uint32_t size) +{ + struct xb_key key = { + .xb_offset = blob->last_offset, + .xb_magic = XB_KEY_MAGIC, + .xb_size = size, + }; + loff_t pos = blob->last_offset; + int error; + + error = xfile_store(blob->xfile, &key, sizeof(key), pos); + if (error) + return error; + + pos += sizeof(key); + error = xfile_store(blob->xfile, ptr, size, pos); + if (error) + goto out_err; + + *cookie = blob->last_offset; + blob->last_offset += sizeof(key) + size; + return 0; +out_err: + xfile_discard(blob->xfile, blob->last_offset, sizeof(key)); + return error; +} + +/* Free a blob. */ +int +xfblob_free( + struct xfblob *blob, + xfblob_cookie cookie) +{ + struct xb_key key; + int error; + + error = xfile_load(blob->xfile, &key, sizeof(key), cookie); + if (error) + return error; + + if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) { + ASSERT(0); + return -ENODATA; + } + + xfile_discard(blob->xfile, cookie, sizeof(key) + key.xb_size); + return 0; +} + +/* How many bytes is this blob storage object consuming? */ +unsigned long long +xfblob_bytes( + struct xfblob *blob) +{ + return xfile_bytes(blob->xfile); +} + +/* Drop all the blobs. */ +void +xfblob_truncate( + struct xfblob *blob) +{ + xfile_discard(blob->xfile, PAGE_SIZE, MAX_LFS_FILESIZE - PAGE_SIZE); + blob->last_offset = PAGE_SIZE; +} diff --git a/fs/xfs/scrub/xfblob.h b/fs/xfs/scrub/xfblob.h new file mode 100644 index 000000000000..ae78322613ca --- /dev/null +++ b/fs/xfs/scrub/xfblob.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_XFBLOB_H__ +#define __XFS_SCRUB_XFBLOB_H__ + +struct xfblob { + struct xfile *xfile; + loff_t last_offset; +}; + +typedef loff_t xfblob_cookie; + +int xfblob_create(const char *descr, struct xfblob **blobp); +void xfblob_destroy(struct xfblob *blob); +int xfblob_load(struct xfblob *blob, xfblob_cookie cookie, void *ptr, + uint32_t size); +int xfblob_store(struct xfblob *blob, xfblob_cookie *cookie, const void *ptr, + uint32_t size); +int xfblob_free(struct xfblob *blob, xfblob_cookie cookie); +unsigned long long xfblob_bytes(struct xfblob *blob); +void xfblob_truncate(struct xfblob *blob); + +static inline int +xfblob_storename( + struct xfblob *blob, + xfblob_cookie *cookie, + const struct xfs_name *xname) +{ + return xfblob_store(blob, cookie, xname->name, xname->len); +} + +static inline int +xfblob_loadname( + struct xfblob *blob, + xfblob_cookie cookie, + struct xfs_name *xname, + uint32_t size) +{ + int ret = xfblob_load(blob, cookie, (void *)xname->name, size); + if (ret) + return ret; + + xname->len = size; + return 0; +} + +#endif /* __XFS_SCRUB_XFBLOB_H__ */ diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 8cdd863db585..c753c79df203 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -10,9 +10,9 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" -#include "scrub/scrub.h" #include "scrub/trace.h" #include <linux/shmem_fs.h> @@ -126,7 +126,7 @@ xfile_load( unsigned int len; unsigned int offset; - if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_READ) < 0) break; if (!folio) { @@ -196,7 +196,7 @@ xfile_store( unsigned int len; unsigned int offset; - if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_CACHE) < 0) break; if (filemap_check_wb_err(inode->i_mapping, 0)) { @@ -267,7 +267,7 @@ xfile_get_folio( i_size_write(inode, pos + len); pflags = memalloc_nofs_save(); - error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ); memalloc_nofs_restore(pflags); if (error) @@ -293,7 +293,7 @@ xfile_get_folio( * (potentially last) reference in xfile_put_folio. */ if (flags & XFILE_ALLOC) - folio_set_dirty(folio); + folio_mark_dirty(folio); return folio; } @@ -310,3 +310,15 @@ xfile_put_folio( folio_unlock(folio); folio_put(folio); } + +/* Discard the page cache that's backing a range of the xfile. */ +void +xfile_discard( + struct xfile *xf, + loff_t pos, + u64 count) +{ + trace_xfile_discard(xf, pos, count); + + shmem_truncate_range(file_inode(xf->file), pos, pos + count - 1); +} diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h index 76d78dba7e34..cc2cc1714cd4 100644 --- a/fs/xfs/scrub/xfile.h +++ b/fs/xfs/scrub/xfile.h @@ -17,6 +17,7 @@ int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos); int xfile_store(struct xfile *xf, const void *buf, size_t count, loff_t pos); +void xfile_discard(struct xfile *xf, loff_t pos, u64 count); loff_t xfile_seek_data(struct xfile *xf, loff_t pos); #define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER) @@ -26,4 +27,9 @@ struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len, unsigned int flags); void xfile_put_folio(struct xfile *xf, struct folio *folio); +static inline unsigned long long xfile_bytes(struct xfile *xf) +{ + return file_inode(xf->file)->i_blocks << SECTOR_SHIFT; +} + #endif /* __XFS_SCRUB_XFILE_H__ */ diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h index a39befa743ce..f17173b83e6f 100644 --- a/fs/xfs/scrub/xfs_scrub.h +++ b/fs/xfs/scrub/xfs_scrub.h @@ -7,9 +7,11 @@ #define __XFS_SCRUB_H__ #ifndef CONFIG_XFS_ONLINE_SCRUB -# define xfs_scrub_metadata(file, sm) (-ENOTTY) +# define xfs_ioc_scrub_metadata(f, a) (-ENOTTY) +# define xfs_ioc_scrubv_metadata(f, a) (-ENOTTY) #else -int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm); +int xfs_ioc_scrub_metadata(struct file *file, void __user *arg); +int xfs_ioc_scrubv_metadata(struct file *file, void __user *arg); #endif /* CONFIG_XFS_ONLINE_SCRUB */ #endif /* __XFS_SCRUB_H__ */ |