diff options
Diffstat (limited to 'fs/xfs/libxfs')
99 files changed, 18337 insertions, 5046 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 39d9525270b7..e6ba914f6d06 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -30,137 +30,7 @@ #include "xfs_trace.h" #include "xfs_inode.h" #include "xfs_icache.h" - - -/* - * Passive reference counting access wrappers to the perag structures. If the - * per-ag structure is to be freed, the freeing code is responsible for cleaning - * up objects with passive references before freeing the structure. This is - * things like cached buffers. - */ -struct xfs_perag * -xfs_perag_get( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - rcu_read_lock(); - pag = radix_tree_lookup(&mp->m_perag_tree, agno); - if (pag) { - trace_xfs_perag_get(pag, _RET_IP_); - ASSERT(atomic_read(&pag->pag_ref) >= 0); - atomic_inc(&pag->pag_ref); - } - rcu_read_unlock(); - return pag; -} - -/* - * search from @first to find the next perag with the given tag set. - */ -struct xfs_perag * -xfs_perag_get_tag( - struct xfs_mount *mp, - xfs_agnumber_t first, - unsigned int tag) -{ - struct xfs_perag *pag; - int found; - - rcu_read_lock(); - found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, - (void **)&pag, first, 1, tag); - if (found <= 0) { - rcu_read_unlock(); - return NULL; - } - trace_xfs_perag_get_tag(pag, _RET_IP_); - atomic_inc(&pag->pag_ref); - rcu_read_unlock(); - return pag; -} - -/* Get a passive reference to the given perag. */ -struct xfs_perag * -xfs_perag_hold( - struct xfs_perag *pag) -{ - ASSERT(atomic_read(&pag->pag_ref) > 0 || - atomic_read(&pag->pag_active_ref) > 0); - - trace_xfs_perag_hold(pag, _RET_IP_); - atomic_inc(&pag->pag_ref); - return pag; -} - -void -xfs_perag_put( - struct xfs_perag *pag) -{ - trace_xfs_perag_put(pag, _RET_IP_); - ASSERT(atomic_read(&pag->pag_ref) > 0); - atomic_dec(&pag->pag_ref); -} - -/* - * Active references for perag structures. This is for short term access to the - * per ag structures for walking trees or accessing state. If an AG is being - * shrunk or is offline, then this will fail to find that AG and return NULL - * instead. - */ -struct xfs_perag * -xfs_perag_grab( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - rcu_read_lock(); - pag = radix_tree_lookup(&mp->m_perag_tree, agno); - if (pag) { - trace_xfs_perag_grab(pag, _RET_IP_); - if (!atomic_inc_not_zero(&pag->pag_active_ref)) - pag = NULL; - } - rcu_read_unlock(); - return pag; -} - -/* - * search from @first to find the next perag with the given tag set. - */ -struct xfs_perag * -xfs_perag_grab_tag( - struct xfs_mount *mp, - xfs_agnumber_t first, - int tag) -{ - struct xfs_perag *pag; - int found; - - rcu_read_lock(); - found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, - (void **)&pag, first, 1, tag); - if (found <= 0) { - rcu_read_unlock(); - return NULL; - } - trace_xfs_perag_grab_tag(pag, _RET_IP_); - if (!atomic_inc_not_zero(&pag->pag_active_ref)) - pag = NULL; - rcu_read_unlock(); - return pag; -} - -void -xfs_perag_rele( - struct xfs_perag *pag) -{ - trace_xfs_perag_rele(pag, _RET_IP_); - if (atomic_dec_and_test(&pag->pag_active_ref)) - wake_up(&pag->pag_active_wq); -} +#include "xfs_group.h" /* * xfs_initialize_perag_data @@ -194,7 +64,7 @@ xfs_initialize_perag_data( pag = xfs_perag_get(mp, index); error = xfs_alloc_read_agf(pag, NULL, 0, NULL); if (!error) - error = xfs_ialloc_read_agi(pag, NULL, NULL); + error = xfs_ialloc_read_agi(pag, NULL, 0, NULL); if (error) { xfs_perag_put(pag); return error; @@ -217,6 +87,7 @@ xfs_initialize_perag_data( */ if (fdblocks > sbp->sb_dblocks || ifree > ialloc) { xfs_alert(mp, "AGF corruption. Please run xfs_repair."); + xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); error = -EFSCORRUPTED; goto out; } @@ -234,43 +105,32 @@ out: return error; } -STATIC void -__xfs_free_perag( - struct rcu_head *head) +static void +xfs_perag_uninit( + struct xfs_group *xg) { - struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); +#ifdef __KERNEL__ + struct xfs_perag *pag = to_perag(xg); - ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); - kmem_free(pag); + cancel_delayed_work_sync(&pag->pag_blockgc_work); + xfs_buf_cache_destroy(&pag->pag_bcache); +#endif } /* - * Free up the per-ag resources associated with the mount structure. + * Free up the per-ag resources within the specified AG range. */ void -xfs_free_perag( - struct xfs_mount *mp) +xfs_free_perag_range( + struct xfs_mount *mp, + xfs_agnumber_t first_agno, + xfs_agnumber_t end_agno) + { - struct xfs_perag *pag; xfs_agnumber_t agno; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - spin_lock(&mp->m_perag_lock); - pag = radix_tree_delete(&mp->m_perag_tree, agno); - spin_unlock(&mp->m_perag_lock); - ASSERT(pag); - XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); - xfs_defer_drain_free(&pag->pag_intents_drain); - - cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_buf_hash_destroy(pag); - - /* drop the mount's active reference */ - xfs_perag_rele(pag); - XFS_IS_CORRUPT(pag->pag_mount, - atomic_read(&pag->pag_active_ref) != 0); - call_rcu(&pag->rcu_head, __xfs_free_perag); - } + for (agno = first_agno; agno < end_agno; agno++) + xfs_group_free(mp, agno, XG_TYPE_AG, xfs_perag_uninit); } /* Find the size of the AG, in blocks. */ @@ -333,130 +193,100 @@ xfs_agino_range( } /* - * Free perag within the specified AG range, it is only used to free unused - * perags under the error handling path. + * Update the perag of the previous tail AG if it has been changed during + * recovery (i.e. recovery of a growfs). */ -void -xfs_free_unused_perag_range( +int +xfs_update_last_ag_size( struct xfs_mount *mp, - xfs_agnumber_t agstart, - xfs_agnumber_t agend) + xfs_agnumber_t prev_agcount) { - struct xfs_perag *pag; - xfs_agnumber_t index; + struct xfs_perag *pag = xfs_perag_grab(mp, prev_agcount - 1); - for (index = agstart; index < agend; index++) { - spin_lock(&mp->m_perag_lock); - pag = radix_tree_delete(&mp->m_perag_tree, index); - spin_unlock(&mp->m_perag_lock); - if (!pag) - break; - xfs_buf_hash_destroy(pag); - xfs_defer_drain_free(&pag->pag_intents_drain); - kmem_free(pag); - } + if (!pag) + return -EFSCORRUPTED; + pag_group(pag)->xg_block_count = __xfs_ag_block_count(mp, + prev_agcount - 1, mp->m_sb.sb_agcount, + mp->m_sb.sb_dblocks); + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); + xfs_perag_rele(pag); + return 0; } -int -xfs_initialize_perag( +static int +xfs_perag_alloc( struct xfs_mount *mp, + xfs_agnumber_t index, xfs_agnumber_t agcount, - xfs_rfsblock_t dblocks, - xfs_agnumber_t *maxagi) + xfs_rfsblock_t dblocks) { struct xfs_perag *pag; - xfs_agnumber_t index; - xfs_agnumber_t first_initialised = NULLAGNUMBER; int error; - /* - * Walk the current per-ag tree so we don't try to initialise AGs - * that already exist (growfs case). Allocate and insert all the - * AGs we don't find ready for initialisation. - */ - for (index = 0; index < agcount; index++) { - pag = xfs_perag_get(mp, index); - if (pag) { - xfs_perag_put(pag); - continue; - } - - pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); - if (!pag) { - error = -ENOMEM; - goto out_unwind_new_pags; - } - pag->pag_agno = index; - pag->pag_mount = mp; - - error = radix_tree_preload(GFP_NOFS); - if (error) - goto out_free_pag; - - spin_lock(&mp->m_perag_lock); - if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { - WARN_ON_ONCE(1); - spin_unlock(&mp->m_perag_lock); - radix_tree_preload_end(); - error = -EEXIST; - goto out_free_pag; - } - spin_unlock(&mp->m_perag_lock); - radix_tree_preload_end(); + pag = kzalloc(sizeof(*pag), GFP_KERNEL); + if (!pag) + return -ENOMEM; #ifdef __KERNEL__ - /* Place kernel structure only init below this point. */ - spin_lock_init(&pag->pag_ici_lock); - spin_lock_init(&pag->pagb_lock); - spin_lock_init(&pag->pag_state_lock); - INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); - INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - xfs_defer_drain_init(&pag->pag_intents_drain); - init_waitqueue_head(&pag->pagb_wait); - init_waitqueue_head(&pag->pag_active_wq); - pag->pagb_count = 0; - pag->pagb_tree = RB_ROOT; + /* Place kernel structure only init below this point. */ + spin_lock_init(&pag->pag_ici_lock); + INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); #endif /* __KERNEL__ */ - error = xfs_buf_hash_init(pag); - if (error) - goto out_remove_pag; + error = xfs_buf_cache_init(&pag->pag_bcache); + if (error) + goto out_free_perag; - /* Active ref owned by mount indicates AG is online. */ - atomic_set(&pag->pag_active_ref, 1); + /* + * Pre-calculated geometry + */ + pag_group(pag)->xg_block_count = __xfs_ag_block_count(mp, index, agcount, + dblocks); + pag_group(pag)->xg_min_gbno = XFS_AGFL_BLOCK(mp) + 1; + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); - /* first new pag is fully initialized */ - if (first_initialised == NULLAGNUMBER) - first_initialised = index; + error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG); + if (error) + goto out_buf_cache_destroy; - /* - * Pre-calculated geometry - */ - pag->block_count = __xfs_ag_block_count(mp, index, agcount, - dblocks); - pag->min_block = XFS_AGFL_BLOCK(mp); - __xfs_agino_range(mp, pag->block_count, &pag->agino_min, - &pag->agino_max); - } + return 0; + +out_buf_cache_destroy: + xfs_buf_cache_destroy(&pag->pag_bcache); +out_free_perag: + kfree(pag); + return error; +} + +int +xfs_initialize_perag( + struct xfs_mount *mp, + xfs_agnumber_t orig_agcount, + xfs_agnumber_t new_agcount, + xfs_rfsblock_t dblocks, + xfs_agnumber_t *maxagi) +{ + xfs_agnumber_t index; + int error; - index = xfs_set_inode_alloc(mp, agcount); + if (orig_agcount >= new_agcount) + return 0; - if (maxagi) - *maxagi = index; + for (index = orig_agcount; index < new_agcount; index++) { + error = xfs_perag_alloc(mp, index, new_agcount, dblocks); + if (error) + goto out_unwind_new_pags; + } + *maxagi = xfs_set_inode_alloc(mp, new_agcount); mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; -out_remove_pag: - xfs_defer_drain_free(&pag->pag_intents_drain); - spin_lock(&mp->m_perag_lock); - radix_tree_delete(&mp->m_perag_tree, index); - spin_unlock(&mp->m_perag_lock); -out_free_pag: - kmem_free(pag); out_unwind_new_pags: - /* unwind any prior newly initialized pags */ - xfs_free_unused_perag_range(mp, first_initialised, agcount); + xfs_free_perag_range(mp, orig_agcount, index); return error; } @@ -471,7 +301,7 @@ xfs_get_aghdr_buf( struct xfs_buf *bp; int error; - error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp); + error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, &bp); if (error) return error; @@ -491,7 +321,7 @@ xfs_btroot_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno); + xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno); } /* Finish initializing a free space btree. */ @@ -549,7 +379,7 @@ xfs_freesp_init_recs( } /* - * Alloc btree root block init functions + * bnobt/cntbt btree root block init functions */ static void xfs_bnoroot_init( @@ -557,17 +387,7 @@ xfs_bnoroot_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 0, id->agno); - xfs_freesp_init_recs(mp, bp, id); -} - -static void -xfs_cntroot_init( - struct xfs_mount *mp, - struct xfs_buf *bp, - struct aghdr_init_data *id) -{ - xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 0, id->agno); + xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno); xfs_freesp_init_recs(mp, bp, id); } @@ -583,7 +403,7 @@ xfs_rmaproot_init( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_rmap_rec *rrec; - xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno); + xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 4, id->agno); /* * mark the AG header regions as static metadata The BNO @@ -678,14 +498,13 @@ xfs_agfblock_init( agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(id->agno); agf->agf_length = cpu_to_be32(id->agsize); - agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); - agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); - agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); - agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); + agf->agf_bno_root = cpu_to_be32(XFS_BNO_BLOCK(mp)); + agf->agf_cnt_root = cpu_to_be32(XFS_CNT_BLOCK(mp)); + agf->agf_bno_level = cpu_to_be32(1); + agf->agf_cnt_level = cpu_to_be32(1); if (xfs_has_rmapbt(mp)) { - agf->agf_roots[XFS_BTNUM_RMAPi] = - cpu_to_be32(XFS_RMAP_BLOCK(mp)); - agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); + agf->agf_rmap_root = cpu_to_be32(XFS_RMAP_BLOCK(mp)); + agf->agf_rmap_level = cpu_to_be32(1); agf->agf_rmap_blocks = cpu_to_be32(1); } @@ -796,7 +615,7 @@ struct xfs_aghdr_grow_data { size_t numblks; const struct xfs_buf_ops *ops; aghdr_init_work_f work; - xfs_btnum_t type; + const struct xfs_btree_ops *bc_ops; bool need_init; }; @@ -850,13 +669,15 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_bnobt_buf_ops, .work = &xfs_bnoroot_init, + .bc_ops = &xfs_bnobt_ops, .need_init = true }, { /* CNT root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_cntbt_buf_ops, - .work = &xfs_cntroot_init, + .work = &xfs_bnoroot_init, + .bc_ops = &xfs_cntbt_ops, .need_init = true }, { /* INO root block */ @@ -864,7 +685,7 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_inobt_buf_ops, .work = &xfs_btroot_init, - .type = XFS_BTNUM_INO, + .bc_ops = &xfs_inobt_ops, .need_init = true }, { /* FINO root block */ @@ -872,7 +693,7 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, - .type = XFS_BTNUM_FINO, + .bc_ops = &xfs_finobt_ops, .need_init = xfs_has_finobt(mp) }, { /* RMAP root block */ @@ -880,6 +701,7 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_rmapbt_buf_ops, .work = &xfs_rmaproot_init, + .bc_ops = &xfs_rmapbt_ops, .need_init = xfs_has_rmapbt(mp) }, { /* REFC root block */ @@ -887,7 +709,7 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_refcountbt_buf_ops, .work = &xfs_btroot_init, - .type = XFS_BTNUM_REFC, + .bc_ops = &xfs_refcountbt_ops, .need_init = xfs_has_reflink(mp) }, { /* NULL terminating block */ @@ -905,7 +727,7 @@ xfs_ag_init_headers( id->daddr = dp->daddr; id->numblks = dp->numblks; - id->type = dp->type; + id->bc_ops = dp->bc_ops; error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops); if (error) break; @@ -919,7 +741,7 @@ xfs_ag_shrink_space( struct xfs_trans **tpp, xfs_extlen_t delta) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_alloc_arg args = { .tp = *tpp, .mp = mp, @@ -936,8 +758,8 @@ xfs_ag_shrink_space( xfs_agblock_t aglen; int error, err2; - ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1); - error = xfs_ialloc_read_agi(pag, *tpp, &agibp); + ASSERT(pag_agno(pag) == mp->m_sb.sb_agcount - 1); + error = xfs_ialloc_read_agi(pag, *tpp, 0, &agibp); if (error) return error; @@ -950,8 +772,10 @@ xfs_ag_shrink_space( agf = agfbp->b_addr; aglen = be32_to_cpu(agi->agi_length); /* some extra paranoid checks before we shrink the ag */ - if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) + if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); return -EFSCORRUPTED; + } if (delta >= aglen) return -EINVAL; @@ -967,26 +791,33 @@ xfs_ag_shrink_space( * Disable perag reservations so it doesn't cause the allocation request * to fail. We'll reestablish reservation before we return. */ - error = xfs_ag_resv_free(pag); - if (error) - return error; + xfs_ag_resv_free(pag); /* internal log shouldn't also show up in the free space btrees */ error = xfs_alloc_vextent_exact_bno(&args, - XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta)); + xfs_agbno_to_fsb(pag, aglen - delta)); if (!error && args.agbno == NULLAGBLOCK) error = -ENOSPC; if (error) { /* - * if extent allocation fails, need to roll the transaction to + * If extent allocation fails, need to roll the transaction to * ensure that the AGFL fixup has been committed anyway. + * + * We need to hold the AGF across the roll to ensure nothing can + * access the AG for allocation until the shrink is fully + * cleaned up. And due to the resetting of the AG block + * reservation space needing to lock the AGI, we also have to + * hold that so we don't get AGI/AGF lock order inversions in + * the error handling path. */ xfs_trans_bhold(*tpp, agfbp); + xfs_trans_bhold(*tpp, agibp); err2 = xfs_trans_roll(tpp); if (err2) return err2; xfs_trans_bjoin(*tpp, agfbp); + xfs_trans_bjoin(*tpp, agibp); goto resv_init_out; } @@ -1005,7 +836,7 @@ xfs_ag_shrink_space( goto resv_err; err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, - XFS_AG_RESV_NONE, true); + XFS_AG_RESV_NONE, XFS_FREE_EXTENT_SKIP_DISCARD); if (err2) goto resv_err; @@ -1023,9 +854,9 @@ xfs_ag_shrink_space( } /* Update perag geometry */ - pag->block_count -= delta; - __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, - &pag->agino_max); + pag_group(pag)->xg_block_count -= delta; + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); @@ -1050,14 +881,15 @@ xfs_ag_extend_space( struct xfs_trans *tp, xfs_extlen_t len) { + struct xfs_mount *mp = pag_mount(pag); struct xfs_buf *bp; struct xfs_agi *agi; struct xfs_agf *agf; int error; - ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1); + ASSERT(pag_agno(pag) == mp->m_sb.sb_agcount - 1); - error = xfs_ialloc_read_agi(pag, tp, &bp); + error = xfs_ialloc_read_agi(pag, tp, 0, &bp); if (error) return error; @@ -1094,9 +926,9 @@ xfs_ag_extend_space( return error; /* Update perag geometry */ - pag->block_count = be32_to_cpu(agf->agf_length); - __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, - &pag->agino_max); + pag_group(pag)->xg_block_count = be32_to_cpu(agf->agf_length); + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); return 0; } @@ -1114,7 +946,7 @@ xfs_ag_get_geometry( int error; /* Lock the AG headers. */ - error = xfs_ialloc_read_agi(pag, NULL, &agi_bp); + error = xfs_ialloc_read_agi(pag, NULL, 0, &agi_bp); if (error) return error; error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp); @@ -1123,7 +955,7 @@ xfs_ag_get_geometry( /* Fill out form. */ memset(ageo, 0, sizeof(*ageo)); - ageo->ag_number = pag->pag_agno; + ageo->ag_number = pag_agno(pag); agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 4b343c4fac28..1f24cfa27321 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -7,6 +7,8 @@ #ifndef __LIBXFS_AG_H #define __LIBXFS_AG_H 1 +#include "xfs_group.h" + struct xfs_mount; struct xfs_trans; struct xfs_perag; @@ -30,14 +32,11 @@ struct xfs_ag_resv { * performance of allocation group selection. */ struct xfs_perag { - struct xfs_mount *pag_mount; /* owner filesystem */ - xfs_agnumber_t pag_agno; /* AG this structure belongs to */ - atomic_t pag_ref; /* passive reference count */ - atomic_t pag_active_ref; /* active reference count */ - wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */ + struct xfs_group pag_group; unsigned long pag_opstate; - uint8_t pagf_levels[XFS_BTNUM_AGF]; - /* # of levels in bno & cnt btree */ + uint8_t pagf_bno_level; /* # of levels in bno btree */ + uint8_t pagf_cnt_level; /* # of levels in cnt btree */ + uint8_t pagf_rmap_level;/* # of levels in rmap btree */ uint32_t pagf_flcount; /* count of blocks in freelist */ xfs_extlen_t pagf_freeblks; /* total free blocks */ xfs_extlen_t pagf_longest; /* longest free space */ @@ -54,7 +53,6 @@ struct xfs_perag { xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; - int pagb_count; /* pagb slots in use */ uint8_t pagf_refcount_level; /* recount btree height */ /* Blocks reserved for all kinds of metadata. */ @@ -62,41 +60,24 @@ struct xfs_perag { /* Blocks reserved for the reverse mapping btree. */ struct xfs_ag_resv pag_rmapbt_resv; - /* for rcu-safe freeing */ - struct rcu_head rcu_head; - /* Precalculated geometry info */ - xfs_agblock_t block_count; - xfs_agblock_t min_block; xfs_agino_t agino_min; xfs_agino_t agino_max; #ifdef __KERNEL__ /* -- kernel only structures below this line -- */ - /* - * Bitsets of per-ag metadata that have been checked and/or are sick. - * Callers should hold pag_state_lock before accessing this field. - */ - uint16_t pag_checked; - uint16_t pag_sick; - #ifdef CONFIG_XFS_ONLINE_REPAIR /* * Alternate btree heights so that online repair won't trip the write * verifiers while rebuilding the AG btrees. */ - uint8_t pagf_repair_levels[XFS_BTNUM_AGF]; + uint8_t pagf_repair_bno_level; + uint8_t pagf_repair_cnt_level; uint8_t pagf_repair_refcount_level; + uint8_t pagf_repair_rmap_level; #endif - spinlock_t pag_state_lock; - - spinlock_t pagb_lock; /* lock for pagb_tree */ - struct rb_root pagb_tree; /* ordered tree of busy extents */ - unsigned int pagb_gen; /* generation count for pagb_tree */ - wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */ - atomic_t pagf_fstrms; /* # of filestreams active in this AG */ spinlock_t pag_ici_lock; /* incore inode cache lock */ @@ -104,24 +85,33 @@ struct xfs_perag { int pag_ici_reclaimable; /* reclaimable inodes */ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ - /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ - struct rhashtable pag_buf_hash; + struct xfs_buf_cache pag_bcache; /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; - - /* - * We use xfs_drain to track the number of deferred log intent items - * that have been queued (but not yet processed) so that waiters (e.g. - * scrub) will not lock resources when other threads are in the middle - * of processing a chain of intent items only to find momentary - * inconsistencies. - */ - struct xfs_defer_drain pag_intents_drain; #endif /* __KERNEL__ */ }; +static inline struct xfs_perag *to_perag(struct xfs_group *xg) +{ + return container_of(xg, struct xfs_perag, pag_group); +} + +static inline struct xfs_group *pag_group(struct xfs_perag *pag) +{ + return &pag->pag_group; +} + +static inline struct xfs_mount *pag_mount(const struct xfs_perag *pag) +{ + return pag->pag_group.xg_mount; +} + +static inline xfs_agnumber_t pag_agno(const struct xfs_perag *pag) +{ + return pag->pag_group.xg_gno; +} + /* * Per-AG operational state. These are atomic flag bits. */ @@ -143,25 +133,80 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA) __XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES) __XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET) -void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart, - xfs_agnumber_t agend); -int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, - xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); +int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t orig_agcount, + xfs_agnumber_t new_agcount, xfs_rfsblock_t dcount, + xfs_agnumber_t *maxagi); +void xfs_free_perag_range(struct xfs_mount *mp, xfs_agnumber_t first_agno, + xfs_agnumber_t end_agno); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); -void xfs_free_perag(struct xfs_mount *mp); +int xfs_update_last_ag_size(struct xfs_mount *mp, xfs_agnumber_t prev_agcount); /* Passive AG references */ -struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); -struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, - unsigned int tag); -struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag); -void xfs_perag_put(struct xfs_perag *pag); +static inline struct xfs_perag * +xfs_perag_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + return to_perag(xfs_group_get(mp, agno, XG_TYPE_AG)); +} + +static inline struct xfs_perag * +xfs_perag_hold( + struct xfs_perag *pag) +{ + return to_perag(xfs_group_hold(pag_group(pag))); +} + +static inline void +xfs_perag_put( + struct xfs_perag *pag) +{ + xfs_group_put(pag_group(pag)); +} /* Active AG references */ -struct xfs_perag *xfs_perag_grab(struct xfs_mount *, xfs_agnumber_t); -struct xfs_perag *xfs_perag_grab_tag(struct xfs_mount *, xfs_agnumber_t, - int tag); -void xfs_perag_rele(struct xfs_perag *pag); +static inline struct xfs_perag * +xfs_perag_grab( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + return to_perag(xfs_group_grab(mp, agno, XG_TYPE_AG)); +} + +static inline void +xfs_perag_rele( + struct xfs_perag *pag) +{ + xfs_group_rele(pag_group(pag)); +} + +static inline struct xfs_perag * +xfs_perag_next_range( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_agnumber_t start_agno, + xfs_agnumber_t end_agno) +{ + return to_perag(xfs_group_next_range(mp, pag ? pag_group(pag) : NULL, + start_agno, end_agno, XG_TYPE_AG)); +} + +static inline struct xfs_perag * +xfs_perag_next_from( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_agnumber_t start_agno) +{ + return xfs_perag_next_range(mp, pag, start_agno, mp->m_sb.sb_agcount - 1); +} + +static inline struct xfs_perag * +xfs_perag_next( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + return xfs_perag_next_from(mp, pag, 0); +} /* * Per-ag geometry infomation and validation @@ -173,11 +218,7 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, static inline bool xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) { - if (agbno >= pag->block_count) - return false; - if (agbno <= pag->min_block) - return false; - return true; + return xfs_verify_gbno(pag_group(pag), agbno); } static inline bool @@ -186,13 +227,7 @@ xfs_verify_agbext( xfs_agblock_t agbno, xfs_agblock_t len) { - if (agbno + len <= agbno) - return false; - - if (!xfs_verify_agbno(pag, agbno)) - return false; - - return xfs_verify_agbno(pag, agbno + len - 1); + return xfs_verify_gbext(pag_group(pag), agbno, len); } /* @@ -228,47 +263,6 @@ xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno) agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); } -/* - * Perag iteration APIs - */ -static inline struct xfs_perag * -xfs_perag_next( - struct xfs_perag *pag, - xfs_agnumber_t *agno, - xfs_agnumber_t end_agno) -{ - struct xfs_mount *mp = pag->pag_mount; - - *agno = pag->pag_agno + 1; - xfs_perag_rele(pag); - while (*agno <= end_agno) { - pag = xfs_perag_grab(mp, *agno); - if (pag) - return pag; - (*agno)++; - } - return NULL; -} - -#define for_each_perag_range(mp, agno, end_agno, pag) \ - for ((pag) = xfs_perag_grab((mp), (agno)); \ - (pag) != NULL; \ - (pag) = xfs_perag_next((pag), &(agno), (end_agno))) - -#define for_each_perag_from(mp, agno, pag) \ - for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag)) - -#define for_each_perag(mp, agno, pag) \ - (agno) = 0; \ - for_each_perag_from((mp), (agno), (pag)) - -#define for_each_perag_tag(mp, agno, pag, tag) \ - for ((agno) = 0, (pag) = xfs_perag_grab_tag((mp), 0, (tag)); \ - (pag) != NULL; \ - (agno) = (pag)->pag_agno + 1, \ - xfs_perag_rele(pag), \ - (pag) = xfs_perag_grab_tag((mp), (agno), (tag))) - static inline struct xfs_perag * xfs_perag_next_wrap( struct xfs_perag *pag, @@ -277,9 +271,9 @@ xfs_perag_next_wrap( xfs_agnumber_t restart_agno, xfs_agnumber_t wrap_agno) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); - *agno = pag->pag_agno + 1; + *agno = pag_agno(pag) + 1; xfs_perag_rele(pag); while (*agno != stop_agno) { if (*agno >= wrap_agno) { @@ -331,7 +325,7 @@ struct aghdr_init_data { /* per header data */ xfs_daddr_t daddr; /* header location */ size_t numblks; /* size of header */ - xfs_btnum_t type; /* type of btree root block */ + const struct xfs_btree_ops *bc_ops; /* btree ops */ }; int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); @@ -341,4 +335,28 @@ int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t len); int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); +static inline xfs_fsblock_t +xfs_agbno_to_fsb( + struct xfs_perag *pag, + xfs_agblock_t agbno) +{ + return XFS_AGB_TO_FSB(pag_mount(pag), pag_agno(pag), agbno); +} + +static inline xfs_daddr_t +xfs_agbno_to_daddr( + struct xfs_perag *pag, + xfs_agblock_t agbno) +{ + return XFS_AGB_TO_DADDR(pag_mount(pag), pag_agno(pag), agbno); +} + +static inline xfs_ino_t +xfs_agino_to_ino( + struct xfs_perag *pag, + xfs_agino_t agino) +{ + return XFS_AGINO_TO_INO(pag_mount(pag), pag_agno(pag), agino); +} + #endif /* __LIBXFS_AG_H */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index da1057bd0e60..fb79215a509d 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -70,6 +70,7 @@ xfs_ag_resv_critical( struct xfs_perag *pag, enum xfs_ag_resv_type type) { + struct xfs_mount *mp = pag_mount(pag); xfs_extlen_t avail; xfs_extlen_t orig; @@ -92,8 +93,8 @@ xfs_ag_resv_critical( /* Critically low if less than 10% or max btree height remains. */ return XFS_TEST_ERROR(avail < orig / 10 || - avail < pag->pag_mount->m_agbtree_maxlevels, - pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL); + avail < mp->m_agbtree_maxlevels, + mp, XFS_ERRTAG_AG_RESV_CRITICAL); } /* @@ -113,6 +114,7 @@ xfs_ag_resv_needed( case XFS_AG_RESV_RMAPBT: len -= xfs_perag_resv(pag, type)->ar_reserved; break; + case XFS_AG_RESV_METAFILE: case XFS_AG_RESV_NONE: /* empty */ break; @@ -126,20 +128,19 @@ xfs_ag_resv_needed( } /* Clean out a reservation */ -static int +static void __xfs_ag_resv_free( struct xfs_perag *pag, enum xfs_ag_resv_type type) { struct xfs_ag_resv *resv; xfs_extlen_t oldresv; - int error; trace_xfs_ag_resv_free(pag, type, 0); resv = xfs_perag_resv(pag, type); - if (pag->pag_agno == 0) - pag->pag_mount->m_ag_max_usable += resv->ar_asked; + if (pag_agno(pag) == 0) + pag_mount(pag)->m_ag_max_usable += resv->ar_asked; /* * RMAPBT blocks come from the AGFL and AGFL blocks are always * considered "free", so whatever was reserved at mount time must be @@ -149,30 +150,19 @@ __xfs_ag_resv_free( oldresv = resv->ar_orig_reserved; else oldresv = resv->ar_reserved; - error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); + xfs_add_fdblocks(pag_mount(pag), oldresv); resv->ar_reserved = 0; resv->ar_asked = 0; resv->ar_orig_reserved = 0; - - if (error) - trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, - error, _RET_IP_); - return error; } /* Free a per-AG reservation. */ -int +void xfs_ag_resv_free( struct xfs_perag *pag) { - int error; - int err2; - - error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT); - err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); - if (err2 && !error) - error = err2; - return error; + __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT); + __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); } static int @@ -182,7 +172,7 @@ __xfs_ag_resv_init( xfs_extlen_t ask, xfs_extlen_t used) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_ag_resv *resv; int error; xfs_extlen_t hidden_space; @@ -216,13 +206,12 @@ __xfs_ag_resv_init( if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL)) error = -ENOSPC; else - error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); + error = xfs_dec_fdblocks(mp, hidden_space, true); if (error) { - trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, - error, _RET_IP_); + trace_xfs_ag_resv_init_error(pag, error, _RET_IP_); xfs_warn(mp, "Per-AG reservation for AG %u failed. Filesystem may run out of space.", - pag->pag_agno); + pag_agno(pag)); return error; } @@ -232,7 +221,7 @@ __xfs_ag_resv_init( * counter, we only make the adjustment for AG 0. This assumes that * there aren't any AGs hungrier for per-AG reservation than AG 0. */ - if (pag->pag_agno == 0) + if (pag_agno(pag) == 0) mp->m_ag_max_usable -= ask; resv = xfs_perag_resv(pag, type); @@ -250,7 +239,7 @@ xfs_ag_resv_init( struct xfs_perag *pag, struct xfs_trans *tp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); xfs_extlen_t ask; xfs_extlen_t used; int error = 0, error2; @@ -359,6 +348,7 @@ xfs_ag_resv_alloc_extent( switch (type) { case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_METAFILE: return; case XFS_AG_RESV_METADATA: case XFS_AG_RESV_RMAPBT: @@ -401,6 +391,7 @@ xfs_ag_resv_free_extent( switch (type) { case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_METAFILE: return; case XFS_AG_RESV_METADATA: case XFS_AG_RESV_RMAPBT: diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index b74b210008ea..f247eeff7358 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -6,7 +6,7 @@ #ifndef __XFS_AG_RESV_H__ #define __XFS_AG_RESV_H__ -int xfs_ag_resv_free(struct xfs_perag *pag); +void xfs_ag_resv_free(struct xfs_perag *pag); int xfs_ag_resv_init(struct xfs_perag *pag, struct xfs_trans *tp); bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type); @@ -33,23 +33,4 @@ xfs_perag_resv( } } -/* - * RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from - * the AGFL, they are allocated one at a time and the reservation updates don't - * require a transaction. - */ -static inline void -xfs_ag_resv_rmapbt_alloc( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_alloc_arg args = { NULL }; - struct xfs_perag *pag; - - args.len = 1; - pag = xfs_perag_get(mp, agno); - xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args); - xfs_perag_put(pag); -} - #endif /* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3bd0a33fee0a..7839efe050bf 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -26,13 +26,13 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_bmap.h" +#include "xfs_health.h" +#include "xfs_extfree_item.h" struct kmem_cache *xfs_extfree_item_cache; struct workqueue_struct *xfs_alloc_wq; -#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) - #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 @@ -78,7 +78,7 @@ xfs_prealloc_blocks( } /* - * The number of blocks per AG that we withhold from xfs_mod_fdblocks to + * The number of blocks per AG that we withhold from xfs_dec_fdblocks to * guarantee that we can refill the AGFL prior to allocating space in a nearly * full AG. Although the space described by the free space btrees, the * blocks used by the freesp btrees themselves, and the blocks owned by the @@ -88,7 +88,7 @@ xfs_prealloc_blocks( * until the fs goes down, we subtract this many AG blocks from the incore * fdblocks to ensure user allocation does not overcommit the space the * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to - * withhold space from xfs_mod_fdblocks, so we do not account for that here. + * withhold space from xfs_dec_fdblocks, so we do not account for that here. */ #define XFS_ALLOCBT_AGFL_RESERVE 4 @@ -150,23 +150,38 @@ xfs_alloc_ag_max_usable( return mp->m_sb.sb_agblocks - blocks; } + +static int +xfs_alloc_lookup( + struct xfs_btree_cur *cur, + xfs_lookup_t dir, + xfs_agblock_t bno, + xfs_extlen_t len, + int *stat) +{ + int error; + + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + error = xfs_btree_lookup(cur, dir, stat); + if (*stat == 1) + cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; + else + cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; + return error; +} + /* * Lookup the record equal to [bno, len] in the btree given by cur. */ -STATIC int /* error */ +static inline int /* error */ xfs_alloc_lookup_eq( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { - int error; - - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); - cur->bc_ag.abt.active = (*stat == 1); - return error; + return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat); } /* @@ -180,13 +195,7 @@ xfs_alloc_lookup_ge( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { - int error; - - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); - cur->bc_ag.abt.active = (*stat == 1); - return error; + return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat); } /* @@ -200,19 +209,14 @@ xfs_alloc_lookup_le( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { - int error; - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); - cur->bc_ag.abt.active = (*stat == 1); - return error; + return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat); } static inline bool xfs_alloc_cur_active( struct xfs_btree_cur *cur) { - return cur && cur->bc_ag.abt.active; + return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE); } /* @@ -268,12 +272,12 @@ xfs_alloc_complain_bad_rec( struct xfs_mount *mp = cur->bc_mp; xfs_warn(mp, - "%s Freespace BTree record corruption in AG %d detected at %pS!", - cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", - cur->bc_ag.pag->pag_agno, fa); + "%sbt record corruption in AG %d detected at %pS!", + cur->bc_ops->name, cur->bc_group->xg_gno, fa); xfs_warn(mp, "start block 0x%x block count 0x%x", irec->ar_startblock, irec->ar_blockcount); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -297,7 +301,7 @@ xfs_alloc_get_rec( return error; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); @@ -325,7 +329,8 @@ xfs_alloc_compute_aligned( bool busy; /* Trim busy sections out of found extent */ - busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); + busy = xfs_extent_busy_trim(pag_group(args->pag), args->minlen, + args->maxlen, &bno, &len, busy_gen); /* * If we have a largish extent that happens to start before min_agbno, @@ -403,8 +408,8 @@ xfs_alloc_compute_diff( if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { if (newlen1 < newlen2 || (newlen1 == newlen2 && - XFS_ABSDIFF(newbno1, wantbno) > - XFS_ABSDIFF(newbno2, wantbno))) + abs_diff(newbno1, wantbno) > + abs_diff(newbno2, wantbno))) newbno1 = newbno2; } else if (newbno2 != NULLAGBLOCK) newbno1 = newbno2; @@ -420,7 +425,7 @@ xfs_alloc_compute_diff( } else newbno1 = freeend - wantlen; *newbnop = newbno1; - return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); + return newbno1 == NULLAGBLOCK ? 0 : abs_diff(newbno1, wantbno); } /* @@ -461,6 +466,97 @@ xfs_alloc_fix_len( } /* + * Determine if the cursor points to the block that contains the right-most + * block of records in the by-count btree. This block contains the largest + * contiguous free extent in the AG, so if we modify a record in this block we + * need to call xfs_alloc_fixup_longest() once the modifications are done to + * ensure the agf->agf_longest field is kept up to date with the longest free + * extent tracked by the by-count btree. + */ +static bool +xfs_alloc_cursor_at_lastrec( + struct xfs_btree_cur *cnt_cur) +{ + struct xfs_btree_block *block; + union xfs_btree_ptr ptr; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cnt_cur, 0, &bp); + + xfs_btree_get_sibling(cnt_cur, block, &ptr, XFS_BB_RIGHTSIB); + return xfs_btree_ptr_is_null(cnt_cur, &ptr); +} + +/* + * Find the rightmost record of the cntbt, and return the longest free space + * recorded in it. Simply set both the block number and the length to their + * maximum values before searching. + */ +static int +xfs_cntbt_longest( + struct xfs_btree_cur *cnt_cur, + xfs_extlen_t *longest) +{ + struct xfs_alloc_rec_incore irec; + union xfs_btree_rec *rec; + int stat = 0; + int error; + + memset(&cnt_cur->bc_rec, 0xFF, sizeof(cnt_cur->bc_rec)); + error = xfs_btree_lookup(cnt_cur, XFS_LOOKUP_LE, &stat); + if (error) + return error; + if (!stat) { + /* totally empty tree */ + *longest = 0; + return 0; + } + + error = xfs_btree_get_rec(cnt_cur, &rec, &stat); + if (error) + return error; + if (XFS_IS_CORRUPT(cnt_cur->bc_mp, !stat)) { + xfs_btree_mark_sick(cnt_cur); + return -EFSCORRUPTED; + } + + xfs_alloc_btrec_to_irec(rec, &irec); + *longest = irec.ar_blockcount; + return 0; +} + +/* + * Update the longest contiguous free extent in the AG from the by-count cursor + * that is passed to us. This should be done at the end of any allocation or + * freeing operation that touches the longest extent in the btree. + * + * Needing to update the longest extent can be determined by calling + * xfs_alloc_cursor_at_lastrec() after the cursor is positioned for record + * modification but before the modification begins. + */ +static int +xfs_alloc_fixup_longest( + struct xfs_btree_cur *cnt_cur) +{ + struct xfs_perag *pag = to_perag(cnt_cur->bc_group); + struct xfs_buf *bp = cnt_cur->bc_ag.agbp; + struct xfs_agf *agf = bp->b_addr; + xfs_extlen_t longest = 0; + int error; + + /* Lookup last rec in order to update AGF. */ + error = xfs_cntbt_longest(cnt_cur, &longest); + if (error) + return error; + + pag->pagf_longest = longest; + agf->agf_longest = cpu_to_be32(pag->pagf_longest); + xfs_alloc_log_agf(cnt_cur->bc_tp, bp, XFS_AGF_LONGEST); + + return 0; +} + +/* * Update the two btrees, logically removing from freespace the extent * starting at rbno, rlen blocks. The extent is contained within the * actual (current) free extent fbno for flen blocks. @@ -484,6 +580,7 @@ xfs_alloc_fixup_trees( xfs_extlen_t nflen1=0; /* first new free length */ xfs_extlen_t nflen2=0; /* second new free length */ struct xfs_mount *mp; + bool fixup_longest = false; mp = cnt_cur->bc_mp; @@ -497,14 +594,18 @@ xfs_alloc_fixup_trees( if (XFS_IS_CORRUPT(mp, i != 1 || nfbno1 != fbno || - nflen1 != flen)) + nflen1 != flen)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } #endif } else { if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } } /* * Look up the record in the by-block tree if necessary. @@ -516,14 +617,18 @@ xfs_alloc_fixup_trees( if (XFS_IS_CORRUPT(mp, i != 1 || nfbno1 != fbno || - nflen1 != flen)) + nflen1 != flen)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } #endif } else { if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } } #ifdef DEBUG @@ -536,8 +641,10 @@ xfs_alloc_fixup_trees( if (XFS_IS_CORRUPT(mp, bnoblock->bb_numrecs != - cntblock->bb_numrecs)) + cntblock->bb_numrecs)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } } #endif @@ -562,35 +669,49 @@ xfs_alloc_fixup_trees( nfbno2 = rbno + rlen; nflen2 = (fbno + flen) - nfbno2; } + + if (xfs_alloc_cursor_at_lastrec(cnt_cur)) + fixup_longest = true; + /* * Delete the entry from the by-size btree. */ if ((error = xfs_btree_delete(cnt_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } /* * Add new by-size btree entry(s). */ if (nfbno1 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 0)) + if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } } if (nfbno2 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 0)) + if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } } /* * Fix up the by-block btree entry(s). @@ -601,8 +722,10 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(bno_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } } else { /* * Update the by-block entry to start later|be shorter. @@ -616,13 +739,21 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 0)) + if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } if ((error = xfs_btree_insert(bno_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } } + + if (fixup_longest) + return xfs_alloc_fixup_longest(cnt_cur); + return 0; } @@ -667,7 +798,7 @@ xfs_agfl_verify( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) + if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != pag_agno((bp->b_pag))) return __this_address; for (i = 0; i < xfs_agfl_size(mp); i++) { @@ -747,14 +878,15 @@ xfs_alloc_read_agfl( struct xfs_trans *tp, struct xfs_buf **bpp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_buf *bp; int error; - error = xfs_trans_read_buf( - mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); if (error) return error; xfs_buf_set_ref(bp, XFS_AGFL_REF); @@ -776,6 +908,7 @@ xfs_alloc_update_counters( if (unlikely(be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))) { xfs_buf_mark_corrupt(agbp); + xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF); return -EFSCORRUPTED; } @@ -828,8 +961,8 @@ xfs_alloc_cur_setup( * attempt a small allocation. */ if (!acur->cnt) - acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->pag, XFS_BTNUM_CNT); + acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp, + args->agbp, args->pag); error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); if (error) return error; @@ -838,11 +971,11 @@ xfs_alloc_cur_setup( * Allocate the bnobt left and right search cursors. */ if (!acur->bnolt) - acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->pag, XFS_BTNUM_BNO); + acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp, + args->agbp, args->pag); if (!acur->bnogt) - acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->pag, XFS_BTNUM_BNO); + acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp, + args->agbp, args->pag); return i == 1 ? 0 : -ENOSPC; } @@ -884,15 +1017,17 @@ xfs_alloc_cur_check( bool busy; unsigned busy_gen = 0; bool deactivate = false; - bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO; + bool isbnobt = xfs_btree_is_bno(cur->bc_ops); *new = 0; error = xfs_alloc_get_rec(cur, &bno, &len, &i); if (error) return error; - if (XFS_IS_CORRUPT(args->mp, i != 1)) + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } /* * Check minlen and deactivate a cntbt cursor if out of acceptable size @@ -958,9 +1093,8 @@ xfs_alloc_cur_check( deactivate = true; out: if (deactivate) - cur->bc_ag.abt.active = false; - trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, - *new); + cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; + trace_xfs_alloc_cur_check(cur, bno, len, diff, *new); return 0; } @@ -973,13 +1107,12 @@ xfs_alloc_cur_finish( struct xfs_alloc_arg *args, struct xfs_alloc_cur *acur) { - struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; int error; ASSERT(acur->cnt && acur->bnolt); ASSERT(acur->bno >= acur->rec_bno); ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); - ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); + ASSERT(xfs_verify_agbext(args->pag, acur->rec_bno, acur->rec_len)); error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, acur->rec_len, acur->bno, acur->len, 0); @@ -1098,6 +1231,7 @@ xfs_alloc_ag_vextent_small( if (error) goto error; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(ccur); error = -EFSCORRUPTED; goto error; } @@ -1116,14 +1250,14 @@ xfs_alloc_ag_vextent_small( if (fbno == NULLAGBLOCK) goto out; - xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1, + xfs_extent_busy_reuse(pag_group(args->pag), fbno, 1, (args->datatype & XFS_ALLOC_NOBUSY)); if (args->datatype & XFS_ALLOC_USERDATA) { struct xfs_buf *bp; error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, - XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), + xfs_agbno_to_daddr(args->pag, fbno), args->mp->m_bsize, 0, &bp); if (error) goto error; @@ -1132,6 +1266,7 @@ xfs_alloc_ag_vextent_small( *fbnop = args->agbno = fbno; *flenp = args->len = 1; if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { + xfs_btree_mark_sick(ccur); error = -EFSCORRUPTED; goto error; } @@ -1180,7 +1315,6 @@ STATIC int /* error */ xfs_alloc_ag_vextent_exact( xfs_alloc_arg_t *args) /* allocation argument structure */ { - struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ int error; @@ -1197,8 +1331,8 @@ xfs_alloc_ag_vextent_exact( /* * Allocate/initialize a cursor for the by-number freespace btree. */ - bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->pag, XFS_BTNUM_BNO); + bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); /* * Lookup bno and minlen in the btree (minlen is irrelevant, really). @@ -1218,6 +1352,7 @@ xfs_alloc_ag_vextent_exact( if (error) goto error0; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1228,7 +1363,8 @@ xfs_alloc_ag_vextent_exact( */ tbno = fbno; tlen = flen; - xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); + xfs_extent_busy_trim(pag_group(args->pag), args->minlen, args->maxlen, + &tbno, &tlen, &busy_gen); /* * Give up if the start of the extent is busy, or the freespace isn't @@ -1257,9 +1393,9 @@ xfs_alloc_ag_vextent_exact( * We are allocating agbno for args->len * Allocate/initialize a cursor for the by-size btree. */ - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->pag, XFS_BTNUM_CNT); - ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); + cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); + ASSERT(xfs_verify_agbext(args->pag, args->agbno, args->len)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); if (error) { @@ -1330,7 +1466,7 @@ xfs_alloc_walk_iter( if (error) return error; if (i == 0) - cur->bc_ag.abt.active = false; + cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; if (count > 0) count--; @@ -1444,7 +1580,7 @@ xfs_alloc_ag_vextent_locality( if (error) return error; if (i) { - acur->cnt->bc_ag.abt.active = true; + acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; fbcur = acur->cnt; fbinc = false; } @@ -1497,8 +1633,10 @@ xfs_alloc_ag_vextent_lastblock( error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); if (error) return error; - if (XFS_IS_CORRUPT(args->mp, i != 1)) + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(acur->cnt); return -EFSCORRUPTED; + } if (*len >= args->minlen) break; error = xfs_btree_increment(acur->cnt, 0, &i); @@ -1619,8 +1757,9 @@ restart: * the allocation can be retried. */ trace_xfs_alloc_near_busy(args); - error = xfs_extent_busy_flush(args->tp, args->pag, - acur.busy_gen, alloc_flags); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), acur.busy_gen, + alloc_flags); if (error) goto out; @@ -1670,8 +1809,8 @@ restart: /* * Allocate and initialize a cursor for the by-size btree. */ - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->pag, XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); bno_cur = NULL; /* @@ -1710,6 +1849,7 @@ restart: if (error) goto error0; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1734,8 +1874,9 @@ restart: * the allocation can be retried. */ trace_xfs_alloc_size_busy(args); - error = xfs_extent_busy_flush(args->tp, args->pag, - busy_gen, alloc_flags); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), busy_gen, + alloc_flags); if (error) goto error0; @@ -1756,6 +1897,7 @@ restart: rlen != 0 && (rlen > flen || rbno + rlen > fbno + flen))) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1778,10 +1920,11 @@ restart: &i))) goto error0; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } - if (flen < bestrlen) + if (flen <= bestrlen) break; busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen, &busy_gen); @@ -1790,6 +1933,7 @@ restart: rlen != 0 && (rlen > flen || rbno + rlen > fbno + flen))) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1806,6 +1950,7 @@ restart: &i))) goto error0; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1829,8 +1974,9 @@ restart: * the allocation can be retried. */ trace_xfs_alloc_size_busy(args); - error = xfs_extent_busy_flush(args->tp, args->pag, - busy_gen, alloc_flags); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), busy_gen, + alloc_flags); if (error) goto error0; @@ -1844,14 +1990,15 @@ restart: rlen = args->len; if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } /* * Allocate and initialize a cursor for the by-block tree. */ - bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->pag, XFS_BTNUM_BNO); + bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, rbno, rlen, XFSA_FIXUP_CNT_OK))) goto error0; @@ -1863,6 +2010,7 @@ restart: if (XFS_IS_CORRUPT(args->mp, args->agbno + args->len > be32_to_cpu(agf->agf_length))) { + xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT); error = -EFSCORRUPTED; goto error0; } @@ -1887,11 +2035,10 @@ out_nominleft: /* * Free the extent starting at agno/bno for length. */ -STATIC int +int xfs_free_ag_extent( struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, @@ -1911,6 +2058,7 @@ xfs_free_ag_extent( int i; int error; struct xfs_perag *pag = agbp->b_pag; + bool fixup_longest = false; bno_cur = cnt_cur = NULL; mp = tp->t_mountp; @@ -1924,7 +2072,7 @@ xfs_free_ag_extent( /* * Allocate and initialize a cursor for the by-block btree. */ - bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO); + bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); /* * Look for a neighboring block on the left (lower block numbers) * that is contiguous with this space. @@ -1938,6 +2086,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1953,6 +2102,7 @@ xfs_free_ag_extent( * Very bad. */ if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1971,6 +2121,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1986,6 +2137,7 @@ xfs_free_ag_extent( * Very bad. */ if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1994,7 +2146,7 @@ xfs_free_ag_extent( /* * Now allocate and initialize a cursor for the by-size tree. */ - cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); /* * Have both left and right contiguous neighbors. * Merge all three into a single free block. @@ -2006,12 +2158,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -2021,12 +2175,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -2036,6 +2192,7 @@ xfs_free_ag_extent( if ((error = xfs_btree_delete(bno_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2045,6 +2202,7 @@ xfs_free_ag_extent( if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2064,6 +2222,7 @@ xfs_free_ag_extent( i != 1 || xxbno != ltbno || xxlen != ltlen)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2088,12 +2247,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -2104,6 +2265,7 @@ xfs_free_ag_extent( if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2123,12 +2285,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -2151,27 +2315,43 @@ xfs_free_ag_extent( if ((error = xfs_btree_insert(bno_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } } xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); bno_cur = NULL; + /* * In all cases we need to insert the new freespace in the by-size tree. + * + * If this new freespace is being inserted in the block that contains + * the largest free space in the btree, make sure we also fix up the + * agf->agf-longest tracker field. */ if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } + if (xfs_alloc_cursor_at_lastrec(cnt_cur)) + fixup_longest = true; if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } + if (fixup_longest) { + error = xfs_alloc_fixup_longest(cnt_cur); + if (error) + goto error0; + } + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); cnt_cur = NULL; @@ -2179,19 +2359,19 @@ xfs_free_ag_extent( * Update the freespace totals in the ag and superblock. */ error = xfs_alloc_update_counters(tp, agbp, len); - xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); + xfs_ag_resv_free_extent(pag, type, tp, len); if (error) goto error0; XFS_STATS_INC(mp, xs_freex); XFS_STATS_ADD(mp, xs_freeb, len); - trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); + trace_xfs_free_extent(pag, bno, len, type, haveleft, haveright); return 0; error0: - trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); + trace_xfs_free_extent(pag, bno, len, type, -1, -1); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); if (cnt_cur) @@ -2250,7 +2430,7 @@ xfs_alloc_longest_free_extent( * reservations and AGFL rules in place, we can return this extent. */ if (pag->pagf_longest > delta) - return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable, + return min_t(xfs_extlen_t, pag_mount(pag)->m_ag_max_usable, pag->pagf_longest - delta); /* Otherwise, let the caller try for 1 block if there's space. */ @@ -2267,8 +2447,9 @@ xfs_alloc_min_freelist( struct xfs_perag *pag) { /* AG btrees have at least 1 level. */ - static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; - const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; + const unsigned int bno_level = pag ? pag->pagf_bno_level : 1; + const unsigned int cnt_level = pag ? pag->pagf_cnt_level : 1; + const unsigned int rmap_level = pag ? pag->pagf_rmap_level : 1; unsigned int min_free; ASSERT(mp->m_alloc_maxlevels > 0); @@ -2295,16 +2476,12 @@ xfs_alloc_min_freelist( */ /* space needed by-bno freespace btree */ - min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, - mp->m_alloc_maxlevels) * 2 - 2; + min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2; /* space needed by-size freespace btree */ - min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, - mp->m_alloc_maxlevels) * 2 - 2; + min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2; /* space needed reverse mapping used space btree */ if (xfs_has_rmapbt(mp)) - min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels) * 2 - 2; - + min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2; return min_free; } @@ -2361,32 +2538,6 @@ xfs_alloc_space_available( return true; } -int -xfs_free_agfl_block( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t agbno, - struct xfs_buf *agbp, - struct xfs_owner_info *oinfo) -{ - int error; - struct xfs_buf *bp; - - error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, - XFS_AG_RESV_AGFL); - if (error) - return error; - - error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp, - XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno), - tp->t_mountp->m_bsize, 0, &bp); - if (error) - return error; - xfs_trans_binval(tp, bp); - - return 0; -} - /* * Check the agfl fields of the agf for inconsistency or corruption. * @@ -2462,7 +2613,7 @@ xfs_agfl_reset( xfs_warn(mp, "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " "Please unmount and run xfs_repair.", - pag->pag_agno, pag->pagf_flcount); + pag_agno(pag), pag->pagf_flcount); agf->agf_flfirst = 0; agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); @@ -2475,48 +2626,6 @@ xfs_agfl_reset( } /* - * Defer an AGFL block free. This is effectively equivalent to - * xfs_free_extent_later() with some special handling particular to AGFL blocks. - * - * Deferring AGFL frees helps prevent log reservation overruns due to too many - * allocation operations in a transaction. AGFL frees are prone to this problem - * because for one they are always freed one at a time. Further, an immediate - * AGFL block free can cause a btree join and require another block free before - * the real allocation can proceed. Deferring the free disconnects freeing up - * the AGFL slot from freeing the block. - */ -static int -xfs_defer_agfl_block( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t agbno, - struct xfs_owner_info *oinfo) -{ - struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent_free_item *xefi; - xfs_fsblock_t fsbno = XFS_AGB_TO_FSB(mp, agno, agbno); - - ASSERT(xfs_extfree_item_cache != NULL); - ASSERT(oinfo != NULL); - - if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno))) - return -EFSCORRUPTED; - - xefi = kmem_cache_zalloc(xfs_extfree_item_cache, - GFP_KERNEL | __GFP_NOFAIL); - xefi->xefi_startblock = fsbno; - xefi->xefi_blockcount = 1; - xefi->xefi_owner = oinfo->oi_owner; - xefi->xefi_agresv = XFS_AG_RESV_AGFL; - - trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); - - xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type); - return 0; -} - -/* * Add the extent to the list of extents to be free at transaction end. * The list is maintained sorted (by block number). */ @@ -2527,39 +2636,37 @@ xfs_defer_extent_free( xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, - bool skip_discard, + unsigned int free_flags, struct xfs_defer_pending **dfpp) { struct xfs_extent_free_item *xefi; struct xfs_mount *mp = tp->t_mountp; -#ifdef DEBUG - xfs_agnumber_t agno; - xfs_agblock_t agbno; - ASSERT(bno != NULLFSBLOCK); - ASSERT(len > 0); ASSERT(len <= XFS_MAX_BMBT_EXTLEN); ASSERT(!isnullstartblock(bno)); - agno = XFS_FSB_TO_AGNO(mp, bno); - agbno = XFS_FSB_TO_AGBNO(mp, bno); - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(agbno < mp->m_sb.sb_agblocks); - ASSERT(len < mp->m_sb.sb_agblocks); - ASSERT(agbno + len <= mp->m_sb.sb_agblocks); -#endif - ASSERT(xfs_extfree_item_cache != NULL); - ASSERT(type != XFS_AG_RESV_AGFL); + ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS)); - if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) - return -EFSCORRUPTED; + if (free_flags & XFS_FREE_EXTENT_REALTIME) { + if (type != XFS_AG_RESV_NONE) { + ASSERT(type == XFS_AG_RESV_NONE); + return -EFSCORRUPTED; + } + if (XFS_IS_CORRUPT(mp, !xfs_verify_rtbext(mp, bno, len))) + return -EFSCORRUPTED; + } else { + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) + return -EFSCORRUPTED; + } xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); xefi->xefi_startblock = bno; xefi->xefi_blockcount = (xfs_extlen_t)len; xefi->xefi_agresv = type; - if (skip_discard) + if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD) xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; + if (free_flags & XFS_FREE_EXTENT_REALTIME) + xefi->xefi_flags |= XFS_EFI_REALTIME; if (oinfo) { ASSERT(oinfo->oi_offset == 0); @@ -2571,12 +2678,8 @@ xfs_defer_extent_free( } else { xefi->xefi_owner = XFS_RMAP_OWN_NULL; } - trace_xfs_bmap_free_defer(mp, - XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, - XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); - xfs_extent_free_get_group(mp, xefi); - *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type); + xfs_extent_free_defer_add(tp, xefi, dfpp); return 0; } @@ -2587,11 +2690,11 @@ xfs_free_extent_later( xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, - bool skip_discard) + unsigned int free_flags) { struct xfs_defer_pending *dontcare = NULL; - return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard, + return xfs_defer_extent_free(tp, bno, len, oinfo, type, free_flags, &dontcare); } @@ -2616,13 +2719,13 @@ xfs_free_extent_later( int xfs_alloc_schedule_autoreap( const struct xfs_alloc_arg *args, - bool skip_discard, + unsigned int free_flags, struct xfs_alloc_autoreap *aarp) { int error; error = xfs_defer_extent_free(args->tp, args->fsbno, args->len, - &args->oinfo, args->resv, skip_discard, &aarp->dfp); + &args->oinfo, args->resv, free_flags, &aarp->dfp); if (error) return error; @@ -2675,7 +2778,6 @@ xfs_alloc_commit_autoreap( xfs_defer_item_unpause(tp, aarp->dfp); } -#ifdef DEBUG /* * Check if an AGF has a free extent record whose length is equal to * args->minlen. @@ -2691,13 +2793,14 @@ xfs_exact_minlen_extent_available( xfs_extlen_t flen; int error = 0; - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp, - args->pag, XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp, + args->pag); error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); if (error) goto out; if (*stat == 0) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto out; } @@ -2714,7 +2817,6 @@ out: return error; } -#endif /* * Decide whether to use this allocation group for this allocation. @@ -2788,15 +2890,14 @@ xfs_alloc_fix_freelist( if (!xfs_alloc_space_available(args, need, alloc_flags)) goto out_agbp_relse; -#ifdef DEBUG - if (args->alloc_minlen_only) { + if (IS_ENABLED(CONFIG_XFS_DEBUG) && args->alloc_minlen_only) { int stat; error = xfs_exact_minlen_extent_available(args, agbp, &stat); if (error || !stat) goto out_agbp_relse; } -#endif + /* * Make the freelist shorter if it's too long. * @@ -2833,8 +2934,20 @@ xfs_alloc_fix_freelist( if (error) goto out_agbp_relse; - /* defer agfl frees */ - error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); + /* + * Defer the AGFL block free. + * + * This helps to prevent log reservation overruns due to too + * many allocation operations in a transaction. AGFL frees are + * prone to this problem because for one they are always freed + * one at a time. Further, an immediate AGFL block free can + * cause a btree join and require another block free before the + * real allocation can proceed. + * Deferring the free disconnects freeing up the AGFL slot from + * freeing the block. + */ + error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, bno), + 1, &targs.oinfo, XFS_AG_RESV_AGFL, 0); if (error) goto out_agbp_relse; } @@ -2987,8 +3100,8 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_versionnum), offsetof(xfs_agf_t, agf_seqno), offsetof(xfs_agf_t, agf_length), - offsetof(xfs_agf_t, agf_roots[0]), - offsetof(xfs_agf_t, agf_levels[0]), + offsetof(xfs_agf_t, agf_bno_root), /* also cnt/rmap root */ + offsetof(xfs_agf_t, agf_bno_level), /* also cnt/rmap levels */ offsetof(xfs_agf_t, agf_flfirst), offsetof(xfs_agf_t, agf_fllast), offsetof(xfs_agf_t, agf_flcount), @@ -3054,8 +3167,6 @@ xfs_alloc_put_freelist( logflags |= XFS_AGF_BTREEBLKS; } - xfs_alloc_log_agf(tp, agbp, logflags); - ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); agfl_bno = xfs_buf_to_agfl_bno(agflbp); @@ -3088,7 +3199,7 @@ xfs_validate_ag_length( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag && seqno != bp->b_pag->pag_agno) + if (bp->b_pag && seqno != pag_agno(bp->b_pag)) return __this_address; /* @@ -3167,12 +3278,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_freeblks) > agf_length) return __this_address; - if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > - mp->m_alloc_maxlevels || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > - mp->m_alloc_maxlevels) + if (be32_to_cpu(agf->agf_bno_level) < 1 || + be32_to_cpu(agf->agf_cnt_level) < 1 || + be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels || + be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels) return __this_address; if (xfs_has_lazysbcount(mp) && @@ -3183,9 +3292,8 @@ xfs_agf_verify( if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length) return __this_address; - if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > - mp->m_rmap_maxlevels) + if (be32_to_cpu(agf->agf_rmap_level) < 1 || + be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels) return __this_address; } @@ -3260,14 +3368,16 @@ xfs_read_agf( int flags, struct xfs_buf **agfbpp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); int error; - trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); + trace_xfs_read_agf(pag); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); if (error) return error; @@ -3287,12 +3397,13 @@ xfs_alloc_read_agf( int flags, struct xfs_buf **agfbpp) { + struct xfs_mount *mp = pag_mount(pag); struct xfs_buf *agfbp; struct xfs_agf *agf; int error; int allocbt_blks; - trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); + trace_xfs_alloc_read_agf(pag); /* We don't support trylock when freeing. */ ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != @@ -3309,14 +3420,11 @@ xfs_alloc_read_agf( pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); pag->pagf_longest = be32_to_cpu(agf->agf_longest); - pag->pagf_levels[XFS_BTNUM_BNOi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); - pag->pagf_levels[XFS_BTNUM_CNTi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); - pag->pagf_levels[XFS_BTNUM_RMAPi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); + pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level); + pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); + pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); - if (xfs_agfl_needs_reset(pag->pag_mount, agf)) + if (xfs_agfl_needs_reset(mp, agf)) set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); else clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); @@ -3329,24 +3437,21 @@ xfs_alloc_read_agf( * counter only tracks non-root blocks. */ allocbt_blks = pag->pagf_btreeblks; - if (xfs_has_rmapbt(pag->pag_mount)) + if (xfs_has_rmapbt(mp)) allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; if (allocbt_blks > 0) - atomic64_add(allocbt_blks, - &pag->pag_mount->m_allocbt_blks); + atomic64_add(allocbt_blks, &mp->m_allocbt_blks); set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); } #ifdef DEBUG - else if (!xfs_is_shutdown(pag->pag_mount)) { + else if (!xfs_is_shutdown(mp)) { ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); - ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi])); - ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); + ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level)); + ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level)); } #endif if (agfbpp) @@ -3501,7 +3606,7 @@ xfs_alloc_vextent_finish( goto out_drop_perag; } - args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); + args->fsbno = xfs_agbno_to_fsb(args->pag, args->agbno); ASSERT(args->len >= args->minlen); ASSERT(args->len <= args->maxlen); @@ -3522,8 +3627,8 @@ xfs_alloc_vextent_finish( if (error) goto out_drop_perag; - ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno, - args->len)); + ASSERT(!xfs_extent_busy_search(pag_group(args->pag), + args->agbno, args->len)); } xfs_ag_resv_alloc_extent(args->pag, args->resv, args); @@ -3553,21 +3658,20 @@ xfs_alloc_vextent_this_ag( struct xfs_alloc_arg *args, xfs_agnumber_t agno) { - struct xfs_mount *mp = args->mp; xfs_agnumber_t minimum_agno; uint32_t alloc_flags = 0; int error; ASSERT(args->pag != NULL); - ASSERT(args->pag->pag_agno == agno); + ASSERT(pag_agno(args->pag) == agno); args->agno = agno; args->agbno = 0; trace_xfs_alloc_vextent_this_ag(args); - error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0), - &minimum_agno); + error = xfs_alloc_vextent_check_args(args, + xfs_agbno_to_fsb(args->pag, 0), &minimum_agno); if (error) { if (error == -ENOSPC) return 0; @@ -3772,7 +3876,7 @@ xfs_alloc_vextent_exact_bno( int error; ASSERT(args->pag != NULL); - ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); + ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target)); args->agno = XFS_FSB_TO_AGNO(mp, target); args->agbno = XFS_FSB_TO_AGBNO(mp, target); @@ -3811,7 +3915,7 @@ xfs_alloc_vextent_near_bno( int error; if (!needs_perag) - ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); + ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target)); args->agno = XFS_FSB_TO_AGNO(mp, target); args->agbno = XFS_FSB_TO_AGBNO(mp, target); @@ -3848,7 +3952,7 @@ xfs_free_extent_fix_freelist( memset(&args, 0, sizeof(struct xfs_alloc_arg)); args.tp = tp; args.mp = tp->t_mountp; - args.agno = pag->pag_agno; + args.agno = pag_agno(pag); args.pag = pag; /* @@ -3895,29 +3999,34 @@ __xfs_free_extent( return -EIO; error = xfs_free_extent_fix_freelist(tp, pag, &agbp); - if (error) + if (error) { + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); return error; + } + agf = agbp->b_addr; if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); error = -EFSCORRUPTED; goto err_release; } /* validate the extent size is legal now we have the agf locked */ if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); error = -EFSCORRUPTED; goto err_release; } - error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo, - type); + error = xfs_free_ag_extent(tp, agbp, agbno, len, oinfo, type); if (error) goto err_release; if (skip_discard) busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; - xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); + xfs_extent_busy_insert(tp, pag_group(pag), agbno, len, busy_flags); return 0; err_release: @@ -3942,7 +4051,7 @@ xfs_alloc_query_range_helper( xfs_failaddr_t fa; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); @@ -3962,7 +4071,7 @@ xfs_alloc_query_range( union xfs_btree_irec high_brec = { .a = *high_rec }; struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn }; - ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); + ASSERT(xfs_btree_is_bno(cur->bc_ops)); return xfs_btree_query_range(cur, &low_brec, &high_brec, xfs_alloc_query_range_helper, &query); } @@ -3976,7 +4085,7 @@ xfs_alloc_query_all( { struct xfs_alloc_query_range_info query; - ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); + ASSERT(xfs_btree_is_bno(cur->bc_ops)); query.priv = priv; query.fn = fn; return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 0b956f8b9d5a..50ef79a1ed41 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -53,11 +53,9 @@ typedef struct xfs_alloc_arg { int datatype; /* mask defining data type treatment */ char wasdel; /* set if allocation was prev delayed */ char wasfromfl; /* set if allocation is from freelist */ + bool alloc_minlen_only; /* allocate exact minlen extent */ struct xfs_owner_info oinfo; /* owner of blocks being allocated */ enum xfs_ag_resv_type resv; /* block reservation to use */ -#ifdef DEBUG - bool alloc_minlen_only; /* allocate exact minlen extent */ -#endif } xfs_alloc_arg_t; /* @@ -80,6 +78,9 @@ int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp, int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agfbp, struct xfs_buf *agflbp, xfs_agblock_t bno, int btreeblk); +int xfs_free_ag_extent(struct xfs_trans *tp, struct xfs_buf *agbp, + xfs_agblock_t bno, xfs_extlen_t len, + const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); /* * Compute and fill in value of m_alloc_maxlevels. @@ -194,8 +195,6 @@ int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, struct xfs_buf **agfbpp); int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf **bpp); -int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, - struct xfs_buf *, struct xfs_owner_info *); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, uint32_t alloc_flags); int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_buf **agbp); @@ -233,7 +232,16 @@ xfs_buf_to_agfl_bno( int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type, bool skip_discard); + enum xfs_ag_resv_type type, unsigned int free_flags); + +/* Don't issue a discard for the blocks freed. */ +#define XFS_FREE_EXTENT_SKIP_DISCARD (1U << 0) + +/* Free blocks on the realtime device. */ +#define XFS_FREE_EXTENT_REALTIME (1U << 1) + +#define XFS_FREE_EXTENT_ALL_FLAGS (XFS_FREE_EXTENT_SKIP_DISCARD | \ + XFS_FREE_EXTENT_REALTIME) /* * List of extents to be free "later". @@ -244,25 +252,28 @@ struct xfs_extent_free_item { uint64_t xefi_owner; xfs_fsblock_t xefi_startblock;/* starting fs block number */ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ - struct xfs_perag *xefi_pag; + struct xfs_group *xefi_group; unsigned int xefi_flags; enum xfs_ag_resv_type xefi_agresv; }; -void xfs_extent_free_get_group(struct xfs_mount *mp, - struct xfs_extent_free_item *xefi); - #define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ #define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */ +#define XFS_EFI_REALTIME (1U << 4) /* freeing realtime extent */ + +static inline bool xfs_efi_is_realtime(const struct xfs_extent_free_item *xefi) +{ + return xefi->xefi_flags & XFS_EFI_REALTIME; +} struct xfs_alloc_autoreap { struct xfs_defer_pending *dfp; }; int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args, - bool skip_discard, struct xfs_alloc_autoreap *aarp); + unsigned int free_flags, struct xfs_alloc_autoreap *aarp); void xfs_alloc_cancel_autoreap(struct xfs_trans *tp, struct xfs_alloc_autoreap *aarp); void xfs_alloc_commit_autoreap(struct xfs_trans *tp, diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index a7032bf0cd37..a4ac37ba5d51 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -16,6 +16,7 @@ #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_error.h" +#include "xfs_health.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_ag.h" @@ -23,11 +24,19 @@ static struct kmem_cache *xfs_allocbt_cur_cache; STATIC struct xfs_btree_cur * -xfs_allocbt_dup_cursor( +xfs_bnobt_dup_cursor( struct xfs_btree_cur *cur) { - return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum); + return xfs_bnobt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, + to_perag(cur->bc_group)); +} + +STATIC struct xfs_btree_cur * +xfs_cntbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_cntbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, + to_perag(cur->bc_group)); } STATIC void @@ -36,15 +45,21 @@ xfs_allocbt_set_root( const union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_ag.agbp; - struct xfs_agf *agf = agbp->b_addr; - int btnum = cur->bc_btnum; + struct xfs_perag *pag = to_perag(cur->bc_group); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; ASSERT(ptr->s != 0); - agf->agf_roots[btnum] = ptr->s; - be32_add_cpu(&agf->agf_levels[btnum], inc); - cur->bc_ag.pag->pagf_levels[btnum] += inc; + if (xfs_btree_is_bno(cur->bc_ops)) { + agf->agf_bno_root = ptr->s; + be32_add_cpu(&agf->agf_bno_level, inc); + pag->pagf_bno_level += inc; + } else { + agf->agf_cnt_root = ptr->s; + be32_add_cpu(&agf->agf_cnt_level, inc); + pag->pagf_cnt_level += inc; + } xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -60,7 +75,7 @@ xfs_allocbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp, + error = xfs_alloc_get_freelist(to_perag(cur->bc_group), cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -71,7 +86,7 @@ xfs_allocbt_alloc_block( } atomic64_inc(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_group, bno, 1, false); new->s = cpu_to_be32(bno); @@ -89,78 +104,17 @@ xfs_allocbt_free_block( int error; bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); - error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL, - bno, 1); + error = xfs_alloc_put_freelist(to_perag(cur->bc_group), cur->bc_tp, + agbp, NULL, bno, 1); if (error) return error; atomic64_dec(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_insert(cur->bc_tp, agbp->b_pag, bno, 1, + xfs_extent_busy_insert(cur->bc_tp, pag_group(agbp->b_pag), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; } -/* - * Update the longest extent in the AGF - */ -STATIC void -xfs_allocbt_update_lastrec( - struct xfs_btree_cur *cur, - const struct xfs_btree_block *block, - const union xfs_btree_rec *rec, - int ptr, - int reason) -{ - struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - struct xfs_perag *pag; - __be32 len; - int numrecs; - - ASSERT(cur->bc_btnum == XFS_BTNUM_CNT); - - switch (reason) { - case LASTREC_UPDATE: - /* - * If this is the last leaf block and it's the last record, - * then update the size of the longest extent in the AG. - */ - if (ptr != xfs_btree_get_numrecs(block)) - return; - len = rec->alloc.ar_blockcount; - break; - case LASTREC_INSREC: - if (be32_to_cpu(rec->alloc.ar_blockcount) <= - be32_to_cpu(agf->agf_longest)) - return; - len = rec->alloc.ar_blockcount; - break; - case LASTREC_DELREC: - numrecs = xfs_btree_get_numrecs(block); - if (ptr <= numrecs) - return; - ASSERT(ptr == numrecs + 1); - - if (numrecs) { - xfs_alloc_rec_t *rrp; - - rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs); - len = rrp->ar_blockcount; - } else { - len = 0; - } - - break; - default: - ASSERT(0); - return; - } - - agf->agf_longest = len; - pag = cur->bc_ag.agbp->b_pag; - pag->pagf_longest = be32_to_cpu(len); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST); -} - STATIC int xfs_allocbt_get_minrecs( struct xfs_btree_cur *cur, @@ -224,9 +178,12 @@ xfs_allocbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); - ptr->s = agf->agf_roots[cur->bc_btnum]; + if (xfs_btree_is_bno(cur->bc_ops)) + ptr->s = agf->agf_bno_root; + else + ptr->s = agf->agf_cnt_root; } STATIC int64_t @@ -299,13 +256,12 @@ xfs_allocbt_verify( struct xfs_perag *pag = bp->b_pag; xfs_failaddr_t fa; unsigned int level; - xfs_btnum_t btnum = XFS_BTNUM_BNOi; if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (xfs_has_crc(mp)) { - fa = xfs_btree_sblock_v5hdr_verify(bp); + fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; } @@ -320,26 +276,32 @@ xfs_allocbt_verify( * against. */ level = be16_to_cpu(block->bb_level); - if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) - btnum = XFS_BTNUM_CNTi; if (pag && xfs_perag_initialised_agf(pag)) { - unsigned int maxlevel = pag->pagf_levels[btnum]; + unsigned int maxlevel, repair_maxlevel = 0; -#ifdef CONFIG_XFS_ONLINE_REPAIR /* * Online repair could be rewriting the free space btrees, so * we'll validate against the larger of either tree while this * is going on. */ - maxlevel = max_t(unsigned int, maxlevel, - pag->pagf_repair_levels[btnum]); + if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) { + maxlevel = pag->pagf_cnt_level; +#ifdef CONFIG_XFS_ONLINE_REPAIR + repair_maxlevel = pag->pagf_repair_cnt_level; #endif - if (level >= maxlevel) + } else { + maxlevel = pag->pagf_bno_level; +#ifdef CONFIG_XFS_ONLINE_REPAIR + repair_maxlevel = pag->pagf_repair_bno_level; +#endif + } + + if (level >= max(maxlevel, repair_maxlevel)) return __this_address; } else if (level >= mp->m_alloc_maxlevels) return __this_address; - return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); + return xfs_btree_agblock_verify(bp, mp->m_alloc_mxr[level != 0]); } static void @@ -348,7 +310,7 @@ xfs_allocbt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_sblock_verify_crc(bp)) + if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_allocbt_verify(bp); @@ -372,7 +334,7 @@ xfs_allocbt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_sblock_calc_crc(bp); + xfs_btree_agblock_calc_crc(bp); } @@ -454,15 +416,22 @@ xfs_allocbt_keys_contiguous( be32_to_cpu(key2->alloc.ar_startblock)); } -static const struct xfs_btree_ops xfs_bnobt_ops = { +const struct xfs_btree_ops xfs_bnobt_ops = { + .name = "bno", + .type = XFS_BTREE_TYPE_AG, + .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, - .dup_cursor = xfs_allocbt_dup_cursor, + .lru_refs = XFS_ALLOC_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_abtb_2), + .sick_mask = XFS_SICK_AG_BNOBT, + + .dup_cursor = xfs_bnobt_dup_cursor, .set_root = xfs_allocbt_set_root, .alloc_block = xfs_allocbt_alloc_block, .free_block = xfs_allocbt_free_block, - .update_lastrec = xfs_allocbt_update_lastrec, .get_minrecs = xfs_allocbt_get_minrecs, .get_maxrecs = xfs_allocbt_get_maxrecs, .init_key_from_rec = xfs_allocbt_init_key_from_rec, @@ -477,15 +446,22 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .keys_contiguous = xfs_allocbt_keys_contiguous, }; -static const struct xfs_btree_ops xfs_cntbt_ops = { +const struct xfs_btree_ops xfs_cntbt_ops = { + .name = "cnt", + .type = XFS_BTREE_TYPE_AG, + .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, + + .lru_refs = XFS_ALLOC_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_abtc_2), + .sick_mask = XFS_SICK_AG_CNTBT, - .dup_cursor = xfs_allocbt_dup_cursor, + .dup_cursor = xfs_cntbt_dup_cursor, .set_root = xfs_allocbt_set_root, .alloc_block = xfs_allocbt_alloc_block, .free_block = xfs_allocbt_free_block, - .update_lastrec = xfs_allocbt_update_lastrec, .get_minrecs = xfs_allocbt_get_minrecs, .get_maxrecs = xfs_allocbt_get_maxrecs, .init_key_from_rec = xfs_allocbt_init_key_from_rec, @@ -500,76 +476,55 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .keys_contiguous = NULL, /* not needed right now */ }; -/* Allocate most of a new allocation btree cursor. */ -STATIC struct xfs_btree_cur * -xfs_allocbt_init_common( +/* + * Allocate a new bnobt cursor. + * + * For staging cursors tp and agbp are NULL. + */ +struct xfs_btree_cur * +xfs_bnobt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_perag *pag, - xfs_btnum_t btnum) + struct xfs_buf *agbp, + struct xfs_perag *pag) { struct xfs_btree_cur *cur; - ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); - - cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels, - xfs_allocbt_cur_cache); - cur->bc_ag.abt.active = false; + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bnobt_ops, + mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); + cur->bc_group = xfs_group_hold(pag_group(pag)); + cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agf *agf = agbp->b_addr; - if (btnum == XFS_BTNUM_CNT) { - cur->bc_ops = &xfs_cntbt_ops; - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); - cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; - } else { - cur->bc_ops = &xfs_bnobt_ops; - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); + cur->bc_nlevels = be32_to_cpu(agf->agf_bno_level); } - - cur->bc_ag.pag = xfs_perag_hold(pag); - - if (xfs_has_crc(mp)) - cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - return cur; } /* - * Allocate a new allocation btree cursor. + * Allocate a new cntbt cursor. + * + * For staging cursors tp and agbp are NULL. */ -struct xfs_btree_cur * /* new alloc btree cursor */ -xfs_allocbt_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for agf structure */ - struct xfs_perag *pag, - xfs_btnum_t btnum) /* btree identifier */ -{ - struct xfs_agf *agf = agbp->b_addr; - struct xfs_btree_cur *cur; - - cur = xfs_allocbt_init_common(mp, tp, pag, btnum); - if (btnum == XFS_BTNUM_CNT) - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); - else - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); - - cur->bc_ag.agbp = agbp; - - return cur; -} - -/* Create a free space btree cursor with a fake root for staging. */ struct xfs_btree_cur * -xfs_allocbt_stage_cursor( +xfs_cntbt_init_cursor( struct xfs_mount *mp, - struct xbtree_afakeroot *afake, - struct xfs_perag *pag, - xfs_btnum_t btnum) + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_perag *pag) { struct xfs_btree_cur *cur; - cur = xfs_allocbt_init_common(mp, NULL, pag, btnum); - xfs_btree_stage_afakeroot(cur, afake); + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_cntbt_ops, + mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); + cur->bc_group = xfs_group_hold(pag_group(pag)); + cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agf *agf = agbp->b_addr; + + cur->bc_nlevels = be32_to_cpu(agf->agf_cnt_level); + } return cur; } @@ -588,16 +543,16 @@ xfs_allocbt_commit_staged_btree( ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); - agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); - xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); - - if (cur->bc_btnum == XFS_BTNUM_BNO) { - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops); + if (xfs_btree_is_bno(cur->bc_ops)) { + agf->agf_bno_root = cpu_to_be32(afake->af_root); + agf->agf_bno_level = cpu_to_be32(afake->af_levels); } else { - cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE; - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops); + agf->agf_cnt_root = cpu_to_be32(afake->af_root); + agf->agf_cnt_level = cpu_to_be32(afake->af_levels); } + xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); + + xfs_btree_commit_afakeroot(cur, tp, agbp); } /* Calculate number of records in an alloc btree block. */ @@ -614,11 +569,11 @@ xfs_allocbt_block_maxrecs( /* * Calculate number of records in an alloc btree block. */ -int +unsigned int xfs_allocbt_maxrecs( struct xfs_mount *mp, - int blocklen, - int leaf) + unsigned int blocklen, + bool leaf) { blocklen -= XFS_ALLOC_BLOCK_LEN(mp); return xfs_allocbt_block_maxrecs(blocklen, leaf); diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 45df893ef6bb..12647f9aaa6d 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -47,13 +47,14 @@ struct xbtree_afakeroot; (maxrecs) * sizeof(xfs_alloc_key_t) + \ ((index) - 1) * sizeof(xfs_alloc_ptr_t))) -extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp, +struct xfs_btree_cur *xfs_bnobt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_perag *pag, xfs_btnum_t btnum); -struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, struct xfs_perag *pag, - xfs_btnum_t btnum); -extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); + struct xfs_perag *pag); +struct xfs_btree_cur *xfs_cntbt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_perag *pag); +unsigned int xfs_allocbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, unsigned long long len); diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 9976a00a73f9..8c04acd30d48 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -26,6 +26,7 @@ #include "xfs_trace.h" #include "xfs_attr_item.h" #include "xfs_xattr.h" +#include "xfs_parent.h" struct kmem_cache *xfs_attr_intent_cache; @@ -50,7 +51,6 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); -STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args); /* * Internal routines when attribute list is more than one block. @@ -87,6 +87,8 @@ xfs_attr_is_leaf( struct xfs_iext_cursor icur; struct xfs_bmbt_irec imap; + ASSERT(!xfs_need_iread_extents(ifp)); + if (ifp->if_nextents != 1 || ifp->if_format != XFS_DINODE_FMT_EXTENTS) return false; @@ -224,11 +226,21 @@ int xfs_attr_get_ilocked( struct xfs_da_args *args) { - ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + int error; + + xfs_assert_ilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; + /* + * The incore attr fork iext tree must be loaded for xfs_attr_is_leaf + * to work correctly. + */ + error = xfs_iread_extents(args->trans, args->dp, XFS_ATTR_FORK); + if (error) + return error; + if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); if (xfs_attr_is_leaf(args->dp)) @@ -264,9 +276,11 @@ xfs_attr_get( if (xfs_is_shutdown(args->dp->i_mount)) return -EIO; + if (!args->owner) + args->owner = args->dp->i_ino; args->geo = args->dp->i_mount->m_attr_geo; args->whichfork = XFS_ATTR_FORK; - args->hashval = xfs_da_hashname(args->name, args->namelen); + xfs_attr_sethash(args); /* Entirely possible to look up a name which doesn't exist */ args->op_flags = XFS_DA_OP_OKNOENT; @@ -314,26 +328,20 @@ xfs_attr_calc_size( return nblks; } -/* Initialize transaction reservation for attr operations */ -void -xfs_init_attr_trans( - struct xfs_da_args *args, - struct xfs_trans_res *tres, - unsigned int *total) +/* Initialize transaction reservation for an xattr set/replace/upsert */ +inline struct xfs_trans_res +xfs_attr_set_resv( + const struct xfs_da_args *args) { - struct xfs_mount *mp = args->dp->i_mount; - - if (args->value) { - tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * - args->total; - tres->tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres->tr_logflags = XFS_TRANS_PERM_LOG_RES; - *total = args->total; - } else { - *tres = M_RES(mp)->tr_attrrm; - *total = XFS_ATTRRM_SPACE_RES(mp); - } + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_trans_res ret = { + .tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args->total, + .tr_logcount = XFS_ATTRSET_LOG_COUNT, + .tr_logflags = XFS_TRANS_PERM_LOG_RES, + }; + + return ret; } /* @@ -363,7 +371,7 @@ xfs_attr_try_sf_addname( * Commit the shortform mods, and we're done. * NOTE: this is also the error path (EEXIST, etc). */ - if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) + if (!error) xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); if (xfs_has_wsync(dp->i_mount)) @@ -401,6 +409,77 @@ out: return error; } +/* Compute the hash value for a user/root/secure extended attribute */ +xfs_dahash_t +xfs_attr_hashname( + const uint8_t *name, + int namelen) +{ + return xfs_da_hashname(name, namelen); +} + +/* Compute the hash value for any extended attribute from any namespace. */ +xfs_dahash_t +xfs_attr_hashval( + struct xfs_mount *mp, + unsigned int attr_flags, + const uint8_t *name, + int namelen, + const void *value, + int valuelen) +{ + ASSERT(xfs_attr_check_namespace(attr_flags)); + + if (attr_flags & XFS_ATTR_PARENT) + return xfs_parent_hashattr(mp, name, namelen, value, valuelen); + + return xfs_attr_hashname(name, namelen); +} + +/* Save the current remote block info and clear the current pointers. */ +static void +xfs_attr_save_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno2 = args->blkno; + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; +} + +/* Set stored info about a remote block */ +static void +xfs_attr_restore_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno = args->blkno2; + args->index = args->index2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; +} + +/* + * PPTR_REPLACE operations require the caller to set the old and new names and + * values explicitly. Update the canonical fields to the new name and value + * here now that the removal phase has finished. + */ +static void +xfs_attr_update_pptr_replace_args( + struct xfs_da_args *args) +{ + ASSERT(args->new_namelen > 0); + args->name = args->new_name; + args->namelen = args->new_namelen; + args->value = args->new_value; + args->valuelen = args->new_valuelen; + xfs_attr_sethash(args); +} + /* * Handle the state change on completion of a multi-state attr operation. * @@ -418,58 +497,84 @@ xfs_attr_complete_op( enum xfs_delattr_state replace_state) { struct xfs_da_args *args = attr->xattri_da_args; - bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; + + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + replace_state = XFS_DAS_DONE; + else if (xfs_attr_intent_op(attr) == XFS_ATTRI_OP_FLAGS_PPTR_REPLACE) + xfs_attr_update_pptr_replace_args(args); args->op_flags &= ~XFS_DA_OP_REPLACE; - if (do_replace) { - args->attr_filter &= ~XFS_ATTR_INCOMPLETE; - return replace_state; - } - return XFS_DAS_DONE; + args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + return replace_state; } +/* + * Try to add an attribute to an inode in leaf form. + */ static int xfs_attr_leaf_addname( struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_buf *bp; int error; ASSERT(xfs_attr_is_leaf(args->dp)); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp); + if (error) + return error; + /* - * Use the leaf buffer we may already hold locked as a result of - * a sf-to-leaf conversion. + * Look up the xattr name to set the insertion point for the new xattr. */ - error = xfs_attr_leaf_try_add(args); - - if (error == -ENOSPC) { - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; + error = xfs_attr3_leaf_lookup_int(bp, args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto out_brelse; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + goto out_brelse; + trace_xfs_attr_leaf_replace(args); /* - * We're not in leaf format anymore, so roll the transaction and - * retry the add to the newly allocated node block. + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to + * add, not the attribute we just found and will remove later. */ - attr->xattri_dela_state = XFS_DAS_NODE_ADD; - goto out; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto out_brelse; } - if (error) - return error; /* * We need to commit and roll if we need to allocate remote xattr blocks * or perform more xattr manipulations. Otherwise there is nothing more * to do and we can return success. */ - if (args->rmtblkno) + if (!xfs_attr3_leaf_add(bp, args)) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; + + attr->xattri_dela_state = XFS_DAS_NODE_ADD; + } else if (args->rmtblkno) { attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; - else - attr->xattri_dela_state = xfs_attr_complete_op(attr, - XFS_DAS_LEAF_REPLACE); -out: + } else { + attr->xattri_dela_state = + xfs_attr_complete_op(attr, XFS_DAS_LEAF_REPLACE); + } + trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); + return 0; + +out_brelse: + xfs_trans_brelse(args->trans, bp); return error; } @@ -492,7 +597,7 @@ xfs_attr_node_addname( return error; error = xfs_attr_node_try_addname(attr); - if (error == -ENOSPC) { + if (error == 1) { error = xfs_attr3_leaf_to_node(args); if (error) return error; @@ -647,8 +752,8 @@ xfs_attr_leaf_remove_attr( int forkoff; int error; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp); if (error) return error; @@ -679,7 +784,7 @@ xfs_attr_leaf_shrink( if (!xfs_attr_is_leaf(dp)) return 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp); if (error) return error; @@ -868,6 +973,11 @@ xfs_attr_lookup( return -ENOATTR; } + /* Prerequisite for xfs_attr_is_leaf */ + error = xfs_iread_extents(args->trans, args->dp, XFS_ATTR_FORK); + if (error) + return error; + if (xfs_attr_is_leaf(dp)) { error = xfs_attr_leaf_hasname(args, &bp); @@ -883,73 +993,73 @@ xfs_attr_lookup( return error; } -static void -xfs_attr_defer_add( - struct xfs_da_args *args, - unsigned int op_flags) +int +xfs_attr_add_fork( + struct xfs_inode *ip, /* incore inode pointer */ + int size, /* space new attribute needs */ + int rsvd) /* xact may use reserved blks */ { + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; /* transaction pointer */ + unsigned int blks; /* space reservation */ + int error; /* error return value */ - struct xfs_attr_intent *new; + if (!xfs_is_metadir_inode(ip)) + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL); - new->xattri_op_flags = op_flags; - new->xattri_da_args = args; + blks = XFS_ADDAFORK_SPACE_RES(mp); - switch (op_flags) { - case XFS_ATTRI_OP_FLAGS_SET: - new->xattri_dela_state = xfs_attr_init_add_state(args); - break; - case XFS_ATTRI_OP_FLAGS_REPLACE: - new->xattri_dela_state = xfs_attr_init_replace_state(args); - break; - case XFS_ATTRI_OP_FLAGS_REMOVE: - new->xattri_dela_state = xfs_attr_init_remove_state(args); - break; - default: - ASSERT(0); - } + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0, + rsvd, &tp); + if (error) + return error; - xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type); - trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); + if (xfs_inode_has_attr_fork(ip)) + goto trans_cancel; + + error = xfs_bmap_add_attrfork(tp, ip, size, rsvd); + if (error) + goto trans_cancel; + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +trans_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; } /* - * Note: If args->value is NULL the attribute will be removed, just like the - * Linux ->setattr API. + * Make a change to the xattr structure. + * + * The caller must have initialized @args, attached dquots, and must not hold + * any ILOCKs. Reserved data blocks may be used if @rsvd is set. + * + * Returns -EEXIST for XFS_ATTRUPDATE_CREATE if the name already exists. + * Returns -ENOATTR for XFS_ATTRUPDATE_REMOVE if the name does not exist. + * Returns 0 on success, or a negative errno if something else went wrong. */ int xfs_attr_set( - struct xfs_da_args *args) + struct xfs_da_args *args, + enum xfs_attr_update op, + bool rsvd) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_trans_res tres; - bool rsvd = (args->attr_filter & XFS_ATTR_ROOT); int error, local; int rmt_blks = 0; - unsigned int total; - - if (xfs_is_shutdown(dp->i_mount)) - return -EIO; - - error = xfs_qm_dqattach(dp); - if (error) - return error; + unsigned int total = 0; - args->geo = mp->m_attr_geo; - args->whichfork = XFS_ATTR_FORK; - args->hashval = xfs_da_hashname(args->name, args->namelen); + ASSERT(!args->trans); - /* - * We have no control over the attribute names that userspace passes us - * to remove, so we have to allow the name lookup prior to attribute - * removal to fail as well. Preserve the logged flag, since we need - * to pass that through to the logging code. - */ - args->op_flags = XFS_DA_OP_OKNOENT | - (args->op_flags & XFS_DA_OP_LOGGED); - - if (args->value) { + switch (op) { + case XFS_ATTRUPDATE_UPSERT: + case XFS_ATTRUPDATE_CREATE: + case XFS_ATTRUPDATE_REPLACE: XFS_STATS_INC(mp, xs_attr_set); args->total = xfs_attr_calc_size(args, &local); @@ -962,33 +1072,36 @@ xfs_attr_set( xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); - error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + error = xfs_attr_add_fork(dp, sf_size, rsvd); if (error) return error; } if (!local) rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); - } else { + + tres = xfs_attr_set_resv(args); + total = args->total; + break; + case XFS_ATTRUPDATE_REMOVE: XFS_STATS_INC(mp, xs_attr_remove); - rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); + rmt_blks = xfs_attr3_max_rmt_blocks(mp); + tres = M_RES(mp)->tr_attrrm; + total = XFS_ATTRRM_SPACE_RES(mp); + break; } /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) return error; - if (args->value || xfs_inode_hasattr(dp)) { - error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, + if (op != XFS_ATTRUPDATE_REMOVE || xfs_inode_hasattr(dp)) { + error = xfs_iext_count_extend(args->trans, dp, XFS_ATTR_FORK, XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(args->trans, dp, - XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); if (error) goto out_trans_cancel; } @@ -996,26 +1109,26 @@ xfs_attr_set( error = xfs_attr_lookup(args); switch (error) { case -EEXIST: - if (!args->value) { + if (op == XFS_ATTRUPDATE_REMOVE) { /* if no value, we are performing a remove operation */ - xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE); + xfs_attr_defer_add(args, XFS_ATTR_DEFER_REMOVE); break; } /* Pure create fails if the attr already exists */ - if (args->attr_flags & XATTR_CREATE) + if (op == XFS_ATTRUPDATE_CREATE) goto out_trans_cancel; - xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE); + xfs_attr_defer_add(args, XFS_ATTR_DEFER_REPLACE); break; case -ENOATTR: /* Can't remove what isn't there. */ - if (!args->value) + if (op == XFS_ATTRUPDATE_REMOVE) goto out_trans_cancel; /* Pure replace fails if no existing attr to replace. */ - if (args->attr_flags & XATTR_REPLACE) + if (op == XFS_ATTRUPDATE_REPLACE) goto out_trans_cancel; - xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET); + xfs_attr_defer_add(args, XFS_ATTR_DEFER_SET); break; default: goto out_trans_cancel; @@ -1028,8 +1141,7 @@ xfs_attr_set( if (xfs_has_wsync(mp)) xfs_trans_set_sync(args->trans); - if (!(args->op_flags & XFS_DA_OP_NOTIME)) - xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); /* * Commit the last in the sequence of transactions. @@ -1038,6 +1150,7 @@ xfs_attr_set( error = xfs_trans_commit(args->trans); out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); + args->trans = NULL; return error; out_trans_cancel: @@ -1050,7 +1163,7 @@ out_trans_cancel: * External routines when attribute list is inside the inode *========================================================================*/ -static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) +int xfs_attr_sf_totsize(struct xfs_inode *dp) { struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; @@ -1109,88 +1222,6 @@ xfs_attr_shortform_addname( * External routines when attribute list is one block *========================================================================*/ -/* Save the current remote block info and clear the current pointers. */ -static void -xfs_attr_save_rmt_blk( - struct xfs_da_args *args) -{ - args->blkno2 = args->blkno; - args->index2 = args->index; - args->rmtblkno2 = args->rmtblkno; - args->rmtblkcnt2 = args->rmtblkcnt; - args->rmtvaluelen2 = args->rmtvaluelen; - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; -} - -/* Set stored info about a remote block */ -static void -xfs_attr_restore_rmt_blk( - struct xfs_da_args *args) -{ - args->blkno = args->blkno2; - args->index = args->index2; - args->rmtblkno = args->rmtblkno2; - args->rmtblkcnt = args->rmtblkcnt2; - args->rmtvaluelen = args->rmtvaluelen2; -} - -/* - * Tries to add an attribute to an inode in leaf form - * - * This function is meant to execute as part of a delayed operation and leaves - * the transaction handling to the caller. On success the attribute is added - * and the inode and transaction are left dirty. If there is not enough space, - * the attr data is converted to node format and -ENOSPC is returned. Caller is - * responsible for handling the dirty inode and transaction or adding the attr - * in node format. - */ -STATIC int -xfs_attr_leaf_try_add( - struct xfs_da_args *args) -{ - struct xfs_buf *bp; - int error; - - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - return error; - - /* - * Look up the xattr name to set the insertion point for the new xattr. - */ - error = xfs_attr3_leaf_lookup_int(bp, args); - switch (error) { - case -ENOATTR: - if (args->op_flags & XFS_DA_OP_REPLACE) - goto out_brelse; - break; - case -EEXIST: - if (!(args->op_flags & XFS_DA_OP_REPLACE)) - goto out_brelse; - - trace_xfs_attr_leaf_replace(args); - /* - * Save the existing remote attr state so that the current - * values reflect the state of the new attribute we are about to - * add, not the attribute we just found and will remove later. - */ - xfs_attr_save_rmt_blk(args); - break; - case 0: - break; - default: - goto out_brelse; - } - - return xfs_attr3_leaf_add(bp, args); - -out_brelse: - xfs_trans_brelse(args->trans, bp); - return error; -} - /* * Return EEXIST if attr is found, or ENOATTR if not */ @@ -1201,7 +1232,7 @@ xfs_attr_leaf_hasname( { int error = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, bp); if (error) return error; @@ -1356,9 +1387,12 @@ error: /* * Add a name to a Btree-format attribute list. * - * This will involve walking down the Btree, and may involve splitting - * leaf nodes and even splitting intermediate nodes up to and including - * the root node (a special case of an intermediate node). + * This will involve walking down the Btree, and may involve splitting leaf + * nodes and even splitting intermediate nodes up to and including the root + * node (a special case of an intermediate node). + * + * If the tree was still in single leaf format and needs to converted to + * real node format return 1 and let the caller handle that. */ static int xfs_attr_node_try_addname( @@ -1366,21 +1400,21 @@ xfs_attr_node_try_addname( { struct xfs_da_state *state = attr->xattri_da_state; struct xfs_da_state_blk *blk; - int error; + int error = 0; trace_xfs_attr_node_addname(state->args); blk = &state->path.blk[state->path.active-1]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr3_leaf_add(blk->bp, state->args); - if (error == -ENOSPC) { + if (!xfs_attr3_leaf_add(blk->bp, state->args)) { if (state->path.active == 1) { /* * Its really a single leaf node, but it had * out-of-line values so it looked like it *might* * have been a b-tree. Let the caller deal with this. */ + error = 1; goto out; } @@ -1510,12 +1544,23 @@ out_release: return error; } +/* Enforce that there is at most one namespace bit per attr. */ +inline bool xfs_attr_check_namespace(unsigned int attr_flags) +{ + return hweight32(attr_flags & XFS_ATTR_NSP_ONDISK_MASK) < 2; +} + /* Returns true if the attribute entry name is valid. */ bool xfs_attr_namecheck( + unsigned int attr_flags, const void *name, size_t length) { + /* Only one namespace bit allowed. */ + if (!xfs_attr_check_namespace(attr_flags)) + return false; + /* * MAXNAMELEN includes the trailing null, but (name/length) leave it * out, so use >= for the length check. @@ -1523,6 +1568,10 @@ xfs_attr_namecheck( if (length >= MAXNAMELEN) return false; + /* Parent pointers have their own validation. */ + if (attr_flags & XFS_ATTR_PARENT) + return xfs_parent_namecheck(attr_flags, name, length); + /* There shouldn't be any nulls here */ return !memchr(name, 0, length); } diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 81be9b3e4004..0e51d0723f9a 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -47,8 +47,9 @@ struct xfs_attrlist_cursor_kern { /* void; state communicated via *context */ -typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, - unsigned char *, int, int); +typedef void (*put_listent_func_t)(struct xfs_attr_list_context *context, + int flags, unsigned char *name, int namelen, void *value, + int valuelen); struct xfs_attr_list_context { struct xfs_trans *tp; @@ -510,8 +511,8 @@ struct xfs_attr_intent { struct xfs_da_args *xattri_da_args; /* - * Shared buffer containing the attr name and value so that the logging - * code can share large memory buffers between log items. + * Shared buffer containing the attr name, new name, and value so that + * the logging code can share large memory buffers between log items. */ struct xfs_attri_log_nameval *xattri_nameval; @@ -529,6 +530,11 @@ struct xfs_attr_intent { struct xfs_bmbt_irec xattri_map; }; +static inline unsigned int +xfs_attr_intent_op(const struct xfs_attr_intent *attr) +{ + return attr->xattri_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK; +} /*======================================================================== * Function prototypes for the kernel. @@ -544,13 +550,22 @@ int xfs_inode_hasattr(struct xfs_inode *ip); bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); -int xfs_attr_set(struct xfs_da_args *args); + +enum xfs_attr_update { + XFS_ATTRUPDATE_REMOVE, /* remove attr */ + XFS_ATTRUPDATE_UPSERT, /* set value, replace any existing attr */ + XFS_ATTRUPDATE_CREATE, /* set value, fail if attr already exists */ + XFS_ATTRUPDATE_REPLACE, /* set value, fail if attr does not exist */ +}; + +int xfs_attr_set(struct xfs_da_args *args, enum xfs_attr_update op, bool rsvd); int xfs_attr_set_iter(struct xfs_attr_intent *attr); int xfs_attr_remove_iter(struct xfs_attr_intent *attr); -bool xfs_attr_namecheck(const void *name, size_t length); +bool xfs_attr_check_namespace(unsigned int attr_flags); +bool xfs_attr_namecheck(unsigned int attr_flags, const void *name, + size_t length); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); -void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, - unsigned int *total); +struct xfs_trans_res xfs_attr_set_resv(const struct xfs_da_args *args); /* * Check to see if the attr should be upgraded from non-existent or shortform to @@ -590,7 +605,6 @@ xfs_attr_init_add_state(struct xfs_da_args *args) static inline enum xfs_delattr_state xfs_attr_init_remove_state(struct xfs_da_args *args) { - args->op_flags |= XFS_DA_OP_REMOVE; if (xfs_attr_is_shortform(args->dp)) return XFS_DAS_SF_REMOVE; if (xfs_attr_is_leaf(args->dp)) @@ -614,8 +628,25 @@ xfs_attr_init_replace_state(struct xfs_da_args *args) return xfs_attr_init_add_state(args); } +xfs_dahash_t xfs_attr_hashname(const uint8_t *name, int namelen); + +xfs_dahash_t xfs_attr_hashval(struct xfs_mount *mp, unsigned int attr_flags, + const uint8_t *name, int namelen, const void *value, + int valuelen); + +/* Set the hash value for any extended attribute from any namespace. */ +static inline void xfs_attr_sethash(struct xfs_da_args *args) +{ + args->hashval = xfs_attr_hashval(args->dp->i_mount, args->attr_filter, + args->name, args->namelen, + args->value, args->valuelen); +} + extern struct kmem_cache *xfs_attr_intent_cache; int __init xfs_attr_intent_init_cache(void); void xfs_attr_intent_destroy_cache(void); +int xfs_attr_sf_totsize(struct xfs_inode *dp); +int xfs_attr_add_fork(struct xfs_inode *ip, int size, int rsvd); + #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 6374bf107242..fddb55605e0c 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -29,6 +29,7 @@ #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_errortag.h" +#include "xfs_health.h" /* @@ -46,7 +47,7 @@ */ STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block, struct xfs_buf **bpp); -STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, +STATIC void xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, struct xfs_attr3_icleaf_hdr *ichdr, struct xfs_da_args *args, int freemap_index); STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args, @@ -387,6 +388,27 @@ xfs_attr3_leaf_verify( return NULL; } +xfs_failaddr_t +xfs_attr3_leaf_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + + if (xfs_has_crc(mp)) { + struct xfs_attr3_leafblock *hdr3 = bp->b_addr; + + if (hdr3->hdr.info.hdr.magic != + cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->hdr.info.owner) != owner) + return __this_address; + } + + return NULL; +} + static void xfs_attr3_leaf_write_verify( struct xfs_buf *bp) @@ -447,16 +469,30 @@ int xfs_attr3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t bno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); - if (!err && tp && *bpp) + if (err || !(*bpp)) + return err; + + fa = xfs_attr3_leaf_header_check(*bpp, owner); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); + return -EFSCORRUPTED; + } + + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); - return err; + return 0; } /*======================================================================== @@ -471,28 +507,57 @@ xfs_attr3_leaf_read( * INCOMPLETE flag will not be set in attr->attr_filter, but rather * XFS_DA_OP_RECOVERY will be set in args->op_flags. */ +static inline unsigned int xfs_attr_match_mask(const struct xfs_da_args *args) +{ + if (args->op_flags & XFS_DA_OP_RECOVERY) + return XFS_ATTR_NSP_ONDISK_MASK; + return XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE; +} + +static inline bool +xfs_attr_parent_match( + const struct xfs_da_args *args, + const void *value, + unsigned int valuelen) +{ + ASSERT(args->value != NULL); + + /* Parent pointers do not use remote values */ + if (!value) + return false; + + /* + * The only value we support is a parent rec. However, we'll accept + * any valuelen so that offline repair can delete ATTR_PARENT values + * that are not parent pointers. + */ + if (valuelen != args->valuelen) + return false; + + return memcmp(args->value, value, valuelen) == 0; +} + static bool xfs_attr_match( struct xfs_da_args *args, - uint8_t namelen, - unsigned char *name, - int flags) + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen) { + unsigned int mask = xfs_attr_match_mask(args); if (args->namelen != namelen) return false; + if ((args->attr_filter & mask) != (attr_flags & mask)) + return false; if (memcmp(args->name, name, namelen) != 0) return false; - /* Recovery ignores the INCOMPLETE flag. */ - if ((args->op_flags & XFS_DA_OP_RECOVERY) && - args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK)) - return true; + if (attr_flags & XFS_ATTR_PARENT) + return xfs_attr_parent_match(args, value, valuelen); - /* All remaining matches need to be filtered by INCOMPLETE state. */ - if (args->attr_filter != - (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) - return false; return true; } @@ -503,6 +568,13 @@ xfs_attr_copy_value( int valuelen) { /* + * Parent pointer lookups require the caller to specify the name and + * value, so don't copy anything. + */ + if (args->attr_filter & XFS_ATTR_PARENT) + return 0; + + /* * No copy if all we have to do is get the length */ if (!args->valuelen) { @@ -614,7 +686,7 @@ xfs_attr_shortform_bytesfit( */ if (!dp->i_forkoff && dp->i_df.if_bytes > xfs_default_attroffset(dp)) - dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + dsize = xfs_bmdr_space_calc(MINDBTPTRS); break; case XFS_DINODE_FMT_BTREE: /* @@ -628,7 +700,7 @@ xfs_attr_shortform_bytesfit( return 0; return dp->i_forkoff; } - dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); + dsize = xfs_bmap_bmdr_space(dp->i_df.if_broot); break; } @@ -636,11 +708,11 @@ xfs_attr_shortform_bytesfit( * A data fork btree root must have space for at least * MINDBTPTRS key/ptr pairs if the data fork is small or empty. */ - minforkoff = max_t(int64_t, dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); + minforkoff = max_t(int64_t, dsize, xfs_bmdr_space_calc(MINDBTPTRS)); minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) @@ -710,8 +782,9 @@ xfs_attr_sf_findname( for (sfe = xfs_attr_sf_firstentry(sf); sfe < xfs_attr_sf_endptr(sf); sfe = xfs_attr_sf_nextentry(sfe)) { - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) + if (xfs_attr_match(args, sfe->flags, sfe->nameval, + sfe->namelen, &sfe->nameval[sfe->namelen], + sfe->valuelen)) return sfe; } @@ -818,7 +891,8 @@ xfs_attr_sf_removename( */ if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && - !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { + !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) && + !xfs_has_parent(mp)) { xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); @@ -827,7 +901,8 @@ xfs_attr_sf_removename( ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || !xfs_has_attr2(mp) || - dp->i_df.if_format == XFS_DINODE_FMT_BTREE); + dp->i_df.if_format == XFS_DINODE_FMT_BTREE || + xfs_has_parent(mp)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } @@ -879,8 +954,7 @@ xfs_attr_shortform_to_leaf( trace_xfs_attr_sf_to_leaf(args); - tmpbuffer = kmem_alloc(size, 0); - ASSERT(tmpbuffer != NULL); + tmpbuffer = kmalloc(size, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, ifp->if_data, size); sf = (struct xfs_attr_sf_hdr *)tmpbuffer; @@ -904,6 +978,7 @@ xfs_attr_shortform_to_leaf( nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; + nargs.owner = args->owner; sfe = xfs_attr_sf_firstentry(sf); for (i = 0; i < sf->count; i++) { @@ -911,20 +986,22 @@ xfs_attr_shortform_to_leaf( nargs.namelen = sfe->namelen; nargs.value = &sfe->nameval[nargs.namelen]; nargs.valuelen = sfe->valuelen; - nargs.hashval = xfs_da_hashname(sfe->nameval, - sfe->namelen); nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; + if (!xfs_attr_check_namespace(sfe->flags)) { + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto out; + } + xfs_attr_sethash(&nargs); error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == -ENOATTR); - error = xfs_attr3_leaf_add(bp, &nargs); - ASSERT(error != -ENOSPC); - if (error) - goto out; + if (!xfs_attr3_leaf_add(bp, &nargs)) + ASSERT(0); sfe = xfs_attr_sf_nextentry(sfe); } error = 0; out: - kmem_free(tmpbuffer); + kfree(tmpbuffer); return error; } @@ -1027,7 +1104,7 @@ xfs_attr_shortform_verify( * one namespace flag per xattr, so we can just count the * bits (i.e. hweight) here. */ - if (hweight8(sfep->flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) + if (!xfs_attr_check_namespace(sfep->flags)) return __this_address; sfep = next_sfep; @@ -1059,10 +1136,7 @@ xfs_attr3_leaf_to_shortform( trace_xfs_attr_leaf_to_sf(args); - tmpbuffer = kmem_alloc(args->geo->blksize, 0); - if (!tmpbuffer) - return -ENOMEM; - + tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); leaf = (xfs_attr_leafblock_t *)tmpbuffer; @@ -1106,6 +1180,7 @@ xfs_attr3_leaf_to_shortform( nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; + nargs.owner = args->owner; for (i = 0; i < ichdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) @@ -1125,7 +1200,7 @@ xfs_attr3_leaf_to_shortform( error = 0; out: - kmem_free(tmpbuffer); + kvfree(tmpbuffer); return error; } @@ -1158,7 +1233,7 @@ xfs_attr3_leaf_to_node( error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_attr3_leaf_read(args->trans, dp, 0, &bp1); + error = xfs_attr3_leaf_read(args->trans, dp, args->owner, 0, &bp1); if (error) goto out; @@ -1237,7 +1312,7 @@ xfs_attr3_leaf_create( ichdr.magic = XFS_ATTR3_LEAF_MAGIC; hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->owner = cpu_to_be64(dp->i_ino); + hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); @@ -1256,6 +1331,9 @@ xfs_attr3_leaf_create( /* * Split the leaf node, rebalance, then add the new entry. + * + * Returns 0 if the entry was added, 1 if a further split is needed or a + * negative error number otherwise. */ int xfs_attr3_leaf_split( @@ -1263,8 +1341,9 @@ xfs_attr3_leaf_split( struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk) { - xfs_dablk_t blkno; - int error; + bool added; + xfs_dablk_t blkno; + int error; trace_xfs_attr_leaf_split(state->args); @@ -1299,10 +1378,10 @@ xfs_attr3_leaf_split( */ if (state->inleaf) { trace_xfs_attr_leaf_add_old(state->args); - error = xfs_attr3_leaf_add(oldblk->bp, state->args); + added = xfs_attr3_leaf_add(oldblk->bp, state->args); } else { trace_xfs_attr_leaf_add_new(state->args); - error = xfs_attr3_leaf_add(newblk->bp, state->args); + added = xfs_attr3_leaf_add(newblk->bp, state->args); } /* @@ -1310,13 +1389,15 @@ xfs_attr3_leaf_split( */ oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); - return error; + if (!added) + return 1; + return 0; } /* * Add a name to the leaf attribute list structure. */ -int +bool xfs_attr3_leaf_add( struct xfs_buf *bp, struct xfs_da_args *args) @@ -1325,6 +1406,7 @@ xfs_attr3_leaf_add( struct xfs_attr3_icleaf_hdr ichdr; int tablesize; int entsize; + bool added = true; int sum; int tmp; int i; @@ -1353,7 +1435,7 @@ xfs_attr3_leaf_add( if (ichdr.freemap[i].base < ichdr.firstused) tmp += sizeof(xfs_attr_leaf_entry_t); if (ichdr.freemap[i].size >= tmp) { - tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i); + xfs_attr3_leaf_add_work(bp, &ichdr, args, i); goto out_log_hdr; } sum += ichdr.freemap[i].size; @@ -1365,7 +1447,7 @@ xfs_attr3_leaf_add( * no good and we should just give up. */ if (!ichdr.holes && sum < entsize) - return -ENOSPC; + return false; /* * Compact the entries to coalesce free space. @@ -1378,24 +1460,24 @@ xfs_attr3_leaf_add( * free region, in freemap[0]. If it is not big enough, give up. */ if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { - tmp = -ENOSPC; + added = false; goto out_log_hdr; } - tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); + xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); out_log_hdr: xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); - return tmp; + return added; } /* * Add a name to a leaf attribute list structure. */ -STATIC int +STATIC void xfs_attr3_leaf_add_work( struct xfs_buf *bp, struct xfs_attr3_icleaf_hdr *ichdr, @@ -1513,7 +1595,6 @@ xfs_attr3_leaf_add_work( } } ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); - return 0; } /* @@ -1533,7 +1614,7 @@ xfs_attr3_leaf_compact( trace_xfs_attr_leaf_compact(args); - tmpbuffer = kmem_alloc(args->geo->blksize, 0); + tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); memset(bp->b_addr, 0, args->geo->blksize); leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; @@ -1571,7 +1652,7 @@ xfs_attr3_leaf_compact( */ xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); - kmem_free(tmpbuffer); + kvfree(tmpbuffer); } /* @@ -1993,7 +2074,7 @@ xfs_attr3_leaf_toosmall( if (blkno == 0) continue; error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, - blkno, &bp); + state->args->owner, blkno, &bp); if (error) return error; @@ -2250,7 +2331,8 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0); + tmp_leaf = kvzalloc(state->args->geo->blksize, + GFP_KERNEL | __GFP_NOFAIL); /* * Copy the header into the temp leaf so that all the stuff @@ -2290,7 +2372,7 @@ xfs_attr3_leaf_unbalance( } memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); savehdr = tmphdr; /* struct copy */ - kmem_free(tmp_leaf); + kvfree(tmp_leaf); } xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr); @@ -2343,6 +2425,7 @@ xfs_attr3_leaf_lookup_int( entries = xfs_attr3_leaf_entryp(leaf); if (ichdr.count >= args->geo->blksize / 8) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -2362,10 +2445,12 @@ xfs_attr3_leaf_lookup_int( } if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -2397,18 +2482,23 @@ xfs_attr3_leaf_lookup_int( */ if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, probe); - if (!xfs_attr_match(args, name_loc->namelen, - name_loc->nameval, entry->flags)) + if (!xfs_attr_match(args, entry->flags, + name_loc->nameval, name_loc->namelen, + &name_loc->nameval[name_loc->namelen], + be16_to_cpu(name_loc->valuelen))) continue; args->index = probe; return -EEXIST; } else { + unsigned int valuelen; + name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); - if (!xfs_attr_match(args, name_rmt->namelen, - name_rmt->name, entry->flags)) + valuelen = be32_to_cpu(name_rmt->valuelen); + if (!xfs_attr_match(args, entry->flags, name_rmt->name, + name_rmt->namelen, NULL, valuelen)) continue; args->index = probe; - args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = valuelen; args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, @@ -2711,7 +2801,8 @@ xfs_attr3_leaf_clearflag( /* * Set up the operation. */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp); if (error) return error; @@ -2775,7 +2866,8 @@ xfs_attr3_leaf_setflag( /* * Set up the operation. */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp); if (error) return error; @@ -2834,7 +2926,8 @@ xfs_attr3_leaf_flipflags( /* * Read the block containing the "old" attr */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp1); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp1); if (error) return error; @@ -2842,8 +2935,8 @@ xfs_attr3_leaf_flipflags( * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2, - &bp2); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno2, &bp2); if (error) return error; } else { diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 9b9948639c0f..589f810eedc0 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -76,7 +76,7 @@ int xfs_attr3_leaf_split(struct xfs_da_state *state, int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf, struct xfs_da_args *args); int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); -int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, +bool xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); @@ -98,12 +98,14 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t bno, struct xfs_buf **bpp); void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from); void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo, struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from); +xfs_failaddr_t xfs_attr3_leaf_header_check(struct xfs_buf *bp, + xfs_ino_t owner); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index d440393b40eb..4c44ce1c8a64 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -22,6 +22,7 @@ #include "xfs_attr_remote.h" #include "xfs_trace.h" #include "xfs_error.h" +#include "xfs_health.h" #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ @@ -42,19 +43,32 @@ * the logging system and therefore never have a log item. */ -/* - * Each contiguous block has a header, so it is not just a simple attribute - * length to FSB conversion. - */ -int +/* How many bytes can be stored in a remote value buffer? */ +inline unsigned int +xfs_attr3_rmt_buf_space( + struct xfs_mount *mp) +{ + unsigned int blocksize = mp->m_attr_geo->blksize; + + if (xfs_has_crc(mp)) + return blocksize - sizeof(struct xfs_attr3_rmt_hdr); + + return blocksize; +} + +/* Compute number of fsblocks needed to store a remote attr value */ +unsigned int xfs_attr3_rmt_blocks( - struct xfs_mount *mp, - int attrlen) + struct xfs_mount *mp, + unsigned int attrlen) { - if (xfs_has_crc(mp)) { - int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - return (attrlen + buflen - 1) / buflen; - } + /* + * Each contiguous block has a header, so it is not just a simple + * attribute length to FSB conversion. + */ + if (xfs_has_crc(mp)) + return howmany(attrlen, xfs_attr3_rmt_buf_space(mp)); + return XFS_B_TO_FSB(mp, attrlen); } @@ -91,7 +105,6 @@ xfs_attr3_rmt_verify( struct xfs_mount *mp, struct xfs_buf *bp, void *ptr, - int fsbsize, xfs_daddr_t bno) { struct xfs_attr3_rmt_hdr *rmt = ptr; @@ -102,7 +115,7 @@ xfs_attr3_rmt_verify( return __this_address; if (be64_to_cpu(rmt->rm_blkno) != bno) return __this_address; - if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) + if (be32_to_cpu(rmt->rm_bytes) > mp->m_attr_geo->blksize - sizeof(*rmt)) return __this_address; if (be32_to_cpu(rmt->rm_offset) + be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX) @@ -121,9 +134,9 @@ __xfs_attr3_rmt_read_verify( { struct xfs_mount *mp = bp->b_mount; char *ptr; - int len; + unsigned int len; xfs_daddr_t bno; - int blksize = mp->m_attr_geo->blksize; + unsigned int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ if (!xfs_has_crc(mp)) @@ -140,7 +153,7 @@ __xfs_attr3_rmt_read_verify( *failaddr = __this_address; return -EFSBADCRC; } - *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); + *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, bno); if (*failaddr) return -EFSCORRUPTED; len -= blksize; @@ -185,7 +198,7 @@ xfs_attr3_rmt_write_verify( { struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - int blksize = mp->m_attr_geo->blksize; + unsigned int blksize = mp->m_attr_geo->blksize; char *ptr; int len; xfs_daddr_t bno; @@ -202,7 +215,7 @@ xfs_attr3_rmt_write_verify( while (len > 0) { struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; - fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); + fa = xfs_attr3_rmt_verify(mp, bp, ptr, bno); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -276,32 +289,34 @@ xfs_attr3_rmt_hdr_set( */ STATIC int xfs_attr_rmtval_copyout( - struct xfs_mount *mp, - struct xfs_buf *bp, - xfs_ino_t ino, - int *offset, - int *valuelen, - uint8_t **dst) + struct xfs_mount *mp, + struct xfs_buf *bp, + struct xfs_inode *dp, + xfs_ino_t owner, + unsigned int *offset, + unsigned int *valuelen, + uint8_t **dst) { - char *src = bp->b_addr; - xfs_daddr_t bno = xfs_buf_daddr(bp); - int len = BBTOB(bp->b_length); - int blksize = mp->m_attr_geo->blksize; + char *src = bp->b_addr; + xfs_daddr_t bno = xfs_buf_daddr(bp); + unsigned int len = BBTOB(bp->b_length); + unsigned int blksize = mp->m_attr_geo->blksize; ASSERT(len >= blksize); while (len > 0 && *valuelen > 0) { - int hdr_size = 0; - int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + unsigned int hdr_size = 0; + unsigned int byte_cnt = xfs_attr3_rmt_buf_space(mp); byte_cnt = min(*valuelen, byte_cnt); if (xfs_has_crc(mp)) { - if (xfs_attr3_rmt_hdr_ok(src, ino, *offset, + if (xfs_attr3_rmt_hdr_ok(src, owner, *offset, byte_cnt, bno)) { xfs_alert(mp, "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", - bno, *offset, byte_cnt, ino); + bno, *offset, byte_cnt, owner); + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } hdr_size = sizeof(struct xfs_attr3_rmt_hdr); @@ -327,20 +342,20 @@ xfs_attr_rmtval_copyin( struct xfs_mount *mp, struct xfs_buf *bp, xfs_ino_t ino, - int *offset, - int *valuelen, + unsigned int *offset, + unsigned int *valuelen, uint8_t **src) { char *dst = bp->b_addr; xfs_daddr_t bno = xfs_buf_daddr(bp); - int len = BBTOB(bp->b_length); - int blksize = mp->m_attr_geo->blksize; + unsigned int len = BBTOB(bp->b_length); + unsigned int blksize = mp->m_attr_geo->blksize; ASSERT(len >= blksize); while (len > 0 && *valuelen > 0) { - int hdr_size; - int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + unsigned int hdr_size; + unsigned int byte_cnt = xfs_attr3_rmt_buf_space(mp); byte_cnt = min(*valuelen, byte_cnt); hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, @@ -386,12 +401,12 @@ xfs_attr_rmtval_get( struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; uint8_t *dst = args->value; - int valuelen; + unsigned int valuelen; int nmap; int error; - int blkcnt = args->rmtblkcnt; + unsigned int blkcnt = args->rmtblkcnt; int i; - int offset = 0; + unsigned int offset = 0; trace_xfs_attr_rmtval_get(args); @@ -418,12 +433,13 @@ xfs_attr_rmtval_get( dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0, &bp, &xfs_attr3_rmt_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK); if (error) return error; - error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, - &offset, &valuelen, - &dst); + error = xfs_attr_rmtval_copyout(mp, bp, args->dp, + args->owner, &offset, &valuelen, &dst); xfs_buf_relse(bp); if (error) return error; @@ -448,7 +464,7 @@ xfs_attr_rmt_find_hole( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; int error; - int blkcnt; + unsigned int blkcnt; xfs_fileoff_t lfileoff = 0; /* @@ -477,11 +493,11 @@ xfs_attr_rmtval_set_value( struct xfs_bmbt_irec map; xfs_dablk_t lblkno; uint8_t *src = args->value; - int blkcnt; - int valuelen; + unsigned int blkcnt; + unsigned int valuelen; int nmap; int error; - int offset = 0; + unsigned int offset = 0; /* * Roll through the "value", copying the attribute value to the @@ -517,8 +533,8 @@ xfs_attr_rmtval_set_value( return error; bp->b_ops = &xfs_attr3_rmt_buf_ops; - xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, - &valuelen, &src); + xfs_attr_rmtval_copyin(mp, bp, args->owner, &offset, &valuelen, + &src); error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); @@ -545,11 +561,13 @@ xfs_attr_rmtval_stale( struct xfs_buf *bp; int error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) || - XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) + XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) { + xfs_bmap_mark_sick(ip, XFS_ATTR_FORK); return -EFSCORRUPTED; + } error = xfs_buf_incore(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map->br_startblock), @@ -619,7 +637,6 @@ xfs_attr_rmtval_set_blk( if (error) return error; - ASSERT(nmap == 1); ASSERT((map->br_startblock != DELAYSTARTBLOCK) && (map->br_startblock != HOLESTARTBLOCK)); @@ -639,7 +656,7 @@ xfs_attr_rmtval_invalidate( struct xfs_da_args *args) { xfs_dablk_t lblkno; - int blkcnt; + unsigned int blkcnt; int error; /* @@ -659,8 +676,10 @@ xfs_attr_rmtval_invalidate( blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; - if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) + if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) { + xfs_bmap_mark_sick(args->dp, XFS_ATTR_FORK); return -EFSCORRUPTED; + } error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index d097ec6c4dc3..e3c6c7d774bf 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -6,7 +6,13 @@ #ifndef __XFS_ATTR_REMOTE_H__ #define __XFS_ATTR_REMOTE_H__ -int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); +unsigned int xfs_attr3_rmt_blocks(struct xfs_mount *mp, unsigned int attrlen); + +/* Number of rmt blocks needed to store the maximally sized attr value */ +static inline unsigned int xfs_attr3_max_rmt_blocks(struct xfs_mount *mp) +{ + return xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); +} int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index bc4422223024..73bdc0e55682 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -16,6 +16,7 @@ typedef struct xfs_attr_sf_sort { uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ xfs_dahash_t hash; /* this entry's hash value */ unsigned char *name; /* name value, pointer into buffer */ + void *value; } xfs_attr_sf_sort_t; #define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f362345467fa..d954f9b8071f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -34,8 +34,13 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" -#include "xfs_icache.h" #include "xfs_iomap.h" +#include "xfs_health.h" +#include "xfs_bmap_item.h" +#include "xfs_symlink_remote.h" +#include "xfs_inode_util.h" +#include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_bmap_intent_cache; @@ -75,9 +80,9 @@ xfs_bmap_compute_maxlevels( maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), whichfork); if (whichfork == XFS_DATA_FORK) - sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + sz = xfs_bmdr_space_calc(MINDBTPTRS); else - sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); + sz = xfs_bmdr_space_calc(MINABTPTRS); maxrootrecs = xfs_bmdr_maxrecs(sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; @@ -98,8 +103,8 @@ xfs_bmap_compute_attr_offset( struct xfs_mount *mp) { if (mp->m_sb.sb_inodesize == 256) - return XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); - return XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); + return XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS); + return xfs_bmdr_space_calc(6 * MINABTPTRS); } STATIC int /* error */ @@ -166,18 +171,16 @@ xfs_bmbt_update( * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". */ -STATIC xfs_filblks_t +xfs_filblks_t xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len) /* delayed extent length */ + struct xfs_inode *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ { - int level; /* btree level number */ - int maxrecs; /* maximum record count at this level */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t rval; /* return value */ + struct xfs_mount *mp = ip->i_mount; + int maxrecs = mp->m_bmap_dmxr[0]; + int level; + xfs_filblks_t rval; - mp = ip->i_mount; - maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); level++) { @@ -225,6 +228,28 @@ xfs_bmap_forkoff_reset( } } +static int +xfs_bmap_read_buf( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + struct xfs_buf **bpp) /* buffer for fsbno */ +{ + struct xfs_buf *bp; /* return value */ + int error; + + if (!xfs_verify_fsbno(mp, fsbno)) + return -EFSCORRUPTED; + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, fsbno), mp->m_bsize, 0, &bp, + &xfs_bmbt_buf_ops); + if (!error) { + xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); + *bpp = bp; + } + return error; +} + #ifdef DEBUG STATIC struct xfs_buf * xfs_bmap_get_bp( @@ -272,7 +297,7 @@ xfs_check_block( prevp = NULL; for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { dmxr = mp->m_bmap_dmxr[0]; - keyp = XFS_BMBT_KEY_ADDR(mp, block, i); + keyp = xfs_bmbt_key_addr(mp, block, i); if (prevp) { ASSERT(be64_to_cpu(prevp->br_startoff) < @@ -284,15 +309,15 @@ xfs_check_block( * Compare the block numbers to see if there are dups. */ if (root) - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); + pp = xfs_bmap_broot_ptr_addr(mp, block, i, sz); else - pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); + pp = xfs_bmbt_ptr_addr(mp, block, i, dmxr); for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { if (root) - thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); + thispa = xfs_bmap_broot_ptr_addr(mp, block, j, sz); else - thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); + thispa = xfs_bmbt_ptr_addr(mp, block, j, dmxr); if (*thispa == *pp) { xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld", __func__, j, i, @@ -347,7 +372,7 @@ xfs_bmap_check_leaf_extents( level = be16_to_cpu(block->bb_level); ASSERT(level > 0); xfs_check_block(block, mp, 1, ifp->if_broot_bytes); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + pp = xfs_bmap_broot_ptr_addr(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); ASSERT(bno != NULLFSBLOCK); @@ -364,9 +389,9 @@ xfs_bmap_check_leaf_extents( bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); if (!bp) { bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); + error = xfs_bmap_read_buf(mp, NULL, bno, &bp); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(cur); if (error) goto error_norelse; } @@ -380,9 +405,10 @@ xfs_bmap_check_leaf_extents( */ xfs_check_block(block, mp, 0, 0); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + pp = xfs_bmbt_ptr_addr(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -419,14 +445,14 @@ xfs_bmap_check_leaf_extents( * conform with the first entry in this one. */ - ep = XFS_BMBT_REC_ADDR(mp, block, 1); + ep = xfs_bmbt_rec_addr(mp, block, 1); if (i) { ASSERT(xfs_bmbt_disk_get_startoff(&last) + xfs_bmbt_disk_get_blockcount(&last) <= xfs_bmbt_disk_get_startoff(ep)); } for (j = 1; j < num_recs; j++) { - nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); + nextp = xfs_bmbt_rec_addr(mp, block, j + 1); ASSERT(xfs_bmbt_disk_get_startoff(ep) + xfs_bmbt_disk_get_blockcount(ep) <= xfs_bmbt_disk_get_startoff(nextp)); @@ -450,9 +476,9 @@ xfs_bmap_check_leaf_extents( bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); if (!bp) { bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); + error = xfs_bmap_read_buf(mp, NULL, bno, &bp); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(cur); if (error) goto error_norelse; } @@ -557,16 +583,19 @@ xfs_bmap_btree_to_extents( ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); - ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false) == 1); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); + pp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); #ifdef DEBUG - if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1))) + if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_verify_fsbno(mp, cbno))) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } #endif - error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); + error = xfs_bmap_read_buf(mp, tp, cbno, &cbp); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(cur); if (error) return error; cblock = XFS_BUF_TO_BLOCK(cbp); @@ -575,7 +604,7 @@ xfs_bmap_btree_to_extents( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, 0); if (error) return error; @@ -584,7 +613,7 @@ xfs_bmap_btree_to_extents( xfs_trans_binval(tp, cbp); if (cur->bc_levels[0].bp == cbp) cur->bc_levels[0].bp = NULL; - xfs_iroot_realloc(ip, -1, whichfork); + xfs_bmap_broot_realloc(ip, whichfork, 0); ASSERT(ifp->if_broot == NULL); ifp->if_format = XFS_DINODE_FMT_EXTENTS; *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); @@ -628,20 +657,18 @@ xfs_bmap_extents_to_btree( * Make space in the inode incore. This needs to be undone if we fail * to expand the root. */ - xfs_iroot_realloc(ip, 1, whichfork); + block = xfs_bmap_broot_realloc(ip, whichfork, 1); /* * Fill in the root. */ - block = ifp->if_broot; - xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, - XFS_BTNUM_BMAP, 1, 1, ip->i_ino, - XFS_BTREE_LONG_PTRS); + xfs_bmbt_init_block(ip, block, NULL, 1, 1); /* * Need a cursor. Can't allocate until bb_level is filled in. */ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; + if (wasdel) + cur->bc_flags |= XFS_BTREE_BMBT_WASDEL; /* * Convert to a btree with two levels, one record in root. */ @@ -667,7 +694,7 @@ xfs_bmap_extents_to_btree( goto out_root_realloc; } - cur->bc_ino.allocated++; + cur->bc_bmap.allocated++; ip->i_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, @@ -679,16 +706,13 @@ xfs_bmap_extents_to_btree( /* * Fill in the child block. */ - abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); - xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp), - XFS_BTNUM_BMAP, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS); + xfs_bmbt_init_block(ip, ablock, abp, 0, 0); for_each_xfs_iext(ifp, &icur, &rec) { if (isnullstartblock(rec.br_startblock)) continue; - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1 + cnt); + arp = xfs_bmbt_rec_addr(mp, ablock, 1 + cnt); xfs_bmbt_disk_set_all(arp, &rec); cnt++; } @@ -698,10 +722,10 @@ xfs_bmap_extents_to_btree( /* * Fill in the root key and pointer. */ - kp = XFS_BMBT_KEY_ADDR(mp, block, 1); - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + kp = xfs_bmbt_key_addr(mp, block, 1); + arp = xfs_bmbt_rec_addr(mp, ablock, 1); kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, + pp = xfs_bmbt_ptr_addr(mp, block, 1, xfs_bmbt_get_maxrecs(cur, be16_to_cpu(block->bb_level))); *pp = cpu_to_be64(args.fsbno); @@ -719,7 +743,7 @@ xfs_bmap_extents_to_btree( out_unreserve_dquot: xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); out_root_realloc: - xfs_iroot_realloc(ip, -1, whichfork); + xfs_bmap_broot_realloc(ip, whichfork, 0); ifp->if_format = XFS_DINODE_FMT_EXTENTS; ASSERT(ifp->if_broot == NULL); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -754,7 +778,7 @@ xfs_bmap_local_to_extents_empty( } -STATIC int /* error */ +int /* error */ xfs_bmap_local_to_extents( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ @@ -764,7 +788,8 @@ xfs_bmap_local_to_extents( void (*init_fn)(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, - struct xfs_ifork *ifp)) + struct xfs_ifork *ifp, void *priv), + void *priv) { int error = 0; int flags; /* logging flags returned */ @@ -825,7 +850,7 @@ xfs_bmap_local_to_extents( * log here. Note that init_fn must also set the buffer log item type * correctly. */ - init_fn(tp, bp, ip, ifp); + init_fn(tp, bp, ip, ifp, priv); /* account for the change in fork size */ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); @@ -869,7 +894,7 @@ xfs_bmap_add_attrfork_btree( mp = ip->i_mount; - if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip)) + if (xfs_bmap_bmdr_space(block) <= xfs_inode_data_fork_size(ip)) *flags |= XFS_ILOG_DBROOT; else { cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); @@ -878,6 +903,7 @@ xfs_bmap_add_attrfork_btree( goto error0; /* must be at least one entry */ if (XFS_IS_CORRUPT(mp, stat != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -887,7 +913,7 @@ xfs_bmap_add_attrfork_btree( xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return -ENOSPC; } - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); } return 0; @@ -915,7 +941,7 @@ xfs_bmap_add_attrfork_extents( error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, XFS_DATA_FORK); if (cur) { - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -950,16 +976,18 @@ xfs_bmap_add_attrfork_local( dargs.total = dargs.geo->fsbcount; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; + dargs.owner = ip->i_ino; return xfs_dir2_sf_to_block(&dargs); } if (S_ISLNK(VFS_I(ip)->i_mode)) return xfs_bmap_local_to_extents(tp, ip, 1, flags, - XFS_DATA_FORK, - xfs_symlink_local_to_remote); + XFS_DATA_FORK, xfs_symlink_local_to_remote, + NULL); /* should only be called for types that support local format data */ ASSERT(0); + xfs_bmap_mark_sick(ip, XFS_ATTR_FORK); return -EFSCORRUPTED; } @@ -996,40 +1024,30 @@ xfs_bmap_set_attrforkoff( } /* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. + * Convert inode from non-attributed to attributed. Caller must hold the + * ILOCK_EXCL and the file cannot have an attr fork. */ int /* error code */ xfs_bmap_add_attrfork( - xfs_inode_t *ip, /* incore inode pointer */ + struct xfs_trans *tp, + struct xfs_inode *ip, /* incore inode pointer */ int size, /* space new attribute needs */ int rsvd) /* xact may use reserved blks */ { - xfs_mount_t *mp; /* mount structure */ - xfs_trans_t *tp; /* transaction pointer */ - int blks; /* space reservation */ + struct xfs_mount *mp = tp->t_mountp; int version = 1; /* superblock attr version */ int logflags; /* logging flags */ int error; /* error return value */ - ASSERT(xfs_inode_has_attr_fork(ip) == 0); - - mp = ip->i_mount; - ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - - blks = XFS_ADDAFORK_SPACE_RES(mp); - - error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0, - rsvd, &tp); - if (error) - return error; - if (xfs_inode_has_attr_fork(ip)) - goto trans_cancel; + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + if (!xfs_is_metadir_inode(ip)) + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + ASSERT(!xfs_inode_has_attr_fork(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_bmap_set_attrforkoff(ip, size, &version); if (error) - goto trans_cancel; + return error; xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); logflags = 0; @@ -1050,7 +1068,7 @@ xfs_bmap_add_attrfork( if (logflags) xfs_trans_log_inode(tp, ip, logflags); if (error) - goto trans_cancel; + return error; if (!xfs_has_attr(mp) || (!xfs_has_attr2(mp) && version == 2)) { bool log_sb = false; @@ -1069,14 +1087,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; - -trans_cancel: - xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; + return 0; } /* @@ -1143,11 +1154,12 @@ xfs_iread_bmbt_block( (unsigned long long)ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block, sizeof(*block), __this_address); + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } /* Copy records into the incore cache. */ - frp = XFS_BMBT_REC_ADDR(mp, block, 1); + frp = xfs_bmbt_rec_addr(mp, block, 1); for (j = 0; j < num_recs; j++, frp++, ir->loaded++) { struct xfs_bmbt_irec new; xfs_failaddr_t fa; @@ -1158,6 +1170,7 @@ xfs_iread_bmbt_block( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iread_extents(2)", frp, sizeof(*frp), fa); + xfs_bmap_mark_sick(ip, whichfork); return xfs_bmap_complain_bad_rec(ip, whichfork, fa, &new); } @@ -1189,7 +1202,7 @@ xfs_iread_extents( if (!xfs_need_iread_extents(ifp)) return 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ir.loaded = 0; xfs_iext_first(ifp, &ir.icur); @@ -1201,6 +1214,7 @@ xfs_iread_extents( goto out; if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) { + xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto out; } @@ -1213,6 +1227,8 @@ xfs_iread_extents( smp_store_release(&ifp->if_needextents, 0); return 0; out: + if (xfs_metadata_is_sick(error)) + xfs_bmap_mark_sick(ip, whichfork); xfs_iext_destroy(ifp); return error; } @@ -1292,6 +1308,7 @@ xfs_bmap_last_before( break; default: ASSERT(0); + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -1388,8 +1405,10 @@ xfs_bmap_last_offset( if (ifp->if_format == XFS_DINODE_FMT_LOCAL) return 0; - if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) + if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; + } error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); if (error || is_empty) @@ -1403,6 +1422,24 @@ xfs_bmap_last_offset( * Extent tree manipulation functions used during allocation. */ +static inline bool +xfs_bmap_same_rtgroup( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *left, + struct xfs_bmbt_irec *right) +{ + struct xfs_mount *mp = ip->i_mount; + + if (xfs_ifork_is_realtime(ip, whichfork) && xfs_has_rtgroups(mp)) { + if (xfs_rtb_to_rgno(mp, left->br_startblock) != + xfs_rtb_to_rgno(mp, right->br_startblock)) + return false; + } + + return true; +} + /* * Convert a delayed allocation to a real allocation. */ @@ -1429,8 +1466,7 @@ xfs_bmap_add_extent_delay_real( ASSERT(whichfork != XFS_ATTR_FORK); ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!bma->cur || - (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); + ASSERT(!bma->cur || (bma->cur->bc_flags & XFS_BTREE_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -1473,7 +1509,8 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && + xfs_bmap_same_rtgroup(bma->ip, whichfork, &LEFT, new)) state |= BMAP_LEFT_CONTIG; /* @@ -1497,7 +1534,8 @@ xfs_bmap_add_extent_delay_real( (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= XFS_MAX_BMBT_EXTLEN)) + <= XFS_MAX_BMBT_EXTLEN) && + xfs_bmap_same_rtgroup(bma->ip, whichfork, new, &RIGHT)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -1528,6 +1566,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1535,6 +1574,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1542,6 +1582,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1549,6 +1590,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: @@ -1571,6 +1613,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1578,6 +1621,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -1604,6 +1648,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1611,6 +1656,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: @@ -1632,6 +1678,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1639,10 +1686,12 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: @@ -1673,6 +1722,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1680,6 +1730,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING: @@ -1698,6 +1749,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1705,6 +1757,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1721,7 +1774,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_ino.allocated : 0)); + (bma->cur ? bma->cur->bc_bmap.allocated : 0)); PREV.br_startoff = new_endoff; PREV.br_blockcount = temp; @@ -1749,6 +1802,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1767,6 +1821,7 @@ xfs_bmap_add_extent_delay_real( xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); xfs_iext_next(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT); + ASSERT(da_new <= da_old); break; case BMAP_RIGHT_FILLING: @@ -1785,6 +1840,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1792,6 +1848,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1808,12 +1865,13 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_ino.allocated : 0)); + (bma->cur ? bma->cur->bc_bmap.allocated : 0)); PREV.br_startblock = nullstartblock(da_new); PREV.br_blockcount = temp; xfs_iext_insert(bma->ip, &bma->icur, &PREV, state); xfs_iext_next(ifp, &bma->icur); + ASSERT(da_new <= da_old); break; case 0: @@ -1871,6 +1929,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1878,6 +1937,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1926,19 +1986,18 @@ xfs_bmap_add_extent_delay_real( } if (da_new != da_old) - xfs_mod_delalloc(mp, (int64_t)da_new - da_old); + xfs_mod_delalloc(bma->ip, 0, (int64_t)da_new - da_old); if (bma->cur) { - da_new += bma->cur->bc_ino.allocated; - bma->cur->bc_ino.allocated = 0; + da_new += bma->cur->bc_bmap.allocated; + bma->cur->bc_bmap.allocated = 0; } /* adjust for changes in reserved delayed indirect blocks */ - if (da_new != da_old) { - ASSERT(state == 0 || da_new < da_old); - error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), - false); - } + if (da_new < da_old) + xfs_add_fdblocks(mp, da_old - da_new); + else if (da_new > da_old) + error = xfs_dec_fdblocks(mp, da_new - da_old, true); xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: @@ -2021,7 +2080,8 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && + xfs_bmap_same_rtgroup(ip, whichfork, &LEFT, new)) state |= BMAP_LEFT_CONTIG; /* @@ -2045,7 +2105,8 @@ xfs_bmap_add_extent_unwritten_real( (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= XFS_MAX_BMBT_EXTLEN)) + <= XFS_MAX_BMBT_EXTLEN) && + xfs_bmap_same_rtgroup(ip, whichfork, new, &RIGHT)) state |= BMAP_RIGHT_CONTIG; /* @@ -2074,30 +2135,35 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2126,18 +2192,21 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2169,18 +2238,21 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2207,6 +2279,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2240,6 +2313,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2277,6 +2351,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2287,6 +2362,7 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2317,6 +2393,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2353,6 +2430,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2363,12 +2441,14 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2405,6 +2485,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2417,6 +2498,7 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2429,6 +2511,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2436,6 +2519,7 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2472,7 +2556,7 @@ xfs_bmap_add_extent_unwritten_real( /* clear out the allocated field, done with it now in any case. */ if (cur) { - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; *curp = cur; } @@ -2486,146 +2570,6 @@ done: } /* - * Convert a hole to a delayed allocation. - */ -STATIC void -xfs_bmap_add_extent_hole_delay( - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork, - struct xfs_iext_cursor *icur, - xfs_bmbt_irec_t *new) /* new data to add to file extents */ -{ - struct xfs_ifork *ifp; /* inode fork pointer */ - xfs_bmbt_irec_t left; /* left neighbor extent entry */ - xfs_filblks_t newlen=0; /* new indirect size */ - xfs_filblks_t oldlen=0; /* old indirect size */ - xfs_bmbt_irec_t right; /* right neighbor extent entry */ - uint32_t state = xfs_bmap_fork_to_state(whichfork); - xfs_filblks_t temp; /* temp for indirect calculations */ - - ifp = xfs_ifork_ptr(ip, whichfork); - ASSERT(isnullstartblock(new->br_startblock)); - - /* - * Check and set flags if this segment has a left neighbor - */ - if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { - state |= BMAP_LEFT_VALID; - if (isnullstartblock(left.br_startblock)) - state |= BMAP_LEFT_DELAY; - } - - /* - * Check and set flags if the current (right) segment exists. - * If it doesn't exist, we're converting the hole at end-of-file. - */ - if (xfs_iext_get_extent(ifp, icur, &right)) { - state |= BMAP_RIGHT_VALID; - if (isnullstartblock(right.br_startblock)) - state |= BMAP_RIGHT_DELAY; - } - - /* - * Set contiguity flags on the left and right neighbors. - * Don't let extents get too large, even if the pieces are contiguous. - */ - if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && - left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) - state |= BMAP_LEFT_CONTIG; - - if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && - new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && - (!(state & BMAP_LEFT_CONTIG) || - (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) - state |= BMAP_RIGHT_CONTIG; - - /* - * Switch out based on the contiguity flags. - */ - switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { - case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with delayed allocations - * on the left and on the right. - * Merge all three into a single extent record. - */ - temp = left.br_blockcount + new->br_blockcount + - right.br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_startblock = nullstartblock(newlen); - left.br_blockcount = temp; - - xfs_iext_remove(ip, icur, state); - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_LEFT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the left. - * Merge the new allocation with the left neighbor. - */ - temp = left.br_blockcount + new->br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_blockcount = temp; - left.br_startblock = nullstartblock(newlen); - - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the right. - * Merge the new allocation with the right neighbor. - */ - temp = new->br_blockcount + right.br_blockcount; - oldlen = startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - right.br_startoff = new->br_startoff; - right.br_startblock = nullstartblock(newlen); - right.br_blockcount = temp; - xfs_iext_update_extent(ip, state, icur, &right); - break; - - case 0: - /* - * New allocation is not contiguous with another - * delayed allocation. - * Insert a new entry. - */ - oldlen = newlen = 0; - xfs_iext_insert(ip, icur, new, state); - break; - } - if (oldlen != newlen) { - ASSERT(oldlen > newlen); - xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen), - false); - /* - * Nothing to do for disk quota accounting here. - */ - xfs_mod_delalloc(ip->i_mount, (int64_t)newlen - oldlen); - } -} - -/* * Convert a hole to a real allocation. */ STATIC int /* error */ @@ -2651,7 +2595,7 @@ xfs_bmap_add_extent_hole_real( struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); + ASSERT(!cur || !(cur->bc_flags & XFS_BTREE_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2682,7 +2626,8 @@ xfs_bmap_add_extent_hole_real( left.br_startoff + left.br_blockcount == new->br_startoff && left.br_startblock + left.br_blockcount == new->br_startblock && left.br_state == new->br_state && - left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && + xfs_bmap_same_rtgroup(ip, whichfork, &left, new)) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && @@ -2692,7 +2637,8 @@ xfs_bmap_add_extent_hole_real( new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || left.br_blockcount + new->br_blockcount + - right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN) && + xfs_bmap_same_rtgroup(ip, whichfork, new, &right)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -2721,6 +2667,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2728,6 +2675,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2735,6 +2683,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2764,6 +2713,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2794,6 +2744,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2820,6 +2771,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2827,6 +2779,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2853,7 +2806,7 @@ xfs_bmap_add_extent_hole_real( /* clear out the allocated field, done with it now in any case. */ if (cur) - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_bmap_check_leaf_extents(cur, ip, whichfork); done: @@ -3042,6 +2995,30 @@ xfs_bmap_extsize_align( return 0; } +static inline bool +xfs_bmap_adjacent_valid( + struct xfs_bmalloca *ap, + xfs_fsblock_t x, + xfs_fsblock_t y) +{ + struct xfs_mount *mp = ap->ip->i_mount; + + if (XFS_IS_REALTIME_INODE(ap->ip) && + (ap->datatype & XFS_ALLOC_USERDATA)) { + if (!xfs_has_rtgroups(mp)) + return x < mp->m_sb.sb_rblocks; + + return xfs_rtb_to_rgno(mp, x) == xfs_rtb_to_rgno(mp, y) && + xfs_rtb_to_rgno(mp, x) < mp->m_sb.sb_rgcount && + xfs_rtb_to_rtx(mp, x) < mp->m_sb.sb_rgextents; + + } + + return XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && + XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && + XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks; +} + #define XFS_ALLOC_GAP_UNITS 4 /* returns true if ap->blkno was modified */ @@ -3049,36 +3026,25 @@ bool xfs_bmap_adjacent( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { - xfs_fsblock_t adjust; /* adjustment to block numbers */ - xfs_mount_t *mp; /* mount point structure */ - int rt; /* true if inode is realtime */ - -#define ISVALID(x,y) \ - (rt ? \ - (x) < mp->m_sb.sb_rblocks : \ - XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \ - XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \ - XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) - - mp = ap->ip->i_mount; - rt = XFS_IS_REALTIME_INODE(ap->ip) && - (ap->datatype & XFS_ALLOC_USERDATA); + xfs_fsblock_t adjust; /* adjustment to block numbers */ + /* * If allocating at eof, and there's a previous real block, * try to use its last block as our starting point. */ if (ap->eof && ap->prev.br_startoff != NULLFILEOFF && !isnullstartblock(ap->prev.br_startblock) && - ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount, - ap->prev.br_startblock)) { + xfs_bmap_adjacent_valid(ap, + ap->prev.br_startblock + ap->prev.br_blockcount, + ap->prev.br_startblock)) { ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount; /* * Adjust for the gap between prevp and us. */ adjust = ap->offset - (ap->prev.br_startoff + ap->prev.br_blockcount); - if (adjust && - ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) + if (adjust && xfs_bmap_adjacent_valid(ap, ap->blkno + adjust, + ap->prev.br_startblock)) ap->blkno += adjust; return true; } @@ -3101,7 +3067,8 @@ xfs_bmap_adjacent( !isnullstartblock(ap->prev.br_startblock) && (prevbno = ap->prev.br_startblock + ap->prev.br_blockcount) && - ISVALID(prevbno, ap->prev.br_startblock)) { + xfs_bmap_adjacent_valid(ap, prevbno, + ap->prev.br_startblock)) { /* * Calculate gap to end of previous block. */ @@ -3117,8 +3084,8 @@ xfs_bmap_adjacent( * number, then just use the end of the previous block. */ if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length && - ISVALID(prevbno + prevdiff, - ap->prev.br_startblock)) + xfs_bmap_adjacent_valid(ap, prevbno + prevdiff, + ap->prev.br_startblock)) prevbno += adjust; else prevdiff += adjust; @@ -3150,9 +3117,11 @@ xfs_bmap_adjacent( * offset by our length. */ if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length && - ISVALID(gotbno - gotdiff, gotbno)) + xfs_bmap_adjacent_valid(ap, gotbno - gotdiff, + gotbno)) gotbno -= adjust; - else if (ISVALID(gotbno - ap->length, gotbno)) { + else if (xfs_bmap_adjacent_valid(ap, gotbno - ap->length, + gotbno)) { gotbno -= ap->length; gotdiff += adjust - ap->length; } else @@ -3180,7 +3149,7 @@ xfs_bmap_adjacent( return true; } } -#undef ISVALID + return false; } @@ -3201,7 +3170,7 @@ xfs_bmap_longest_free_extent( } longest = xfs_alloc_longest_free_extent(pag, - xfs_alloc_min_freelist(pag->pag_mount, pag), + xfs_alloc_min_freelist(pag_mount(pag), pag), xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) *blen = longest; @@ -3291,7 +3260,7 @@ xfs_bmap_alloc_account( * yet. */ if (ap->wasdel) { - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); + xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0); return; } @@ -3315,7 +3284,7 @@ xfs_bmap_alloc_account( xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) { ap->ip->i_delayed_blks -= ap->length; - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); + xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0); fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT; } else { fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; @@ -3343,6 +3312,11 @@ xfs_bmap_compute_alignments( align = xfs_get_cowextsz_hint(ap->ip); else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); + + /* Try to align start block to any minimum allocation alignment */ + if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN)) + args->alignment = align; + if (align) { if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, &ap->offset, @@ -3398,31 +3372,19 @@ xfs_bmap_process_allocated_extent( xfs_bmap_alloc_account(ap); } -#ifdef DEBUG static int xfs_bmap_exact_minlen_extent_alloc( - struct xfs_bmalloca *ap) + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args) { - struct xfs_mount *mp = ap->ip->i_mount; - struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp }; - xfs_fileoff_t orig_offset; - xfs_extlen_t orig_length; - int error; - - ASSERT(ap->length); - if (ap->minlen != 1) { - ap->blkno = NULLFSBLOCK; - ap->length = 0; + args->fsbno = NULLFSBLOCK; return 0; } - orig_offset = ap->offset; - orig_length = ap->length; - - args.alloc_minlen_only = 1; - - xfs_bmap_compute_alignments(ap, &args); + args->alloc_minlen_only = 1; + args->minlen = args->maxlen = ap->minlen; + args->total = ap->total; /* * Unlike the longest extent available in an AG, we don't track @@ -3432,39 +3394,16 @@ xfs_bmap_exact_minlen_extent_alloc( * we need not be concerned about a drop in performance in * "debug only" code paths. */ - ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0); + ap->blkno = XFS_AGB_TO_FSB(ap->ip->i_mount, 0, 0); - args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; - args.minlen = args.maxlen = ap->minlen; - args.total = ap->total; - - args.alignment = 1; - args.minalignslop = 0; - - args.minleft = ap->minleft; - args.wasdel = ap->wasdel; - args.resv = XFS_AG_RESV_NONE; - args.datatype = ap->datatype; - - error = xfs_alloc_vextent_first_ag(&args, ap->blkno); - if (error) - return error; - - if (args.fsbno != NULLFSBLOCK) { - xfs_bmap_process_allocated_extent(ap, &args, orig_offset, - orig_length); - } else { - ap->blkno = NULLFSBLOCK; - ap->length = 0; - } - - return 0; + /* + * Call xfs_bmap_btalloc_low_space here as it first does a "normal" AG + * iteration and then drops args->total to args->minlen, which might be + * required to find an allocation for the transaction reservation when + * the file system is very full. + */ + return xfs_bmap_btalloc_low_space(ap, args); } -#else - -#define xfs_bmap_exact_minlen_extent_alloc(bma) (-EFSCORRUPTED) - -#endif /* * If we are not low on available data blocks and we are allocating at @@ -3487,12 +3426,12 @@ xfs_bmap_btalloc_at_eof( int error; /* - * If there are already extents in the file, try an exact EOF block - * allocation to extend the file as a contiguous extent. If that fails, - * or it's the first allocation in a file, just try for a stripe aligned - * allocation. + * If there are already extents in the file, and xfs_bmap_adjacent() has + * given a better blkno, try an exact EOF block allocation to extend the + * file as a contiguous extent. If that fails, or it's the first + * allocation in a file, just try for a stripe aligned allocation. */ - if (ap->offset) { + if (ap->eof) { xfs_extlen_t nextminlen = 0; /* @@ -3660,7 +3599,8 @@ xfs_bmap_btalloc_best_length( int error; ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino); - xfs_bmap_adjacent(ap); + if (!xfs_bmap_adjacent(ap)) + ap->eof = false; /* * Search for an allocation group with a single extent large enough for @@ -3722,8 +3662,11 @@ xfs_bmap_btalloc( /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); - if ((ap->datatype & XFS_ALLOC_USERDATA) && - xfs_inode_is_filestream(ap->ip)) + if (unlikely(XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + error = xfs_bmap_exact_minlen_extent_alloc(ap, &args); + else if ((ap->datatype & XFS_ALLOC_USERDATA) && + xfs_inode_is_filestream(ap->ip)) error = xfs_bmap_btalloc_filestreams(ap, &args, stripe_align); else error = xfs_bmap_btalloc_best_length(ap, &args, stripe_align); @@ -3898,14 +3841,18 @@ xfs_bmapi_read( ASSERT(*nmap >= 1); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE))); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); - if (WARN_ON_ONCE(!ifp)) + if (WARN_ON_ONCE(!ifp)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; + } if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; + } if (xfs_is_shutdown(mp)) return -EIO; @@ -3955,152 +3902,6 @@ xfs_bmapi_read( return 0; } -/* - * Add a delayed allocation extent to an inode. Blocks are reserved from the - * global pool and the extent inserted into the inode in-core extent tree. - * - * On entry, got refers to the first extent beyond the offset of the extent to - * allocate or eof is specified if no such extent exists. On return, got refers - * to the extent record that was inserted to the inode fork. - * - * Note that the allocated extent may have been merged with contiguous extents - * during insertion into the inode fork. Thus, got does not reflect the current - * state of the inode fork on return. If necessary, the caller can use lastx to - * look up the updated record in the inode fork. - */ -int -xfs_bmapi_reserve_delalloc( - struct xfs_inode *ip, - int whichfork, - xfs_fileoff_t off, - xfs_filblks_t len, - xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, - struct xfs_iext_cursor *icur, - int eof) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); - xfs_extlen_t alen; - xfs_extlen_t indlen; - int error; - xfs_fileoff_t aoff = off; - - /* - * Cap the alloc length. Keep track of prealloc so we know whether to - * tag the inode before we return. - */ - alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); - if (!eof) - alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); - if (prealloc && alen >= len) - prealloc = alen - len; - - /* Figure out the extent size, adjust alen */ - if (whichfork == XFS_COW_FORK) { - struct xfs_bmbt_irec prev; - xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); - - if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) - prev.br_startoff = NULLFILEOFF; - - error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, - 1, 0, &aoff, &alen); - ASSERT(!error); - } - - /* - * Make a transaction-less quota reservation for delayed allocation - * blocks. This number gets adjusted later. We return if we haven't - * allocated blocks already inside this loop. - */ - error = xfs_quota_reserve_blkres(ip, alen); - if (error) - return error; - - /* - * Split changing sb for alen and indlen since they could be coming - * from different places. - */ - indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); - ASSERT(indlen > 0); - - error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); - if (error) - goto out_unreserve_quota; - - error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false); - if (error) - goto out_unreserve_blocks; - - - ip->i_delayed_blks += alen; - xfs_mod_delalloc(ip->i_mount, alen + indlen); - - got->br_startoff = aoff; - got->br_startblock = nullstartblock(indlen); - got->br_blockcount = alen; - got->br_state = XFS_EXT_NORM; - - xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); - - /* - * Tag the inode if blocks were preallocated. Note that COW fork - * preallocation can occur at the start or end of the extent, even when - * prealloc == 0, so we must also check the aligned offset and length. - */ - if (whichfork == XFS_DATA_FORK && prealloc) - xfs_inode_set_eofblocks_tag(ip); - if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) - xfs_inode_set_cowblocks_tag(ip); - - return 0; - -out_unreserve_blocks: - xfs_mod_fdblocks(mp, alen, false); -out_unreserve_quota: - if (XFS_IS_QUOTA_ON(mp)) - xfs_quota_unreserve_blkres(ip, alen); - return error; -} - -static int -xfs_bmap_alloc_userdata( - struct xfs_bmalloca *bma) -{ - struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = xfs_bmapi_whichfork(bma->flags); - int error; - - /* - * Set the data type being allocated. For the data fork, the first data - * in the file is treated differently to all other allocations. For the - * attribute fork, we only need to ensure the allocated range is not on - * the busy list. - */ - bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { - bma->datatype |= XFS_ALLOC_USERDATA; - if (bma->offset == 0) - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; - - if (mp->m_dalign && bma->length >= mp->m_dalign) { - error = xfs_bmap_isaeof(bma, whichfork); - if (error) - return error; - } - - if (XFS_IS_REALTIME_INODE(bma->ip)) - return xfs_bmap_rtalloc(bma); - } - - if (unlikely(XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) - return xfs_bmap_exact_minlen_extent_alloc(bma); - - return xfs_bmap_btalloc(bma); -} - static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) @@ -4108,43 +3909,51 @@ xfs_bmapi_allocate( struct xfs_mount *mp = bma->ip->i_mount; int whichfork = xfs_bmapi_whichfork(bma->flags); struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); - int tmp_logflags = 0; int error; ASSERT(bma->length > 0); - - /* - * For the wasdelay case, we could also just allocate the stuff asked - * for in this bmap call but that wouldn't be as good. - */ - if (bma->wasdel) { - bma->length = (xfs_extlen_t)bma->got.br_blockcount; - bma->offset = bma->got.br_startoff; - if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev)) - bma->prev.br_startoff = NULLFILEOFF; - } else { - bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN); - if (!bma->eof) - bma->length = XFS_FILBLKS_MIN(bma->length, - bma->got.br_startoff - bma->offset); - } + ASSERT(bma->length <= XFS_MAX_BMBT_EXTLEN); if (bma->flags & XFS_BMAPI_CONTIG) bma->minlen = bma->length; else bma->minlen = 1; - if (bma->flags & XFS_BMAPI_METADATA) { - if (unlikely(XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) - error = xfs_bmap_exact_minlen_extent_alloc(bma); - else - error = xfs_bmap_btalloc(bma); - } else { - error = xfs_bmap_alloc_userdata(bma); + if (!(bma->flags & XFS_BMAPI_METADATA)) { + /* + * For the data and COW fork, the first data in the file is + * treated differently to all other allocations. For the + * attribute fork, we only need to ensure the allocated range + * is not on the busy list. + */ + bma->datatype = XFS_ALLOC_NOBUSY; + if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { + bma->datatype |= XFS_ALLOC_USERDATA; + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + + if (mp->m_dalign && bma->length >= mp->m_dalign) { + error = xfs_bmap_isaeof(bma, whichfork); + if (error) + return error; + } + } } - if (error || bma->blkno == NULLFSBLOCK) + + if ((bma->datatype & XFS_ALLOC_USERDATA) && + XFS_IS_REALTIME_INODE(bma->ip)) + error = xfs_bmap_rtalloc(bma); + else + error = xfs_bmap_btalloc(bma); + if (error) return error; + if (bma->blkno == NULLFSBLOCK) + return -ENOSPC; + + if (WARN_ON_ONCE(!xfs_valid_startblock(bma->ip, bma->blkno))) { + xfs_bmap_mark_sick(bma->ip, whichfork); + return -EFSCORRUPTED; + } if (bma->flags & XFS_BMAPI_ZERO) { error = xfs_zero_extent(bma->ip, bma->blkno, bma->length); @@ -4160,9 +3969,8 @@ xfs_bmapi_allocate( */ bma->nallocs++; - if (bma->cur) - bma->cur->bc_ino.flags = - bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; + if (bma->cur && bma->wasdel) + bma->cur->bc_flags |= XFS_BTREE_BMBT_WASDEL; bma->got.br_startoff = bma->offset; bma->got.br_startblock = bma->blkno; @@ -4178,8 +3986,6 @@ xfs_bmapi_allocate( error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, whichfork, &bma->icur, &bma->cur, &bma->got, &bma->logflags, bma->flags); - - bma->logflags |= tmp_logflags; if (error) return error; @@ -4324,6 +4130,15 @@ xfs_bmapi_finish( * extent state if necessary. Details behaviour is controlled by the flags * parameter. Only allocates blocks from a single allocation group, to avoid * locking problems. + * + * Returns 0 on success and places the extent mappings in mval. nmaps is used + * as an input/output parameter where the caller specifies the maximum number + * of mappings that may be returned and xfs_bmapi_write passes back the number + * of mappings (including existing mappings) it found. + * + * Returns a negative error code on failure, including -ENOSPC when it could not + * allocate any blocks and -ENOSR when it did allocate blocks to convert a + * delalloc range, but those blocks were before the passed in range. */ int xfs_bmapi_write( @@ -4369,7 +4184,7 @@ xfs_bmapi_write( ASSERT(tp != NULL); ASSERT(len > 0); ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!(flags & XFS_BMAPI_REMAP)); /* zeroing is for currently only for data extents, not metadata */ @@ -4386,6 +4201,7 @@ xfs_bmapi_write( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4441,28 +4257,42 @@ xfs_bmapi_write( * allocation length request (which can be 64 bits in * length) and the bma length request, which is * xfs_extlen_t and therefore 32 bits. Hence we have to - * check for 32-bit overflows and handle them here. + * be careful and do the min() using the larger type to + * avoid overflows. */ - if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN) - bma.length = XFS_MAX_BMBT_EXTLEN; - else - bma.length = len; + bma.length = XFS_FILBLKS_MIN(len, XFS_MAX_BMBT_EXTLEN); + + if (wasdelay) { + bma.length = XFS_FILBLKS_MIN(bma.length, + bma.got.br_blockcount - + (bno - bma.got.br_startoff)); + } else { + if (!eof) + bma.length = XFS_FILBLKS_MIN(bma.length, + bma.got.br_startoff - bno); + } - ASSERT(len > 0); ASSERT(bma.length > 0); error = xfs_bmapi_allocate(&bma); - if (error) + if (error) { + /* + * If we already allocated space in a previous + * iteration return what we go so far when + * running out of space. + */ + if (error == -ENOSPC && bma.nallocs) + break; goto error0; - if (bma.blkno == NULLFSBLOCK) - break; + } /* * If this is a CoW allocation, record the data in * the refcount btree for orphan recovery. */ if (whichfork == XFS_COW_FORK) - xfs_refcount_alloc_cow_extent(tp, bma.blkno, - bma.length); + xfs_refcount_alloc_cow_extent(tp, + XFS_IS_REALTIME_INODE(ip), + bma.blkno, bma.length); } /* Deal with the allocated space we found. */ @@ -4492,7 +4322,6 @@ xfs_bmapi_write( if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got)) eof = true; } - *nmap = n; error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); @@ -4503,7 +4332,22 @@ xfs_bmapi_write( ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork)); xfs_bmapi_finish(&bma, whichfork, 0); xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, - orig_nmap, *nmap); + orig_nmap, n); + + /* + * When converting delayed allocations, xfs_bmapi_allocate ignores + * the passed in bno and always converts from the start of the found + * delalloc extent. + * + * To avoid a successful return with *nmap set to 0, return the magic + * -ENOSR error code for this particular case so that the caller can + * handle it. + */ + if (!n) { + ASSERT(bma.nallocs >= *nmap); + return -ENOSR; + } + *nmap = n; return 0; error0: xfs_bmapi_finish(&bma, whichfork, error); @@ -4516,8 +4360,8 @@ error0: * invocations to allocate the target offset if a large enough physical extent * is not available. */ -int -xfs_bmapi_convert_delalloc( +static int +xfs_bmapi_convert_one_delalloc( struct xfs_inode *ip, int whichfork, xfs_off_t offset, @@ -4547,11 +4391,8 @@ xfs_bmapi_convert_delalloc( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_iext_count_may_overflow(ip, whichfork, + error = xfs_iext_count_extend(tp, ip, whichfork, XFS_IEXT_ADD_NOSPLIT_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; @@ -4574,19 +4415,25 @@ xfs_bmapi_convert_delalloc( if (!isnullstartblock(bma.got.br_startblock)) { xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } bma.tp = tp; bma.ip = ip; bma.wasdel = true; - bma.offset = bma.got.br_startoff; - bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, - XFS_MAX_BMBT_EXTLEN); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); /* + * Always allocate convert from the start of the delalloc extent even if + * that is outside the passed in range to create large contiguous + * extents on disk. + */ + bma.offset = bma.got.br_startoff; + bma.length = bma.got.br_blockcount; + + /* * When we're converting the delalloc reservations backing dirty pages * in the page cache, we must be careful about how we create the new * extents: @@ -4610,23 +4457,18 @@ xfs_bmapi_convert_delalloc( if (error) goto out_finish; - error = -ENOSPC; - if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) - goto out_finish; - error = -EFSCORRUPTED; - if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) - goto out_finish; - XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) - xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); + xfs_refcount_alloc_cow_extent(tp, XFS_IS_REALTIME_INODE(ip), + bma.blkno, bma.length); error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); @@ -4646,6 +4488,36 @@ out_trans_cancel: return error; } +/* + * Pass in a dellalloc extent and convert it to real extents, return the real + * extent that maps offset_fsb in iomap. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_inode *ip, + int whichfork, + loff_t offset, + struct iomap *iomap, + unsigned int *seq) +{ + int error; + + /* + * Attempt to allocate whatever delalloc extent currently backs offset + * and put the result into iomap. Allocate in a loop because it may + * take several attempts to allocate real blocks for a contiguous + * delalloc extent if free space is sufficiently fragmented. + */ + do { + error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset, + iomap, seq); + if (error) + return error; + } while (iomap->offset + iomap->length <= offset); + + return 0; +} + int xfs_bmapi_remap( struct xfs_trans *tp, @@ -4666,7 +4538,7 @@ xfs_bmapi_remap( ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(len > 0); ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC | XFS_BMAPI_NORMAP))); ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) != @@ -4674,6 +4546,7 @@ xfs_bmapi_remap( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4691,12 +4564,11 @@ xfs_bmapi_remap( } ip->i_nblocks += len; + ip->i_delayed_blks -= len; /* see xfs_bmap_defer_add */ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; - } got.br_startoff = bno; got.br_startblock = startblock; @@ -4738,32 +4610,18 @@ error0: * ores == 1). The number of stolen blocks is returned. The availability and * subsequent accounting of stolen blocks is the responsibility of the caller. */ -static xfs_filblks_t +static void xfs_bmap_split_indlen( xfs_filblks_t ores, /* original res. */ xfs_filblks_t *indlen1, /* ext1 worst indlen */ - xfs_filblks_t *indlen2, /* ext2 worst indlen */ - xfs_filblks_t avail) /* stealable blocks */ + xfs_filblks_t *indlen2) /* ext2 worst indlen */ { xfs_filblks_t len1 = *indlen1; xfs_filblks_t len2 = *indlen2; xfs_filblks_t nres = len1 + len2; /* new total res. */ - xfs_filblks_t stolen = 0; xfs_filblks_t resfactor; /* - * Steal as many blocks as we can to try and satisfy the worst case - * indlen for both new extents. - */ - if (ores < nres && avail) - stolen = XFS_FILBLKS_MIN(nres - ores, avail); - ores += stolen; - - /* nothing else to do if we've satisfied the new reservation */ - if (ores >= nres) - return stolen; - - /* * We can't meet the total required reservation for the two extents. * Calculate the percent of the overall shortage between both extents * and apply this percentage to each of the requested indlen values. @@ -4807,31 +4665,30 @@ xfs_bmap_split_indlen( *indlen1 = len1; *indlen2 = len2; - - return stolen; } -int +void xfs_bmap_del_extent_delay( struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del) + struct xfs_bmbt_irec *del, + uint32_t bflags) /* bmapi flags */ { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec new; int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; - xfs_filblks_t got_indlen, new_indlen, stolen; + xfs_filblks_t got_indlen, new_indlen, stolen = 0; uint32_t state = xfs_bmap_fork_to_state(whichfork); - int error = 0; + uint64_t fdblocks; bool isrt; XFS_STATS_INC(mp, xs_del_exlist); - isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + isrt = xfs_ifork_is_realtime(ip, whichfork); del_endoff = del->br_startoff + del->br_blockcount; got_endoff = got->br_startoff + got->br_blockcount; da_old = startblockval(got->br_startblock); @@ -4841,18 +4698,12 @@ xfs_bmap_del_extent_delay( ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); - if (isrt) - xfs_mod_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount)); - /* * Update the inode delalloc counter now and wait to update the * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - ASSERT(!isrt); - error = xfs_quota_unreserve_blkres(ip, del->br_blockcount); - if (error) - return error; + xfs_quota_unreserve_blkres(ip, del->br_blockcount); ip->i_delayed_blks -= del->br_blockcount; if (got->br_startoff == del->br_startoff) @@ -4906,8 +4757,24 @@ xfs_bmap_del_extent_delay( new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount); WARN_ON_ONCE(!got_indlen || !new_indlen); - stolen = xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen, - del->br_blockcount); + /* + * Steal as many blocks as we can to try and satisfy the worst + * case indlen for both new extents. + * + * However, we can't just steal reservations from the data + * blocks if this is an RT inodes as the data and metadata + * blocks come from different pools. We'll have to live with + * under-filled indirect reservation in this case. + */ + da_new = got_indlen + new_indlen; + if (da_new > da_old && !isrt) { + stolen = XFS_FILBLKS_MIN(da_new - da_old, + del->br_blockcount); + da_old += stolen; + } + if (da_new > da_old) + xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen); + da_new = got_indlen + new_indlen; got->br_startblock = nullstartblock((int)got_indlen); @@ -4919,20 +4786,29 @@ xfs_bmap_del_extent_delay( xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &new, state); - da_new = got_indlen + new_indlen - stolen; del->br_blockcount -= stolen; break; } ASSERT(da_old >= da_new); da_diff = da_old - da_new; - if (!isrt) - da_diff += del->br_blockcount; - if (da_diff) { - xfs_mod_fdblocks(mp, da_diff, false); - xfs_mod_delalloc(mp, -da_diff); + fdblocks = da_diff; + + if (bflags & XFS_BMAPI_REMAP) { + ; + } else if (isrt) { + xfs_rtbxlen_t rtxlen; + + rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount); + if (xfs_is_zoned_inode(ip)) + xfs_zoned_add_available(mp, rtxlen); + xfs_add_frextents(mp, rtxlen); + } else { + fdblocks += del->br_blockcount; } - return error; + + xfs_add_fdblocks(mp, fdblocks); + xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff); } void @@ -5006,6 +4882,34 @@ xfs_bmap_del_extent_cow( ip->i_delayed_blks -= del->br_blockcount; } +static int +xfs_bmap_free_rtblocks( + struct xfs_trans *tp, + struct xfs_bmbt_irec *del) +{ + struct xfs_rtgroup *rtg; + int error; + + rtg = xfs_rtgroup_grab(tp->t_mountp, 0); + if (!rtg) + return -EIO; + + /* + * Ensure the bitmap and summary inodes are locked and joined to the + * transaction before modifying them. + */ + if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) { + tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED; + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_BITMAP); + } + + error = xfs_rtfree_blocks(tp, rtg, del->br_startblock, + del->br_blockcount); + xfs_rtgroup_rele(rtg); + return error; +} + /* * Called by xfs_bmapi to update file extent records and the btree * after removing space. @@ -5023,8 +4927,7 @@ xfs_bmap_del_extent_real( { xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ - int do_fx; /* free extent at end of routine */ - int error; /* error return value */ + int error = 0; /* error return value */ struct xfs_bmbt_irec got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ @@ -5067,20 +4970,10 @@ xfs_bmap_del_extent_real( return -ENOSPC; *logflagsp = XFS_ILOG_CORE; - if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - if (!(bflags & XFS_BMAPI_REMAP)) { - error = xfs_rtfree_blocks(tp, del->br_startblock, - del->br_blockcount); - if (error) - return error; - } - - do_fx = 0; + if (xfs_ifork_is_realtime(ip, whichfork)) qfield = XFS_TRANS_DQ_RTBCOUNT; - } else { - do_fx = 1; + else qfield = XFS_TRANS_DQ_BCOUNT; - } nblks = del->br_blockcount; del_endblock = del->br_startblock + del->br_blockcount; @@ -5088,8 +4981,10 @@ xfs_bmap_del_extent_real( error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } if (got.br_startoff == del->br_startoff) @@ -5113,8 +5008,10 @@ xfs_bmap_del_extent_real( } if ((error = xfs_btree_delete(cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } break; case BMAP_LEFT_FILLING: /* @@ -5186,8 +5083,10 @@ xfs_bmap_del_extent_real( error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } /* * Update the btree record back * to the original value. @@ -5203,8 +5102,10 @@ xfs_bmap_del_extent_real( *logflagsp = 0; return -ENOSPC; } - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } else *logflagsp |= xfs_ilog_fext(whichfork); @@ -5220,18 +5121,39 @@ xfs_bmap_del_extent_real( /* * If we need to, add to list of extents to delete. */ - if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { + if (!(bflags & XFS_BMAPI_REMAP)) { + bool isrt = xfs_ifork_is_realtime(ip, whichfork); + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { - xfs_refcount_decrease_extent(tp, del); + xfs_refcount_decrease_extent(tp, isrt, del); + } else if (isrt && !xfs_has_rtgroups(mp)) { + error = xfs_bmap_free_rtblocks(tp, del); } else { + unsigned int efi_flags = 0; + + if ((bflags & XFS_BMAPI_NODISCARD) || + del->br_state == XFS_EXT_UNWRITTEN) + efi_flags |= XFS_FREE_EXTENT_SKIP_DISCARD; + + /* + * Historically, we did not use EFIs to free realtime + * extents. However, when reverse mapping is enabled, + * we must maintain the same order of operations as the + * data device, which is: Remove the file mapping, + * remove the reverse mapping, and then free the + * blocks. Reflink for realtime volumes requires the + * same sort of ordering. Both features rely on + * rtgroups, so let's gate rt EFI usage on rtgroups. + */ + if (isrt) + efi_flags |= XFS_FREE_EXTENT_REALTIME; + error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, - XFS_AG_RESV_NONE, - ((bflags & XFS_BMAPI_NODISCARD) || - del->br_state == XFS_EXT_UNWRITTEN)); - if (error) - return error; + XFS_AG_RESV_NONE, efi_flags); } + if (error) + return error; } /* @@ -5286,12 +5208,14 @@ __xfs_bunmapi( whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); ifp = xfs_ifork_ptr(ip, whichfork); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; + } if (xfs_is_shutdown(mp)) return -EIO; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(len > 0); ASSERT(nexts >= 0); @@ -5304,7 +5228,7 @@ __xfs_bunmapi( return 0; } XFS_STATS_INC(mp, xs_blk_unmap); - isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + isrt = xfs_ifork_is_realtime(ip, whichfork); end = start + len; if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) { @@ -5317,20 +5241,9 @@ __xfs_bunmapi( if (ifp->if_format == XFS_DINODE_FMT_BTREE) { ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; } else cur = NULL; - if (isrt) { - /* - * Synchronize by locking the bitmap inode. - */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); - xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); - } - extno = 0; while (end != (xfs_fileoff_t)-1 && end >= start && (nexts == 0 || extno < nexts)) { @@ -5367,7 +5280,7 @@ __xfs_bunmapi( if (del.br_startoff + del.br_blockcount > end + 1) del.br_blockcount = end + 1 - del.br_startoff; - if (!isrt) + if (!isrt || (flags & XFS_BMAPI_REMAP)) goto delete; mod = xfs_rtb_to_rtxoff(mp, @@ -5385,7 +5298,7 @@ __xfs_bunmapi( * This piece is unwritten, or we're not * using unwritten extents. Skip over it. */ - ASSERT(end >= mod); + ASSERT((flags & XFS_BMAPI_REMAP) || end >= mod); end -= mod > del.br_blockcount ? del.br_blockcount : mod; if (end < got.br_startoff && @@ -5491,18 +5404,17 @@ __xfs_bunmapi( delete: if (wasdel) { - error = xfs_bmap_del_extent_delay(ip, whichfork, &icur, - &got, &del); + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, flags); } else { error = xfs_bmap_del_extent_real(ip, tp, &icur, cur, &del, &tmp_logflags, whichfork, flags); logflags |= tmp_logflags; + if (error) + goto error0; } - if (error) - goto error0; - end = del.br_startoff - 1; nodelete: /* @@ -5555,7 +5467,7 @@ error0: xfs_trans_log_inode(tp, ip, logflags); if (cur) { if (!error) - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -5585,6 +5497,8 @@ xfs_bunmapi( */ STATIC bool xfs_bmse_can_merge( + struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *left, /* preceding extent */ struct xfs_bmbt_irec *got, /* current extent to shift */ xfs_fileoff_t shift) /* shift fsb */ @@ -5600,7 +5514,8 @@ xfs_bmse_can_merge( if ((left->br_startoff + left->br_blockcount != startoff) || (left->br_startblock + left->br_blockcount != got->br_startblock) || (left->br_state != got->br_state) || - (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN)) + (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN) || + !xfs_bmap_same_rtgroup(ip, whichfork, left, got)) return false; return true; @@ -5635,9 +5550,8 @@ xfs_bmse_merge( blockcount = left->br_blockcount + got->br_blockcount; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_bmse_can_merge(left, got, shift)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + ASSERT(xfs_bmse_can_merge(ip, whichfork, left, got, shift)); new = *left; new.br_blockcount = blockcount; @@ -5657,21 +5571,27 @@ xfs_bmse_merge( error = xfs_bmbt_lookup_eq(cur, got, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_btree_delete(cur, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } /* lookup and update size of the previous extent */ error = xfs_bmbt_lookup_eq(cur, left, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_bmbt_update(cur, &new); if (error) @@ -5719,8 +5639,10 @@ xfs_bmap_shift_update_extent( error = xfs_bmbt_lookup_eq(cur, &prev, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_bmbt_update(cur, got); if (error) @@ -5758,28 +5680,28 @@ xfs_bmap_collapse_extents( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } if (xfs_is_shutdown(mp)) return -EIO; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; - if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; - } if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { *done = true; goto del_cursor; } if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) { + xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto del_cursor; } @@ -5791,7 +5713,8 @@ xfs_bmap_collapse_extents( goto del_cursor; } - if (xfs_bmse_can_merge(&prev, &got, offset_shift_fsb)) { + if (xfs_bmse_can_merge(ip, whichfork, &prev, &got, + offset_shift_fsb)) { error = xfs_bmse_merge(tp, ip, whichfork, offset_shift_fsb, &icur, &got, &prev, cur, &logflags); @@ -5837,7 +5760,7 @@ xfs_bmap_can_insert_extents( int is_empty; int error = 0; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); if (xfs_is_shutdown(ip->i_mount)) return -EIO; @@ -5873,22 +5796,21 @@ xfs_bmap_insert_extents( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } if (xfs_is_shutdown(mp)) return -EIO; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; - if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; - } if (*next_fsb == NULLFSBLOCK) { xfs_iext_last(ifp, &icur); @@ -5904,11 +5826,13 @@ xfs_bmap_insert_extents( } } if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) { + xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto del_cursor; } if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) { + xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto del_cursor; } @@ -5926,7 +5850,8 @@ xfs_bmap_insert_extents( * never find mergeable extents in this scenario. Check anyways * and warn if we encounter two extents that could be one. */ - if (xfs_bmse_can_merge(&got, &next, offset_shift_fsb)) + if (xfs_bmse_can_merge(ip, whichfork, &got, &next, + offset_shift_fsb)) WARN_ON_ONCE(1); } @@ -5976,6 +5901,7 @@ xfs_bmap_split_extent( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -6002,11 +5928,11 @@ xfs_bmap_split_extent( if (ifp->if_format == XFS_DINODE_FMT_BTREE) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto del_cursor; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto del_cursor; } @@ -6034,6 +5960,7 @@ xfs_bmap_split_extent( if (error) goto del_cursor; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto del_cursor; } @@ -6041,6 +5968,7 @@ xfs_bmap_split_extent( if (error) goto del_cursor; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto del_cursor; } @@ -6060,7 +5988,7 @@ xfs_bmap_split_extent( del_cursor: if (cur) { - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, error); } @@ -6069,17 +5997,8 @@ del_cursor: return error; } -/* Deferred mapping is only for real extents in the data fork. */ -static bool -xfs_bmap_is_update_needed( - struct xfs_bmbt_irec *bmap) -{ - return bmap->br_startblock != HOLESTARTBLOCK && - bmap->br_startblock != DELAYSTARTBLOCK; -} - /* Record a bmap intent. */ -static int +static inline void __xfs_bmap_add( struct xfs_trans *tp, enum xfs_bmap_intent_type type, @@ -6089,25 +6008,19 @@ __xfs_bmap_add( { struct xfs_bmap_intent *bi; - trace_xfs_bmap_defer(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), - type, - XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), - ip->i_ino, whichfork, - bmap->br_startoff, - bmap->br_blockcount, - bmap->br_state); + if ((whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) || + bmap->br_startblock == HOLESTARTBLOCK || + bmap->br_startblock == DELAYSTARTBLOCK) + return; - bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&bi->bi_list); bi->bi_type = type; bi->bi_owner = ip; bi->bi_whichfork = whichfork; bi->bi_bmap = *bmap; - xfs_bmap_update_get_group(tp->t_mountp, bi); - xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); - return 0; + xfs_bmap_defer_add(tp, bi); } /* Map an extent into a file. */ @@ -6115,12 +6028,10 @@ void xfs_bmap_map_extent( struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_bmap_is_update_needed(PREV)) - return; - - __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); + __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, whichfork, PREV); } /* Unmap an extent out of a file. */ @@ -6128,12 +6039,10 @@ void xfs_bmap_unmap_extent( struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_bmap_is_update_needed(PREV)) - return; - - __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); + __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, whichfork, PREV); } /* @@ -6147,36 +6056,35 @@ xfs_bmap_finish_one( { struct xfs_bmbt_irec *bmap = &bi->bi_bmap; int error = 0; + int flags = 0; - ASSERT(tp->t_highest_agno == NULLAGNUMBER); + if (bi->bi_whichfork == XFS_ATTR_FORK) + flags |= XFS_BMAPI_ATTRFORK; - trace_xfs_bmap_deferred(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), - bi->bi_type, - XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), - bi->bi_owner->i_ino, bi->bi_whichfork, - bmap->br_startoff, bmap->br_blockcount, - bmap->br_state); + ASSERT(tp->t_highest_agno == NULLAGNUMBER); - if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK)) - return -EFSCORRUPTED; + trace_xfs_bmap_deferred(bi); - if (XFS_TEST_ERROR(false, tp->t_mountp, - XFS_ERRTAG_BMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; switch (bi->bi_type) { case XFS_BMAP_MAP: + if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN) + flags |= XFS_BMAPI_PREALLOC; error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff, - bmap->br_blockcount, bmap->br_startblock, 0); + bmap->br_blockcount, bmap->br_startblock, + flags); bmap->br_blockcount = 0; break; case XFS_BMAP_UNMAP: error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff, - &bmap->br_blockcount, XFS_BMAPI_REMAP, 1); + &bmap->br_blockcount, flags | XFS_BMAPI_REMAP, + 1); break; default: ASSERT(0); + xfs_bmap_mark_sick(bi->bi_owner, bi->bi_whichfork); error = -EFSCORRUPTED; } @@ -6257,7 +6165,7 @@ xfs_bunmapi_range( xfs_filblks_t unmap_len = endoff - startoff + 1; int error = 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); while (unmap_len > 0) { ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); @@ -6270,7 +6178,98 @@ xfs_bunmapi_range( error = xfs_defer_finish(tpp); if (error) goto out; + cond_resched(); } out: return error; } + +struct xfs_bmap_query_range { + xfs_bmap_query_range_fn fn; + void *priv; +}; + +/* Format btree record and pass to our callback. */ +STATIC int +xfs_bmap_query_range_helper( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_bmap_query_range *query = priv; + struct xfs_bmbt_irec irec; + xfs_failaddr_t fa; + + xfs_bmbt_disk_get_all(&rec->bmbt, &irec); + fa = xfs_bmap_validate_extent(cur->bc_ino.ip, cur->bc_ino.whichfork, + &irec); + if (fa) { + xfs_btree_mark_sick(cur); + return xfs_bmap_complain_bad_rec(cur->bc_ino.ip, + cur->bc_ino.whichfork, fa, &irec); + } + + return query->fn(cur, &irec, query->priv); +} + +/* Find all bmaps. */ +int +xfs_bmap_query_all( + struct xfs_btree_cur *cur, + xfs_bmap_query_range_fn fn, + void *priv) +{ + struct xfs_bmap_query_range query = { + .priv = priv, + .fn = fn, + }; + + return xfs_btree_query_all(cur, xfs_bmap_query_range_helper, &query); +} + +/* Helper function to extract extent size hint from inode */ +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + /* + * No point in aligning allocations if we need to COW to actually + * write to them. + */ + if (!xfs_is_always_cow_inode(ip) && + (ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) + return ip->i_extsize; + if (XFS_IS_REALTIME_INODE(ip) && + ip->i_mount->m_sb.sb_rextsize > 1) + return ip->i_mount->m_sb.sb_rextsize; + return 0; +} + +/* + * Helper function to extract CoW extent size hint from inode. + * Between the extent size hint and the CoW extent size hint, we + * return the greater of the two. If the value is zero (automatic), + * use the default size. + */ +xfs_extlen_t +xfs_get_cowextsz_hint( + struct xfs_inode *ip) +{ + xfs_extlen_t a, b; + + a = 0; + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) + a = ip->i_cowextsize; + if (XFS_IS_REALTIME_INODE(ip)) { + b = 0; + if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) + b = ip->i_extsize; + } else { + b = xfs_get_extsz_hint(ip); + } + + a = max(a, b); + if (a == 0) + return XFS_DEFAULT_COWEXTSZ_HINT; + return a; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f6b73f1bad5f..d5f2729305fa 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -87,6 +87,9 @@ struct xfs_bmalloca { /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP (1u << 10) +/* Try to align allocations to the extent size hint */ +#define XFS_BMAPI_EXTSZALIGN (1u << 11) + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -98,7 +101,8 @@ struct xfs_bmalloca { { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ - { XFS_BMAPI_NORMAP, "NORMAP" } + { XFS_BMAPI_NORMAP, "NORMAP" },\ + { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" } static inline int xfs_bmapi_aflag(int w) @@ -158,7 +162,7 @@ static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec) * Return true if the extent is a real, allocated extent, or false if it is a * delayed allocation, and unwritten extent or a hole. */ -static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) +static inline bool xfs_bmap_is_written_extent(const struct xfs_bmbt_irec *irec) { return xfs_bmap_is_real_extent(irec) && irec->br_state != XFS_EXT_UNWRITTEN; @@ -176,9 +180,16 @@ int xfs_bmap_longest_free_extent(struct xfs_perag *pag, void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); -int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +int xfs_bmap_add_attrfork(struct xfs_trans *tp, struct xfs_inode *ip, + int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); +int xfs_bmap_local_to_extents(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_extlen_t total, int *logflagsp, int whichfork, + void (*init_fn)(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp, + void *priv), + void *priv); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); @@ -195,9 +206,9 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); -int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, +void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del); + struct xfs_bmbt_irec *del, uint32_t bflags); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); @@ -212,10 +223,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, bool *done, xfs_fileoff_t stop_fsb); int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_offset); -int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, - int eof); int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, xfs_off_t offset, struct iomap *iomap, unsigned int *seq); int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, @@ -226,29 +233,31 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip, int fork); int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap, struct xfs_alloc_arg *args); +xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, XFS_BMAP_UNMAP, }; +#define XFS_BMAP_INTENT_STRINGS \ + { XFS_BMAP_MAP, "map" }, \ + { XFS_BMAP_UNMAP, "unmap" } + struct xfs_bmap_intent { struct list_head bi_list; enum xfs_bmap_intent_type bi_type; int bi_whichfork; struct xfs_inode *bi_owner; - struct xfs_perag *bi_pag; + struct xfs_group *bi_group; struct xfs_bmbt_irec bi_bmap; }; -void xfs_bmap_update_get_group(struct xfs_mount *mp, - struct xfs_bmap_intent *bi); - int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, - struct xfs_bmbt_irec *imap); + int whichfork, struct xfs_bmbt_irec *imap); void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, - struct xfs_bmbt_irec *imap); + int whichfork, struct xfs_bmbt_irec *imap); static inline uint32_t xfs_bmap_fork_to_state(int whichfork) { @@ -280,4 +289,15 @@ extern struct kmem_cache *xfs_bmap_intent_cache; int __init xfs_bmap_intent_init_cache(void); void xfs_bmap_intent_destroy_cache(void); +typedef int (*xfs_bmap_query_range_fn)( + struct xfs_btree_cur *cur, + struct xfs_bmbt_irec *rec, + void *priv); + +int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn, + void *priv); + +xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); +xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); + #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 71f2d50f7823..908d7b050e9c 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -26,6 +26,22 @@ static struct kmem_cache *xfs_bmbt_cur_cache; +void +xfs_bmbt_init_block( + struct xfs_inode *ip, + struct xfs_btree_block *buf, + struct xfs_buf *bp, + __u16 level, + __u16 numrecs) +{ + if (bp) + xfs_btree_init_buf(ip->i_mount, bp, &xfs_bmbt_ops, level, + numrecs, ip->i_ino); + else + xfs_btree_init_block(ip->i_mount, buf, &xfs_bmbt_ops, level, + numrecs, ip->i_ino); +} + /* * Convert on-disk form of btree root to in-memory form. */ @@ -44,17 +60,15 @@ xfs_bmdr_to_bmbt( xfs_bmbt_key_t *tkp; __be64 *tpp; - xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, - XFS_BTNUM_BMAP, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS); + xfs_bmbt_init_block(ip, rblock, NULL, 0, 0); rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; dmxr = xfs_bmdr_maxrecs(dblocklen, 0); - fkp = XFS_BMDR_KEY_ADDR(dblock, 1); - tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); - fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); - tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); + fkp = xfs_bmdr_key_addr(dblock, 1); + tkp = xfs_bmbt_key_addr(mp, rblock, 1); + fpp = xfs_bmdr_ptr_addr(dblock, 1, dmxr); + tpp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, rblocklen); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); memcpy(tpp, fpp, sizeof(*fpp) * dmxr); @@ -154,10 +168,10 @@ xfs_bmbt_to_bmdr( dblock->bb_level = rblock->bb_level; dblock->bb_numrecs = rblock->bb_numrecs; dmxr = xfs_bmdr_maxrecs(dblocklen, 0); - fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); - tkp = XFS_BMDR_KEY_ADDR(dblock, 1); - fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); - tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); + fkp = xfs_bmbt_key_addr(mp, rblock, 1); + tkp = xfs_bmdr_key_addr(dblock, 1); + fpp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, rblocklen); + tpp = xfs_bmdr_ptr_addr(dblock, 1, dmxr); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); memcpy(tpp, fpp, sizeof(*fpp) * dmxr); @@ -171,13 +185,8 @@ xfs_bmbt_dup_cursor( new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ino.ip, cur->bc_ino.whichfork); - - /* - * Copy the firstblock, dfops, and flags values, - * since init cursor doesn't get them. - */ - new->bc_ino.flags = cur->bc_ino.flags; - + new->bc_flags |= (cur->bc_flags & + (XFS_BTREE_BMBT_INVALID_OWNER | XFS_BTREE_BMBT_WASDEL)); return new; } @@ -189,10 +198,10 @@ xfs_bmbt_update_cursor( ASSERT((dst->bc_tp->t_highest_agno != NULLAGNUMBER) || (dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME)); - dst->bc_ino.allocated += src->bc_ino.allocated; + dst->bc_bmap.allocated += src->bc_bmap.allocated; dst->bc_tp->t_highest_agno = src->bc_tp->t_highest_agno; - src->bc_ino.allocated = 0; + src->bc_bmap.allocated = 0; } STATIC int @@ -211,7 +220,7 @@ xfs_bmbt_alloc_block( xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino, cur->bc_ino.whichfork); args.minlen = args.maxlen = args.prod = 1; - args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL; + args.wasdel = cur->bc_flags & XFS_BTREE_BMBT_WASDEL; if (!args.wasdel && args.tp->t_blk_res == 0) return -ENOSPC; @@ -247,7 +256,7 @@ xfs_bmbt_alloc_block( } ASSERT(args.len == 1); - cur->bc_ino.allocated++; + cur->bc_bmap.allocated++; cur->bc_ino.ip->i_nblocks++; xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE); xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip, @@ -273,7 +282,7 @@ xfs_bmbt_free_block( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, 0); if (error) return error; @@ -360,14 +369,6 @@ xfs_bmbt_init_rec_from_cur( xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b); } -STATIC void -xfs_bmbt_init_ptr_from_cur( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr) -{ - ptr->l = 0; -} - STATIC int64_t xfs_bmbt_key_diff( struct xfs_btree_cur *cur, @@ -419,7 +420,7 @@ xfs_bmbt_verify( * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. */ - fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); if (fa) return fa; } @@ -435,7 +436,7 @@ xfs_bmbt_verify( if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1])) return __this_address; - return xfs_btree_lblock_verify(bp, mp->m_bmap_dmxr[level != 0]); + return xfs_btree_fsblock_verify(bp, mp->m_bmap_dmxr[level != 0]); } static void @@ -444,7 +445,7 @@ xfs_bmbt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_lblock_verify_crc(bp)) + if (!xfs_btree_fsblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_bmbt_verify(bp); @@ -468,7 +469,7 @@ xfs_bmbt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_lblock_calc_crc(bp); + xfs_btree_fsblock_calc_crc(bp); } const struct xfs_buf_ops xfs_bmbt_buf_ops = { @@ -515,9 +516,126 @@ xfs_bmbt_keys_contiguous( be64_to_cpu(key2->bmbt.br_startoff)); } -static const struct xfs_btree_ops xfs_bmbt_ops = { +static inline void +xfs_bmbt_move_ptrs( + struct xfs_mount *mp, + struct xfs_btree_block *broot, + short old_size, + size_t new_size, + unsigned int numrecs) +{ + void *dptr; + void *sptr; + + sptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, old_size); + dptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, new_size); + memmove(dptr, sptr, numrecs * sizeof(xfs_bmbt_ptr_t)); +} + +/* + * Reallocate the space for if_broot based on the number of records. Move the + * records and pointers in if_broot to fit the new size. When shrinking this + * will eliminate holes between the records and pointers created by the caller. + * When growing this will create holes to be filled in by the caller. + * + * The caller must not request to add more records than would fit in the + * on-disk inode root. If the if_broot is currently NULL, then if we are + * adding records, one will be allocated. The caller must also not request + * that the number of records go below zero, although it can go to zero. + * + * ip -- the inode whose if_broot area is changing + * whichfork -- which inode fork to change + * new_numrecs -- the new number of records requested for the if_broot array + * + * Returns the incore btree root block. + */ +struct xfs_btree_block * +xfs_bmap_broot_realloc( + struct xfs_inode *ip, + int whichfork, + unsigned int new_numrecs) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + struct xfs_btree_block *broot; + unsigned int new_size; + unsigned int old_size = ifp->if_broot_bytes; + + /* + * Block mapping btrees do not support storing zero records; if this + * happens, the fork is being changed to FMT_EXTENTS. Free the broot + * and get out. + */ + if (new_numrecs == 0) + return xfs_broot_realloc(ifp, 0); + + new_size = xfs_bmap_broot_space_calc(mp, new_numrecs); + + /* Handle the nop case quietly. */ + if (new_size == old_size) + return ifp->if_broot; + + if (new_size > old_size) { + unsigned int old_numrecs; + + /* + * If there wasn't any memory allocated before, just + * allocate it now and get out. + */ + if (old_size == 0) + return xfs_broot_realloc(ifp, new_size); + + /* + * If there is already an existing if_broot, then we need + * to realloc() it and shift the pointers to their new + * location. The records don't change location because + * they are kept butted up against the btree block header. + */ + old_numrecs = xfs_bmbt_maxrecs(mp, old_size, false); + broot = xfs_broot_realloc(ifp, new_size); + ASSERT(xfs_bmap_bmdr_space(broot) <= + xfs_inode_fork_size(ip, whichfork)); + xfs_bmbt_move_ptrs(mp, broot, old_size, new_size, old_numrecs); + return broot; + } + + /* + * We're reducing, but not totally eliminating, numrecs. In this case, + * we are shrinking the if_broot buffer, so it must already exist. + */ + ASSERT(ifp->if_broot != NULL && old_size > 0 && new_size > 0); + + /* + * Shrink the btree root by moving the bmbt pointers, since they are + * not butted up against the btree block header, then reallocating + * broot. + */ + xfs_bmbt_move_ptrs(mp, ifp->if_broot, old_size, new_size, new_numrecs); + broot = xfs_broot_realloc(ifp, new_size); + ASSERT(xfs_bmap_bmdr_space(broot) <= + xfs_inode_fork_size(ip, whichfork)); + return broot; +} + +static struct xfs_btree_block * +xfs_bmbt_broot_realloc( + struct xfs_btree_cur *cur, + unsigned int new_numrecs) +{ + return xfs_bmap_broot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork, + new_numrecs); +} + +const struct xfs_btree_ops xfs_bmbt_ops = { + .name = "bmap", + .type = XFS_BTREE_TYPE_INODE, + .rec_len = sizeof(xfs_bmbt_rec_t), .key_len = sizeof(xfs_bmbt_key_t), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_BMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2), .dup_cursor = xfs_bmbt_dup_cursor, .update_cursor = xfs_bmbt_update_cursor, @@ -529,44 +647,19 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_key_from_rec = xfs_bmbt_init_key_from_rec, .init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec, .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, - .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, .diff_two_keys = xfs_bmbt_diff_two_keys, .buf_ops = &xfs_bmbt_buf_ops, .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, .keys_contiguous = xfs_bmbt_keys_contiguous, + .broot_realloc = xfs_bmbt_broot_realloc, }; -static struct xfs_btree_cur * -xfs_bmbt_init_common( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_inode *ip, - int whichfork) -{ - struct xfs_btree_cur *cur; - - ASSERT(whichfork != XFS_COW_FORK); - - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, - mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); - - cur->bc_ops = &xfs_bmbt_ops; - cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; - if (xfs_has_crc(mp)) - cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - - cur->bc_ino.ip = ip; - cur->bc_ino.allocated = 0; - cur->bc_ino.flags = 0; - - return cur; -} - /* - * Allocate a new bmap btree cursor. + * Create a new bmap btree cursor. + * + * For staging cursors -1 in passed in whichfork. */ struct xfs_btree_cur * xfs_bmbt_init_cursor( @@ -575,15 +668,34 @@ xfs_bmbt_init_cursor( struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; + unsigned int maxlevels; - cur = xfs_bmbt_init_common(mp, tp, ip, whichfork); + ASSERT(whichfork != XFS_COW_FORK); - cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; - cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); + /* + * The Data fork always has larger maxlevel, so use that for staging + * cursors. + */ + switch (whichfork) { + case XFS_STAGING_FORK: + maxlevels = mp->m_bm_maxlevels[XFS_DATA_FORK]; + break; + default: + maxlevels = mp->m_bm_maxlevels[whichfork]; + break; + } + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bmbt_ops, maxlevels, + xfs_bmbt_cur_cache); + cur->bc_ino.ip = ip; cur->bc_ino.whichfork = whichfork; + cur->bc_bmap.allocated = 0; + if (whichfork != XFS_STAGING_FORK) { + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); + } return cur; } @@ -599,33 +711,6 @@ xfs_bmbt_block_maxrecs( } /* - * Allocate a new bmap btree cursor for reloading an inode block mapping data - * structure. Note that callers can use the staged cursor to reload extents - * format inode forks if they rebuild the iext tree and commit the staged - * cursor immediately. - */ -struct xfs_btree_cur * -xfs_bmbt_stage_cursor( - struct xfs_mount *mp, - struct xfs_inode *ip, - struct xbtree_ifakeroot *ifake) -{ - struct xfs_btree_cur *cur; - struct xfs_btree_ops *ops; - - /* data fork always has larger maxheight */ - cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK); - cur->bc_nlevels = ifake->if_levels; - cur->bc_ino.forksize = ifake->if_fork_size; - - /* Don't let anyone think we're attached to the real fork yet. */ - cur->bc_ino.whichfork = -1; - xfs_btree_stage_ifakeroot(cur, ifake, &ops); - ops->update_cursor = NULL; - return cur; -} - -/* * Swap in the new inode fork root. Once we pass this point the newly rebuilt * mappings are in place and we have to kill off any old btree blocks. */ @@ -665,19 +750,19 @@ xfs_bmbt_commit_staged_btree( break; } xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); - xfs_btree_commit_ifakeroot(cur, tp, whichfork, &xfs_bmbt_ops); + xfs_btree_commit_ifakeroot(cur, tp, whichfork); } /* * Calculate number of records in a bmap btree block. */ -int +unsigned int xfs_bmbt_maxrecs( struct xfs_mount *mp, - int blocklen, - int leaf) + unsigned int blocklen, + bool leaf) { - blocklen -= XFS_BMBT_BLOCK_LEN(mp); + blocklen -= xfs_bmbt_block_len(mp); return xfs_bmbt_block_maxrecs(blocklen, leaf); } @@ -751,7 +836,7 @@ xfs_bmbt_change_owner( ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); - cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER; + cur->bc_flags |= XFS_BTREE_BMBT_INVALID_OWNER; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 151b8491f60e..b238d559ab03 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -14,70 +14,6 @@ struct xfs_trans; struct xbtree_ifakeroot; /* - * Btree block header size depends on a superblock flag. - */ -#define XFS_BMBT_BLOCK_LEN(mp) \ - (xfs_has_crc(((mp))) ? \ - XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) - -#define XFS_BMBT_REC_ADDR(mp, block, index) \ - ((xfs_bmbt_rec_t *) \ - ((char *)(block) + \ - XFS_BMBT_BLOCK_LEN(mp) + \ - ((index) - 1) * sizeof(xfs_bmbt_rec_t))) - -#define XFS_BMBT_KEY_ADDR(mp, block, index) \ - ((xfs_bmbt_key_t *) \ - ((char *)(block) + \ - XFS_BMBT_BLOCK_LEN(mp) + \ - ((index) - 1) * sizeof(xfs_bmbt_key_t))) - -#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \ - ((xfs_bmbt_ptr_t *) \ - ((char *)(block) + \ - XFS_BMBT_BLOCK_LEN(mp) + \ - (maxrecs) * sizeof(xfs_bmbt_key_t) + \ - ((index) - 1) * sizeof(xfs_bmbt_ptr_t))) - -#define XFS_BMDR_REC_ADDR(block, index) \ - ((xfs_bmdr_rec_t *) \ - ((char *)(block) + \ - sizeof(struct xfs_bmdr_block) + \ - ((index) - 1) * sizeof(xfs_bmdr_rec_t))) - -#define XFS_BMDR_KEY_ADDR(block, index) \ - ((xfs_bmdr_key_t *) \ - ((char *)(block) + \ - sizeof(struct xfs_bmdr_block) + \ - ((index) - 1) * sizeof(xfs_bmdr_key_t))) - -#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \ - ((xfs_bmdr_ptr_t *) \ - ((char *)(block) + \ - sizeof(struct xfs_bmdr_block) + \ - (maxrecs) * sizeof(xfs_bmdr_key_t) + \ - ((index) - 1) * sizeof(xfs_bmdr_ptr_t))) - -/* - * These are to be used when we know the size of the block and - * we don't have a cursor. - */ -#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \ - XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0)) - -#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \ - (int)(XFS_BMBT_BLOCK_LEN(mp) + \ - ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) - -#define XFS_BMAP_BROOT_SPACE(mp, bb) \ - (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs))) -#define XFS_BMDR_SPACE_CALC(nrecs) \ - (int)(sizeof(xfs_bmdr_block_t) + \ - ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) -#define XFS_BMAP_BMDR_SPACE(bb) \ - (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) - -/* * Maximum number of bmap btree levels. */ #define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)]) @@ -99,7 +35,8 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(int blocklen, int leaf); -extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); +unsigned int xfs_bmbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_ino_t new_owner, @@ -107,8 +44,6 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); -struct xfs_btree_cur *xfs_bmbt_stage_cursor(struct xfs_mount *mp, - struct xfs_inode *ip, struct xbtree_ifakeroot *ifake); void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, int whichfork); @@ -120,4 +55,150 @@ unsigned int xfs_bmbt_maxlevels_ondisk(void); int __init xfs_bmbt_init_cur_cache(void); void xfs_bmbt_destroy_cur_cache(void); +void xfs_bmbt_init_block(struct xfs_inode *ip, struct xfs_btree_block *buf, + struct xfs_buf *bp, __u16 level, __u16 numrecs); + +/* + * Btree block header size depends on a superblock flag. + */ +static inline size_t +xfs_bmbt_block_len(struct xfs_mount *mp) +{ + return xfs_has_crc(mp) ? + XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN; +} + +/* Addresses of key, pointers, and records within an incore bmbt block. */ + +static inline struct xfs_bmbt_rec * +xfs_bmbt_rec_addr( + struct xfs_mount *mp, + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_bmbt_rec *) + ((char *)block + xfs_bmbt_block_len(mp) + + (index - 1) * sizeof(struct xfs_bmbt_rec)); +} + +static inline struct xfs_bmbt_key * +xfs_bmbt_key_addr( + struct xfs_mount *mp, + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_bmbt_key *) + ((char *)block + xfs_bmbt_block_len(mp) + + (index - 1) * sizeof(struct xfs_bmbt_key *)); +} + +static inline xfs_bmbt_ptr_t * +xfs_bmbt_ptr_addr( + struct xfs_mount *mp, + struct xfs_btree_block *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_bmbt_ptr_t *) + ((char *)block + xfs_bmbt_block_len(mp) + + maxrecs * sizeof(struct xfs_bmbt_key) + + (index - 1) * sizeof(xfs_bmbt_ptr_t)); +} + +/* Addresses of key, pointers, and records within an ondisk bmbt block. */ + +static inline struct xfs_bmbt_rec * +xfs_bmdr_rec_addr( + struct xfs_bmdr_block *block, + unsigned int index) +{ + return (struct xfs_bmbt_rec *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_bmbt_rec)); +} + +static inline struct xfs_bmbt_key * +xfs_bmdr_key_addr( + struct xfs_bmdr_block *block, + unsigned int index) +{ + return (struct xfs_bmbt_key *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_bmbt_key)); +} + +static inline xfs_bmbt_ptr_t * +xfs_bmdr_ptr_addr( + struct xfs_bmdr_block *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_bmbt_ptr_t *) + ((char *)(block + 1) + + maxrecs * sizeof(struct xfs_bmbt_key) + + (index - 1) * sizeof(xfs_bmbt_ptr_t)); +} + +/* + * Address of pointers within the incore btree root. + * + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +static inline xfs_bmbt_ptr_t * +xfs_bmap_broot_ptr_addr( + struct xfs_mount *mp, + struct xfs_btree_block *bb, + unsigned int i, + unsigned int sz) +{ + return xfs_bmbt_ptr_addr(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, false)); +} + +/* + * Compute the space required for the incore btree root containing the given + * number of records. + */ +static inline size_t +xfs_bmap_broot_space_calc( + struct xfs_mount *mp, + unsigned int nrecs) +{ + return xfs_bmbt_block_len(mp) + + (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t))); +} + +/* + * Compute the space required for the incore btree root given the ondisk + * btree root block. + */ +static inline size_t +xfs_bmap_broot_space( + struct xfs_mount *mp, + struct xfs_bmdr_block *bb) +{ + return xfs_bmap_broot_space_calc(mp, be16_to_cpu(bb->bb_numrecs)); +} + +/* Compute the space required for the ondisk root block. */ +static inline size_t +xfs_bmdr_space_calc(unsigned int nrecs) +{ + return sizeof(struct xfs_bmdr_block) + + (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t))); +} + +/* + * Compute the space required for the ondisk root block given an incore root + * block. + */ +static inline size_t +xfs_bmap_bmdr_space(struct xfs_btree_block *bb) +{ + return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs)); +} + +struct xfs_btree_block *xfs_bmap_broot_realloc(struct xfs_inode *ip, + int whichfork, unsigned int new_numrecs); + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index ea8d3659df20..299ce7fd11b0 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -27,28 +27,30 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_health.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_rmap.h" +#include "xfs_quota.h" +#include "xfs_metafile.h" +#include "xfs_rtrefcount_btree.h" /* * Btree magic numbers. */ -static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { - { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, - XFS_FIBT_MAGIC, 0 }, - { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, - XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, - XFS_REFC_CRC_MAGIC } -}; - uint32_t xfs_btree_magic( - int crc, - xfs_btnum_t btnum) + struct xfs_mount *mp, + const struct xfs_btree_ops *ops) { - uint32_t magic = xfs_magics[crc][btnum]; + int idx = xfs_has_crc(mp) ? 1 : 0; + __be32 magic = ops->buf_ops->magic[idx]; /* Ensure we asked for crc for crc-only magics. */ ASSERT(magic != 0); - return magic; + return be32_to_cpu(magic); } /* @@ -63,10 +65,8 @@ xfs_btree_magic( * bytes. */ static inline xfs_failaddr_t -xfs_btree_check_lblock_siblings( +xfs_btree_check_fsblock_siblings( struct xfs_mount *mp, - struct xfs_btree_cur *cur, - int level, xfs_fsblock_t fsb, __be64 dsibling) { @@ -78,22 +78,33 @@ xfs_btree_check_lblock_siblings( sibling = be64_to_cpu(dsibling); if (sibling == fsb) return __this_address; - if (level >= 0) { - if (!xfs_btree_check_lptr(cur, sibling, level + 1)) - return __this_address; - } else { - if (!xfs_verify_fsbno(mp, sibling)) - return __this_address; - } + if (!xfs_verify_fsbno(mp, sibling)) + return __this_address; + return NULL; +} + +static inline xfs_failaddr_t +xfs_btree_check_memblock_siblings( + struct xfs_buftarg *btp, + xfbno_t bno, + __be64 dsibling) +{ + xfbno_t sibling; + if (dsibling == cpu_to_be64(NULLFSBLOCK)) + return NULL; + + sibling = be64_to_cpu(dsibling); + if (sibling == bno) + return __this_address; + if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling))) + return __this_address; return NULL; } static inline xfs_failaddr_t -xfs_btree_check_sblock_siblings( +xfs_btree_check_agblock_siblings( struct xfs_perag *pag, - struct xfs_btree_cur *cur, - int level, xfs_agblock_t agbno, __be32 dsibling) { @@ -105,34 +116,21 @@ xfs_btree_check_sblock_siblings( sibling = be32_to_cpu(dsibling); if (sibling == agbno) return __this_address; - if (level >= 0) { - if (!xfs_btree_check_sptr(cur, sibling, level + 1)) - return __this_address; - } else { - if (!xfs_verify_agbno(pag, sibling)) - return __this_address; - } + if (!xfs_verify_agbno(pag, sibling)) + return __this_address; return NULL; } -/* - * Check a long btree block header. Return the address of the failing check, - * or NULL if everything is ok. - */ -xfs_failaddr_t -__xfs_btree_check_lblock( +static xfs_failaddr_t +__xfs_btree_check_lblock_hdr( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - xfs_btnum_t btnum = cur->bc_btnum; - int crc = xfs_has_crc(mp); - xfs_failaddr_t fa; - xfs_fsblock_t fsb = NULLFSBLOCK; - if (crc) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.l.bb_blkno != @@ -142,7 +140,7 @@ __xfs_btree_check_lblock( return __this_address; } - if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) + if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) return __this_address; if (be16_to_cpu(block->bb_level) != level) return __this_address; @@ -150,65 +148,101 @@ __xfs_btree_check_lblock( cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (bp) - fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + return NULL; +} + +/* + * Check a long btree block header. Return the address of the failing check, + * or NULL if everything is ok. + */ +static xfs_failaddr_t +__xfs_btree_check_fsblock( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_failaddr_t fa; + xfs_fsblock_t fsb; + + fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); + if (fa) + return fa; + + /* + * For inode-rooted btrees, the root block sits in the inode fork. In + * that case bp is NULL, and the block must not have any siblings. + */ + if (!bp) { + if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK)) + return __this_address; + if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK)) + return __this_address; + return NULL; + } - fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_leftsib); if (!fa) - fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_rightsib); return fa; } -/* Check a long btree block header. */ -static int -xfs_btree_check_lblock( +/* + * Check an in-memory btree block header. Return the address of the failing + * check, or NULL if everything is ok. + */ +static xfs_failaddr_t +__xfs_btree_check_memblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { - struct xfs_mount *mp = cur->bc_mp; + struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; xfs_failaddr_t fa; + xfbno_t bno; - fa = __xfs_btree_check_lblock(cur, block, level, bp); - if (XFS_IS_CORRUPT(mp, fa != NULL) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) { - if (bp) - trace_xfs_btree_corrupt(bp, _RET_IP_); - return -EFSCORRUPTED; - } - return 0; + fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); + if (fa) + return fa; + + bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_rightsib); + return fa; } /* * Check a short btree block header. Return the address of the failing check, * or NULL if everything is ok. */ -xfs_failaddr_t -__xfs_btree_check_sblock( +static xfs_failaddr_t +__xfs_btree_check_agblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; - xfs_btnum_t btnum = cur->bc_btnum; - int crc = xfs_has_crc(mp); + struct xfs_perag *pag = to_perag(cur->bc_group); xfs_failaddr_t fa; - xfs_agblock_t agbno = NULLAGBLOCK; + xfs_agblock_t agbno; - if (crc) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (block->bb_u.s.bb_blkno != - cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) + if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; } - if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) + if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) return __this_address; if (be16_to_cpu(block->bb_level) != level) return __this_address; @@ -216,36 +250,45 @@ __xfs_btree_check_sblock( cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (bp) - agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); - - fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_agblock_siblings(pag, agbno, block->bb_u.s.bb_leftsib); if (!fa) - fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, + fa = xfs_btree_check_agblock_siblings(pag, agbno, block->bb_u.s.bb_rightsib); return fa; } -/* Check a short btree block header. */ -STATIC int -xfs_btree_check_sblock( +/* + * Internal btree block check. + * + * Return NULL if the block is ok or the address of the failed check otherwise. + */ +xfs_failaddr_t +__xfs_btree_check_block( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { - struct xfs_mount *mp = cur->bc_mp; - xfs_failaddr_t fa; - - fa = __xfs_btree_check_sblock(cur, block, level, bp); - if (XFS_IS_CORRUPT(mp, fa != NULL) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) { - if (bp) - trace_xfs_btree_corrupt(bp, _RET_IP_); - return -EFSCORRUPTED; + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + return __xfs_btree_check_memblock(cur, block, level, bp); + case XFS_BTREE_TYPE_AG: + return __xfs_btree_check_agblock(cur, block, level, bp); + case XFS_BTREE_TYPE_INODE: + return __xfs_btree_check_fsblock(cur, block, level, bp); + default: + ASSERT(0); + return __this_address; } - return 0; +} + +static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) +{ + if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN) + return XFS_ERRTAG_BTREE_CHECK_SBLOCK; + return XFS_ERRTAG_BTREE_CHECK_LBLOCK; } /* @@ -258,34 +301,49 @@ xfs_btree_check_block( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer containing block, if any */ { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - return xfs_btree_check_lblock(cur, block, level, bp); - else - return xfs_btree_check_sblock(cur, block, level, bp); -} + struct xfs_mount *mp = cur->bc_mp; + xfs_failaddr_t fa; -/* Check that this long pointer is valid and points within the fs. */ -bool -xfs_btree_check_lptr( - struct xfs_btree_cur *cur, - xfs_fsblock_t fsbno, - int level) -{ - if (level <= 0) - return false; - return xfs_verify_fsbno(cur->bc_mp, fsbno); + fa = __xfs_btree_check_block(cur, block, level, bp); + if (XFS_IS_CORRUPT(mp, fa != NULL) || + XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { + if (bp) + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + return 0; } -/* Check that this short pointer is valid and points within the AG. */ -bool -xfs_btree_check_sptr( - struct xfs_btree_cur *cur, - xfs_agblock_t agbno, - int level) +int +__xfs_btree_check_ptr( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int index, + int level) { if (level <= 0) - return false; - return xfs_verify_agbno(cur->bc_ag.pag, agbno); + return -EFSCORRUPTED; + + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + if (!xfbtree_verify_bno(cur->bc_mem.xfbtree, + be64_to_cpu((&ptr->l)[index]))) + return -EFSCORRUPTED; + break; + case XFS_BTREE_TYPE_INODE: + if (!xfs_verify_fsbno(cur->bc_mp, + be64_to_cpu((&ptr->l)[index]))) + return -EFSCORRUPTED; + break; + case XFS_BTREE_TYPE_AG: + if (!xfs_verify_agbno(to_perag(cur->bc_group), + be32_to_cpu((&ptr->s)[index]))) + return -EFSCORRUPTED; + break; + } + + return 0; } /* @@ -299,26 +357,35 @@ xfs_btree_check_ptr( int index, int level) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]), - level)) - return 0; - xfs_err(cur->bc_mp, -"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.", + int error; + + error = __xfs_btree_check_ptr(cur, ptr, index, level); + if (error) { + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + xfs_err(cur->bc_mp, +"In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.", + cur->bc_ops->name, cur->bc_flags, level, index, + __this_address); + break; + case XFS_BTREE_TYPE_INODE: + xfs_err(cur->bc_mp, +"Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", cur->bc_ino.ip->i_ino, - cur->bc_ino.whichfork, cur->bc_btnum, + cur->bc_ino.whichfork, cur->bc_ops->name, level, index); - } else { - if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]), - level)) - return 0; - xfs_err(cur->bc_mp, -"AG %u: Corrupt btree %d pointer at level %d index %d.", - cur->bc_ag.pag->pag_agno, cur->bc_btnum, + break; + case XFS_BTREE_TYPE_AG: + xfs_err(cur->bc_mp, +"AG %u: Corrupt %sbt pointer at level %d index %d.", + cur->bc_group->xg_gno, cur->bc_ops->name, level, index); + break; + } + xfs_btree_mark_sick(cur); } - return -EFSCORRUPTED; + return error; } #ifdef DEBUG @@ -336,7 +403,7 @@ xfs_btree_check_ptr( * it to disk. */ void -xfs_btree_lblock_calc_crc( +xfs_btree_fsblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -350,7 +417,7 @@ xfs_btree_lblock_calc_crc( } bool -xfs_btree_lblock_verify_crc( +xfs_btree_fsblock_verify_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -374,7 +441,7 @@ xfs_btree_lblock_verify_crc( * it to disk. */ void -xfs_btree_sblock_calc_crc( +xfs_btree_agblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -388,7 +455,7 @@ xfs_btree_sblock_calc_crc( } bool -xfs_btree_sblock_verify_crc( +xfs_btree_agblock_verify_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -410,6 +477,17 @@ xfs_btree_free_block( { int error; + trace_xfs_btree_free_block(cur, bp); + + /* + * Don't allow block freeing for a staging cursor, because staging + * cursors do not support regular btree modifications. + */ + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { + ASSERT(0); + return -EFSCORRUPTED; + } + error = cur->bc_ops->free_block(cur, bp); if (!error) { xfs_trans_binval(cur->bc_tp, bp); @@ -448,33 +526,58 @@ xfs_btree_del_cursor( * zero, then we should be shut down or on our way to shutdown due to * cancelling a dirty transaction on error. */ - ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || + ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || xfs_is_shutdown(cur->bc_mp) || error != 0); - if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) - kmem_free(cur->bc_ops); - if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) - xfs_perag_put(cur->bc_ag.pag); + + if (cur->bc_group) + xfs_group_put(cur->bc_group); kmem_cache_free(cur->bc_cache, cur); } +/* Return the buffer target for this btree's buffer. */ +static inline struct xfs_buftarg * +xfs_btree_buftarg( + struct xfs_btree_cur *cur) +{ + if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) + return cur->bc_mem.xfbtree->target; + return cur->bc_mp->m_ddev_targp; +} + +/* Return the block size (in units of 512b sectors) for this btree. */ +static inline unsigned int +xfs_btree_bbsize( + struct xfs_btree_cur *cur) +{ + if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) + return XFBNO_BBSIZE; + return cur->bc_mp->m_bsize; +} + /* * Duplicate the btree cursor. * Allocate a new one, copy the record, re-get the buffers. */ -int /* error */ +int /* error */ xfs_btree_dup_cursor( - struct xfs_btree_cur *cur, /* input cursor */ - struct xfs_btree_cur **ncur) /* output cursor */ + struct xfs_btree_cur *cur, /* input cursor */ + struct xfs_btree_cur **ncur) /* output cursor */ { - struct xfs_buf *bp; /* btree block's buffer pointer */ - int error; /* error return value */ - int i; /* level number of btree block */ - xfs_mount_t *mp; /* mount structure for filesystem */ - struct xfs_btree_cur *new; /* new cursor value */ - xfs_trans_t *tp; /* transaction pointer, can be NULL */ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_trans *tp = cur->bc_tp; + struct xfs_buf *bp; + struct xfs_btree_cur *new; + int error; + int i; - tp = cur->bc_tp; - mp = cur->bc_mp; + /* + * Don't allow staging cursors to be duplicated because they're supposed + * to be kept private to a single thread. + */ + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { + ASSERT(0); + return -EFSCORRUPTED; + } /* * Allocate a new cursor like the old one. @@ -494,10 +597,13 @@ xfs_btree_dup_cursor( new->bc_levels[i].ra = cur->bc_levels[i].ra; bp = cur->bc_levels[i].bp; if (bp) { - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - xfs_buf_daddr(bp), mp->m_bsize, - 0, &bp, - cur->bc_ops->buf_ops); + error = xfs_trans_read_buf(mp, tp, + xfs_btree_buftarg(cur), + xfs_buf_daddr(bp), + xfs_btree_bbsize(cur), 0, &bp, + cur->bc_ops->buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(new); if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; @@ -539,7 +645,7 @@ xfs_btree_dup_cursor( * record, key or pointer (xfs_btree_*_addr). Note that all addressing * inside the btree block is done using indices starting at one, not zero! * - * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing + * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing * overlapping intervals. In such a tree, records are still sorted lowest to * highest and indexed by the smallest key value that refers to the record. * However, nodes are different: each pointer has two associated keys -- one @@ -589,26 +695,17 @@ xfs_btree_dup_cursor( */ static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { + if (xfs_has_crc(cur->bc_mp)) return XFS_BTREE_LBLOCK_CRC_LEN; return XFS_BTREE_LBLOCK_LEN; } - if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + if (xfs_has_crc(cur->bc_mp)) return XFS_BTREE_SBLOCK_CRC_LEN; return XFS_BTREE_SBLOCK_LEN; } /* - * Return size of btree block pointers for this btree instance. - */ -static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur) -{ - return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? - sizeof(__be64) : sizeof(__be32); -} - -/* * Calculate offset of the n-th record in a btree block. */ STATIC size_t @@ -655,7 +752,7 @@ xfs_btree_ptr_offset( { return xfs_btree_block_len(cur) + cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + - (n - 1) * xfs_btree_ptr_len(cur); + (n - 1) * cur->bc_ops->ptr_len; } /* @@ -718,7 +815,7 @@ struct xfs_ifork * xfs_btree_ifork_ptr( struct xfs_btree_cur *cur) { - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); if (cur->bc_flags & XFS_BTREE_STAGING) return cur->bc_ino.ifake->if_fork; @@ -750,8 +847,7 @@ xfs_btree_get_block( int level, /* level in btree */ struct xfs_buf **bpp) /* buffer containing the block */ { - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - (level == cur->bc_nlevels - 1)) { + if (xfs_btree_at_iroot(cur, level)) { *bpp = NULL; return xfs_btree_get_iroot(cur); } @@ -856,95 +952,52 @@ xfs_btree_offsets( } } -/* - * Get a buffer for the block, return it read in. - * Long-form addressing. - */ -int -xfs_btree_read_bufl( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - struct xfs_buf **bpp, /* buffer for fsbno */ - int refval, /* ref count value for buffer */ - const struct xfs_buf_ops *ops) -{ - struct xfs_buf *bp; /* return value */ - xfs_daddr_t d; /* real disk block address */ - int error; - - if (!xfs_verify_fsbno(mp, fsbno)) - return -EFSCORRUPTED; - d = XFS_FSB_TO_DADDR(mp, fsbno); - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, 0, &bp, ops); - if (error) - return error; - if (bp) - xfs_buf_set_ref(bp, refval); - *bpp = bp; - return 0; -} - -/* - * Read-ahead the block, don't wait for it, don't return a buffer. - * Long-form addressing. - */ -/* ARGSUSED */ -void -xfs_btree_reada_bufl( - struct xfs_mount *mp, /* file system mount point */ - xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count, /* count of filesystem blocks */ - const struct xfs_buf_ops *ops) +STATIC int +xfs_btree_readahead_fsblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) { - xfs_daddr_t d; + struct xfs_mount *mp = cur->bc_mp; + xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); + xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); + int rval = 0; - ASSERT(fsbno != NULLFSBLOCK); - d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); -} + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { + xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left), + mp->m_bsize, cur->bc_ops->buf_ops); + rval++; + } -/* - * Read-ahead the block, don't wait for it, don't return a buffer. - * Short-form addressing. - */ -/* ARGSUSED */ -void -xfs_btree_reada_bufs( - struct xfs_mount *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count, /* count of filesystem blocks */ - const struct xfs_buf_ops *ops) -{ - xfs_daddr_t d; + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { + xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right), + mp->m_bsize, cur->bc_ops->buf_ops); + rval++; + } - ASSERT(agno != NULLAGNUMBER); - ASSERT(agbno != NULLAGBLOCK); - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); + return rval; } STATIC int -xfs_btree_readahead_lblock( +xfs_btree_readahead_memblock( struct xfs_btree_cur *cur, int lr, struct xfs_btree_block *block) { + struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; + xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); + xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); int rval = 0; - xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); - xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { - xfs_btree_reada_bufl(cur->bc_mp, left, 1, - cur->bc_ops->buf_ops); + xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE, + cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { - xfs_btree_reada_bufl(cur->bc_mp, right, 1, - cur->bc_ops->buf_ops); + xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE, + cur->bc_ops->buf_ops); rval++; } @@ -952,25 +1005,28 @@ xfs_btree_readahead_lblock( } STATIC int -xfs_btree_readahead_sblock( +xfs_btree_readahead_agblock( struct xfs_btree_cur *cur, int lr, - struct xfs_btree_block *block) + struct xfs_btree_block *block) { - int rval = 0; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_perag *pag = to_perag(cur->bc_group); xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); - + int rval = 0; if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, - left, 1, cur->bc_ops->buf_ops); + xfs_buf_readahead(mp->m_ddev_targp, + xfs_agbno_to_daddr(pag, left), mp->m_bsize, + cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, - right, 1, cur->bc_ops->buf_ops); + xfs_buf_readahead(mp->m_ddev_targp, + xfs_agbno_to_daddr(pag, right), mp->m_bsize, + cur->bc_ops->buf_ops); rval++; } @@ -993,8 +1049,7 @@ xfs_btree_readahead( * No readahead needed if we are at the root level and the * btree root is stored in the inode. */ - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - (lev == cur->bc_nlevels - 1)) + if (xfs_btree_at_iroot(cur, lev)) return 0; if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) @@ -1003,9 +1058,17 @@ xfs_btree_readahead( cur->bc_levels[lev].ra |= lr; block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - return xfs_btree_readahead_lblock(cur, lr, block); - return xfs_btree_readahead_sblock(cur, lr, block); + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: + return xfs_btree_readahead_agblock(cur, lr, block); + case XFS_BTREE_TYPE_INODE: + return xfs_btree_readahead_fsblock(cur, lr, block); + case XFS_BTREE_TYPE_MEM: + return xfs_btree_readahead_memblock(cur, lr, block); + default: + ASSERT(0); + return 0; + } } STATIC int @@ -1014,23 +1077,24 @@ xfs_btree_ptr_to_daddr( const union xfs_btree_ptr *ptr, xfs_daddr_t *daddr) { - xfs_fsblock_t fsbno; - xfs_agblock_t agbno; int error; error = xfs_btree_check_ptr(cur, ptr, 0, 1); if (error) return error; - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - fsbno = be64_to_cpu(ptr->l); - *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); - } else { - agbno = be32_to_cpu(ptr->s); - *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, - agbno); + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: + *daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group), + be32_to_cpu(ptr->s)); + break; + case XFS_BTREE_TYPE_INODE: + *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); + break; + case XFS_BTREE_TYPE_MEM: + *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l)); + break; } - return 0; } @@ -1050,8 +1114,9 @@ xfs_btree_readahead_ptr( if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) return; - xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr, - cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); + xfs_buf_readahead(xfs_btree_buftarg(cur), daddr, + xfs_btree_bbsize(cur) * count, + cur->bc_ops->buf_ops); } /* @@ -1072,7 +1137,7 @@ xfs_btree_setbuf( cur->bc_levels[lev].ra = 0; b = XFS_BUF_TO_BLOCK(bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) @@ -1090,7 +1155,7 @@ xfs_btree_ptr_is_null( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return ptr->l == cpu_to_be64(NULLFSBLOCK); else return ptr->s == cpu_to_be32(NULLAGBLOCK); @@ -1101,12 +1166,23 @@ xfs_btree_set_ptr_null( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ptr->l = cpu_to_be64(NULLFSBLOCK); else ptr->s = cpu_to_be32(NULLAGBLOCK); } +static inline bool +xfs_btree_ptrs_equal( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr1, + union xfs_btree_ptr *ptr2) +{ + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) + return ptr1->l == ptr2->l; + return ptr1->s == ptr2->s; +} + /* * Get/set/init sibling pointers */ @@ -1119,7 +1195,7 @@ xfs_btree_get_sibling( { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (lr == XFS_BB_RIGHTSIB) ptr->l = block->bb_u.l.bb_rightsib; else @@ -1141,7 +1217,7 @@ xfs_btree_set_sibling( { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (lr == XFS_BB_RIGHTSIB) block->bb_u.l.bb_rightsib = ptr->l; else @@ -1154,25 +1230,24 @@ xfs_btree_set_sibling( } } -void -xfs_btree_init_block_int( +static void +__xfs_btree_init_block( struct xfs_mount *mp, struct xfs_btree_block *buf, + const struct xfs_btree_ops *ops, xfs_daddr_t blkno, - xfs_btnum_t btnum, __u16 level, __u16 numrecs, - __u64 owner, - unsigned int flags) + __u64 owner) { - int crc = xfs_has_crc(mp); - __u32 magic = xfs_btree_magic(crc, btnum); + bool crc = xfs_has_crc(mp); + __u32 magic = xfs_btree_magic(mp, ops); buf->bb_magic = cpu_to_be32(magic); buf->bb_level = cpu_to_be16(level); buf->bb_numrecs = cpu_to_be16(numrecs); - if (flags & XFS_BTREE_LONG_PTRS) { + if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); if (crc) { @@ -1183,14 +1258,12 @@ xfs_btree_init_block_int( buf->bb_u.l.bb_lsn = 0; } } else { - /* owner is a 32 bit value on short blocks */ - __u32 __owner = (__u32)owner; - buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); if (crc) { buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); - buf->bb_u.s.bb_owner = cpu_to_be32(__owner); + /* owner is a 32 bit value on short blocks */ + buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner); uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); buf->bb_u.s.bb_lsn = 0; } @@ -1199,64 +1272,57 @@ xfs_btree_init_block_int( void xfs_btree_init_block( - struct xfs_mount *mp, - struct xfs_buf *bp, - xfs_btnum_t btnum, - __u16 level, - __u16 numrecs, - __u64 owner) + struct xfs_mount *mp, + struct xfs_btree_block *block, + const struct xfs_btree_ops *ops, + __u16 level, + __u16 numrecs, + __u64 owner) { - xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp), - btnum, level, numrecs, owner, 0); + __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level, + numrecs, owner); } void -xfs_btree_init_block_cur( - struct xfs_btree_cur *cur, - struct xfs_buf *bp, - int level, - int numrecs) +xfs_btree_init_buf( + struct xfs_mount *mp, + struct xfs_buf *bp, + const struct xfs_btree_ops *ops, + __u16 level, + __u16 numrecs, + __u64 owner) { - __u64 owner; - - /* - * we can pull the owner from the cursor right now as the different - * owners align directly with the pointer size of the btree. This may - * change in future, but is safe for current users of the generic btree - * code. - */ - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - owner = cur->bc_ino.ip->i_ino; - else - owner = cur->bc_ag.pag->pag_agno; - - xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), - xfs_buf_daddr(bp), cur->bc_btnum, level, - numrecs, owner, cur->bc_flags); + __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, + xfs_buf_daddr(bp), level, numrecs, owner); + bp->b_ops = ops->buf_ops; } -/* - * Return true if ptr is the last record in the btree and - * we need to track updates to this record. The decision - * will be further refined in the update_lastrec method. - */ -STATIC int -xfs_btree_is_lastrec( - struct xfs_btree_cur *cur, - struct xfs_btree_block *block, - int level) +static inline __u64 +xfs_btree_owner( + struct xfs_btree_cur *cur) { - union xfs_btree_ptr ptr; - - if (level > 0) - return 0; - if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE)) + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + return cur->bc_mem.xfbtree->owner; + case XFS_BTREE_TYPE_INODE: + return cur->bc_ino.ip->i_ino; + case XFS_BTREE_TYPE_AG: + return cur->bc_group->xg_gno; + default: + ASSERT(0); return 0; + } +} - xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); - if (!xfs_btree_ptr_is_null(cur, &ptr)) - return 0; - return 1; +void +xfs_btree_init_block_cur( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int level, + int numrecs) +{ + xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs, + xfs_btree_owner(cur)); } STATIC void @@ -1265,41 +1331,27 @@ xfs_btree_buf_to_ptr( struct xfs_buf *bp, union xfs_btree_ptr *ptr) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, - xfs_buf_daddr(bp))); - else { + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp))); + break; + case XFS_BTREE_TYPE_INODE: + ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(bp))); + break; + case XFS_BTREE_TYPE_MEM: + ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp))); + break; } } -STATIC void +static inline void xfs_btree_set_refs( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - switch (cur->bc_btnum) { - case XFS_BTNUM_BNO: - case XFS_BTNUM_CNT: - xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); - break; - case XFS_BTNUM_INO: - case XFS_BTNUM_FINO: - xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); - break; - case XFS_BTNUM_BMAP: - xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); - break; - case XFS_BTNUM_RMAP: - xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); - break; - case XFS_BTNUM_REFC: - xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF); - break; - default: - ASSERT(0); - } + xfs_buf_set_ref(bp, cur->bc_ops->lru_refs); } int @@ -1309,15 +1361,14 @@ xfs_btree_get_buf_block( struct xfs_btree_block **block, struct xfs_buf **bpp) { - struct xfs_mount *mp = cur->bc_mp; - xfs_daddr_t d; - int error; + xfs_daddr_t d; + int error; error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; - error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, - 0, bpp); + error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d, + xfs_btree_bbsize(cur), 0, bpp); if (error) return error; @@ -1348,9 +1399,11 @@ xfs_btree_read_buf_block( error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; - error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags, bpp, - cur->bc_ops->buf_ops); + error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d, + xfs_btree_bbsize(cur), flags, bpp, + cur->bc_ops->buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(cur); if (error) return error; @@ -1398,7 +1451,7 @@ xfs_btree_copy_ptrs( int numptrs) { ASSERT(numptrs >= 0); - memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur)); + memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len); } /* @@ -1454,8 +1507,8 @@ xfs_btree_shift_ptrs( ASSERT(numptrs >= 0); ASSERT(dir == 1 || dir == -1); - dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur)); - memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur)); + dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len); + memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len); } /* @@ -1490,12 +1543,16 @@ xfs_btree_log_recs( int first, int last) { + if (!bp) { + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); + return; + } xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_rec_offset(cur, first), xfs_btree_rec_offset(cur, last + 1) - 1); - } /* @@ -1566,7 +1623,7 @@ xfs_btree_log_block( if (bp) { int nbits; - if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (xfs_has_crc(cur->bc_mp)) { /* * We don't log the CRC when updating a btree * block but instead recreate it during log @@ -1581,7 +1638,7 @@ xfs_btree_log_block( nbits = XFS_BB_NUM_BITS; } xfs_btree_offsets(fields, - (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ? loffsets : soffsets, nbits, &first, &last); xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); @@ -1658,9 +1715,10 @@ xfs_btree_increment( * confused or have the tree root in an inode. */ if (lev == cur->bc_nlevels) { - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) goto out0; ASSERT(0); + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1751,9 +1809,10 @@ xfs_btree_decrement( * or the root of the tree is in an inode. */ if (lev == cur->bc_nlevels) { - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) goto out0; ASSERT(0); + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1786,6 +1845,33 @@ error0: return error; } +/* + * Check the btree block owner now that we have the context to know who the + * real owner is. + */ +static inline xfs_failaddr_t +xfs_btree_check_block_owner( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block) +{ + __u64 owner; + + if (!xfs_has_crc(cur->bc_mp) || + (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER)) + return NULL; + + owner = xfs_btree_owner(cur); + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { + if (be64_to_cpu(block->bb_u.l.bb_owner) != owner) + return __this_address; + } else { + if (be32_to_cpu(block->bb_u.s.bb_owner) != owner) + return __this_address; + } + + return NULL; +} + int xfs_btree_lookup_get_block( struct xfs_btree_cur *cur, /* btree cursor */ @@ -1798,8 +1884,7 @@ xfs_btree_lookup_get_block( int error = 0; /* special case the root block if in an inode */ - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - (level == cur->bc_nlevels - 1)) { + if (xfs_btree_at_iroot(cur, level)) { *blkp = xfs_btree_get_iroot(cur); return 0; } @@ -1824,11 +1909,7 @@ xfs_btree_lookup_get_block( return error; /* Check the inode owner since the verifiers don't. */ - if (xfs_has_crc(cur->bc_mp) && - !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && - (cur->bc_flags & XFS_BTREE_LONG_PTRS) && - be64_to_cpu((*blkp)->bb_u.l.bb_owner) != - cur->bc_ino.ip->i_ino) + if (xfs_btree_check_block_owner(cur, *blkp) != NULL) goto out_bad; /* Did we get the level we were looking for? */ @@ -1846,6 +1927,7 @@ out_bad: *blkp = NULL; xfs_buf_mark_corrupt(bp); xfs_trans_brelse(cur->bc_tp, bp); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -1872,6 +1954,27 @@ xfs_lookup_get_search_key( } /* + * Initialize a pointer to the root block. + */ +void +xfs_btree_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { + /* + * Inode-rooted btrees call xfs_btree_get_iroot to find the root + * in xfs_btree_lookup_get_block and don't need a pointer here. + */ + ptr->l = 0; + } else if (cur->bc_flags & XFS_BTREE_STAGING) { + ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root); + } else { + cur->bc_ops->init_ptr_from_cur(cur, ptr); + } +} + +/* * Lookup the record. The cursor is made to point to it, based on dir. * stat is set to 0 if can't find any such record, 1 for success. */ @@ -1892,14 +1995,16 @@ xfs_btree_lookup( XFS_BTREE_STATS_INC(cur, lookup); /* No such thing as a zero-level tree. */ - if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) + if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } block = NULL; keyno = 0; /* initialise start pointer from cursor */ - cur->bc_ops->init_ptr_from_cur(cur, &ptr); + xfs_btree_init_ptr_from_cur(cur, &ptr); pp = &ptr; /* @@ -1936,6 +2041,7 @@ xfs_btree_lookup( XFS_ERRLEVEL_LOW, cur->bc_mp, block, sizeof(*block)); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -2012,8 +2118,10 @@ xfs_btree_lookup( error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } *stat = 1; return 0; } @@ -2040,7 +2148,7 @@ xfs_btree_high_key_from_key( struct xfs_btree_cur *cur, union xfs_btree_key *key) { - ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); + ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); return (union xfs_btree_key *)((char *)key + (cur->bc_ops->key_len / 2)); } @@ -2061,7 +2169,7 @@ xfs_btree_get_leaf_keys( rec = xfs_btree_rec_addr(cur, 1, block); cur->bc_ops->init_key_from_rec(key, rec); - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { @@ -2088,7 +2196,7 @@ xfs_btree_get_node_keys( union xfs_btree_key *high; int n; - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { memcpy(key, xfs_btree_key_addr(cur, 1, block), cur->bc_ops->key_len / 2); @@ -2132,7 +2240,7 @@ xfs_btree_needs_key_update( struct xfs_btree_cur *cur, int ptr) { - return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1; + return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1; } /* @@ -2156,7 +2264,7 @@ __xfs_btree_updkeys( struct xfs_buf *bp; int ptr; - ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); + ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); /* Exit if there aren't any parent levels to update. */ if (level + 1 >= cur->bc_nlevels) @@ -2225,7 +2333,7 @@ xfs_btree_update_keys( ASSERT(level >= 0); block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) return __xfs_btree_updkeys(cur, level, block, bp, false); /* @@ -2286,15 +2394,6 @@ xfs_btree_update( xfs_btree_copy_recs(cur, rp, rec, 1); xfs_btree_log_recs(cur, bp, ptr, ptr); - /* - * If we are tracking the last record in the tree and - * we are at the far right edge of the tree, update it. - */ - if (xfs_btree_is_lastrec(cur, block, 0)) { - cur->bc_ops->update_lastrec(cur, block, rec, - ptr, LASTREC_UPDATE); - } - /* Pass new key value up to our parent. */ if (xfs_btree_needs_key_update(cur, ptr)) { error = xfs_btree_update_keys(cur, 0); @@ -2332,8 +2431,7 @@ xfs_btree_lshift( int error; /* error return value */ int i; - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - level == cur->bc_nlevels - 1) + if (xfs_btree_at_iroot(cur, level)) goto out0; /* Set up variables for this block as "right". */ @@ -2460,12 +2558,13 @@ xfs_btree_lshift( * Using a temporary cursor, update the parent key values of the * block on the left. */ - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_dup_cursor(cur, &tcur); if (error) goto error0; i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -2527,8 +2626,7 @@ xfs_btree_rshift( int error; /* error return value */ int i; /* loop counter */ - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - (level == cur->bc_nlevels - 1)) + if (xfs_btree_at_iroot(cur, level)) goto out0; /* Set up variables for this block as "left". */ @@ -2636,6 +2734,7 @@ xfs_btree_rshift( goto error0; i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -2645,7 +2744,7 @@ xfs_btree_rshift( goto error1; /* Update the parent high keys of the left block, if needed. */ - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_update_keys(cur, level); if (error) goto error1; @@ -2673,6 +2772,32 @@ error1: return error; } +static inline int +xfs_btree_alloc_block( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *hint_block, + union xfs_btree_ptr *new_block, + int *stat) +{ + int error; + + /* + * Don't allow block allocation for a staging cursor, because staging + * cursors do not support regular btree modifications. + * + * Bulk loading uses a separate callback to obtain new blocks from a + * preallocated list, which prevents ENOSPC failures during loading. + */ + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat); + trace_xfs_btree_alloc_block(cur, new_block, *stat, error); + return error; +} + /* * Split cur/level block in half. * Return new block number and the key to its first @@ -2716,7 +2841,7 @@ __xfs_btree_split( xfs_btree_buf_to_ptr(cur, lbp, &lptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat); + error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat); if (error) goto error0; if (*stat == 0) @@ -2823,7 +2948,7 @@ __xfs_btree_split( } /* Update the parent high keys of the left block, if needed. */ - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_update_keys(cur, level); if (error) goto error0; @@ -2941,7 +3066,7 @@ xfs_btree_split( struct xfs_btree_split_args args; DECLARE_COMPLETION_ONSTACK(done); - if (cur->bc_btnum != XFS_BTNUM_BMAP || + if (!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_tp->t_highest_agno == NULLAGNUMBER) return __xfs_btree_split(cur, level, ptrp, key, curp, stat); @@ -2963,6 +3088,130 @@ xfs_btree_split( #define xfs_btree_split __xfs_btree_split #endif /* __KERNEL__ */ +/* Move the records from a root leaf block to a separate block. */ +STATIC void +xfs_btree_promote_leaf_iroot( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + struct xfs_buf *cbp, + union xfs_btree_ptr *cptr, + struct xfs_btree_block *cblock) +{ + union xfs_btree_rec *rp; + union xfs_btree_rec *crp; + union xfs_btree_key *kp; + union xfs_btree_ptr *pp; + struct xfs_btree_block *broot; + int numrecs = xfs_btree_get_numrecs(block); + + /* Copy the records from the leaf broot into the new child block. */ + rp = xfs_btree_rec_addr(cur, 1, block); + crp = xfs_btree_rec_addr(cur, 1, cblock); + xfs_btree_copy_recs(cur, crp, rp, numrecs); + + /* + * Increment the tree height. + * + * Trickery here: The amount of memory that we need per record for the + * ifork's btree root block may change when we convert the broot from a + * leaf to a node block. Free the existing leaf broot so that nobody + * thinks we need to migrate node pointers when we realloc the broot + * buffer after bumping nlevels. + */ + cur->bc_ops->broot_realloc(cur, 0); + cur->bc_nlevels++; + cur->bc_levels[1].ptr = 1; + + /* + * Allocate a new node broot and initialize it to point to the new + * child block. + */ + broot = cur->bc_ops->broot_realloc(cur, 1); + xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, + cur->bc_nlevels - 1, 1, cur->bc_ino.ip->i_ino); + + pp = xfs_btree_ptr_addr(cur, 1, broot); + kp = xfs_btree_key_addr(cur, 1, broot); + xfs_btree_copy_ptrs(cur, pp, cptr, 1); + xfs_btree_get_keys(cur, cblock, kp); + + /* Attach the new block to the cursor and log it. */ + xfs_btree_setbuf(cur, 0, cbp); + xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); + xfs_btree_log_recs(cur, cbp, 1, numrecs); +} + +/* + * Move the keys and pointers from a root block to a separate block. + * + * Since the keyptr size does not change, all we have to do is increase the + * tree height, copy the keyptrs to the new internal node (cblock), shrink + * the root, and copy the pointers there. + */ +STATIC int +xfs_btree_promote_node_iroot( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *cbp, + union xfs_btree_ptr *cptr, + struct xfs_btree_block *cblock) +{ + union xfs_btree_key *ckp; + union xfs_btree_key *kp; + union xfs_btree_ptr *cpp; + union xfs_btree_ptr *pp; + int i; + int error; + int numrecs = xfs_btree_get_numrecs(block); + + /* + * Increase tree height, adjusting the root block level to match. + * We cannot change the root btree node size until we've copied the + * block contents to the new child block. + */ + be16_add_cpu(&block->bb_level, 1); + cur->bc_nlevels++; + cur->bc_levels[level + 1].ptr = 1; + + /* + * Adjust the root btree record count, then copy the keys from the old + * root to the new child block. + */ + xfs_btree_set_numrecs(block, 1); + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, ckp, kp, numrecs); + + /* Check the pointers and copy them to the new child block. */ + pp = xfs_btree_ptr_addr(cur, 1, block); + cpp = xfs_btree_ptr_addr(cur, 1, cblock); + for (i = 0; i < numrecs; i++) { + error = xfs_btree_debug_check_ptr(cur, pp, i, level); + if (error) + return error; + } + xfs_btree_copy_ptrs(cur, cpp, pp, numrecs); + + /* + * Set the first keyptr to point to the new child block, then shrink + * the memory buffer for the root block. + */ + error = xfs_btree_debug_check_ptr(cur, cptr, 0, level); + if (error) + return error; + xfs_btree_copy_ptrs(cur, pp, cptr, 1); + xfs_btree_get_keys(cur, cblock, kp); + + cur->bc_ops->broot_realloc(cur, 1); + + /* Attach the new block to the cursor and log it. */ + xfs_btree_setbuf(cur, level, cbp); + xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); + xfs_btree_log_keys(cur, cbp, 1, numrecs); + xfs_btree_log_ptrs(cur, cbp, 1, numrecs); + return 0; +} /* * Copy the old inode root contents into a real block and make the @@ -2977,26 +3226,27 @@ xfs_btree_new_iroot( struct xfs_buf *cbp; /* buffer for cblock */ struct xfs_btree_block *block; /* btree block */ struct xfs_btree_block *cblock; /* child btree block */ - union xfs_btree_key *ckp; /* child key pointer */ - union xfs_btree_ptr *cpp; /* child ptr pointer */ - union xfs_btree_key *kp; /* pointer to btree key */ - union xfs_btree_ptr *pp; /* pointer to block addr */ + union xfs_btree_ptr aptr; union xfs_btree_ptr nptr; /* new block addr */ int level; /* btree level */ int error; /* error return code */ - int i; /* loop counter */ XFS_BTREE_STATS_INC(cur, newroot); - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); level = cur->bc_nlevels - 1; block = xfs_btree_get_iroot(cur); - pp = xfs_btree_ptr_addr(cur, 1, block); + ASSERT(level > 0 || (cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)); + if (level > 0) + aptr = *xfs_btree_ptr_addr(cur, 1, block); + else + aptr.l = cpu_to_be64(XFS_INO_TO_FSB(cur->bc_mp, + cur->bc_ino.ip->i_ino)); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); + error = xfs_btree_alloc_block(cur, &aptr, &nptr, stat); if (error) goto error0; if (*stat == 0) @@ -3014,61 +3264,45 @@ xfs_btree_new_iroot( * In that case have to also ensure the blkno remains correct */ memcpy(cblock, block, xfs_btree_block_len(cur)); - if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (xfs_has_crc(cur->bc_mp)) { __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) cblock->bb_u.l.bb_blkno = bno; else cblock->bb_u.s.bb_blkno = bno; } - be16_add_cpu(&block->bb_level, 1); - xfs_btree_set_numrecs(block, 1); - cur->bc_nlevels++; - ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); - cur->bc_levels[level + 1].ptr = 1; - - kp = xfs_btree_key_addr(cur, 1, block); - ckp = xfs_btree_key_addr(cur, 1, cblock); - xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); - - cpp = xfs_btree_ptr_addr(cur, 1, cblock); - for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { - error = xfs_btree_debug_check_ptr(cur, pp, i, level); + if (level > 0) { + error = xfs_btree_promote_node_iroot(cur, block, level, cbp, + &nptr, cblock); if (error) goto error0; + } else { + xfs_btree_promote_leaf_iroot(cur, block, cbp, &nptr, cblock); } - xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); - - error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); - if (error) - goto error0; - - xfs_btree_copy_ptrs(cur, pp, &nptr, 1); - - xfs_iroot_realloc(cur->bc_ino.ip, - 1 - xfs_btree_get_numrecs(cblock), - cur->bc_ino.whichfork); - - xfs_btree_setbuf(cur, level, cbp); - - /* - * Do all this logging at the end so that - * the root is at the right level. - */ - xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); - xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); - xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); - - *logflags |= - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); + *logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); *stat = 1; return 0; error0: return error; } +static void +xfs_btree_set_root( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) +{ + if (cur->bc_flags & XFS_BTREE_STAGING) { + /* Update the btree root information for a per-AG fake root. */ + cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s); + cur->bc_ag.afake->af_levels += inc; + } else { + cur->bc_ops->set_root(cur, ptr, inc); + } +} + /* * Allocate a new root block, fill it in. */ @@ -3093,10 +3327,10 @@ xfs_btree_new_root( XFS_BTREE_STATS_INC(cur, newroot); /* initialise our start point from the cursor */ - cur->bc_ops->init_ptr_from_cur(cur, &rptr); + xfs_btree_init_ptr_from_cur(cur, &rptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat); + error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat); if (error) goto error0; if (*stat == 0) @@ -3109,7 +3343,7 @@ xfs_btree_new_root( goto error0; /* Set the root in the holding structure increasing the level by 1. */ - cur->bc_ops->set_root(cur, &lptr, 1); + xfs_btree_set_root(cur, &lptr, 1); /* * At the previous root level there are now two blocks: the old root, @@ -3213,13 +3447,12 @@ xfs_btree_make_block_unfull( { int error = 0; - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - level == cur->bc_nlevels - 1) { + if (xfs_btree_at_iroot(cur, level)) { struct xfs_inode *ip = cur->bc_ino.ip; if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ - xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); + cur->bc_ops->broot_realloc(cur, numrecs + 1); *stat = 1; } else { /* A root block that needs replacing */ @@ -3299,8 +3532,8 @@ xfs_btree_insrec( * If we have an external root pointer, and we've made it to the * root level, allocate a new root block and we're done. */ - if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - (level >= cur->bc_nlevels)) { + if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE && + level >= cur->bc_nlevels) { error = xfs_btree_new_root(cur, stat); xfs_btree_set_ptr_null(cur, ptrp); @@ -3429,14 +3662,31 @@ xfs_btree_insrec( xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); /* - * If we just inserted into a new tree block, we have to - * recalculate nkey here because nkey is out of date. + * Update btree keys to reflect the newly added record or keyptr. + * There are three cases here to be aware of. Normally, all we have to + * do is walk towards the root, updating keys as necessary. + * + * If the caller had us target a full block for the insertion, we dealt + * with that by calling the _make_block_unfull function. If the + * "make unfull" function splits the block, it'll hand us back the key + * and pointer of the new block. We haven't yet added the new block to + * the next level up, so if we decide to add the new record to the new + * block (bp->b_bn != old_bn), we have to update the caller's pointer + * so that the caller adds the new block with the correct key. * - * Otherwise we're just updating an existing block (having shoved - * some records into the new tree block), so use the regular key - * update mechanism. + * However, there is a third possibility-- if the selected block is the + * root block of an inode-rooted btree and cannot be expanded further, + * the "make unfull" function moves the root block contents to a new + * block and updates the root block to point to the new block. In this + * case, no block pointer is passed back because the block has already + * been added to the btree. In this case, we need to use the regular + * key update function, just like the first case. This is critical for + * overlapping btrees, because the high key must be updated to reflect + * the entire tree, not just the subtree accessible through the first + * child of the root (which is now two levels down from the root). */ - if (bp && xfs_buf_daddr(bp) != old_bn) { + if (!xfs_btree_ptr_is_null(cur, &nptr) && + bp && xfs_buf_daddr(bp) != old_bn) { xfs_btree_get_keys(cur, block, lkey); } else if (xfs_btree_needs_key_update(cur, optr)) { error = xfs_btree_update_keys(cur, level); @@ -3445,15 +3695,6 @@ xfs_btree_insrec( } /* - * If we are tracking the last record in the tree and - * we are at the far right edge of the tree, update it. - */ - if (xfs_btree_is_lastrec(cur, block, level)) { - cur->bc_ops->update_lastrec(cur, block, rec, - ptr, LASTREC_INSREC); - } - - /* * Return the new block number, if any. * If there is one, give back a record value and a cursor too. */ @@ -3524,6 +3765,7 @@ xfs_btree_insert( } if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -3537,7 +3779,8 @@ xfs_btree_insert( if (pcur != cur && (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { /* Save the state from the cursor before we trash it */ - if (cur->bc_ops->update_cursor) + if (cur->bc_ops->update_cursor && + !(cur->bc_flags & XFS_BTREE_STAGING)) cur->bc_ops->update_cursor(pcur, cur); cur->bc_nlevels = pcur->bc_nlevels; xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); @@ -3555,6 +3798,97 @@ error0: return error; } +/* Move the records from a child leaf block to the root block. */ +STATIC void +xfs_btree_demote_leaf_child( + struct xfs_btree_cur *cur, + struct xfs_btree_block *cblock, + int numrecs) +{ + union xfs_btree_rec *rp; + union xfs_btree_rec *crp; + struct xfs_btree_block *broot; + + /* + * Decrease the tree height. + * + * Trickery here: The amount of memory that we need per record for the + * ifork's btree root block may change when we convert the broot from a + * node to a leaf. Free the old node broot so that we can get a fresh + * leaf broot. + */ + cur->bc_ops->broot_realloc(cur, 0); + cur->bc_nlevels--; + + /* + * Allocate a new leaf broot and copy the records from the old child. + * Detach the old child from the cursor. + */ + broot = cur->bc_ops->broot_realloc(cur, numrecs); + xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, 0, numrecs, + cur->bc_ino.ip->i_ino); + + rp = xfs_btree_rec_addr(cur, 1, broot); + crp = xfs_btree_rec_addr(cur, 1, cblock); + xfs_btree_copy_recs(cur, rp, crp, numrecs); + + cur->bc_levels[0].bp = NULL; +} + +/* + * Move the keyptrs from a child node block to the root block. + * + * Since the keyptr size does not change, all we have to do is increase the + * tree height, copy the keyptrs to the new internal node (cblock), shrink + * the root, and copy the pointers there. + */ +STATIC int +xfs_btree_demote_node_child( + struct xfs_btree_cur *cur, + struct xfs_btree_block *cblock, + int level, + int numrecs) +{ + struct xfs_btree_block *block; + union xfs_btree_key *ckp; + union xfs_btree_key *kp; + union xfs_btree_ptr *cpp; + union xfs_btree_ptr *pp; + int i; + int error; + + /* + * Adjust the root btree node size and the record count to match the + * doomed child so that we can copy the keyptrs ahead of changing the + * tree shape. + */ + block = cur->bc_ops->broot_realloc(cur, numrecs); + + xfs_btree_set_numrecs(block, numrecs); + ASSERT(block->bb_numrecs == cblock->bb_numrecs); + + /* Copy keys from the doomed block. */ + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, kp, ckp, numrecs); + + /* Copy pointers from the doomed block. */ + pp = xfs_btree_ptr_addr(cur, 1, block); + cpp = xfs_btree_ptr_addr(cur, 1, cblock); + for (i = 0; i < numrecs; i++) { + error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); + if (error) + return error; + } + xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); + + /* Decrease tree height, adjusting the root block level to match. */ + cur->bc_levels[level - 1].bp = NULL; + be16_add_cpu(&block->bb_level, -1); + cur->bc_nlevels--; + return 0; +} + /* * Try to merge a non-leaf block back into the inode root. * @@ -3567,34 +3901,31 @@ STATIC int xfs_btree_kill_iroot( struct xfs_btree_cur *cur) { - int whichfork = cur->bc_ino.whichfork; struct xfs_inode *ip = cur->bc_ino.ip; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; struct xfs_btree_block *cblock; - union xfs_btree_key *kp; - union xfs_btree_key *ckp; - union xfs_btree_ptr *pp; - union xfs_btree_ptr *cpp; struct xfs_buf *cbp; int level; - int index; int numrecs; int error; #ifdef DEBUG union xfs_btree_ptr ptr; #endif - int i; - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); - ASSERT(cur->bc_nlevels > 1); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); + ASSERT((cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS) || + cur->bc_nlevels > 1); /* * Don't deal with the root block needs to be a leaf case. * We're just going to turn the thing back into extents anyway. */ level = cur->bc_nlevels - 1; - if (level == 1) + if (level == 1 && !(cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) + goto out0; + + /* If we're already a leaf, jump out. */ + if (level == 0) goto out0; /* @@ -3624,40 +3955,20 @@ xfs_btree_kill_iroot( ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); #endif - index = numrecs - cur->bc_ops->get_maxrecs(cur, level); - if (index) { - xfs_iroot_realloc(cur->bc_ino.ip, index, - cur->bc_ino.whichfork); - block = ifp->if_broot; - } - - be16_add_cpu(&block->bb_numrecs, index); - ASSERT(block->bb_numrecs == cblock->bb_numrecs); - - kp = xfs_btree_key_addr(cur, 1, block); - ckp = xfs_btree_key_addr(cur, 1, cblock); - xfs_btree_copy_keys(cur, kp, ckp, numrecs); - - pp = xfs_btree_ptr_addr(cur, 1, block); - cpp = xfs_btree_ptr_addr(cur, 1, cblock); - - for (i = 0; i < numrecs; i++) { - error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); + if (level > 1) { + error = xfs_btree_demote_node_child(cur, cblock, level, + numrecs); if (error) return error; - } - - xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); + } else + xfs_btree_demote_leaf_child(cur, cblock, numrecs); error = xfs_btree_free_block(cur, cbp); if (error) return error; - cur->bc_levels[level - 1].bp = NULL; - be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); - cur->bc_nlevels--; out0: return 0; } @@ -3680,7 +3991,7 @@ xfs_btree_kill_root( * Update the root pointer, decreasing the level by 1 and then * free the old root. */ - cur->bc_ops->set_root(cur, newroot, -1); + xfs_btree_set_root(cur, newroot, -1); error = xfs_btree_free_block(cur, bp); if (error) @@ -3809,40 +4120,29 @@ xfs_btree_delrec( xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); /* - * If we are tracking the last record in the tree and - * we are at the far right edge of the tree, update it. - */ - if (xfs_btree_is_lastrec(cur, block, level)) { - cur->bc_ops->update_lastrec(cur, block, NULL, - ptr, LASTREC_DELREC); - } - - /* * We're at the root level. First, shrink the root block in-memory. * Try to get rid of the next level down. If we can't then there's - * nothing left to do. + * nothing left to do. numrecs was decremented above. */ - if (level == cur->bc_nlevels - 1) { - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { - xfs_iroot_realloc(cur->bc_ino.ip, -1, - cur->bc_ino.whichfork); + if (xfs_btree_at_iroot(cur, level)) { + cur->bc_ops->broot_realloc(cur, numrecs); - error = xfs_btree_kill_iroot(cur); - if (error) - goto error0; + error = xfs_btree_kill_iroot(cur); + if (error) + goto error0; - error = xfs_btree_dec_cursor(cur, level, stat); - if (error) - goto error0; - *stat = 1; - return 0; - } + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + *stat = 1; + return 0; + } - /* - * If this is the root level, and there's only one entry left, - * and it's NOT the leaf level, then we can get rid of this - * level. - */ + /* + * If this is the root level, and there's only one entry left, and it's + * NOT the leaf level, then we can get rid of this level. + */ + if (level == cur->bc_nlevels - 1) { if (numrecs == 1 && level > 0) { union xfs_btree_ptr *pp; /* @@ -3891,7 +4191,7 @@ xfs_btree_delrec( xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { /* * One child of root, need to get a chance to copy its contents * into the root and delete it. Can't go up to next level, @@ -3931,6 +4231,7 @@ xfs_btree_delrec( */ i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -3939,12 +4240,14 @@ xfs_btree_delrec( if (error) goto error0; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -3992,6 +4295,7 @@ xfs_btree_delrec( if (!xfs_btree_ptr_is_null(cur, &lptr)) { i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -4000,6 +4304,7 @@ xfs_btree_delrec( if (error) goto error0; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -4017,6 +4322,7 @@ xfs_btree_delrec( */ i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -4026,6 +4332,7 @@ xfs_btree_delrec( goto error0; i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -4201,8 +4508,8 @@ xfs_btree_delrec( * If we joined with the right neighbor and there's a level above * us, increment the cursor at that level. */ - else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || - (level + 1 < cur->bc_nlevels)) { + else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE || + level + 1 < cur->bc_nlevels) { error = xfs_btree_increment(cur, level + 1, &i); if (error) goto error0; @@ -4270,7 +4577,7 @@ xfs_btree_delete( * If we combined blocks as part of deleting the record, delrec won't * have updated the parent high keys so we have to do that here. */ - if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) { + if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) { error = xfs_btree_updkeys_force(cur, 0); if (error) goto error0; @@ -4344,7 +4651,7 @@ xfs_btree_visit_block( { struct xfs_btree_block *block; struct xfs_buf *bp; - union xfs_btree_ptr rptr; + union xfs_btree_ptr rptr, bufptr; int error; /* do right sibling readahead */ @@ -4367,15 +4674,12 @@ xfs_btree_visit_block( * return the same block without checking if the right sibling points * back to us and creates a cyclic reference in the btree. */ - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, - xfs_buf_daddr(bp))) - return -EFSCORRUPTED; - } else { - if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, - xfs_buf_daddr(bp))) - return -EFSCORRUPTED; + xfs_btree_buf_to_ptr(cur, bp, &bufptr); + if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; } + return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } @@ -4393,7 +4697,7 @@ xfs_btree_visit_blocks( struct xfs_btree_block *block = NULL; int error = 0; - cur->bc_ops->init_ptr_from_cur(cur, &lptr); + xfs_btree_init_ptr_from_cur(cur, &lptr); /* for each level */ for (level = cur->bc_nlevels - 1; level >= 0; level--) { @@ -4471,7 +4775,7 @@ xfs_btree_block_change_owner( /* modify the owner */ block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) return 0; block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); @@ -4489,7 +4793,7 @@ xfs_btree_block_change_owner( * though, so everything is consistent in memory. */ if (!bp) { - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); ASSERT(level == cur->bc_nlevels - 1); return 0; } @@ -4523,7 +4827,7 @@ xfs_btree_change_owner( /* Verify the v5 fields of a long-format btree block. */ xfs_failaddr_t -xfs_btree_lblock_v5hdr_verify( +xfs_btree_fsblock_v5hdr_verify( struct xfs_buf *bp, uint64_t owner) { @@ -4544,7 +4848,7 @@ xfs_btree_lblock_v5hdr_verify( /* Verify a long-format btree block. */ xfs_failaddr_t -xfs_btree_lblock_verify( +xfs_btree_fsblock_verify( struct xfs_buf *bp, unsigned int max_recs) { @@ -4553,28 +4857,60 @@ xfs_btree_lblock_verify( xfs_fsblock_t fsb; xfs_failaddr_t fa; + ASSERT(!xfs_buftarg_is_mem(bp->b_target)); + /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_leftsib); if (!fa) - fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_rightsib); return fa; } +/* Verify an in-memory btree block. */ +xfs_failaddr_t +xfs_btree_memblock_verify( + struct xfs_buf *bp, + unsigned int max_recs) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buftarg *btp = bp->b_target; + xfs_failaddr_t fa; + xfbno_t bno; + + ASSERT(xfs_buftarg_is_mem(bp->b_target)); + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > max_recs) + return __this_address; + + /* sibling pointer verification */ + bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_leftsib); + if (fa) + return fa; + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_rightsib); + if (fa) + return fa; + + return NULL; +} /** - * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format + * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format * btree block * * @bp: buffer containing the btree block */ xfs_failaddr_t -xfs_btree_sblock_v5hdr_verify( +xfs_btree_agblock_v5hdr_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; @@ -4587,19 +4923,19 @@ xfs_btree_sblock_v5hdr_verify( return __this_address; if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; - if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag)) return __this_address; return NULL; } /** - * xfs_btree_sblock_verify() -- verify a short-format btree block + * xfs_btree_agblock_verify() -- verify a short-format btree block * * @bp: buffer containing the btree block * @max_recs: maximum records allowed in this btree node */ xfs_failaddr_t -xfs_btree_sblock_verify( +xfs_btree_agblock_verify( struct xfs_buf *bp, unsigned int max_recs) { @@ -4608,16 +4944,18 @@ xfs_btree_sblock_verify( xfs_agblock_t agbno; xfs_failaddr_t fa; + ASSERT(!xfs_buftarg_is_mem(bp->b_target)); + /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, + fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, block->bb_u.s.bb_leftsib); if (!fa) - fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, + fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, block->bb_u.s.bb_rightsib); return fa; } @@ -4815,7 +5153,7 @@ xfs_btree_overlapped_query_range( /* Load the root of the btree. */ level = cur->bc_nlevels - 1; - cur->bc_ops->init_ptr_from_cur(cur, &ptr); + xfs_btree_init_ptr_from_cur(cur, &ptr); error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); if (error) return error; @@ -4966,7 +5304,7 @@ xfs_btree_query_range( if (!xfs_btree_keycmp_le(cur, &low_key, &high_key)) return -EINVAL; - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, @@ -4996,7 +5334,7 @@ xfs_btree_count_blocks_helper( int level, void *data) { - xfs_extlen_t *blocks = data; + xfs_filblks_t *blocks = data; (*blocks)++; return 0; @@ -5006,7 +5344,7 @@ xfs_btree_count_blocks_helper( int xfs_btree_count_blocks( struct xfs_btree_cur *cur, - xfs_extlen_t *blocks) + xfs_filblks_t *blocks) { *blocks = 0; return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, @@ -5020,7 +5358,7 @@ xfs_btree_diff_two_ptrs( const union xfs_btree_ptr *a, const union xfs_btree_ptr *b) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); } @@ -5074,7 +5412,7 @@ xfs_btree_has_records_helper( key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key, &rec_key, info->key_mask); if (key_contig == XBTREE_KEY_OVERLAP && - !(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return -EFSCORRUPTED; if (key_contig == XBTREE_KEY_GAP) return -ECANCELED; @@ -5168,7 +5506,7 @@ xfs_btree_has_more_records( return true; /* There are more record blocks. */ - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); else return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); @@ -5195,6 +5533,12 @@ xfs_btree_init_cur_caches(void) error = xfs_refcountbt_init_cur_cache(); if (error) goto err; + error = xfs_rtrmapbt_init_cur_cache(); + if (error) + goto err; + error = xfs_rtrefcountbt_init_cur_cache(); + if (error) + goto err; return 0; err: @@ -5211,6 +5555,8 @@ xfs_btree_destroy_cur_caches(void) xfs_bmbt_destroy_cur_cache(); xfs_rmapbt_destroy_cur_cache(); xfs_refcountbt_destroy_cur_cache(); + xfs_rtrmapbt_destroy_cur_cache(); + xfs_rtrefcountbt_destroy_cur_cache(); } /* Move the btree cursor before the first record. */ @@ -5233,8 +5579,73 @@ xfs_btree_goto_left_edge( return error; if (stat != 0) { ASSERT(0); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } return 0; } + +/* Allocate a block for an inode-rooted metadata btree. */ +int +xfs_btree_alloc_metafile_block( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + struct xfs_alloc_arg args = { + .mp = cur->bc_mp, + .tp = cur->bc_tp, + .resv = XFS_AG_RESV_METAFILE, + .minlen = 1, + .maxlen = 1, + .prod = 1, + }; + struct xfs_inode *ip = cur->bc_ino.ip; + int error; + + ASSERT(xfs_is_metadir_inode(ip)); + + xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, cur->bc_ino.whichfork); + error = xfs_alloc_vextent_start_ag(&args, + XFS_INO_TO_FSB(cur->bc_mp, ip->i_ino)); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) { + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + + xfs_metafile_resv_alloc_space(ip, &args); + + new->l = cpu_to_be64(args.fsbno); + *stat = 1; + return 0; +} + +/* Free a block from an inode-rooted metadata btree. */ +int +xfs_btree_free_metafile_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip = cur->bc_ino.ip; + struct xfs_trans *tp = cur->bc_tp; + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + int error; + + ASSERT(xfs_is_metadir_inode(ip)); + + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); + error = xfs_free_extent_later(tp, fsbno, 1, &oinfo, XFS_AG_RESV_METAFILE, + 0); + if (error) + return error; + + xfs_metafile_resv_free_space(ip, tp, 1); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index d906324e25c8..355b304696e6 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -55,15 +55,8 @@ union xfs_btree_rec { #define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) #define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) -#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi) -#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) -#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) -#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) -#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) -#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) -#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) - -uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); +struct xfs_btree_ops; +uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops); /* * For logging record fields. @@ -86,9 +79,11 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); * Generic stats interface */ #define XFS_BTREE_STATS_INC(cur, stat) \ - XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat) + XFS_STATS_INC_OFF((cur)->bc_mp, \ + (cur)->bc_ops->statoff + __XBTS_ ## stat) #define XFS_BTREE_STATS_ADD(cur, stat, val) \ - XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) + XFS_STATS_ADD_OFF((cur)->bc_mp, \ + (cur)->bc_ops->statoff + __XBTS_ ## stat, val) enum xbtree_key_contig { XBTREE_KEY_GAP = 0, @@ -111,10 +106,37 @@ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y) return XBTREE_KEY_OVERLAP; } +#define XFS_BTREE_LONG_PTR_LEN (sizeof(__be64)) +#define XFS_BTREE_SHORT_PTR_LEN (sizeof(__be32)) + +enum xfs_btree_type { + XFS_BTREE_TYPE_AG, + XFS_BTREE_TYPE_INODE, + XFS_BTREE_TYPE_MEM, +}; + struct xfs_btree_ops { - /* size of the key and record structures */ - size_t key_len; - size_t rec_len; + const char *name; + + /* Type of btree - AG-rooted or inode-rooted */ + enum xfs_btree_type type; + + /* XFS_BTGEO_* flags that determine the geometry of the btree */ + unsigned int geom_flags; + + /* size of the key, pointer, and record structures */ + size_t key_len; + size_t ptr_len; + size_t rec_len; + + /* LRU refcount to set on each btree buffer created */ + unsigned int lru_refs; + + /* offset of btree stats array */ + unsigned int statoff; + + /* sick mask for health reporting (not for bmap btrees) */ + unsigned int sick_mask; /* cursor operations */ struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); @@ -132,12 +154,6 @@ struct xfs_btree_ops { int *stat); int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); - /* update last record information */ - void (*update_lastrec)(struct xfs_btree_cur *cur, - const struct xfs_btree_block *block, - const union xfs_btree_rec *rec, - int ptr, int reason); - /* records in block/level */ int (*get_minrecs)(struct xfs_btree_cur *cur, int level); int (*get_maxrecs)(struct xfs_btree_cur *cur, int level); @@ -197,15 +213,27 @@ struct xfs_btree_ops { const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask); -}; -/* - * Reasons for the update_lastrec method to be called. - */ -#define LASTREC_UPDATE 0 -#define LASTREC_INSREC 1 -#define LASTREC_DELREC 2 + /* + * Reallocate the space for if_broot to fit the number of records. + * Move the records and pointers in if_broot to fit the new size. When + * shrinking this will eliminate holes between the records and pointers + * created by the caller. When growing this will create holes to be + * filled in by the caller. + * + * The caller must not request to add more records than would fit in + * the on-disk inode root. If the if_broot is currently NULL, then if + * we are adding records, one will be allocated. The caller must also + * not request that the number of records go below zero, although it + * can go to zero. + */ + struct xfs_btree_block *(*broot_realloc)(struct xfs_btree_cur *cur, + unsigned int new_numrecs); +}; +/* btree geometry flags */ +#define XFS_BTGEO_OVERLAPPING (1U << 0) /* overlapping intervals */ +#define XFS_BTGEO_IROOT_RECORDS (1U << 1) /* iroot can store records */ union xfs_btree_irec { struct xfs_alloc_rec_incore a; @@ -215,39 +243,6 @@ union xfs_btree_irec { struct xfs_refcount_irec rc; }; -/* Per-AG btree information. */ -struct xfs_btree_cur_ag { - struct xfs_perag *pag; - union { - struct xfs_buf *agbp; - struct xbtree_afakeroot *afake; /* for staging cursor */ - }; - union { - struct { - unsigned int nr_ops; /* # record updates */ - unsigned int shape_changes; /* # of extent splits */ - } refc; - struct { - bool active; /* allocation cursor state */ - } abt; - }; -}; - -/* Btree-in-inode cursor information */ -struct xfs_btree_cur_ino { - struct xfs_inode *ip; - struct xbtree_ifakeroot *ifake; /* for staging cursor */ - int allocated; - short forksize; - char whichfork; - char flags; -/* We are converting a delalloc reservation */ -#define XFS_BTCUR_BMBT_WASDEL (1 << 0) - -/* For extent swap, ignore owner check in verifier */ -#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1) -}; - struct xfs_btree_level { /* buffer pointer */ struct xfs_buf *bp; @@ -272,21 +267,37 @@ struct xfs_btree_cur const struct xfs_btree_ops *bc_ops; struct kmem_cache *bc_cache; /* cursor cache */ unsigned int bc_flags; /* btree features - below */ - xfs_btnum_t bc_btnum; /* identifies which btree type */ union xfs_btree_irec bc_rec; /* current insert/search record value */ uint8_t bc_nlevels; /* number of levels in the tree */ uint8_t bc_maxlevels; /* maximum levels for this btree type */ - int bc_statoff; /* offset of btree stats array */ + struct xfs_group *bc_group; - /* - * Short btree pointers need an agno to be able to turn the pointers - * into physical addresses for IO, so the btree cursor switches between - * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the - * cursor. - */ + /* per-type information */ union { - struct xfs_btree_cur_ag bc_ag; - struct xfs_btree_cur_ino bc_ino; + struct { + struct xfs_inode *ip; + short forksize; + char whichfork; + struct xbtree_ifakeroot *ifake; /* for staging cursor */ + } bc_ino; + struct { + struct xfs_buf *agbp; + struct xbtree_afakeroot *afake; /* for staging cursor */ + } bc_ag; + struct { + struct xfbtree *xfbtree; + } bc_mem; + }; + + /* per-format private data */ + union { + struct { + int allocated; + } bc_bmap; /* bmapbt */ + struct { + unsigned int nr_ops; /* # record updates */ + unsigned int shape_changes; /* # of extent splits */ + } bc_refc; /* refcountbt/rtrefcountbt */ }; /* Must be at the end of the struct! */ @@ -304,18 +315,22 @@ xfs_btree_cur_sizeof(unsigned int nlevels) return struct_size_t(struct xfs_btree_cur, bc_levels, nlevels); } -/* cursor flags */ -#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ -#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ -#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ -#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ -#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */ +/* cursor state flags */ /* * The root of this btree is a fakeroot structure so that we can stage a btree * rebuild without leaving it accessible via primary metadata. The ops struct * is dynamically allocated and must be freed when the cursor is deleted. */ -#define XFS_BTREE_STAGING (1<<5) +#define XFS_BTREE_STAGING (1U << 0) + +/* We are converting a delalloc reservation (only for bmbt btrees) */ +#define XFS_BTREE_BMBT_WASDEL (1U << 1) + +/* For extent swap, ignore owner check in verifier (only for bmbt btrees) */ +#define XFS_BTREE_BMBT_INVALID_OWNER (1U << 2) + +/* Cursor is active (only for allocbt btrees) */ +#define XFS_BTREE_ALLOCBT_ACTIVE (1U << 3) #define XFS_BTREE_NOERROR 0 #define XFS_BTREE_ERROR 1 @@ -325,14 +340,10 @@ xfs_btree_cur_sizeof(unsigned int nlevels) */ #define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) -/* - * Internal long and short btree block checks. They return NULL if the - * block is ok or the address of the failed check otherwise. - */ -xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur, - struct xfs_btree_block *block, int level, struct xfs_buf *bp); -xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur, +xfs_failaddr_t __xfs_btree_check_block(struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp); +int __xfs_btree_check_ptr(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, int index, int level); /* * Check that block header is ok. @@ -345,24 +356,6 @@ xfs_btree_check_block( struct xfs_buf *bp); /* buffer containing block, if any */ /* - * Check that (long) pointer is ok. - */ -bool /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_lptr( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_fsblock_t fsbno, /* btree block disk address */ - int level); /* btree block level */ - -/* - * Check that (short) pointer is ok. - */ -bool /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_sptr( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agblock_t agbno, /* btree block disk address */ - int level); /* btree block level */ - -/* * Delete the btree cursor. */ void @@ -392,63 +385,14 @@ xfs_btree_offsets( int *last); /* output: last byte offset */ /* - * Get a buffer for the block, return it read in. - * Long-form addressing. - */ -int /* error */ -xfs_btree_read_bufl( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - struct xfs_buf **bpp, /* buffer for fsbno */ - int refval, /* ref count value for buffer */ - const struct xfs_buf_ops *ops); - -/* - * Read-ahead the block, don't wait for it, don't return a buffer. - * Long-form addressing. - */ -void /* error */ -xfs_btree_reada_bufl( - struct xfs_mount *mp, /* file system mount point */ - xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count, /* count of filesystem blocks */ - const struct xfs_buf_ops *ops); - -/* - * Read-ahead the block, don't wait for it, don't return a buffer. - * Short-form addressing. - */ -void /* error */ -xfs_btree_reada_bufs( - struct xfs_mount *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count, /* count of filesystem blocks */ - const struct xfs_buf_ops *ops); - -/* * Initialise a new btree block header */ -void -xfs_btree_init_block( - struct xfs_mount *mp, - struct xfs_buf *bp, - xfs_btnum_t btnum, - __u16 level, - __u16 numrecs, - __u64 owner); - -void -xfs_btree_init_block_int( - struct xfs_mount *mp, - struct xfs_btree_block *buf, - xfs_daddr_t blkno, - xfs_btnum_t btnum, - __u16 level, - __u16 numrecs, - __u64 owner, - unsigned int flags); +void xfs_btree_init_buf(struct xfs_mount *mp, struct xfs_buf *bp, + const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, + __u64 owner); +void xfs_btree_init_block(struct xfs_mount *mp, + struct xfs_btree_block *buf, const struct xfs_btree_ops *ops, + __u16 level, __u16 numrecs, __u64 owner); /* * Common btree core entry points. @@ -467,10 +411,10 @@ int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner, /* * btree block CRC helpers */ -void xfs_btree_lblock_calc_crc(struct xfs_buf *); -bool xfs_btree_lblock_verify_crc(struct xfs_buf *); -void xfs_btree_sblock_calc_crc(struct xfs_buf *); -bool xfs_btree_sblock_verify_crc(struct xfs_buf *); +void xfs_btree_fsblock_calc_crc(struct xfs_buf *); +bool xfs_btree_fsblock_verify_crc(struct xfs_buf *); +void xfs_btree_agblock_calc_crc(struct xfs_buf *); +bool xfs_btree_agblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. @@ -510,12 +454,14 @@ static inline int xfs_btree_get_level(const struct xfs_btree_block *block) #define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) -xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); -xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp, +xfs_failaddr_t xfs_btree_agblock_v5hdr_verify(struct xfs_buf *bp); +xfs_failaddr_t xfs_btree_agblock_verify(struct xfs_buf *bp, unsigned int max_recs); -xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp, +xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp, uint64_t owner); -xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, +xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp, + unsigned int max_recs); +xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp, unsigned int max_recs); unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits, @@ -554,7 +500,7 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data); -int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); +int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_filblks_t *blocks); union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); @@ -690,7 +636,7 @@ xfs_btree_islastblock( block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } @@ -714,21 +660,28 @@ void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur, void xfs_btree_copy_keys(struct xfs_btree_cur *cur, union xfs_btree_key *dst_key, const union xfs_btree_key *src_key, int numkeys); +void xfs_btree_init_ptr_from_cur(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); static inline struct xfs_btree_cur * xfs_btree_alloc_cursor( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_btnum_t btnum, + const struct xfs_btree_ops *ops, uint8_t maxlevels, struct kmem_cache *cache) { struct xfs_btree_cur *cur; - cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL); + ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN || + ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN); + + /* BMBT allocations can come through from non-transactional context. */ + cur = kmem_cache_zalloc(cache, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); + cur->bc_ops = ops; cur->bc_tp = tp; cur->bc_mp = mp; - cur->bc_btnum = btnum; cur->bc_maxlevels = maxlevels; cur->bc_cache = cache; @@ -740,4 +693,20 @@ void xfs_btree_destroy_cur_caches(void); int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur); +/* Does this level of the cursor point to the inode root (and not a block)? */ +static inline bool +xfs_btree_at_iroot( + const struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_ops->type == XFS_BTREE_TYPE_INODE && + level == cur->bc_nlevels - 1; +} + +int xfs_btree_alloc_metafile_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, union xfs_btree_ptr *newp, + int *stat); +int xfs_btree_free_metafile_block(struct xfs_btree_cur *cur, + struct xfs_buf *bp); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c new file mode 100644 index 000000000000..f2f7b4305413 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_mem.c @@ -0,0 +1,346 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_btree.h" +#include "xfs_error.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_ag.h" +#include "xfs_buf_item.h" +#include "xfs_trace.h" +#include "xfs_rtgroup.h" + +/* Set the root of an in-memory btree. */ +void +xfbtree_set_root( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) +{ + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + cur->bc_mem.xfbtree->root = *ptr; + cur->bc_mem.xfbtree->nlevels += inc; +} + +/* Initialize a pointer from the in-memory btree header. */ +void +xfbtree_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + *ptr = cur->bc_mem.xfbtree->root; +} + +/* Duplicate an in-memory btree cursor. */ +struct xfs_btree_cur * +xfbtree_dup_cursor( + struct xfs_btree_cur *cur) +{ + struct xfs_btree_cur *ncur; + + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + ncur = xfs_btree_alloc_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ops, + cur->bc_maxlevels, cur->bc_cache); + ncur->bc_flags = cur->bc_flags; + ncur->bc_nlevels = cur->bc_nlevels; + ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree; + if (cur->bc_group) + ncur->bc_group = xfs_group_hold(cur->bc_group); + return ncur; +} + +/* Close the btree xfile and release all resources. */ +void +xfbtree_destroy( + struct xfbtree *xfbt) +{ + xfs_buftarg_drain(xfbt->target); +} + +/* Compute the number of bytes available for records. */ +static inline unsigned int +xfbtree_rec_bytes( + struct xfs_mount *mp, + const struct xfs_btree_ops *ops) +{ + return XMBUF_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; +} + +/* Initialize an empty leaf block as the btree root. */ +STATIC int +xfbtree_init_leaf_block( + struct xfs_mount *mp, + struct xfbtree *xfbt, + const struct xfs_btree_ops *ops) +{ + struct xfs_buf *bp; + xfbno_t bno = xfbt->highest_bno++; + int error; + + error = xfs_buf_get(xfbt->target, xfbno_to_daddr(bno), XFBNO_BBSIZE, + &bp); + if (error) + return error; + + trace_xfbtree_create_root_buf(xfbt, bp); + + bp->b_ops = ops->buf_ops; + xfs_btree_init_buf(mp, bp, ops, 0, 0, xfbt->owner); + xfs_buf_relse(bp); + + xfbt->root.l = cpu_to_be64(bno); + return 0; +} + +/* + * Create an in-memory btree root that can be used with the given xmbuf. + * Callers must set xfbt->owner. + */ +int +xfbtree_init( + struct xfs_mount *mp, + struct xfbtree *xfbt, + struct xfs_buftarg *btp, + const struct xfs_btree_ops *ops) +{ + unsigned int blocklen = xfbtree_rec_bytes(mp, ops); + unsigned int keyptr_len; + int error; + + /* Requires a long-format CRC-format btree */ + if (!xfs_has_crc(mp)) { + ASSERT(xfs_has_crc(mp)); + return -EINVAL; + } + if (ops->ptr_len != XFS_BTREE_LONG_PTR_LEN) { + ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN); + return -EINVAL; + } + + memset(xfbt, 0, sizeof(*xfbt)); + xfbt->target = btp; + + /* Set up min/maxrecs for this btree. */ + keyptr_len = ops->key_len + sizeof(__be64); + xfbt->maxrecs[0] = blocklen / ops->rec_len; + xfbt->maxrecs[1] = blocklen / keyptr_len; + xfbt->minrecs[0] = xfbt->maxrecs[0] / 2; + xfbt->minrecs[1] = xfbt->maxrecs[1] / 2; + xfbt->highest_bno = 0; + xfbt->nlevels = 1; + + /* Initialize the empty btree. */ + error = xfbtree_init_leaf_block(mp, xfbt, ops); + if (error) + goto err_freesp; + + trace_xfbtree_init(mp, xfbt, ops); + + return 0; + +err_freesp: + xfs_buftarg_drain(xfbt->target); + return error; +} + +/* Allocate a block to our in-memory btree. */ +int +xfbtree_alloc_block( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + xfbno_t bno = xfbt->highest_bno++; + + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + trace_xfbtree_alloc_block(xfbt, cur, bno); + + /* Fail if the block address exceeds the maximum for the buftarg. */ + if (!xfbtree_verify_bno(xfbt, bno)) { + ASSERT(xfbtree_verify_bno(xfbt, bno)); + *stat = 0; + return 0; + } + + new->l = cpu_to_be64(bno); + *stat = 1; + return 0; +} + +/* Free a block from our in-memory btree. */ +int +xfbtree_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + xfs_daddr_t daddr = xfs_buf_daddr(bp); + xfbno_t bno = xfs_daddr_to_xfbno(daddr); + + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + trace_xfbtree_free_block(xfbt, cur, bno); + + if (bno + 1 == xfbt->highest_bno) + xfbt->highest_bno--; + + return 0; +} + +/* Return the minimum number of records for a btree block. */ +int +xfbtree_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + + return xfbt->minrecs[level != 0]; +} + +/* Return the maximum number of records for a btree block. */ +int +xfbtree_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + + return xfbt->maxrecs[level != 0]; +} + +/* If this log item is a buffer item that came from the xfbtree, return it. */ +static inline struct xfs_buf * +xfbtree_buf_match( + struct xfbtree *xfbt, + const struct xfs_log_item *lip) +{ + const struct xfs_buf_log_item *bli; + struct xfs_buf *bp; + + if (lip->li_type != XFS_LI_BUF) + return NULL; + + bli = container_of(lip, struct xfs_buf_log_item, bli_item); + bp = bli->bli_buf; + if (bp->b_target != xfbt->target) + return NULL; + + return bp; +} + +/* + * Commit changes to the incore btree immediately by writing all dirty xfbtree + * buffers to the backing xfile. This detaches all xfbtree buffers from the + * transaction, even on failure. The buffer locks are dropped between the + * delwri queue and submit, so the caller must synchronize btree access. + * + * Normally we'd let the buffers commit with the transaction and get written to + * the xfile via the log, but online repair stages ephemeral btrees in memory + * and uses the btree_staging functions to write new btrees to disk atomically. + * The in-memory btree (and its backing store) are discarded at the end of the + * repair phase, which means that xfbtree buffers cannot commit with the rest + * of a transaction. + * + * In other words, online repair only needs the transaction to collect buffer + * pointers and to avoid buffer deadlocks, not to guarantee consistency of + * updates. + */ +int +xfbtree_trans_commit( + struct xfbtree *xfbt, + struct xfs_trans *tp) +{ + struct xfs_log_item *lip, *n; + bool tp_dirty = false; + int error = 0; + + /* + * For each xfbtree buffer attached to the transaction, write the dirty + * buffers to the xfile and release them. + */ + list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) { + struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip); + + if (!bp) { + if (test_bit(XFS_LI_DIRTY, &lip->li_flags)) + tp_dirty |= true; + continue; + } + + trace_xfbtree_trans_commit_buf(xfbt, bp); + + xmbuf_trans_bdetach(tp, bp); + + /* + * If the buffer fails verification, note the failure but + * continue walking the transaction items so that we remove all + * ephemeral btree buffers. + */ + if (!error) + error = xmbuf_finalize(bp); + + xfs_buf_relse(bp); + } + + /* + * Reset the transaction's dirty flag to reflect the dirty state of the + * log items that are still attached. + */ + tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) | + (tp_dirty ? XFS_TRANS_DIRTY : 0); + + return error; +} + +/* + * Cancel changes to the incore btree by detaching all the xfbtree buffers. + * Changes are not undone, so callers must not access the btree ever again. + */ +void +xfbtree_trans_cancel( + struct xfbtree *xfbt, + struct xfs_trans *tp) +{ + struct xfs_log_item *lip, *n; + bool tp_dirty = false; + + list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) { + struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip); + + if (!bp) { + if (test_bit(XFS_LI_DIRTY, &lip->li_flags)) + tp_dirty |= true; + continue; + } + + trace_xfbtree_trans_cancel_buf(xfbt, bp); + + xmbuf_trans_bdetach(tp, bp); + xfs_buf_relse(bp); + } + + /* + * Reset the transaction's dirty flag to reflect the dirty state of the + * log items that are still attached. + */ + tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) | + (tp_dirty ? XFS_TRANS_DIRTY : 0); +} diff --git a/fs/xfs/libxfs/xfs_btree_mem.h b/fs/xfs/libxfs/xfs_btree_mem.h new file mode 100644 index 000000000000..1c3825786ec8 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_mem.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_BTREE_MEM_H__ +#define __XFS_BTREE_MEM_H__ + +typedef uint64_t xfbno_t; + +#define XFBNO_BLOCKSIZE (XMBUF_BLOCKSIZE) +#define XFBNO_BBSHIFT (XMBUF_BLOCKSHIFT - BBSHIFT) +#define XFBNO_BBSIZE (XFBNO_BLOCKSIZE >> BBSHIFT) + +static inline xfs_daddr_t xfbno_to_daddr(xfbno_t blkno) +{ + return blkno << XFBNO_BBSHIFT; +} + +static inline xfbno_t xfs_daddr_to_xfbno(xfs_daddr_t daddr) +{ + return daddr >> XFBNO_BBSHIFT; +} + +struct xfbtree { + /* buffer cache target for this in-memory btree */ + struct xfs_buftarg *target; + + /* Highest block number that has been written to. */ + xfbno_t highest_bno; + + /* Owner of this btree. */ + unsigned long long owner; + + /* Btree header */ + union xfs_btree_ptr root; + unsigned int nlevels; + + /* Minimum and maximum records per block. */ + unsigned int maxrecs[2]; + unsigned int minrecs[2]; +}; + +#ifdef CONFIG_XFS_BTREE_IN_MEM +static inline bool xfbtree_verify_bno(struct xfbtree *xfbt, xfbno_t bno) +{ + return xmbuf_verify_daddr(xfbt->target, xfbno_to_daddr(bno)); +} + +void xfbtree_set_root(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, int inc); +void xfbtree_init_ptr_from_cur(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); +struct xfs_btree_cur *xfbtree_dup_cursor(struct xfs_btree_cur *cur); + +int xfbtree_get_minrecs(struct xfs_btree_cur *cur, int level); +int xfbtree_get_maxrecs(struct xfs_btree_cur *cur, int level); + +int xfbtree_alloc_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, union xfs_btree_ptr *ptr, + int *stat); +int xfbtree_free_block(struct xfs_btree_cur *cur, struct xfs_buf *bp); + +/* Callers must set xfbt->target and xfbt->owner before calling this */ +int xfbtree_init(struct xfs_mount *mp, struct xfbtree *xfbt, + struct xfs_buftarg *btp, const struct xfs_btree_ops *ops); +void xfbtree_destroy(struct xfbtree *xfbt); + +int xfbtree_trans_commit(struct xfbtree *xfbt, struct xfs_trans *tp); +void xfbtree_trans_cancel(struct xfbtree *xfbt, struct xfs_trans *tp); +#else +# define xfbtree_verify_bno(...) (false) +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + +#endif /* __XFS_BTREE_MEM_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index e276eba87cb1..5ed84f9cc877 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -39,63 +39,6 @@ */ /* - * Don't allow staging cursors to be duplicated because they're supposed to be - * kept private to a single thread. - */ -STATIC struct xfs_btree_cur * -xfs_btree_fakeroot_dup_cursor( - struct xfs_btree_cur *cur) -{ - ASSERT(0); - return NULL; -} - -/* - * Don't allow block allocation for a staging cursor, because staging cursors - * do not support regular btree modifications. - * - * Bulk loading uses a separate callback to obtain new blocks from a - * preallocated list, which prevents ENOSPC failures during loading. - */ -STATIC int -xfs_btree_fakeroot_alloc_block( - struct xfs_btree_cur *cur, - const union xfs_btree_ptr *start_bno, - union xfs_btree_ptr *new_bno, - int *stat) -{ - ASSERT(0); - return -EFSCORRUPTED; -} - -/* - * Don't allow block freeing for a staging cursor, because staging cursors - * do not support regular btree modifications. - */ -STATIC int -xfs_btree_fakeroot_free_block( - struct xfs_btree_cur *cur, - struct xfs_buf *bp) -{ - ASSERT(0); - return -EFSCORRUPTED; -} - -/* Initialize a pointer to the root block from the fakeroot. */ -STATIC void -xfs_btree_fakeroot_init_ptr_from_cur( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr) -{ - struct xbtree_afakeroot *afake; - - ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - - afake = cur->bc_ag.afake; - ptr->s = cpu_to_be32(afake->af_root); -} - -/* * Bulk Loading for AG Btrees * ========================== * @@ -109,47 +52,20 @@ xfs_btree_fakeroot_init_ptr_from_cur( * cursor into a regular btree cursor. */ -/* Update the btree root information for a per-AG fake root. */ -STATIC void -xfs_btree_afakeroot_set_root( - struct xfs_btree_cur *cur, - const union xfs_btree_ptr *ptr, - int inc) -{ - struct xbtree_afakeroot *afake = cur->bc_ag.afake; - - ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - afake->af_root = be32_to_cpu(ptr->s); - afake->af_levels += inc; -} - /* * Initialize a AG-rooted btree cursor with the given AG btree fake root. - * The btree cursor's bc_ops will be overridden as needed to make the staging - * functionality work. */ void xfs_btree_stage_afakeroot( struct xfs_btree_cur *cur, struct xbtree_afakeroot *afake) { - struct xfs_btree_ops *nops; - ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); - ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)); + ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE); ASSERT(cur->bc_tp == NULL); - nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); - memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); - nops->alloc_block = xfs_btree_fakeroot_alloc_block; - nops->free_block = xfs_btree_fakeroot_free_block; - nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; - nops->set_root = xfs_btree_afakeroot_set_root; - nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; - cur->bc_ag.afake = afake; cur->bc_nlevels = afake->af_levels; - cur->bc_ops = nops; cur->bc_flags |= XFS_BTREE_STAGING; } @@ -163,17 +79,15 @@ void xfs_btree_commit_afakeroot( struct xfs_btree_cur *cur, struct xfs_trans *tp, - struct xfs_buf *agbp, - const struct xfs_btree_ops *ops) + struct xfs_buf *agbp) { ASSERT(cur->bc_flags & XFS_BTREE_STAGING); ASSERT(cur->bc_tp == NULL); trace_xfs_btree_commit_afakeroot(cur); - kmem_free((void *)cur->bc_ops); + cur->bc_ag.afake = NULL; cur->bc_ag.agbp = agbp; - cur->bc_ops = ops; cur->bc_flags &= ~XFS_BTREE_STAGING; cur->bc_tp = tp; } @@ -211,29 +125,17 @@ xfs_btree_commit_afakeroot( void xfs_btree_stage_ifakeroot( struct xfs_btree_cur *cur, - struct xbtree_ifakeroot *ifake, - struct xfs_btree_ops **new_ops) + struct xbtree_ifakeroot *ifake) { - struct xfs_btree_ops *nops; - ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); ASSERT(cur->bc_tp == NULL); - nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); - memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); - nops->alloc_block = xfs_btree_fakeroot_alloc_block; - nops->free_block = xfs_btree_fakeroot_free_block; - nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; - nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; - cur->bc_ino.ifake = ifake; cur->bc_nlevels = ifake->if_levels; - cur->bc_ops = nops; + cur->bc_ino.forksize = ifake->if_fork_size; + cur->bc_ino.whichfork = XFS_STAGING_FORK; cur->bc_flags |= XFS_BTREE_STAGING; - - if (new_ops) - *new_ops = nops; } /* @@ -246,18 +148,15 @@ void xfs_btree_commit_ifakeroot( struct xfs_btree_cur *cur, struct xfs_trans *tp, - int whichfork, - const struct xfs_btree_ops *ops) + int whichfork) { ASSERT(cur->bc_flags & XFS_BTREE_STAGING); ASSERT(cur->bc_tp == NULL); trace_xfs_btree_commit_ifakeroot(cur); - kmem_free((void *)cur->bc_ops); cur->bc_ino.ifake = NULL; cur->bc_ino.whichfork = whichfork; - cur->bc_ops = ops; cur->bc_flags &= ~XFS_BTREE_STAGING; cur->bc_tp = tp; } @@ -397,8 +296,7 @@ xfs_btree_bload_prep_block( struct xfs_btree_block *new_block; int ret; - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - level == cur->bc_nlevels - 1) { + if (xfs_btree_at_iroot(cur, level)) { struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); size_t new_size; @@ -406,14 +304,12 @@ xfs_btree_bload_prep_block( /* Allocate a new incore btree root block. */ new_size = bbl->iroot_size(cur, level, nr_this_block, priv); - ifp->if_broot = kmem_zalloc(new_size, 0); + ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); ifp->if_broot_bytes = (int)new_size; /* Initialize it and send it out. */ - xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot, - XFS_BUF_DADDR_NULL, cur->bc_btnum, level, - nr_this_block, cur->bc_ino.ip->i_ino, - cur->bc_flags); + xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops, + level, nr_this_block, cur->bc_ino.ip->i_ino); *bpp = NULL; *blockp = ifp->if_broot; @@ -678,6 +574,7 @@ xfs_btree_bload_compute_geometry( struct xfs_btree_bload *bbl, uint64_t nr_records) { + const struct xfs_btree_ops *ops = cur->bc_ops; uint64_t nr_blocks = 0; uint64_t nr_this_level; @@ -704,7 +601,7 @@ xfs_btree_bload_compute_geometry( xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, &avg_per_block, &level_blocks, &dontcare64); - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + if (ops->type == XFS_BTREE_TYPE_INODE) { /* * If all the items we want to store at this level * would fit in the inode root block, then we have our @@ -712,7 +609,9 @@ xfs_btree_bload_compute_geometry( * * Note that bmap btrees forbid records in the root. */ - if (level != 0 && nr_this_level <= avg_per_block) { + if ((level != 0 || + (ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) && + nr_this_level <= avg_per_block) { nr_blocks++; break; } @@ -763,7 +662,7 @@ xfs_btree_bload_compute_geometry( return -EOVERFLOW; bbl->btree_height = cur->bc_nlevels; - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (ops->type == XFS_BTREE_TYPE_INODE) bbl->nr_blocks = nr_blocks - 1; else bbl->nr_blocks = nr_blocks; @@ -890,7 +789,7 @@ xfs_btree_bload( } /* Initialize the new root. */ - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); cur->bc_ino.ifake->if_levels = cur->bc_nlevels; cur->bc_ino.ifake->if_blocks = total_blocks - 1; diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index 055ea43b1e18..0c9c2ffb127a 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -22,7 +22,7 @@ struct xbtree_afakeroot { void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur, struct xbtree_afakeroot *afake); void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, - struct xfs_buf *agbp, const struct xfs_btree_ops *ops); + struct xfs_buf *agbp); /* Fake root for an inode-rooted btree. */ struct xbtree_ifakeroot { @@ -41,10 +41,9 @@ struct xbtree_ifakeroot { /* Cursor interactions with fake roots for inode-rooted btrees. */ void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur, - struct xbtree_ifakeroot *ifake, - struct xfs_btree_ops **new_ops); + struct xbtree_ifakeroot *ifake); void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, - int whichfork, const struct xfs_btree_ops *ops); + int whichfork); /* Bulk loading of staged btrees. */ typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur, @@ -76,8 +75,7 @@ struct xfs_btree_bload { /* * This function should return the size of the in-core btree root - * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree - * types. + * block. It is only necessary for XFS_BTREE_TYPE_INODE btrees. */ xfs_btree_bload_iroot_size_fn iroot_size; diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 5457188bb4de..17d9e6154f19 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -23,6 +23,7 @@ #include "xfs_buf_item.h" #include "xfs_log.h" #include "xfs_errortag.h" +#include "xfs_health.h" /* * xfs_da_btree.c @@ -85,7 +86,8 @@ xfs_da_state_alloc( { struct xfs_da_state *state; - state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL); + state = kmem_cache_zalloc(xfs_da_state_cache, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); state->args = args; state->mp = args->dp->i_mount; return state; @@ -250,6 +252,51 @@ xfs_da3_node_verify( return NULL; } +xfs_failaddr_t +xfs_da3_node_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + + if (xfs_has_crc(mp)) { + struct xfs_da3_blkinfo *hdr3 = bp->b_addr; + + if (hdr3->hdr.magic != cpu_to_be16(XFS_DA3_NODE_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->owner) != owner) + return __this_address; + } + + return NULL; +} + +xfs_failaddr_t +xfs_da3_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_da_blkinfo *hdr = bp->b_addr; + + if (!xfs_has_crc(mp)) + return NULL; + + switch (hdr->magic) { + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + return xfs_attr3_leaf_header_check(bp, owner); + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + return xfs_da3_node_header_check(bp, owner); + case cpu_to_be16(XFS_DIR3_LEAF1_MAGIC): + case cpu_to_be16(XFS_DIR3_LEAFN_MAGIC): + return xfs_dir3_leaf_header_check(bp, owner); + } + + ASSERT(0); + return NULL; +} + static void xfs_da3_node_write_verify( struct xfs_buf *bp) @@ -352,6 +399,8 @@ const struct xfs_buf_ops xfs_da3_node_buf_ops = { static int xfs_da3_node_set_type( struct xfs_trans *tp, + struct xfs_inode *dp, + int whichfork, struct xfs_buf *bp) { struct xfs_da_blkinfo *info = bp->b_addr; @@ -373,6 +422,7 @@ xfs_da3_node_set_type( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp, info, sizeof(*info)); xfs_trans_brelse(tp, bp); + xfs_dirattr_mark_sick(dp, whichfork); return -EFSCORRUPTED; } } @@ -391,7 +441,7 @@ xfs_da3_node_read( &xfs_da3_node_buf_ops); if (error || !*bpp || !tp) return error; - return xfs_da3_node_set_type(tp, *bpp); + return xfs_da3_node_set_type(tp, dp, whichfork, *bpp); } int @@ -408,6 +458,8 @@ xfs_da3_node_read_mapped( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, mappedbno, XFS_FSB_TO_BB(mp, xfs_dabuf_nfsb(mp, whichfork)), 0, bpp, &xfs_da3_node_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_dirattr_mark_sick(dp, whichfork); if (error || !*bpp) return error; @@ -418,7 +470,7 @@ xfs_da3_node_read_mapped( if (!tp) return 0; - return xfs_da3_node_set_type(tp, *bpp); + return xfs_da3_node_set_type(tp, dp, whichfork, *bpp); } /* @@ -479,7 +531,7 @@ xfs_da3_node_create( memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr)); ichdr.magic = XFS_DA3_NODE_MAGIC; hdr3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->info.owner = cpu_to_be64(args->dp->i_ino); + hdr3->info.owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { ichdr.magic = XFS_DA_NODE_MAGIC; @@ -541,9 +593,8 @@ xfs_da3_split( switch (oldblk->magic) { case XFS_ATTR_LEAF_MAGIC: error = xfs_attr3_leaf_split(state, oldblk, newblk); - if ((error != 0) && (error != -ENOSPC)) { + if (error < 0) return error; /* GROT: attr is inconsistent */ - } if (!error) { addblk = newblk; break; @@ -565,6 +616,8 @@ xfs_da3_split( error = xfs_attr3_leaf_split(state, newblk, &state->extrablk); } + if (error == 1) + return -ENOSPC; if (error) return error; /* GROT: attr inconsistent */ addblk = newblk; @@ -631,6 +684,7 @@ xfs_da3_split( if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) { xfs_buf_mark_corrupt(oldblk->bp); + xfs_da_mark_sick(state->args); error = -EFSCORRUPTED; goto out; } @@ -644,6 +698,7 @@ xfs_da3_split( if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) { xfs_buf_mark_corrupt(oldblk->bp); + xfs_da_mark_sick(state->args); error = -EFSCORRUPTED; goto out; } @@ -1190,6 +1245,7 @@ xfs_da3_root_join( struct xfs_da3_icnode_hdr oldroothdr; int error; struct xfs_inode *dp = state->args->dp; + xfs_failaddr_t fa; trace_xfs_da_root_join(state->args); @@ -1216,6 +1272,13 @@ xfs_da3_root_join( error = xfs_da3_node_read(args->trans, dp, child, &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); /* @@ -1250,6 +1313,7 @@ xfs_da3_node_toosmall( struct xfs_da_blkinfo *info; xfs_dablk_t blkno; struct xfs_buf *bp; + xfs_failaddr_t fa; struct xfs_da3_icnode_hdr nodehdr; int count; int forward; @@ -1324,6 +1388,13 @@ xfs_da3_node_toosmall( state->args->whichfork); if (error) return error; + fa = xfs_da3_node_header_check(bp, state->args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(state->args->trans, bp); + xfs_da_mark_sick(state->args); + return -EFSCORRUPTED; + } node = bp->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &thdr, node); @@ -1582,6 +1653,7 @@ xfs_da3_node_lookup_int( struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; struct xfs_da_args *args; + xfs_failaddr_t fa; xfs_dablk_t blkno; xfs_dahash_t hashval; xfs_dahash_t btreehashval; @@ -1620,6 +1692,12 @@ xfs_da3_node_lookup_int( if (magic == XFS_ATTR_LEAF_MAGIC || magic == XFS_ATTR3_LEAF_MAGIC) { + fa = xfs_attr3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_ATTR_LEAF_MAGIC; blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); break; @@ -1627,6 +1705,12 @@ xfs_da3_node_lookup_int( if (magic == XFS_DIR2_LEAFN_MAGIC || magic == XFS_DIR3_LEAFN_MAGIC) { + fa = xfs_dir3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_DIR2_LEAFN_MAGIC; blk->hashval = xfs_dir2_leaf_lasthash(args->dp, blk->bp, NULL); @@ -1635,6 +1719,14 @@ xfs_da3_node_lookup_int( if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) { xfs_buf_mark_corrupt(blk->bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } + + fa = xfs_da3_node_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -1650,6 +1742,7 @@ xfs_da3_node_lookup_int( /* Tree taller than we can handle; bail out! */ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) { xfs_buf_mark_corrupt(blk->bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -1658,6 +1751,7 @@ xfs_da3_node_lookup_int( expected_level = nodehdr.level - 1; else if (expected_level != nodehdr.level) { xfs_buf_mark_corrupt(blk->bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } else expected_level--; @@ -1709,12 +1803,16 @@ xfs_da3_node_lookup_int( } /* We can't point back to the root. */ - if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk)) + if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk)) { + xfs_da_mark_sick(args); return -EFSCORRUPTED; + } } - if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0)) + if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0)) { + xfs_da_mark_sick(args); return -EFSCORRUPTED; + } /* * A leaf block that ends in the hashval that we are interested in @@ -1732,6 +1830,7 @@ xfs_da3_node_lookup_int( args->blkno = blk->blkno; } else { ASSERT(0); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } if (((retval == -ENOENT) || (retval == -ENOATTR)) && @@ -1803,6 +1902,7 @@ xfs_da3_blk_link( struct xfs_da_blkinfo *tmp_info; struct xfs_da_args *args; struct xfs_buf *bp; + xfs_failaddr_t fa; int before = 0; int error; struct xfs_inode *dp = state->args->dp; @@ -1846,6 +1946,13 @@ xfs_da3_blk_link( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); @@ -1867,6 +1974,13 @@ xfs_da3_blk_link( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); @@ -1896,6 +2010,7 @@ xfs_da3_blk_unlink( struct xfs_da_blkinfo *tmp_info; struct xfs_da_args *args; struct xfs_buf *bp; + xfs_failaddr_t fa; int error; /* @@ -1926,6 +2041,13 @@ xfs_da3_blk_unlink( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); @@ -1943,6 +2065,13 @@ xfs_da3_blk_unlink( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); @@ -1979,6 +2108,7 @@ xfs_da3_path_shift( struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; struct xfs_buf *bp; + xfs_failaddr_t fa; xfs_dablk_t blkno = 0; int level; int error; @@ -2057,6 +2187,12 @@ xfs_da3_path_shift( switch (be16_to_cpu(info->magic)) { case XFS_DA_NODE_MAGIC: case XFS_DA3_NODE_MAGIC: + fa = xfs_da3_node_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_DA_NODE_MAGIC; xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, bp->b_addr); @@ -2070,6 +2206,12 @@ xfs_da3_path_shift( break; case XFS_ATTR_LEAF_MAGIC: case XFS_ATTR3_LEAF_MAGIC: + fa = xfs_attr3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_ATTR_LEAF_MAGIC; ASSERT(level == path->active-1); blk->index = 0; @@ -2077,6 +2219,12 @@ xfs_da3_path_shift( break; case XFS_DIR2_LEAFN_MAGIC: case XFS_DIR3_LEAFN_MAGIC: + fa = xfs_dir3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_DIR2_LEAFN_MAGIC; ASSERT(level == path->active-1); blk->index = 0; @@ -2150,8 +2298,8 @@ xfs_da_grow_inode_int( struct xfs_inode *dp = args->dp; int w = args->whichfork; xfs_rfsblock_t nblks = dp->i_nblocks; - struct xfs_bmbt_irec map, *mapp; - int nmap, error, got, i, mapi; + struct xfs_bmbt_irec map, *mapp = ↦ + int nmap, error, got, i, mapi = 1; /* * Find a spot in the file space to put the new block. @@ -2167,14 +2315,7 @@ xfs_da_grow_inode_int( error = xfs_bmapi_write(tp, dp, *bno, count, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->total, &map, &nmap); - if (error) - return error; - - ASSERT(nmap <= 1); - if (nmap == 1) { - mapp = ↦ - mapi = 1; - } else if (nmap == 0 && count > 1) { + if (error == -ENOSPC && count > 1) { xfs_fileoff_t b; int c; @@ -2182,7 +2323,8 @@ xfs_da_grow_inode_int( * If we didn't get it and the block might work if fragmented, * try without the CONTIG flag. Loop until we get it all. */ - mapp = kmem_alloc(sizeof(*mapp) * count, 0); + mapp = kmalloc(sizeof(*mapp) * count, + GFP_KERNEL | __GFP_NOFAIL); for (b = *bno, mapi = 0; b < *bno + count; ) { c = (int)(*bno + count - b); nmap = min(XFS_BMAP_MAX_NMAP, c); @@ -2191,16 +2333,13 @@ xfs_da_grow_inode_int( args->total, &mapp[mapi], &nmap); if (error) goto out_free_map; - if (nmap < 1) - break; mapi += nmap; b = mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount; } - } else { - mapi = 0; - mapp = NULL; } + if (error) + goto out_free_map; /* * Count the blocks we got, make sure it matches the total. @@ -2219,7 +2358,7 @@ xfs_da_grow_inode_int( out_free_map: if (mapp != &map) - kmem_free(mapp); + kfree(mapp); return error; } @@ -2272,6 +2411,7 @@ xfs_da3_swap_lastblock( struct xfs_buf *last_buf; struct xfs_buf *sib_buf; struct xfs_buf *par_buf; + xfs_failaddr_t fa; xfs_dahash_t dead_hash; xfs_fileoff_t lastoff; xfs_dablk_t dead_blkno; @@ -2297,8 +2437,10 @@ xfs_da3_swap_lastblock( error = xfs_bmap_last_before(tp, dp, &lastoff, w); if (error) return error; - if (XFS_IS_CORRUPT(mp, lastoff == 0)) + if (XFS_IS_CORRUPT(mp, lastoff == 0)) { + xfs_da_mark_sick(args); return -EFSCORRUPTED; + } /* * Read the last block in the btree space. */ @@ -2306,6 +2448,14 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, last_blkno, &last_buf, w); if (error) return error; + fa = xfs_da3_header_check(last_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(last_buf, fa); + xfs_trans_brelse(tp, last_buf); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } + /* * Copy the last block into the dead buffer and log it. */ @@ -2344,10 +2494,18 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w); if (error) goto done; + fa = xfs_da3_header_check(sib_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(sib_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } sib_info = sib_buf->b_addr; if (XFS_IS_CORRUPT(mp, be32_to_cpu(sib_info->forw) != last_blkno || sib_info->magic != dead_info->magic)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2364,10 +2522,18 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w); if (error) goto done; + fa = xfs_da3_header_check(sib_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(sib_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } sib_info = sib_buf->b_addr; if (XFS_IS_CORRUPT(mp, be32_to_cpu(sib_info->back) != last_blkno || sib_info->magic != dead_info->magic)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2386,10 +2552,18 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w); if (error) goto done; + fa = xfs_da3_node_header_check(par_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(par_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } par_node = par_buf->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); if (XFS_IS_CORRUPT(mp, level >= 0 && level != par_hdr.level + 1)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2401,6 +2575,7 @@ xfs_da3_swap_lastblock( entno++) continue; if (XFS_IS_CORRUPT(mp, entno == par_hdr.count)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2426,15 +2601,24 @@ xfs_da3_swap_lastblock( xfs_trans_brelse(tp, par_buf); par_buf = NULL; if (XFS_IS_CORRUPT(mp, par_blkno == 0)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w); if (error) goto done; + fa = xfs_da3_node_header_check(par_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(par_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } par_node = par_buf->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2518,7 +2702,8 @@ xfs_dabuf_map( int error = 0, nirecs, i; if (nfsb > 1) - irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_NOFS); + irecs = kzalloc(sizeof(irec) * nfsb, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); nirecs = nfsb; error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs, @@ -2531,7 +2716,8 @@ xfs_dabuf_map( * larger one that needs to be free by the caller. */ if (nirecs > 1) { - map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS); + map = kzalloc(nirecs * sizeof(struct xfs_buf_map), + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); if (!map) { error = -ENOMEM; goto out_free_irecs; @@ -2557,12 +2743,13 @@ xfs_dabuf_map( *nmaps = nirecs; out_free_irecs: if (irecs != &irec) - kmem_free(irecs); + kfree(irecs); return error; invalid_mapping: /* Caller ok with no mapping. */ if (XFS_IS_CORRUPT(mp, !(flags & XFS_DABUF_MAP_HOLE_OK))) { + xfs_dirattr_mark_sick(dp, whichfork); error = -EFSCORRUPTED; if (xfs_error_level >= XFS_ERRLEVEL_LOW) { xfs_alert(mp, "%s: bno %u inode %llu", @@ -2613,7 +2800,7 @@ xfs_da_get_buf( out_free: if (mapp != &map) - kmem_free(mapp); + kfree(mapp); return error; } @@ -2644,6 +2831,8 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(mp, tp, mp->m_ddev_targp, mapp, nmap, 0, &bp, ops); + if (xfs_metadata_is_sick(error)) + xfs_dirattr_mark_sick(dp, whichfork); if (error) goto out_free; @@ -2654,7 +2843,7 @@ xfs_da_read_buf( *bpp = bp; out_free: if (mapp != &map) - kmem_free(mapp); + kfree(mapp); return error; } @@ -2685,7 +2874,7 @@ xfs_da_reada_buf( out_free: if (mapp != &map) - kmem_free(mapp); + kfree(mapp); return error; } diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 706baf36e175..354d5d65043e 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -54,17 +54,24 @@ enum xfs_dacmp { */ typedef struct xfs_da_args { struct xfs_da_geometry *geo; /* da block geometry */ - const uint8_t *name; /* string (maybe not NULL terminated) */ - int namelen; /* length of string (maybe no NULL) */ - uint8_t filetype; /* filetype of inode for directories */ + const uint8_t *name; /* string (maybe not NULL terminated) */ + const uint8_t *new_name; /* new attr name */ void *value; /* set of bytes (maybe contain NULLs) */ - int valuelen; /* length of value */ - unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */ - unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */ - xfs_dahash_t hashval; /* hash value of name */ - xfs_ino_t inumber; /* input/output inode number */ + void *new_value; /* new xattr value (may contain NULLs) */ struct xfs_inode *dp; /* directory inode to manipulate */ struct xfs_trans *trans; /* current trans (changes over time) */ + + xfs_ino_t inumber; /* input/output inode number */ + xfs_ino_t owner; /* inode that owns the dir/attr data */ + + int valuelen; /* length of value */ + int new_valuelen; /* length of new_value */ + uint8_t filetype; /* filetype of inode for directories */ + uint8_t op_flags; /* operation flags */ + uint8_t attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */ + short namelen; /* length of string (maybe no NULL) */ + short new_namelen; /* length of new attr name */ + xfs_dahash_t hashval; /* hash value of name */ xfs_extlen_t total; /* total blocks needed, for 1st bmap */ int whichfork; /* data or attribute fork */ xfs_dablk_t blkno; /* blkno of attr leaf of interest */ @@ -77,7 +84,6 @@ typedef struct xfs_da_args { xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ int rmtvaluelen2; /* remote attr value length in bytes */ - uint32_t op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; @@ -89,10 +95,8 @@ typedef struct xfs_da_args { #define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */ #define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */ -#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ -#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */ -#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */ -#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */ +#define XFS_DA_OP_RECOVERY (1u << 5) /* Log recovery operation */ +#define XFS_DA_OP_LOGGED (1u << 6) /* Use intent items to track op */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -100,8 +104,6 @@ typedef struct xfs_da_args { { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_NOTIME, "NOTIME" }, \ - { XFS_DA_OP_REMOVE, "REMOVE" }, \ { XFS_DA_OP_RECOVERY, "RECOVERY" }, \ { XFS_DA_OP_LOGGED, "LOGGED" } @@ -235,6 +237,8 @@ void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from); void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp, struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from); +xfs_failaddr_t xfs_da3_header_check(struct xfs_buf *bp, xfs_ino_t owner); +xfs_failaddr_t xfs_da3_node_header_check(struct xfs_buf *bp, xfs_ino_t owner); extern struct kmem_cache *xfs_da_state_cache; diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 24f9d1461f9a..86de99e2f757 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -159,6 +159,17 @@ struct xfs_da3_intnode { #define XFS_DIR3_FT_MAX 9 +#define XFS_DIR3_FTYPE_STR \ + { XFS_DIR3_FT_UNKNOWN, "unknown" }, \ + { XFS_DIR3_FT_REG_FILE, "file" }, \ + { XFS_DIR3_FT_DIR, "directory" }, \ + { XFS_DIR3_FT_CHRDEV, "char" }, \ + { XFS_DIR3_FT_BLKDEV, "block" }, \ + { XFS_DIR3_FT_FIFO, "fifo" }, \ + { XFS_DIR3_FT_SOCK, "sock" }, \ + { XFS_DIR3_FT_SYMLINK, "symlink" }, \ + { XFS_DIR3_FT_WHT, "whiteout" } + /* * Byte offset in data block and shortform entry. */ @@ -703,12 +714,30 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ #define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ +#define XFS_ATTR_PARENT_BIT 3 /* parent pointer attrs */ #define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ #define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT) #define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT) #define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_PARENT (1u << XFS_ATTR_PARENT_BIT) #define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT) -#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) + +#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | \ + XFS_ATTR_SECURE | \ + XFS_ATTR_PARENT) + +/* Private attr namespaces not exposed to userspace */ +#define XFS_ATTR_PRIVATE_NSP_MASK (XFS_ATTR_PARENT) + +#define XFS_ATTR_ONDISK_MASK (XFS_ATTR_NSP_ONDISK_MASK | \ + XFS_ATTR_LOCAL | \ + XFS_ATTR_INCOMPLETE) + +#define XFS_ATTR_NAMESPACE_STR \ + { XFS_ATTR_LOCAL, "local" }, \ + { XFS_ATTR_ROOT, "root" }, \ + { XFS_ATTR_SECURE, "secure" }, \ + { XFS_ATTR_PARENT, "parent" } /* * Alignment for namelist and valuelist entries (since they are mixed @@ -851,9 +880,7 @@ struct xfs_attr3_rmt_hdr { #define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) -#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ - ((bufsize) - (xfs_has_crc((mp)) ? \ - sizeof(struct xfs_attr3_rmt_hdr) : 0)) +unsigned int xfs_attr3_rmt_buf_space(struct xfs_mount *mp); /* Number of bytes in a directory block. */ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) @@ -864,4 +891,17 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp, struct xfs_da3_blkinfo *hdr3); +/* + * Parent pointer attribute format definition + * + * The xattr name contains the dirent name. + * The xattr value encodes the parent inode number and generation to ease + * opening parents by handle. + * The xattr hashval is xfs_dir2_namehash() ^ p_ino + */ +struct xfs_parent_rec { + __be64 p_ino; + __be32 p_gen; +} __packed; + #endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 66a17910d021..5b377cbbb1f7 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -12,12 +12,14 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_rmap.h" #include "xfs_refcount.h" #include "xfs_bmap.h" @@ -26,7 +28,7 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" -#include "xfs_trans_priv.h" +#include "xfs_exchmaps.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -555,7 +557,7 @@ xfs_defer_relog( * the log threshold once per call. */ if (threshold_lsn == NULLCOMMITLSN) { - threshold_lsn = xlog_grant_push_threshold(log, 0); + threshold_lsn = xfs_ail_get_push_target(log->l_ailp); if (threshold_lsn == NULLCOMMITLSN) break; } @@ -819,16 +821,16 @@ xfs_defer_can_append( /* Create a new pending item at the end of the transaction list. */ static inline struct xfs_defer_pending * xfs_defer_alloc( - struct xfs_trans *tp, + struct list_head *dfops, const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp; dfp = kmem_cache_zalloc(xfs_defer_pending_cache, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOFAIL); dfp->dfp_ops = ops; INIT_LIST_HEAD(&dfp->dfp_work); - list_add_tail(&dfp->dfp_list, &tp->t_dfops); + list_add_tail(&dfp->dfp_list, dfops); return dfp; } @@ -844,9 +846,15 @@ xfs_defer_add( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + if (!ops->finish_item) { + ASSERT(ops->finish_item != NULL); + xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); + return NULL; + } + dfp = xfs_defer_find_last(tp, ops); if (!dfp || !xfs_defer_can_append(dfp, ops)) - dfp = xfs_defer_alloc(tp, ops); + dfp = xfs_defer_alloc(&tp->t_dfops, ops); xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); @@ -870,7 +878,7 @@ xfs_defer_add_barrier( if (dfp) return; - xfs_defer_alloc(tp, &xfs_barrier_defer_type); + xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type); trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); } @@ -885,14 +893,9 @@ xfs_defer_start_recovery( struct list_head *r_dfops, const struct xfs_defer_op_type *ops) { - struct xfs_defer_pending *dfp; + struct xfs_defer_pending *dfp = xfs_defer_alloc(r_dfops, ops); - dfp = kmem_cache_zalloc(xfs_defer_pending_cache, - GFP_NOFS | __GFP_NOFAIL); - dfp->dfp_ops = ops; dfp->dfp_intent = lip; - INIT_LIST_HEAD(&dfp->dfp_work); - list_add_tail(&dfp->dfp_list, r_dfops); } /* @@ -979,7 +982,7 @@ xfs_defer_ops_capture( return ERR_PTR(error); /* Create an object to capture the defer ops. */ - dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); + dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&dfc->dfc_list); INIT_LIST_HEAD(&dfc->dfc_dfops); @@ -1011,7 +1014,7 @@ xfs_defer_ops_capture( * transaction. */ for (i = 0; i < dfc->dfc_held.dr_inos; i++) { - ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL)); + xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL); ihold(VFS_I(dfc->dfc_held.dr_ip[i])); } @@ -1038,7 +1041,7 @@ xfs_defer_ops_capture_abort( for (i = 0; i < dfc->dfc_held.dr_inos; i++) xfs_irele(dfc->dfc_held.dr_ip[i]); - kmem_free(dfc); + kfree(dfc); } /* @@ -1096,7 +1099,11 @@ xfs_defer_ops_continue( ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); /* Lock the captured resources to the new transaction. */ - if (dfc->dfc_held.dr_inos == 2) + if (dfc->dfc_held.dr_inos > 2) { + xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos); + xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos, + XFS_ILOCK_EXCL); + } else if (dfc->dfc_held.dr_inos == 2) xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); else if (dfc->dfc_held.dr_inos == 1) @@ -1114,7 +1121,7 @@ xfs_defer_ops_continue( list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); tp->t_flags |= dfc->dfc_tpflags; - kmem_free(dfc); + kfree(dfc); } /* Release the resources captured and continued during recovery. */ @@ -1181,6 +1188,10 @@ xfs_defer_init_item_caches(void) error = xfs_attr_intent_init_cache(); if (error) goto err; + error = xfs_exchmaps_intent_init_cache(); + if (error) + goto err; + return 0; err: xfs_defer_destroy_item_caches(); @@ -1191,6 +1202,7 @@ err: void xfs_defer_destroy_item_caches(void) { + xfs_exchmaps_intent_destroy_cache(); xfs_attr_intent_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 18a9fb92dde8..9effd95ddcd4 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -68,16 +68,25 @@ struct xfs_defer_op_type { extern const struct xfs_defer_op_type xfs_bmap_update_defer_type; extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; +extern const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type; extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; +extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +extern const struct xfs_defer_op_type xfs_rtextent_free_defer_type; extern const struct xfs_defer_op_type xfs_attr_defer_type; - +extern const struct xfs_defer_op_type xfs_exchmaps_defer_type; /* * Deferred operation item relogging limits. */ -#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ + +/* + * Rename w/ parent pointers can require up to 5 inodes with deferred ops to + * be joined to the transaction: src_dp, target_dp, src_ip, target_ip, and wip. + * These inodes are locked in sorted order by their inode numbers + */ +#define XFS_DEFER_OPS_NR_INODES 5 #define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */ /* Resources that must be held across a transaction roll. */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index a76673281514..1775abcfa04d 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -18,6 +18,12 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_health.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_parent.h" +#include "xfs_ag.h" +#include "xfs_ialloc.h" const struct xfs_name xfs_name_dotdot = { .name = (const unsigned char *)"..", @@ -25,6 +31,12 @@ const struct xfs_name xfs_name_dotdot = { .type = XFS_DIR3_FT_DIR, }; +const struct xfs_name xfs_name_dot = { + .name = (const unsigned char *)".", + .len = 1, + .type = XFS_DIR3_FT_DIR, +}; + /* * Convert inode mode to directory entry filetype */ @@ -104,13 +116,13 @@ xfs_da_mount( ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE); - mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), - KM_MAYFAIL); - mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), - KM_MAYFAIL); + mp->m_dir_geo = kzalloc(sizeof(struct xfs_da_geometry), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); + mp->m_attr_geo = kzalloc(sizeof(struct xfs_da_geometry), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!mp->m_dir_geo || !mp->m_attr_geo) { - kmem_free(mp->m_dir_geo); - kmem_free(mp->m_attr_geo); + kfree(mp->m_dir_geo); + kfree(mp->m_attr_geo); return -ENOMEM; } @@ -178,14 +190,14 @@ void xfs_da_unmount( struct xfs_mount *mp) { - kmem_free(mp->m_dir_geo); - kmem_free(mp->m_attr_geo); + kfree(mp->m_dir_geo); + kfree(mp->m_attr_geo); } /* * Return 1 if directory contains only "." and "..". */ -int +static bool xfs_dir_isempty( xfs_inode_t *dp) { @@ -193,9 +205,9 @@ xfs_dir_isempty( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (dp->i_disk_size == 0) /* might happen during shutdown. */ - return 1; + return true; if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) - return 0; + return false; sfp = dp->i_df.if_data; return !sfp->count; } @@ -236,18 +248,75 @@ xfs_dir_init( if (error) return error; - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->dp = dp; args->trans = tp; + args->owner = dp->i_ino; error = xfs_dir2_sf_create(args, pdp->i_ino); - kmem_free(args); + kfree(args); return error; } +enum xfs_dir2_fmt +xfs_dir2_format( + struct xfs_da_args *args, + int *error) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + xfs_fileoff_t eof; + + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + *error = 0; + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) + return XFS_DIR2_FMT_SF; + + *error = xfs_bmap_last_offset(dp, &eof, XFS_DATA_FORK); + if (*error) + return XFS_DIR2_FMT_ERROR; + + if (eof == XFS_B_TO_FSB(mp, geo->blksize)) { + if (XFS_IS_CORRUPT(mp, dp->i_disk_size != geo->blksize)) { + xfs_da_mark_sick(args); + *error = -EFSCORRUPTED; + return XFS_DIR2_FMT_ERROR; + } + return XFS_DIR2_FMT_BLOCK; + } + if (eof == geo->leafblk + geo->fsbcount) + return XFS_DIR2_FMT_LEAF; + return XFS_DIR2_FMT_NODE; +} + +int +xfs_dir_createname_args( + struct xfs_da_args *args) +{ + int error; + + if (!args->inumber) + args->op_flags |= XFS_DA_OP_JUSTCHECK; + + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + return xfs_dir2_sf_addname(args); + case XFS_DIR2_FMT_BLOCK: + return xfs_dir2_block_addname(args); + case XFS_DIR2_FMT_LEAF: + return xfs_dir2_leaf_addname(args); + case XFS_DIR2_FMT_NODE: + return xfs_dir2_node_addname(args); + default: + return error; + } +} + /* * Enter a name in a directory, or check for available space. * If inum is 0, only the available space test is performed. @@ -262,7 +331,6 @@ xfs_dir_createname( { struct xfs_da_args *args; int rval; - bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -273,7 +341,7 @@ xfs_dir_createname( XFS_STATS_INC(dp->i_mount, xs_dir_create); } - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; @@ -288,32 +356,10 @@ xfs_dir_createname( args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - if (!inum) - args->op_flags |= XFS_DA_OP_JUSTCHECK; - - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_addname(args); - goto out_free; - } + args->owner = dp->i_ino; - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_addname(args); - goto out_free; - } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_addname(args); - else - rval = xfs_dir2_node_addname(args); - -out_free: - kmem_free(args); + rval = xfs_dir_createname_args(args); + kfree(args); return rval; } @@ -333,15 +379,43 @@ xfs_dir_cilookup_result( !(args->op_flags & XFS_DA_OP_CILOOKUP)) return -EEXIST; - args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); + args->value = kmemdup(name, len, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL); if (!args->value) return -ENOMEM; - memcpy(args->value, name, len); args->valuelen = len; return -EEXIST; } +int +xfs_dir_lookup_args( + struct xfs_da_args *args) +{ + int error; + + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + error = xfs_dir2_sf_lookup(args); + break; + case XFS_DIR2_FMT_BLOCK: + error = xfs_dir2_block_lookup(args); + break; + case XFS_DIR2_FMT_LEAF: + error = xfs_dir2_leaf_lookup(args); + break; + case XFS_DIR2_FMT_NODE: + error = xfs_dir2_node_lookup(args); + break; + default: + break; + } + + if (error != -EEXIST) + return error; + return 0; +} + /* * Lookup a name in a directory, give back the inode number. * If ci_name is not NULL, returns the actual name in ci_name if it differs @@ -358,21 +432,13 @@ xfs_dir_lookup( { struct xfs_da_args *args; int rval; - bool v; int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_lookup); - /* - * We need to use KM_NOFS here so that lockdep will not throw false - * positive deadlock warnings on a non-transactional lookup path. It is - * safe to recurse into inode recalim in that case, but lockdep can't - * easily be taught about it. Hence KM_NOFS avoids having to add more - * lockdep Doing this avoids having to add a bunch of lockdep class - * annotations into the reclaim path for the ilock. - */ - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; @@ -382,34 +448,12 @@ xfs_dir_lookup( args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_OKNOENT; + args->owner = dp->i_ino; if (ci_name) args->op_flags |= XFS_DA_OP_CILOOKUP; lock_mode = xfs_ilock_data_map_shared(dp); - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_lookup(args); - goto out_check_rval; - } - - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_lookup(args); - goto out_check_rval; - } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_lookup(args); - else - rval = xfs_dir2_node_lookup(args); - -out_check_rval: - if (rval == -EEXIST) - rval = 0; + rval = xfs_dir_lookup_args(args); if (!rval) { *inum = args->inumber; if (ci_name) { @@ -417,12 +461,31 @@ out_check_rval: ci_name->len = args->valuelen; } } -out_free: xfs_iunlock(dp, lock_mode); - kmem_free(args); + kfree(args); return rval; } +int +xfs_dir_removename_args( + struct xfs_da_args *args) +{ + int error; + + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + return xfs_dir2_sf_removename(args); + case XFS_DIR2_FMT_BLOCK: + return xfs_dir2_block_removename(args); + case XFS_DIR2_FMT_LEAF: + return xfs_dir2_leaf_removename(args); + case XFS_DIR2_FMT_NODE: + return xfs_dir2_node_removename(args); + default: + return error; + } +} + /* * Remove an entry from a directory. */ @@ -430,18 +493,17 @@ int xfs_dir_removename( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, + const struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; - bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; @@ -455,30 +517,30 @@ xfs_dir_removename( args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; + args->owner = dp->i_ino; + rval = xfs_dir_removename_args(args); + kfree(args); + return rval; +} - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_removename(args); - goto out_free; - } +int +xfs_dir_replace_args( + struct xfs_da_args *args) +{ + int error; - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_removename(args); - goto out_free; + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + return xfs_dir2_sf_replace(args); + case XFS_DIR2_FMT_BLOCK: + return xfs_dir2_block_replace(args); + case XFS_DIR2_FMT_LEAF: + return xfs_dir2_leaf_replace(args); + case XFS_DIR2_FMT_NODE: + return xfs_dir2_node_replace(args); + default: + return error; } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_removename(args); - else - rval = xfs_dir2_node_removename(args); -out_free: - kmem_free(args); - return rval; } /* @@ -494,7 +556,6 @@ xfs_dir_replace( { struct xfs_da_args *args; int rval; - bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -502,7 +563,7 @@ xfs_dir_replace( if (rval) return rval; - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; @@ -516,29 +577,9 @@ xfs_dir_replace( args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; - - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_replace(args); - goto out_free; - } - - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_replace(args); - goto out_free; - } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_replace(args); - else - rval = xfs_dir2_node_replace(args); -out_free: - kmem_free(args); + args->owner = dp->i_ino; + rval = xfs_dir_replace_args(args); + kfree(args); return rval; } @@ -547,9 +588,9 @@ out_free: */ int xfs_dir_canenter( - xfs_trans_t *tp, - xfs_inode_t *dp, - struct xfs_name *name) /* name of entry to add */ + struct xfs_trans *tp, + struct xfs_inode *dp, + const struct xfs_name *name) /* name of entry to add */ { return xfs_dir_createname(tp, dp, name, 0, 0); } @@ -606,55 +647,6 @@ xfs_dir2_grow_inode( } /* - * See if the directory is a single-block form directory. - */ -int -xfs_dir2_isblock( - struct xfs_da_args *args, - bool *isblock) -{ - struct xfs_mount *mp = args->dp->i_mount; - xfs_fileoff_t eof; - int error; - - error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); - if (error) - return error; - - *isblock = false; - if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize) - return 0; - - *isblock = true; - if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) - return -EFSCORRUPTED; - return 0; -} - -/* - * See if the directory is a single-leaf form directory. - */ -int -xfs_dir2_isleaf( - struct xfs_da_args *args, - bool *isleaf) -{ - xfs_fileoff_t eof; - int error; - - error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); - if (error) - return error; - - *isleaf = false; - if (eof != args->geo->leafblk + args->geo->fsbcount) - return 0; - - *isleaf = true; - return 0; -} - -/* * Remove the given block from the directory. * This routine is used for data and free blocks, leaf/node are done * by xfs_da_shrink_inode. @@ -768,3 +760,653 @@ xfs_dir2_compname( return xfs_ascii_ci_compname(args, name, len); return xfs_da_compname(args, name, len); } + +#ifdef CONFIG_XFS_LIVE_HOOKS +/* + * Use a static key here to reduce the overhead of directory live update hooks. + * If the compiler supports jump labels, the static branch will be replaced by + * a nop sled when there are no hook users. Online fsck is currently the only + * caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); + +void +xfs_dir_hook_disable(void) +{ + xfs_hooks_switch_off(&xfs_dir_hooks_switch); +} + +void +xfs_dir_hook_enable(void) +{ + xfs_hooks_switch_on(&xfs_dir_hooks_switch); +} + +/* Call hooks for a directory update relating to a child dirent update. */ +inline void +xfs_dir_update_hook( + struct xfs_inode *dp, + struct xfs_inode *ip, + int delta, + const struct xfs_name *name) +{ + if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { + struct xfs_dir_update_params p = { + .dp = dp, + .ip = ip, + .delta = delta, + .name = name, + }; + struct xfs_mount *mp = ip->i_mount; + + xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); + } +} + +/* Call the specified function during a directory update. */ +int +xfs_dir_hook_add( + struct xfs_mount *mp, + struct xfs_dir_hook *hook) +{ + return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); +} + +/* Stop calling the specified function during a directory update. */ +void +xfs_dir_hook_del( + struct xfs_mount *mp, + struct xfs_dir_hook *hook) +{ + xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); +} + +/* Configure directory update hook functions. */ +void +xfs_dir_hook_setup( + struct xfs_dir_hook *hook, + notifier_fn_t mod_fn) +{ + xfs_hook_setup(&hook->dirent_hook, mod_fn); +} +#endif /* CONFIG_XFS_LIVE_HOOKS */ + +/* + * Given a directory @dp, a newly allocated inode @ip, and a @name, link @ip + * into @dp under the given @name. If @ip is a directory, it will be + * initialized. Both inodes must have the ILOCK held and the transaction must + * have sufficient blocks reserved. + */ +int +xfs_dir_create_child( + struct xfs_trans *tp, + unsigned int resblks, + struct xfs_dir_update *du) +{ + struct xfs_inode *dp = du->dp; + const struct xfs_name *name = du->name; + struct xfs_inode *ip = du->ip; + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); + + error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks); + if (error) { + ASSERT(error != -ENOSPC); + return error; + } + + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + if (S_ISDIR(VFS_I(ip)->i_mode)) { + error = xfs_dir_init(tp, ip, dp); + if (error) + return error; + + xfs_bumplink(tp, dp); + } + + /* + * If we have parent pointers, we need to add the attribute containing + * the parent information now. + */ + if (du->ppargs) { + error = xfs_parent_addname(tp, du->ppargs, dp, name, ip); + if (error) + return error; + } + + xfs_dir_update_hook(dp, ip, 1, name); + return 0; +} + +/* + * Given a directory @dp, an existing non-directory inode @ip, and a @name, + * link @ip into @dp under the given @name. Both inodes must have the ILOCK + * held. + */ +int +xfs_dir_add_child( + struct xfs_trans *tp, + unsigned int resblks, + struct xfs_dir_update *du) +{ + struct xfs_inode *dp = du->dp; + const struct xfs_name *name = du->name; + struct xfs_inode *ip = du->ip; + struct xfs_mount *mp = tp->t_mountp; + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); + ASSERT(!S_ISDIR(VFS_I(ip)->i_mode)); + + if (!resblks) { + error = xfs_dir_canenter(tp, dp, name); + if (error) + return error; + } + + /* + * Handle initial link state of O_TMPFILE inode + */ + if (VFS_I(ip)->i_nlink == 0) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + error = xfs_iunlink_remove(tp, pag, ip); + xfs_perag_put(pag); + if (error) + return error; + } + + error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks); + if (error) + return error; + + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + xfs_bumplink(tp, ip); + + /* + * If we have parent pointers, we now need to add the parent record to + * the attribute fork of the inode. If this is the initial parent + * attribute, we need to create it correctly, otherwise we can just add + * the parent to the inode. + */ + if (du->ppargs) { + error = xfs_parent_addname(tp, du->ppargs, dp, name, ip); + if (error) + return error; + } + + xfs_dir_update_hook(dp, ip, 1, name); + return 0; +} + +/* + * Given a directory @dp, a child @ip, and a @name, remove the (@name, @ip) + * entry from the directory. Both inodes must have the ILOCK held. + */ +int +xfs_dir_remove_child( + struct xfs_trans *tp, + unsigned int resblks, + struct xfs_dir_update *du) +{ + struct xfs_inode *dp = du->dp; + const struct xfs_name *name = du->name; + struct xfs_inode *ip = du->ip; + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); + + /* + * If we're removing a directory perform some additional validation. + */ + if (S_ISDIR(VFS_I(ip)->i_mode)) { + ASSERT(VFS_I(ip)->i_nlink >= 2); + if (VFS_I(ip)->i_nlink != 2) + return -ENOTEMPTY; + if (!xfs_dir_isempty(ip)) + return -ENOTEMPTY; + + /* Drop the link from ip's "..". */ + error = xfs_droplink(tp, dp); + if (error) + return error; + + /* Drop the "." link from ip to self. */ + error = xfs_droplink(tp, ip); + if (error) + return error; + + /* + * Point the unlinked child directory's ".." entry to the root + * directory to eliminate back-references to inodes that may + * get freed before the child directory is closed. If the fs + * gets shrunk, this can lead to dirent inode validation errors. + */ + if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { + error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, + tp->t_mountp->m_sb.sb_rootino, 0); + if (error) + return error; + } + } else { + /* + * When removing a non-directory we need to log the parent + * inode here. For a directory this is done implicitly + * by the xfs_droplink call for the ".." entry. + */ + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* Drop the link from dp to ip. */ + error = xfs_droplink(tp, ip); + if (error) + return error; + + error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); + if (error) { + ASSERT(error != -ENOENT); + return error; + } + + /* Remove parent pointer. */ + if (du->ppargs) { + error = xfs_parent_removename(tp, du->ppargs, dp, name, ip); + if (error) + return error; + } + + xfs_dir_update_hook(dp, ip, -1, name); + return 0; +} + +/* + * Exchange the entry (@name1, @ip1) in directory @dp1 with the entry (@name2, + * @ip2) in directory @dp2, and update '..' @ip1 and @ip2's entries as needed. + * @ip1 and @ip2 need not be of the same type. + * + * All inodes must have the ILOCK held, and both entries must already exist. + */ +int +xfs_dir_exchange_children( + struct xfs_trans *tp, + struct xfs_dir_update *du1, + struct xfs_dir_update *du2, + unsigned int spaceres) +{ + struct xfs_inode *dp1 = du1->dp; + const struct xfs_name *name1 = du1->name; + struct xfs_inode *ip1 = du1->ip; + struct xfs_inode *dp2 = du2->dp; + const struct xfs_name *name2 = du2->name; + struct xfs_inode *ip2 = du2->ip; + int ip1_flags = 0; + int ip2_flags = 0; + int dp2_flags = 0; + int error; + + /* Swap inode number for dirent in first parent */ + error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); + if (error) + return error; + + /* Swap inode number for dirent in second parent */ + error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); + if (error) + return error; + + /* + * If we're renaming one or more directories across different parents, + * update the respective ".." entries (and link counts) to match the new + * parents. + */ + if (dp1 != dp2) { + dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + + if (S_ISDIR(VFS_I(ip2)->i_mode)) { + error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, + dp1->i_ino, spaceres); + if (error) + return error; + + /* transfer ip2 ".." reference to dp1 */ + if (!S_ISDIR(VFS_I(ip1)->i_mode)) { + error = xfs_droplink(tp, dp2); + if (error) + return error; + xfs_bumplink(tp, dp1); + } + + /* + * Although ip1 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + } + + if (S_ISDIR(VFS_I(ip1)->i_mode)) { + error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, + dp2->i_ino, spaceres); + if (error) + return error; + + /* transfer ip1 ".." reference to dp2 */ + if (!S_ISDIR(VFS_I(ip2)->i_mode)) { + error = xfs_droplink(tp, dp1); + if (error) + return error; + xfs_bumplink(tp, dp2); + } + + /* + * Although ip2 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_CHG; + } + } + + if (ip1_flags) { + xfs_trans_ichgtime(tp, ip1, ip1_flags); + xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); + } + if (ip2_flags) { + xfs_trans_ichgtime(tp, ip2, ip2_flags); + xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); + } + if (dp2_flags) { + xfs_trans_ichgtime(tp, dp2, dp2_flags); + xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); + } + xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); + + /* Schedule parent pointer replacements */ + if (du1->ppargs) { + error = xfs_parent_replacename(tp, du1->ppargs, dp1, name1, + dp2, name2, ip1); + if (error) + return error; + } + + if (du2->ppargs) { + error = xfs_parent_replacename(tp, du2->ppargs, dp2, name2, + dp1, name1, ip2); + if (error) + return error; + } + + /* + * Inform our hook clients that we've finished an exchange operation as + * follows: removed the source and target files from their directories; + * added the target to the source directory; and added the source to + * the target directory. All inodes are locked, so it's ok to model a + * rename this way so long as we say we deleted entries before we add + * new ones. + */ + xfs_dir_update_hook(dp1, ip1, -1, name1); + xfs_dir_update_hook(dp2, ip2, -1, name2); + xfs_dir_update_hook(dp1, ip2, 1, name1); + xfs_dir_update_hook(dp2, ip1, 1, name2); + return 0; +} + +/* + * Given an entry (@src_name, @src_ip) in directory @src_dp, make the entry + * @target_name in directory @target_dp point to @src_ip and remove the + * original entry, cleaning up everything left behind. + * + * Cleanup involves dropping a link count on @target_ip, and either removing + * the (@src_name, @src_ip) entry from @src_dp or simply replacing the entry + * with (@src_name, @wip) if a whiteout inode @wip is supplied. + * + * All inodes must have the ILOCK held. We assume that if @src_ip is a + * directory then its '..' doesn't already point to @target_dp, and that @wip + * is a freshly allocated whiteout. + */ +int +xfs_dir_rename_children( + struct xfs_trans *tp, + struct xfs_dir_update *du_src, + struct xfs_dir_update *du_tgt, + unsigned int spaceres, + struct xfs_dir_update *du_wip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *src_dp = du_src->dp; + const struct xfs_name *src_name = du_src->name; + struct xfs_inode *src_ip = du_src->ip; + struct xfs_inode *target_dp = du_tgt->dp; + const struct xfs_name *target_name = du_tgt->name; + struct xfs_inode *target_ip = du_tgt->ip; + bool new_parent = (src_dp != target_dp); + bool src_is_directory; + int error; + + src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); + + /* + * Check for expected errors before we dirty the transaction + * so we can return an error without a transaction abort. + */ + if (target_ip == NULL) { + /* + * If there's no space reservation, check the entry will + * fit before actually inserting it. + */ + if (!spaceres) { + error = xfs_dir_canenter(tp, target_dp, target_name); + if (error) + return error; + } + } else { + /* + * If target exists and it's a directory, check that whether + * it can be destroyed. + */ + if (S_ISDIR(VFS_I(target_ip)->i_mode) && + (!xfs_dir_isempty(target_ip) || + (VFS_I(target_ip)->i_nlink > 2))) + return -EEXIST; + } + + /* + * Directory entry creation below may acquire the AGF. Remove + * the whiteout from the unlinked list first to preserve correct + * AGI/AGF locking order. This dirties the transaction so failures + * after this point will abort and log recovery will clean up the + * mess. + * + * For whiteouts, we need to bump the link count on the whiteout + * inode. After this point, we have a real link, clear the tmpfile + * state flag from the inode so it doesn't accidentally get misused + * in future. + */ + if (du_wip->ip) { + struct xfs_perag *pag; + + ASSERT(VFS_I(du_wip->ip)->i_nlink == 0); + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, du_wip->ip->i_ino)); + error = xfs_iunlink_remove(tp, pag, du_wip->ip); + xfs_perag_put(pag); + if (error) + return error; + + xfs_bumplink(tp, du_wip->ip); + } + + /* + * Set up the target. + */ + if (target_ip == NULL) { + /* + * If target does not exist and the rename crosses + * directories, adjust the target directory link count + * to account for the ".." reference from the new entry. + */ + error = xfs_dir_createname(tp, target_dp, target_name, + src_ip->i_ino, spaceres); + if (error) + return error; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + if (new_parent && src_is_directory) { + xfs_bumplink(tp, target_dp); + } + } else { /* target_ip != NULL */ + /* + * Link the source inode under the target name. + * If the source inode is a directory and we are moving + * it across directories, its ".." entry will be + * inconsistent until we replace that down below. + * + * In case there is already an entry with the same + * name at the destination directory, remove it first. + */ + error = xfs_dir_replace(tp, target_dp, target_name, + src_ip->i_ino, spaceres); + if (error) + return error; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* + * Decrement the link count on the target since the target + * dir no longer points to it. + */ + error = xfs_droplink(tp, target_ip); + if (error) + return error; + + if (src_is_directory) { + /* + * Drop the link from the old "." entry. + */ + error = xfs_droplink(tp, target_ip); + if (error) + return error; + } + } /* target_ip != NULL */ + + /* + * Remove the source. + */ + if (new_parent && src_is_directory) { + /* + * Rewrite the ".." entry to point to the new + * directory. + */ + error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, + target_dp->i_ino, spaceres); + ASSERT(error != -EEXIST); + if (error) + return error; + } + + /* + * We always want to hit the ctime on the source inode. + * + * This isn't strictly required by the standards since the source + * inode isn't really being changed, but old unix file systems did + * it and some incremental backup programs won't work without it. + */ + xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); + + /* + * Adjust the link count on src_dp. This is necessary when + * renaming a directory, either within one parent when + * the target existed, or across two parent directories. + */ + if (src_is_directory && (new_parent || target_ip != NULL)) { + + /* + * Decrement link count on src_directory since the + * entry that's moved no longer points to it. + */ + error = xfs_droplink(tp, src_dp); + if (error) + return error; + } + + /* + * For whiteouts, we only need to update the source dirent with the + * inode number of the whiteout inode rather than removing it + * altogether. + */ + if (du_wip->ip) + error = xfs_dir_replace(tp, src_dp, src_name, du_wip->ip->i_ino, + spaceres); + else + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + spaceres); + if (error) + return error; + + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + if (new_parent) + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + + /* Schedule parent pointer updates. */ + if (du_wip->ppargs) { + error = xfs_parent_addname(tp, du_wip->ppargs, src_dp, + src_name, du_wip->ip); + if (error) + return error; + } + + if (du_src->ppargs) { + error = xfs_parent_replacename(tp, du_src->ppargs, src_dp, + src_name, target_dp, target_name, src_ip); + if (error) + return error; + } + + if (du_tgt->ppargs) { + error = xfs_parent_removename(tp, du_tgt->ppargs, target_dp, + target_name, target_ip); + if (error) + return error; + } + + /* + * Inform our hook clients that we've finished a rename operation as + * follows: removed the source and target files from their directories; + * that we've added the source to the target directory; and finally + * that we've added the whiteout, if there was one. All inodes are + * locked, so it's ok to model a rename this way so long as we say we + * deleted entries before we add new ones. + */ + if (target_ip) + xfs_dir_update_hook(target_dp, target_ip, -1, target_name); + xfs_dir_update_hook(src_dp, src_ip, -1, src_name); + xfs_dir_update_hook(target_dp, src_ip, 1, target_name); + if (du_wip->ip) + xfs_dir_update_hook(src_dp, du_wip->ip, 1, src_name); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 19af22a16c41..a6594a5a941d 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -22,6 +22,29 @@ struct xfs_dir3_icfree_hdr; struct xfs_dir3_icleaf_hdr; extern const struct xfs_name xfs_name_dotdot; +extern const struct xfs_name xfs_name_dot; + +static inline bool +xfs_dir2_samename( + const struct xfs_name *n1, + const struct xfs_name *n2) +{ + if (n1 == n2) + return true; + if (n1->len != n2->len) + return false; + return !memcmp(n1->name, n2->name, n1->len); +} + +enum xfs_dir2_fmt { + XFS_DIR2_FMT_SF, + XFS_DIR2_FMT_BLOCK, + XFS_DIR2_FMT_LEAF, + XFS_DIR2_FMT_NODE, + XFS_DIR2_FMT_ERROR, +}; + +enum xfs_dir2_fmt xfs_dir2_format(struct xfs_da_args *args, int *error); /* * Convert inode mode to directory entry filetype @@ -35,7 +58,6 @@ extern void xfs_dir_startup(void); extern int xfs_da_mount(struct xfs_mount *mp); extern void xfs_da_unmount(struct xfs_mount *mp); -extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, @@ -45,13 +67,18 @@ extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t *inum, struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t ino, + const struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t tot); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name); + const struct xfs_name *name); + +int xfs_dir_lookup_args(struct xfs_da_args *args); +int xfs_dir_createname_args(struct xfs_da_args *args); +int xfs_dir_removename_args(struct xfs_da_args *args); +int xfs_dir_replace_args(struct xfs_da_args *args); /* * Direct call from the bmap code, bypassing the generic directory layer. @@ -61,8 +88,6 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); /* * Interface routines used by userspace utilities */ -extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock); -extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); @@ -88,6 +113,10 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); +xfs_failaddr_t xfs_dir3_leaf_header_check(struct xfs_buf *bp, xfs_ino_t owner); +xfs_failaddr_t xfs_dir3_data_header_check(struct xfs_buf *bp, xfs_ino_t owner); +xfs_failaddr_t xfs_dir3_block_header_check(struct xfs_buf *bp, xfs_ino_t owner); + extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; @@ -279,4 +308,51 @@ static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c) return c; } +struct xfs_dir_update_params { + const struct xfs_inode *dp; + const struct xfs_inode *ip; + const struct xfs_name *name; + int delta; +}; + +#ifdef CONFIG_XFS_LIVE_HOOKS +void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip, + int delta, const struct xfs_name *name); + +struct xfs_dir_hook { + struct xfs_hook dirent_hook; +}; + +void xfs_dir_hook_disable(void); +void xfs_dir_hook_enable(void); + +int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook); +void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook); +void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn); +#else +# define xfs_dir_update_hook(dp, ip, delta, name) ((void)0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + +struct xfs_parent_args; + +struct xfs_dir_update { + struct xfs_inode *dp; + const struct xfs_name *name; + struct xfs_inode *ip; + struct xfs_parent_args *ppargs; +}; + +int xfs_dir_create_child(struct xfs_trans *tp, unsigned int resblks, + struct xfs_dir_update *du); +int xfs_dir_add_child(struct xfs_trans *tp, unsigned int resblks, + struct xfs_dir_update *du); +int xfs_dir_remove_child(struct xfs_trans *tp, unsigned int resblks, + struct xfs_dir_update *du); + +int xfs_dir_exchange_children(struct xfs_trans *tp, struct xfs_dir_update *du1, + struct xfs_dir_update *du2, unsigned int spaceres); +int xfs_dir_rename_children(struct xfs_trans *tp, struct xfs_dir_update *du_src, + struct xfs_dir_update *du_tgt, unsigned int spaceres, + struct xfs_dir_update *du_wip); + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 3c256d4cc40b..0f93ed1a4a74 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -20,6 +20,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_health.h" /* * Local function prototypes. @@ -114,17 +115,20 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .verify_struct = xfs_dir3_block_verify, }; -static xfs_failaddr_t +xfs_failaddr_t xfs_dir3_block_header_check( - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_ino_t owner) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - if (be64_to_cpu(hdr3->owner) != dp->i_ino) + if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->owner) != owner) return __this_address; } @@ -135,6 +139,7 @@ int xfs_dir3_block_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; @@ -147,11 +152,12 @@ xfs_dir3_block_read( return err; /* Check things that we can't do in the verifier. */ - fa = xfs_dir3_block_header_check(dp, *bpp); + fa = xfs_dir3_block_header_check(*bpp, owner); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; } @@ -161,12 +167,13 @@ xfs_dir3_block_read( static void xfs_dir3_block_init( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_buf *bp, - struct xfs_inode *dp) + struct xfs_da_args *args, + struct xfs_buf *bp) { - struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; bp->b_ops = &xfs_dir3_block_buf_ops; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); @@ -175,7 +182,7 @@ xfs_dir3_block_init( memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->owner = cpu_to_be64(dp->i_ino); + hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); return; @@ -380,7 +387,7 @@ xfs_dir2_block_addname( tp = args->trans; /* Read the (one and only) directory block into bp. */ - error = xfs_dir3_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, args->owner, &bp); if (error) return error; @@ -695,7 +702,7 @@ xfs_dir2_block_lookup_int( dp = args->dp; tp = args->trans; - error = xfs_dir3_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, args->owner, &bp); if (error) return error; @@ -979,7 +986,8 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_dir3_data_read(tp, dp, args->geo->datablk, 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + args->geo->datablk, 0, &dbp); if (error) return error; } @@ -1007,7 +1015,7 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ - xfs_dir3_block_init(mp, tp, dbp, dp); + xfs_dir3_block_init(args, dbp); needlog = 1; needscan = 0; @@ -1108,7 +1116,7 @@ xfs_dir2_sf_to_block( * Copy the directory into a temporary buffer. * Then pitch the incore inode data so we can make extents. */ - sfp = kmem_alloc(ifp->if_bytes, 0); + sfp = kmalloc(ifp->if_bytes, GFP_KERNEL | __GFP_NOFAIL); memcpy(sfp, oldsfp, ifp->if_bytes); xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); @@ -1127,7 +1135,7 @@ xfs_dir2_sf_to_block( error = xfs_dir3_data_init(args, blkno, &bp); if (error) goto out_free; - xfs_dir3_block_init(mp, tp, bp, dp); + xfs_dir3_block_init(args, bp); hdr = bp->b_addr; /* @@ -1167,7 +1175,7 @@ xfs_dir2_sf_to_block( * Create entry for . */ dep = bp->b_addr + offset; - dep->inumber = cpu_to_be64(dp->i_ino); + dep->inumber = cpu_to_be64(args->owner); dep->namelen = 1; dep->name[0] = '.'; xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR); @@ -1253,7 +1261,7 @@ xfs_dir2_sf_to_block( sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } /* Done with the temporary buffer */ - kmem_free(sfp); + kfree(sfp); /* * Sort the leaf entries by hash value. */ @@ -1268,6 +1276,6 @@ xfs_dir2_sf_to_block( xfs_dir3_data_check(dp, bp); return 0; out_free: - kmem_free(sfp); + kfree(sfp); return error; } diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index dbcf58979a59..a16b05c43e2e 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -18,6 +18,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_health.h" static xfs_failaddr_t xfs_dir2_data_freefind_verify( struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, @@ -177,6 +178,14 @@ __xfs_dir3_data_check( while (offset < end) { struct xfs_dir2_data_unused *dup = bp->b_addr + offset; struct xfs_dir2_data_entry *dep = bp->b_addr + offset; + unsigned int reclen; + + /* + * Are the remaining bytes large enough to hold an + * unused entry? + */ + if (offset > end - xfs_dir2_data_unusedsize(1)) + return __this_address; /* * If it's unused, look for the space in the bestfree table. @@ -186,9 +195,13 @@ __xfs_dir3_data_check( if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { xfs_failaddr_t fa; + reclen = xfs_dir2_data_unusedsize( + be16_to_cpu(dup->length)); if (lastfree != 0) return __this_address; - if (offset + be16_to_cpu(dup->length) > end) + if (be16_to_cpu(dup->length) != reclen) + return __this_address; + if (offset + reclen > end) return __this_address; if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) != offset) @@ -206,10 +219,18 @@ __xfs_dir3_data_check( be16_to_cpu(bf[2].length)) return __this_address; } - offset += be16_to_cpu(dup->length); + offset += reclen; lastfree = 1; continue; } + + /* + * This is not an unused entry. Are the remaining bytes + * large enough for a dirent with a single-byte name? + */ + if (offset > end - xfs_dir2_data_entsize(mp, 1)) + return __this_address; + /* * It's a real entry. Validate the fields. * If this is a block directory then make sure it's @@ -218,9 +239,10 @@ __xfs_dir3_data_check( */ if (dep->namelen == 0) return __this_address; - if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber))) + reclen = xfs_dir2_data_entsize(mp, dep->namelen); + if (offset + reclen > end) return __this_address; - if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end) + if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber))) return __this_address; if (be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)) != offset) return __this_address; @@ -244,7 +266,7 @@ __xfs_dir3_data_check( if (i >= be32_to_cpu(btp->count)) return __this_address; } - offset += xfs_dir2_data_entsize(mp, dep->namelen); + offset += reclen; } /* * Need to have seen all the entries and all the bestfree slots. @@ -394,17 +416,20 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .verify_write = xfs_dir3_data_write_verify, }; -static xfs_failaddr_t +xfs_failaddr_t xfs_dir3_data_header_check( - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_ino_t owner) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; - if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + if (hdr3->hdr.magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->hdr.owner) != owner) return __this_address; } @@ -415,6 +440,7 @@ int xfs_dir3_data_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp) @@ -428,11 +454,12 @@ xfs_dir3_data_read( return err; /* Check things that we can't do in the verifier. */ - fa = xfs_dir3_data_header_check(dp, *bpp); + fa = xfs_dir3_data_header_check(*bpp, owner); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; } @@ -723,7 +750,7 @@ xfs_dir3_data_init( memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->owner = cpu_to_be64(dp->i_ino); + hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); } else @@ -1198,6 +1225,7 @@ xfs_dir2_data_use_free( corrupt: xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount, hdr, sizeof(*hdr), __FILE__, __LINE__, fa); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index cb9e950a911d..71c2f22a3f6e 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -19,6 +19,7 @@ #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_buf_item.h" +#include "xfs_health.h" /* * Local function declarations. @@ -207,6 +208,29 @@ xfs_dir3_leaf_verify( return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr, true); } +xfs_failaddr_t +xfs_dir3_leaf_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + + if (xfs_has_crc(mp)) { + struct xfs_dir3_leaf *hdr3 = bp->b_addr; + + if (hdr3->hdr.info.hdr.magic != + cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) && + hdr3->hdr.info.hdr.magic != + cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->hdr.info.owner) != owner) + return __this_address; + } + + return NULL; +} + static void xfs_dir3_leaf_read_verify( struct xfs_buf *bp) @@ -270,32 +294,60 @@ int xfs_dir3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); - if (!err && tp && *bpp) + if (err || !(*bpp)) + return err; + + fa = xfs_dir3_leaf_header_check(*bpp, owner); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); + return -EFSCORRUPTED; + } + + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); - return err; + return 0; } int xfs_dir3_leafn_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); - if (!err && tp && *bpp) + if (err || !(*bpp)) + return err; + + fa = xfs_dir3_leaf_header_check(*bpp, owner); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); + return -EFSCORRUPTED; + } + + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); - return err; + return 0; } /* @@ -303,12 +355,12 @@ xfs_dir3_leafn_read( */ static void xfs_dir3_leaf_init( - struct xfs_mount *mp, - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, - xfs_ino_t owner, uint16_t type) { + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_trans *tp = args->trans; struct xfs_dir2_leaf *leaf = bp->b_addr; ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); @@ -322,7 +374,7 @@ xfs_dir3_leaf_init( ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); leaf3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - leaf3->info.owner = cpu_to_be64(owner); + leaf3->info.owner = cpu_to_be64(args->owner); uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { memset(leaf, 0, sizeof(*leaf)); @@ -355,7 +407,6 @@ xfs_dir3_leaf_get_buf( { struct xfs_inode *dp = args->dp; struct xfs_trans *tp = args->trans; - struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp; int error; @@ -368,7 +419,7 @@ xfs_dir3_leaf_get_buf( if (error) return error; - xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic); + xfs_dir3_leaf_init(args, bp, magic); xfs_dir3_leaf_log_header(args, bp); if (magic == XFS_DIR2_LEAF1_MAGIC) xfs_dir3_leaf_log_tail(args, bp); @@ -646,7 +697,8 @@ xfs_dir2_leaf_addname( trace_xfs_dir2_leaf_addname(args); - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->owner, args->geo->leafblk, + &lbp); if (error) return error; @@ -833,9 +885,9 @@ xfs_dir2_leaf_addname( * Already had space in some data block. * Just read that one in. */ - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, use_block), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, use_block), 0, + &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1237,7 +1289,8 @@ xfs_dir2_leaf_lookup_int( tp = args->trans; mp = dp->i_mount; - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->owner, args->geo->leafblk, + &lbp); if (error) return error; @@ -1275,9 +1328,9 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_trans_brelse(tp, dbp); - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, newdb), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, newdb), 0, + &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1317,9 +1370,9 @@ xfs_dir2_leaf_lookup_int( ASSERT(cidb != -1); if (cidb != curdb) { xfs_trans_brelse(tp, dbp); - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, cidb), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, cidb), 0, + &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1393,8 +1446,10 @@ xfs_dir2_leaf_removename( bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[db]) != oldbest) { xfs_buf_mark_corrupt(lbp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } + /* * Mark the former data entry unused. */ @@ -1611,7 +1666,8 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(geo, db), 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(geo, db), 0, &dbp); if (error) return error; @@ -1750,7 +1806,8 @@ xfs_dir2_node_to_leaf( /* * Read the freespace block. */ - error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp); + error = xfs_dir2_free_read(tp, dp, args->owner, args->geo->freeblk, + &fbp); if (error) return error; xfs_dir2_free_hdr_from_disk(mp, &freehdr, fbp->b_addr); diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 7a03aeb9f4c9..fe8d4fa13128 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -20,6 +20,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_health.h" /* * Function declarations. @@ -174,11 +175,11 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = { /* Everything ok in the free block header? */ static xfs_failaddr_t xfs_dir3_free_header_check( - struct xfs_inode *dp, - xfs_dablk_t fbno, - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_ino_t owner, + xfs_dablk_t fbno) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_mount *mp = bp->b_mount; int maxbests = mp->m_dir_geo->free_max_bests; unsigned int firstdb; @@ -194,7 +195,7 @@ xfs_dir3_free_header_check( return __this_address; if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) return __this_address; - if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + if (be64_to_cpu(hdr3->hdr.owner) != owner) return __this_address; } else { struct xfs_dir2_free_hdr *hdr = bp->b_addr; @@ -213,6 +214,7 @@ static int __xfs_dir3_free_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, unsigned int flags, struct xfs_buf **bpp) @@ -226,11 +228,12 @@ __xfs_dir3_free_read( return err; /* Check things that we can't do in the verifier. */ - fa = xfs_dir3_free_header_check(dp, fbno, *bpp); + fa = xfs_dir3_free_header_check(*bpp, owner, fbno); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; } @@ -297,20 +300,23 @@ int xfs_dir2_free_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir3_free_read(tp, dp, fbno, 0, bpp); + return __xfs_dir3_free_read(tp, dp, owner, fbno, 0, bpp); } static int xfs_dir2_free_try_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir3_free_read(tp, dp, fbno, XFS_DABUF_MAP_HOLE_OK, bpp); + return __xfs_dir3_free_read(tp, dp, owner, fbno, XFS_DABUF_MAP_HOLE_OK, + bpp); } static int @@ -347,7 +353,7 @@ xfs_dir3_free_get_buf( hdr.magic = XFS_DIR3_FREE_MAGIC; hdr3->hdr.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->hdr.owner = cpu_to_be64(dp->i_ino); + hdr3->hdr.owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid); } else hdr.magic = XFS_DIR2_FREE_MAGIC; @@ -443,6 +449,7 @@ xfs_dir2_leaf_to_node( if (be32_to_cpu(ltp->bestcount) > (uint)dp->i_disk_size / args->geo->blksize) { xfs_buf_mark_corrupt(lbp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -517,6 +524,7 @@ xfs_dir2_leafn_add( */ if (index < 0) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -713,7 +721,7 @@ xfs_dir2_leafn_lookup_for_addname( if (curbp) xfs_trans_brelse(tp, curbp); - error = xfs_dir2_free_read(tp, dp, + error = xfs_dir2_free_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, newfdb), &curbp); @@ -736,6 +744,7 @@ xfs_dir2_leafn_lookup_for_addname( cpu_to_be16(NULLDATAOFF))) { if (curfdb != newfdb) xfs_trans_brelse(tp, curbp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } curfdb = newfdb; @@ -804,6 +813,7 @@ xfs_dir2_leafn_lookup_for_entry( xfs_dir3_leaf_check(dp, bp); if (leafhdr.count <= 0) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -857,7 +867,7 @@ xfs_dir2_leafn_lookup_for_entry( ASSERT(state->extravalid); curbp = state->extrablk.bp; } else { - error = xfs_dir3_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, newdb), 0, &curbp); @@ -1350,8 +1360,8 @@ xfs_dir2_leafn_remove( * read in the free block. */ fdb = xfs_dir2_db_to_fdb(geo, db); - error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(geo, fdb), - &fbp); + error = xfs_dir2_free_read(tp, dp, args->owner, + xfs_dir2_db_to_da(geo, fdb), &fbp); if (error) return error; free = fbp->b_addr; @@ -1556,7 +1566,8 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, &bp); + error = xfs_dir3_leafn_read(state->args->trans, dp, + state->args->owner, blkno, &bp); if (error) return error; @@ -1709,7 +1720,7 @@ xfs_dir2_node_add_datablk( * that was just allocated. */ fbno = xfs_dir2_db_to_fdb(args->geo, *dbno); - error = xfs_dir2_free_try_read(tp, dp, + error = xfs_dir2_free_try_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, fbno), &fbp); if (error) return error; @@ -1739,6 +1750,7 @@ xfs_dir2_node_add_datablk( } else { xfs_alert(mp, " ... fblk is NULL"); } + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -1855,7 +1867,7 @@ xfs_dir2_node_find_freeblk( * so this might not succeed. This should be really rare, so * there's no reason to avoid it. */ - error = xfs_dir2_free_try_read(tp, dp, + error = xfs_dir2_free_try_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, fbno), &fbp); if (error) @@ -1941,9 +1953,8 @@ xfs_dir2_node_addname_int( &freehdr, &findex); } else { /* Read the data block in. */ - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, dbno), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, dbno), 0, &dbp); } if (error) return error; @@ -2295,7 +2306,7 @@ xfs_dir2_node_trim_free( /* * Read the freespace block. */ - error = xfs_dir2_free_try_read(tp, dp, fo, &bp); + error = xfs_dir2_free_try_read(tp, dp, args->owner, fo, &bp); if (error) return error; /* diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 1db2e60ba827..10041350274a 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -50,8 +50,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args, /* xfs_dir2_block.c */ -extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_buf **bpp); +int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, struct xfs_buf **bpp); extern int xfs_dir2_block_addname(struct xfs_da_args *args); extern int xfs_dir2_block_lookup(struct xfs_da_args *args); extern int xfs_dir2_block_removename(struct xfs_da_args *args); @@ -78,7 +78,8 @@ extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t bno, unsigned int flags, + struct xfs_buf **bpp); int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, unsigned int flags); @@ -95,9 +96,9 @@ void xfs_dir2_leaf_hdr_from_disk(struct xfs_mount *mp, void xfs_dir2_leaf_hdr_to_disk(struct xfs_mount *mp, struct xfs_dir2_leaf *to, struct xfs_dir3_icleaf_hdr *from); int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp); int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); @@ -154,8 +155,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args); extern int xfs_dir2_node_replace(struct xfs_da_args *args); extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, int *rvalp); -extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, struct xfs_buf **bpp); +int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp); /* xfs_dir2_sf.c */ xfs_ino_t xfs_dir2_sf_get_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, @@ -189,6 +190,13 @@ extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize); static inline unsigned int +xfs_dir2_data_unusedsize( + unsigned int len) +{ + return round_up(len, XFS_DIR2_DATA_ALIGN); +} + +static inline unsigned int xfs_dir2_data_entsize( struct xfs_mount *mp, unsigned int namelen) diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index e1f83fc7b6ad..17a20384c8b7 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -276,7 +276,7 @@ xfs_dir2_block_to_sf( * format the data into. Once we have formatted the data, we can free * the block and copy the formatted data into the inode literal area. */ - sfp = kmem_alloc(mp->m_sb.sb_inodesize, 0); + sfp = kmalloc(mp->m_sb.sb_inodesize, GFP_KERNEL | __GFP_NOFAIL); memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); /* @@ -350,7 +350,7 @@ xfs_dir2_block_to_sf( xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); - kmem_free(sfp); + kfree(sfp); return error; } @@ -524,7 +524,7 @@ xfs_dir2_sf_addname_hard( * Copy the old directory to the stack buffer. */ old_isize = (int)dp->i_disk_size; - buf = kmem_alloc(old_isize, 0); + buf = kmalloc(old_isize, GFP_KERNEL | __GFP_NOFAIL); oldsfp = (xfs_dir2_sf_hdr_t *)buf; memcpy(oldsfp, dp->i_df.if_data, old_isize); /* @@ -576,7 +576,7 @@ xfs_dir2_sf_addname_hard( sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); memcpy(sfep, oldsfep, old_isize - nbytes); } - kmem_free(buf); + kfree(buf); dp->i_disk_size = new_isize; xfs_dir2_sf_check(args); } @@ -1151,7 +1151,7 @@ xfs_dir2_sf_toino4( * Don't want xfs_idata_realloc copying the data here. */ oldsize = dp->i_df.if_bytes; - buf = kmem_alloc(oldsize, 0); + buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL); ASSERT(oldsfp->i8count == 1); memcpy(buf, oldsfp, oldsize); /* @@ -1190,7 +1190,7 @@ xfs_dir2_sf_toino4( /* * Clean up the inode. */ - kmem_free(buf); + kfree(buf); dp->i_disk_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } @@ -1223,7 +1223,7 @@ xfs_dir2_sf_toino8( * Don't want xfs_idata_realloc copying the data here. */ oldsize = dp->i_df.if_bytes; - buf = kmem_alloc(oldsize, 0); + buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL); ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); /* @@ -1262,7 +1262,7 @@ xfs_dir2_sf_toino8( /* * Clean up the inode. */ - kmem_free(buf); + kfree(buf); dp->i_disk_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 15a362e2f5ea..dceef2abd4e2 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -16,6 +16,9 @@ #include "xfs_trans.h" #include "xfs_qm.h" #include "xfs_error.h" +#include "xfs_health.h" +#include "xfs_metadir.h" +#include "xfs_metafile.h" int xfs_calc_dquots_per_chunk( @@ -323,3 +326,190 @@ xfs_dquot_to_disk_ts( return cpu_to_be32(t); } + +inline unsigned int +xfs_dqinode_sick_mask(xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: + return XFS_SICK_FS_UQUOTA; + case XFS_DQTYPE_GROUP: + return XFS_SICK_FS_GQUOTA; + case XFS_DQTYPE_PROJ: + return XFS_SICK_FS_PQUOTA; + } + + ASSERT(0); + return 0; +} + +/* + * Load the inode for a given type of quota, assuming that the sb fields have + * been sorted out. This is not true when switching quota types on a V4 + * filesystem, so do not use this function for that. If metadir is enabled, + * @dp must be the /quota metadir. + * + * Returns -ENOENT if the quota inode field is NULLFSINO; 0 and an inode on + * success; or a negative errno. + */ +int +xfs_dqinode_load( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dqtype_t type, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip; + enum xfs_metafile_type metafile_type = xfs_dqinode_metafile_type(type); + int error; + + if (!xfs_has_metadir(mp)) { + xfs_ino_t ino; + + switch (type) { + case XFS_DQTYPE_USER: + ino = mp->m_sb.sb_uquotino; + break; + case XFS_DQTYPE_GROUP: + ino = mp->m_sb.sb_gquotino; + break; + case XFS_DQTYPE_PROJ: + ino = mp->m_sb.sb_pquotino; + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + + /* Should have set 0 to NULLFSINO when loading superblock */ + if (ino == NULLFSINO) + return -ENOENT; + + error = xfs_trans_metafile_iget(tp, ino, metafile_type, &ip); + } else { + error = xfs_metadir_load(tp, dp, xfs_dqinode_path(type), + metafile_type, &ip); + if (error == -ENOENT) + return error; + } + if (error) { + if (xfs_metadata_is_sick(error)) + xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type)); + return error; + } + + if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE)) { + xfs_irele(ip); + xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type)); + return -EFSCORRUPTED; + } + + if (XFS_IS_CORRUPT(mp, ip->i_projid != 0)) { + xfs_irele(ip); + xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type)); + return -EFSCORRUPTED; + } + + *ipp = ip; + return 0; +} + +/* Create a metadata directory quota inode. */ +int +xfs_dqinode_metadir_create( + struct xfs_inode *dp, + xfs_dqtype_t type, + struct xfs_inode **ipp) +{ + struct xfs_metadir_update upd = { + .dp = dp, + .metafile_type = xfs_dqinode_metafile_type(type), + .path = xfs_dqinode_path(type), + }; + int error; + + error = xfs_metadir_start_create(&upd); + if (error) + return error; + + error = xfs_metadir_create(&upd, S_IFREG); + if (error) + return error; + + xfs_trans_log_inode(upd.tp, upd.ip, XFS_ILOG_CORE); + + error = xfs_metadir_commit(&upd); + if (error) + return error; + + xfs_finish_inode_setup(upd.ip); + *ipp = upd.ip; + return 0; +} + +#ifndef __KERNEL__ +/* Link a metadata directory quota inode. */ +int +xfs_dqinode_metadir_link( + struct xfs_inode *dp, + xfs_dqtype_t type, + struct xfs_inode *ip) +{ + struct xfs_metadir_update upd = { + .dp = dp, + .metafile_type = xfs_dqinode_metafile_type(type), + .path = xfs_dqinode_path(type), + .ip = ip, + }; + int error; + + error = xfs_metadir_start_link(&upd); + if (error) + return error; + + error = xfs_metadir_link(&upd); + if (error) + return error; + + xfs_trans_log_inode(upd.tp, upd.ip, XFS_ILOG_CORE); + + return xfs_metadir_commit(&upd); +} +#endif /* __KERNEL__ */ + +/* Create the parent directory for all quota inodes and load it. */ +int +xfs_dqinode_mkdir_parent( + struct xfs_mount *mp, + struct xfs_inode **dpp) +{ + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_mkdir(mp->m_metadirip, "quota", dpp); +} + +/* + * Load the parent directory of all quota inodes. Pass the inode to the caller + * because quota functions (e.g. QUOTARM) can be called on the quota files even + * if quotas are not enabled. + */ +int +xfs_dqinode_load_parent( + struct xfs_trans *tp, + struct xfs_inode **dpp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_load(tp, mp->m_metadirip, "quota", XFS_METAFILE_DIR, + dpp); +} diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 01a9e86b3037..a53c5d40e084 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -63,7 +63,9 @@ #define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 #define XFS_ERRTAG_WB_DELAY_MS 42 #define XFS_ERRTAG_WRITE_DELAY_MS 43 -#define XFS_ERRTAG_MAX 44 +#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 +#define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45 +#define XFS_ERRTAG_MAX 46 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -111,5 +113,7 @@ #define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 #define XFS_RANDOM_WB_DELAY_MS 3000 #define XFS_RANDOM_WRITE_DELAY_MS 3000 +#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1 +#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c new file mode 100644 index 000000000000..3f1d6a98c118 --- /dev/null +++ b/fs/xfs/libxfs/xfs_exchmaps.c @@ -0,0 +1,1237 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_bmap.h" +#include "xfs_icache.h" +#include "xfs_quota.h" +#include "xfs_exchmaps.h" +#include "xfs_trace.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_error.h" +#include "xfs_errortag.h" +#include "xfs_health.h" +#include "xfs_exchmaps_item.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr.h" +#include "xfs_dir2_priv.h" +#include "xfs_dir2.h" +#include "xfs_symlink_remote.h" + +struct kmem_cache *xfs_exchmaps_intent_cache; + +/* bmbt mappings adjacent to a pair of records. */ +struct xfs_exchmaps_adjacent { + struct xfs_bmbt_irec left1; + struct xfs_bmbt_irec right1; + struct xfs_bmbt_irec left2; + struct xfs_bmbt_irec right2; +}; + +#define ADJACENT_INIT { \ + .left1 = { .br_startblock = HOLESTARTBLOCK }, \ + .right1 = { .br_startblock = HOLESTARTBLOCK }, \ + .left2 = { .br_startblock = HOLESTARTBLOCK }, \ + .right2 = { .br_startblock = HOLESTARTBLOCK }, \ +} + +/* Information to reset reflink flag / CoW fork state after an exchange. */ + +/* + * If the reflink flag is set on either inode, make sure it has an incore CoW + * fork, since all reflink inodes must have them. If there's a CoW fork and it + * has mappings in it, make sure the inodes are tagged appropriately so that + * speculative preallocations can be GC'd if we run low of space. + */ +static inline void +xfs_exchmaps_ensure_cowfork( + struct xfs_inode *ip) +{ + struct xfs_ifork *cfork; + + if (xfs_is_reflink_inode(ip)) + xfs_ifork_init_cow(ip); + + cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); + if (!cfork) + return; + if (cfork->if_bytes > 0) + xfs_inode_set_cowblocks_tag(ip); + else + xfs_inode_clear_cowblocks_tag(ip); +} + +/* + * Adjust the on-disk inode size upwards if needed so that we never add + * mappings into the file past EOF. This is crucial so that log recovery won't + * get confused by the sudden appearance of post-eof mappings. + */ +STATIC void +xfs_exchmaps_update_size( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fsize_t new_isize) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_fsize_t len; + + if (new_isize < 0) + return; + + len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount), + new_isize); + + if (len <= ip->i_disk_size) + return; + + trace_xfs_exchmaps_update_inode_size(ip, len); + + ip->i_disk_size = len; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Advance the incore state tracking after exchanging a mapping. */ +static inline void +xmi_advance( + struct xfs_exchmaps_intent *xmi, + const struct xfs_bmbt_irec *irec) +{ + xmi->xmi_startoff1 += irec->br_blockcount; + xmi->xmi_startoff2 += irec->br_blockcount; + xmi->xmi_blockcount -= irec->br_blockcount; +} + +/* Do we still have more mappings to exchange? */ +static inline bool +xmi_has_more_exchange_work(const struct xfs_exchmaps_intent *xmi) +{ + return xmi->xmi_blockcount > 0; +} + +/* Do we have post-operation cleanups to perform? */ +static inline bool +xmi_has_postop_work(const struct xfs_exchmaps_intent *xmi) +{ + return xmi->xmi_flags & (XFS_EXCHMAPS_CLEAR_INO1_REFLINK | + XFS_EXCHMAPS_CLEAR_INO2_REFLINK | + __XFS_EXCHMAPS_INO2_SHORTFORM); +} + +/* Check all mappings to make sure we can actually exchange them. */ +int +xfs_exchmaps_check_forks( + struct xfs_mount *mp, + const struct xfs_exchmaps_req *req) +{ + struct xfs_ifork *ifp1, *ifp2; + int whichfork = xfs_exchmaps_reqfork(req); + + /* No fork? */ + ifp1 = xfs_ifork_ptr(req->ip1, whichfork); + ifp2 = xfs_ifork_ptr(req->ip2, whichfork); + if (!ifp1 || !ifp2) + return -EINVAL; + + /* We don't know how to exchange local format forks. */ + if (ifp1->if_format == XFS_DINODE_FMT_LOCAL || + ifp2->if_format == XFS_DINODE_FMT_LOCAL) + return -EINVAL; + + return 0; +} + +#ifdef CONFIG_XFS_QUOTA +/* Log the actual updates to the quota accounting. */ +static inline void +xfs_exchmaps_update_quota( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2) +{ + int64_t ip1_delta = 0, ip2_delta = 0; + unsigned int qflag; + + qflag = XFS_IS_REALTIME_INODE(xmi->xmi_ip1) ? XFS_TRANS_DQ_RTBCOUNT : + XFS_TRANS_DQ_BCOUNT; + + if (xfs_bmap_is_real_extent(irec1)) { + ip1_delta -= irec1->br_blockcount; + ip2_delta += irec1->br_blockcount; + } + + if (xfs_bmap_is_real_extent(irec2)) { + ip1_delta += irec2->br_blockcount; + ip2_delta -= irec2->br_blockcount; + } + + xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip1, qflag, ip1_delta); + xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip2, qflag, ip2_delta); +} +#else +# define xfs_exchmaps_update_quota(tp, xmi, irec1, irec2) ((void)0) +#endif + +/* Decide if we want to skip this mapping from file1. */ +static inline bool +xfs_exchmaps_can_skip_mapping( + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = xmi->xmi_ip1->i_mount; + + /* Do not skip this mapping if the caller did not tell us to. */ + if (!(xmi->xmi_flags & XFS_EXCHMAPS_INO1_WRITTEN)) + return false; + + /* Do not skip mapped, written mappings. */ + if (xfs_bmap_is_written_extent(irec)) + return false; + + /* + * The mapping is unwritten or a hole. It cannot be a delalloc + * reservation because we already excluded those. It cannot be an + * unwritten extent with dirty page cache because we flushed the page + * cache. For files where the allocation unit is 1FSB (files on the + * data dev, rt files if the extent size is 1FSB), we can safely + * skip this mapping. + */ + if (!xfs_inode_has_bigrtalloc(xmi->xmi_ip1)) + return true; + + /* + * For a realtime file with a multi-fsb allocation unit, the decision + * is trickier because we can only swap full allocation units. + * Unwritten mappings can appear in the middle of an rtx if the rtx is + * partially written, but they can also appear for preallocations. + * + * If the mapping is a hole, skip it entirely. Holes should align with + * rtx boundaries. + */ + if (!xfs_bmap_is_real_extent(irec)) + return true; + + /* + * All mappings below this point are unwritten. + * + * - If the beginning is not aligned to an rtx, trim the end of the + * mapping so that it does not cross an rtx boundary, and swap it. + * + * - If both ends are aligned to an rtx, skip the entire mapping. + */ + if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) { + xfs_fileoff_t new_end; + + new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize); + irec->br_blockcount = min(irec->br_blockcount, + new_end - irec->br_startoff); + return false; + } + if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize)) + return true; + + /* + * All mappings below this point are unwritten, start on an rtx + * boundary, and do not end on an rtx boundary. + * + * - If the mapping is longer than one rtx, trim the end of the mapping + * down to an rtx boundary and skip it. + * + * - The mapping is shorter than one rtx. Swap it. + */ + if (irec->br_blockcount > mp->m_sb.sb_rextsize) { + xfs_fileoff_t new_end; + + new_end = rounddown_64(irec->br_startoff + irec->br_blockcount, + mp->m_sb.sb_rextsize); + irec->br_blockcount = new_end - irec->br_startoff; + return true; + } + + return false; +} + +/* + * Walk forward through the file ranges in @xmi until we find two different + * mappings to exchange. If there is work to do, return the mappings; + * otherwise we've reached the end of the range and xmi_blockcount will be + * zero. + * + * If the walk skips over a pair of mappings to the same storage, save them as + * the left records in @adj (if provided) so that the simulation phase can + * avoid an extra lookup. + */ +static int +xfs_exchmaps_find_mappings( + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2, + struct xfs_exchmaps_adjacent *adj) +{ + int nimaps; + int bmap_flags; + int error; + + bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_whichfork(xmi)); + + for (; xmi_has_more_exchange_work(xmi); xmi_advance(xmi, irec1)) { + /* Read mapping from the first file */ + nimaps = 1; + error = xfs_bmapi_read(xmi->xmi_ip1, xmi->xmi_startoff1, + xmi->xmi_blockcount, irec1, &nimaps, + bmap_flags); + if (error) + return error; + if (nimaps != 1 || + irec1->br_startblock == DELAYSTARTBLOCK || + irec1->br_startoff != xmi->xmi_startoff1) { + /* + * We should never get no mapping or a delalloc mapping + * or something that doesn't match what we asked for, + * since the caller flushed both inodes and we hold the + * ILOCKs for both inodes. + */ + ASSERT(0); + return -EINVAL; + } + + if (xfs_exchmaps_can_skip_mapping(xmi, irec1)) { + trace_xfs_exchmaps_mapping1_skip(xmi->xmi_ip1, irec1); + continue; + } + + /* Read mapping from the second file */ + nimaps = 1; + error = xfs_bmapi_read(xmi->xmi_ip2, xmi->xmi_startoff2, + irec1->br_blockcount, irec2, &nimaps, + bmap_flags); + if (error) + return error; + if (nimaps != 1 || + irec2->br_startblock == DELAYSTARTBLOCK || + irec2->br_startoff != xmi->xmi_startoff2) { + /* + * We should never get no mapping or a delalloc mapping + * or something that doesn't match what we asked for, + * since the caller flushed both inodes and we hold the + * ILOCKs for both inodes. + */ + ASSERT(0); + return -EINVAL; + } + + /* + * We can only exchange as many blocks as the smaller of the + * two mapping maps. + */ + irec1->br_blockcount = min(irec1->br_blockcount, + irec2->br_blockcount); + + trace_xfs_exchmaps_mapping1(xmi->xmi_ip1, irec1); + trace_xfs_exchmaps_mapping2(xmi->xmi_ip2, irec2); + + /* We found something to exchange, so return it. */ + if (irec1->br_startblock != irec2->br_startblock) + return 0; + + /* + * Two mappings pointing to the same physical block must not + * have different states; that's filesystem corruption. Move + * on to the next mapping if they're both holes or both point + * to the same physical space extent. + */ + if (irec1->br_state != irec2->br_state) { + xfs_bmap_mark_sick(xmi->xmi_ip1, + xfs_exchmaps_whichfork(xmi)); + xfs_bmap_mark_sick(xmi->xmi_ip2, + xfs_exchmaps_whichfork(xmi)); + return -EFSCORRUPTED; + } + + /* + * Save the mappings if we're estimating work and skipping + * these identical mappings. + */ + if (adj) { + memcpy(&adj->left1, irec1, sizeof(*irec1)); + memcpy(&adj->left2, irec2, sizeof(*irec2)); + } + } + + return 0; +} + +/* Exchange these two mappings. */ +static void +xfs_exchmaps_one_step( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2) +{ + int whichfork = xfs_exchmaps_whichfork(xmi); + + xfs_exchmaps_update_quota(tp, xmi, irec1, irec2); + + /* Remove both mappings. */ + xfs_bmap_unmap_extent(tp, xmi->xmi_ip1, whichfork, irec1); + xfs_bmap_unmap_extent(tp, xmi->xmi_ip2, whichfork, irec2); + + /* + * Re-add both mappings. We exchange the file offsets between the two + * maps and add the opposite map, which has the effect of filling the + * logical offsets we just unmapped, but with with the physical mapping + * information exchanged. + */ + swap(irec1->br_startoff, irec2->br_startoff); + xfs_bmap_map_extent(tp, xmi->xmi_ip1, whichfork, irec2); + xfs_bmap_map_extent(tp, xmi->xmi_ip2, whichfork, irec1); + + /* Make sure we're not adding mappings past EOF. */ + if (whichfork == XFS_DATA_FORK) { + xfs_exchmaps_update_size(tp, xmi->xmi_ip1, irec2, + xmi->xmi_isize1); + xfs_exchmaps_update_size(tp, xmi->xmi_ip2, irec1, + xmi->xmi_isize2); + } + + /* + * Advance our cursor and exit. The caller (either defer ops or log + * recovery) will log the XMD item, and if *blockcount is nonzero, it + * will log a new XMI item for the remainder and call us back. + */ + xmi_advance(xmi, irec1); +} + +/* Convert inode2's leaf attr fork back to shortform, if possible.. */ +STATIC int +xfs_exchmaps_attr_to_sf( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_da_args args = { + .dp = xmi->xmi_ip2, + .geo = tp->t_mountp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .trans = tp, + .owner = xmi->xmi_ip2->i_ino, + }; + struct xfs_buf *bp; + int forkoff; + int error; + + if (!xfs_attr_is_leaf(xmi->xmi_ip2)) + return 0; + + error = xfs_attr3_leaf_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, 0, + &bp); + if (error) + return error; + + forkoff = xfs_attr_shortform_allfit(bp, xmi->xmi_ip2); + if (forkoff == 0) + return 0; + + return xfs_attr3_leaf_to_shortform(bp, &args, forkoff); +} + +/* Convert inode2's block dir fork back to shortform, if possible.. */ +STATIC int +xfs_exchmaps_dir_to_sf( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_da_args args = { + .dp = xmi->xmi_ip2, + .geo = tp->t_mountp->m_dir_geo, + .whichfork = XFS_DATA_FORK, + .trans = tp, + .owner = xmi->xmi_ip2->i_ino, + }; + struct xfs_dir2_sf_hdr sfh; + struct xfs_buf *bp; + int size; + int error = 0; + + if (xfs_dir2_format(&args, &error) != XFS_DIR2_FMT_BLOCK) + return error; + + error = xfs_dir3_block_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, &bp); + if (error) + return error; + + size = xfs_dir2_block_sfsize(xmi->xmi_ip2, bp->b_addr, &sfh); + if (size > xfs_inode_data_fork_size(xmi->xmi_ip2)) + return 0; + + return xfs_dir2_block_to_sf(&args, bp, size, &sfh); +} + +/* Convert inode2's remote symlink target back to shortform, if possible. */ +STATIC int +xfs_exchmaps_link_to_sf( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_inode *ip = xmi->xmi_ip2; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + char *buf; + int error; + + if (ifp->if_format == XFS_DINODE_FMT_LOCAL || + ip->i_disk_size > xfs_inode_data_fork_size(ip)) + return 0; + + /* Read the current symlink target into a buffer. */ + buf = kmalloc(ip->i_disk_size + 1, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); + if (!buf) { + ASSERT(0); + return -ENOMEM; + } + + error = xfs_symlink_remote_read(ip, buf); + if (error) + goto free; + + /* Remove the blocks. */ + error = xfs_symlink_remote_truncate(tp, ip); + if (error) + goto free; + + /* Convert fork to local format and log our changes. */ + xfs_idestroy_fork(ifp); + ifp->if_bytes = 0; + ifp->if_format = XFS_DINODE_FMT_LOCAL; + xfs_init_local_fork(ip, XFS_DATA_FORK, buf, ip->i_disk_size); + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); +free: + kfree(buf); + return error; +} + +/* Clear the reflink flag after an exchange. */ +static inline void +xfs_exchmaps_clear_reflink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + trace_xfs_reflink_unset_inode_flag(ip); + + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Finish whatever work might come after an exchange operation. */ +static int +xfs_exchmaps_do_postop_work( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + if (xmi->xmi_flags & __XFS_EXCHMAPS_INO2_SHORTFORM) { + int error = 0; + + if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK) + error = xfs_exchmaps_attr_to_sf(tp, xmi); + else if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode)) + error = xfs_exchmaps_dir_to_sf(tp, xmi); + else if (S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) + error = xfs_exchmaps_link_to_sf(tp, xmi); + xmi->xmi_flags &= ~__XFS_EXCHMAPS_INO2_SHORTFORM; + if (error) + return error; + } + + if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO1_REFLINK) { + xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip1); + xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO1_REFLINK; + } + + if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO2_REFLINK) { + xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip2); + xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO2_REFLINK; + } + + return 0; +} + +/* Finish one step in a mapping exchange operation, possibly relogging. */ +int +xfs_exchmaps_finish_one( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_bmbt_irec irec1, irec2; + int error; + + if (xmi_has_more_exchange_work(xmi)) { + /* + * If the operation state says that some range of the files + * have not yet been exchanged, look for mappings in that range + * to exchange. If we find some mappings, exchange them. + */ + error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, NULL); + if (error) + return error; + + if (xmi_has_more_exchange_work(xmi)) + xfs_exchmaps_one_step(tp, xmi, &irec1, &irec2); + + /* + * If the caller asked us to exchange the file sizes after the + * exchange and either we just exchanged the last mappings in + * the range or we didn't find anything to exchange, update the + * ondisk file sizes. + */ + if ((xmi->xmi_flags & XFS_EXCHMAPS_SET_SIZES) && + !xmi_has_more_exchange_work(xmi)) { + xmi->xmi_ip1->i_disk_size = xmi->xmi_isize1; + xmi->xmi_ip2->i_disk_size = xmi->xmi_isize2; + + xfs_trans_log_inode(tp, xmi->xmi_ip1, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, xmi->xmi_ip2, XFS_ILOG_CORE); + } + } else if (xmi_has_postop_work(xmi)) { + /* + * Now that we're finished with the exchange operation, + * complete the post-op cleanup work. + */ + error = xfs_exchmaps_do_postop_work(tp, xmi); + if (error) + return error; + } + + if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) + return -EIO; + + /* If we still have work to do, ask for a new transaction. */ + if (xmi_has_more_exchange_work(xmi) || xmi_has_postop_work(xmi)) { + trace_xfs_exchmaps_defer(tp->t_mountp, xmi); + return -EAGAIN; + } + + /* + * If we reach here, we've finished all the exchange work and the post + * operation work. The last thing we need to do before returning to + * the caller is to make sure that COW forks are set up correctly. + */ + if (!(xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)) { + xfs_exchmaps_ensure_cowfork(xmi->xmi_ip1); + xfs_exchmaps_ensure_cowfork(xmi->xmi_ip2); + } + + return 0; +} + +/* + * Compute the amount of bmbt blocks we should reserve for each file. In the + * worst case, each exchange will fill a hole with a new mapping, which could + * result in a btree split every time we add a new leaf block. + */ +static inline uint64_t +xfs_exchmaps_bmbt_blocks( + struct xfs_mount *mp, + const struct xfs_exchmaps_req *req) +{ + return howmany_64(req->nr_exchanges, + XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)) * + XFS_EXTENTADD_SPACE_RES(mp, xfs_exchmaps_reqfork(req)); +} + +/* Compute the space we should reserve for the rmap btree expansions. */ +static inline uint64_t +xfs_exchmaps_rmapbt_blocks( + struct xfs_mount *mp, + const struct xfs_exchmaps_req *req) +{ + if (!xfs_has_rmapbt(mp)) + return 0; + if (XFS_IS_REALTIME_INODE(req->ip1)) + return howmany_64(req->nr_exchanges, + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * + XFS_RTRMAPADD_SPACE_RES(mp); + + return howmany_64(req->nr_exchanges, + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * + XFS_RMAPADD_SPACE_RES(mp); +} + +/* Estimate the bmbt and rmapbt overhead required to exchange mappings. */ +int +xfs_exchmaps_estimate_overhead( + struct xfs_exchmaps_req *req) +{ + struct xfs_mount *mp = req->ip1->i_mount; + xfs_filblks_t bmbt_blocks; + xfs_filblks_t rmapbt_blocks; + xfs_filblks_t resblks = req->resblks; + + /* + * Compute the number of bmbt and rmapbt blocks we might need to handle + * the estimated number of exchanges. + */ + bmbt_blocks = xfs_exchmaps_bmbt_blocks(mp, req); + rmapbt_blocks = xfs_exchmaps_rmapbt_blocks(mp, req); + + trace_xfs_exchmaps_overhead(mp, bmbt_blocks, rmapbt_blocks); + + /* Make sure the change in file block count doesn't overflow. */ + if (check_add_overflow(req->ip1_bcount, bmbt_blocks, &req->ip1_bcount)) + return -EFBIG; + if (check_add_overflow(req->ip2_bcount, bmbt_blocks, &req->ip2_bcount)) + return -EFBIG; + + /* + * Add together the number of blocks we need to handle btree growth, + * then add it to the number of blocks we need to reserve to this + * transaction. + */ + if (check_add_overflow(resblks, bmbt_blocks, &resblks)) + return -ENOSPC; + if (check_add_overflow(resblks, bmbt_blocks, &resblks)) + return -ENOSPC; + if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) + return -ENOSPC; + if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) + return -ENOSPC; + + /* Can't actually reserve more than UINT_MAX blocks. */ + if (req->resblks > UINT_MAX) + return -ENOSPC; + + req->resblks = resblks; + trace_xfs_exchmaps_final_estimate(req); + return 0; +} + +/* Decide if we can merge two real mappings. */ +static inline bool +xmi_can_merge( + const struct xfs_bmbt_irec *b1, + const struct xfs_bmbt_irec *b2) +{ + /* Don't merge holes. */ + if (b1->br_startblock == HOLESTARTBLOCK || + b2->br_startblock == HOLESTARTBLOCK) + return false; + + /* We don't merge holes. */ + if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2)) + return false; + + if (b1->br_startoff + b1->br_blockcount == b2->br_startoff && + b1->br_startblock + b1->br_blockcount == b2->br_startblock && + b1->br_state == b2->br_state && + b1->br_blockcount + b2->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + return true; + + return false; +} + +/* + * Decide if we can merge three mappings. Caller must ensure all three + * mappings must not be holes or delalloc reservations. + */ +static inline bool +xmi_can_merge_all( + const struct xfs_bmbt_irec *l, + const struct xfs_bmbt_irec *m, + const struct xfs_bmbt_irec *r) +{ + xfs_filblks_t new_len; + + new_len = l->br_blockcount + m->br_blockcount + r->br_blockcount; + return new_len <= XFS_MAX_BMBT_EXTLEN; +} + +#define CLEFT_CONTIG 0x01 +#define CRIGHT_CONTIG 0x02 +#define CHOLE 0x04 +#define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG) + +#define NLEFT_CONTIG 0x10 +#define NRIGHT_CONTIG 0x20 +#define NHOLE 0x40 +#define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG) + +/* Estimate the effect of a single exchange on mapping count. */ +static inline int +xmi_delta_nextents_step( + struct xfs_mount *mp, + const struct xfs_bmbt_irec *left, + const struct xfs_bmbt_irec *curr, + const struct xfs_bmbt_irec *new, + const struct xfs_bmbt_irec *right) +{ + bool lhole, rhole, chole, nhole; + unsigned int state = 0; + int ret = 0; + + lhole = left->br_startblock == HOLESTARTBLOCK; + rhole = right->br_startblock == HOLESTARTBLOCK; + chole = curr->br_startblock == HOLESTARTBLOCK; + nhole = new->br_startblock == HOLESTARTBLOCK; + + if (chole) + state |= CHOLE; + if (!lhole && !chole && xmi_can_merge(left, curr)) + state |= CLEFT_CONTIG; + if (!rhole && !chole && xmi_can_merge(curr, right)) + state |= CRIGHT_CONTIG; + if ((state & CBOTH_CONTIG) == CBOTH_CONTIG && + !xmi_can_merge_all(left, curr, right)) + state &= ~CRIGHT_CONTIG; + + if (nhole) + state |= NHOLE; + if (!lhole && !nhole && xmi_can_merge(left, new)) + state |= NLEFT_CONTIG; + if (!rhole && !nhole && xmi_can_merge(new, right)) + state |= NRIGHT_CONTIG; + if ((state & NBOTH_CONTIG) == NBOTH_CONTIG && + !xmi_can_merge_all(left, new, right)) + state &= ~NRIGHT_CONTIG; + + switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) { + case CLEFT_CONTIG | CRIGHT_CONTIG: + /* + * left/curr/right are the same mapping, so deleting curr + * causes 2 new mappings to be created. + */ + ret += 2; + break; + case 0: + /* + * curr is not contiguous with any mapping, so we remove curr + * completely + */ + ret--; + break; + case CHOLE: + /* hole, do nothing */ + break; + case CLEFT_CONTIG: + case CRIGHT_CONTIG: + /* trim either left or right, no change */ + break; + } + + switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) { + case NLEFT_CONTIG | NRIGHT_CONTIG: + /* + * left/curr/right will become the same mapping, so adding + * curr causes the deletion of right. + */ + ret--; + break; + case 0: + /* new is not contiguous with any mapping */ + ret++; + break; + case NHOLE: + /* hole, do nothing. */ + break; + case NLEFT_CONTIG: + case NRIGHT_CONTIG: + /* new is absorbed into left or right, no change */ + break; + } + + trace_xfs_exchmaps_delta_nextents_step(mp, left, curr, new, right, ret, + state); + return ret; +} + +/* Make sure we don't overflow the extent (mapping) counters. */ +static inline int +xmi_ensure_delta_nextents( + struct xfs_exchmaps_req *req, + struct xfs_inode *ip, + int64_t delta) +{ + struct xfs_mount *mp = ip->i_mount; + int whichfork = xfs_exchmaps_reqfork(req); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + uint64_t new_nextents; + xfs_extnum_t max_nextents; + + if (delta < 0) + return 0; + + /* + * It's always an error if the delta causes integer overflow. delta + * needs an explicit cast here to avoid warnings about implicit casts + * coded into the overflow check. + */ + if (check_add_overflow(ifp->if_nextents, (uint64_t)delta, + &new_nextents)) + return -EFBIG; + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + new_nextents > 10) + return -EFBIG; + + /* + * We always promote both inodes to have large extent counts if the + * superblock feature is enabled, so we only need to check against the + * theoretical maximum. + */ + max_nextents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), + whichfork); + if (new_nextents > max_nextents) + return -EFBIG; + + return 0; +} + +/* Find the next mapping after irec. */ +static inline int +xmi_next( + struct xfs_inode *ip, + int bmap_flags, + const struct xfs_bmbt_irec *irec, + struct xfs_bmbt_irec *nrec) +{ + xfs_fileoff_t off; + xfs_filblks_t blockcount; + int nimaps = 1; + int error; + + off = irec->br_startoff + irec->br_blockcount; + blockcount = XFS_MAX_FILEOFF - off; + error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags); + if (error) + return error; + if (nrec->br_startblock == DELAYSTARTBLOCK || + nrec->br_startoff != off) { + /* + * If we don't get the mapping we want, return a zero-length + * mapping, which our estimator function will pretend is a hole. + * We shouldn't get delalloc reservations. + */ + nrec->br_startblock = HOLESTARTBLOCK; + } + + return 0; +} + +int __init +xfs_exchmaps_intent_init_cache(void) +{ + xfs_exchmaps_intent_cache = kmem_cache_create("xfs_exchmaps_intent", + sizeof(struct xfs_exchmaps_intent), + 0, 0, NULL); + + return xfs_exchmaps_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_exchmaps_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_exchmaps_intent_cache); + xfs_exchmaps_intent_cache = NULL; +} + +/* + * Decide if we will exchange the reflink flags between the two files after the + * exchange. The only time we want to do this is if we're exchanging all + * mappings under EOF and the inode reflink flags have different states. + */ +static inline bool +xmi_can_exchange_reflink_flags( + const struct xfs_exchmaps_req *req, + unsigned int reflink_state) +{ + struct xfs_mount *mp = req->ip1->i_mount; + + if (hweight32(reflink_state) != 1) + return false; + if (req->startoff1 != 0 || req->startoff2 != 0) + return false; + if (req->blockcount != XFS_B_TO_FSB(mp, req->ip1->i_disk_size)) + return false; + if (req->blockcount != XFS_B_TO_FSB(mp, req->ip2->i_disk_size)) + return false; + return true; +} + + +/* Allocate and initialize a new incore intent item from a request. */ +struct xfs_exchmaps_intent * +xfs_exchmaps_init_intent( + const struct xfs_exchmaps_req *req) +{ + struct xfs_exchmaps_intent *xmi; + unsigned int rs = 0; + + xmi = kmem_cache_zalloc(xfs_exchmaps_intent_cache, + GFP_NOFS | __GFP_NOFAIL); + INIT_LIST_HEAD(&xmi->xmi_list); + xmi->xmi_ip1 = req->ip1; + xmi->xmi_ip2 = req->ip2; + xmi->xmi_startoff1 = req->startoff1; + xmi->xmi_startoff2 = req->startoff2; + xmi->xmi_blockcount = req->blockcount; + xmi->xmi_isize1 = xmi->xmi_isize2 = -1; + xmi->xmi_flags = req->flags & XFS_EXCHMAPS_PARAMS; + + if (xfs_exchmaps_whichfork(xmi) == XFS_ATTR_FORK) { + xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; + return xmi; + } + + if (req->flags & XFS_EXCHMAPS_SET_SIZES) { + xmi->xmi_flags |= XFS_EXCHMAPS_SET_SIZES; + xmi->xmi_isize1 = req->ip2->i_disk_size; + xmi->xmi_isize2 = req->ip1->i_disk_size; + } + + /* Record the state of each inode's reflink flag before the op. */ + if (xfs_is_reflink_inode(req->ip1)) + rs |= 1; + if (xfs_is_reflink_inode(req->ip2)) + rs |= 2; + + /* + * Figure out if we're clearing the reflink flags (which effectively + * exchanges them) after the operation. + */ + if (xmi_can_exchange_reflink_flags(req, rs)) { + if (rs & 1) + xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO1_REFLINK; + if (rs & 2) + xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO2_REFLINK; + } + + if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode) || + S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) + xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; + + return xmi; +} + +/* + * Estimate the number of exchange operations and the number of file blocks + * in each file that will be affected by the exchange operation. + */ +int +xfs_exchmaps_estimate( + struct xfs_exchmaps_req *req) +{ + struct xfs_exchmaps_intent *xmi; + struct xfs_bmbt_irec irec1, irec2; + struct xfs_exchmaps_adjacent adj = ADJACENT_INIT; + xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0; + int64_t d_nexts1, d_nexts2; + int bmap_flags; + int error; + + ASSERT(!(req->flags & ~XFS_EXCHMAPS_PARAMS)); + + bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_reqfork(req)); + xmi = xfs_exchmaps_init_intent(req); + + /* + * To guard against the possibility of overflowing the extent counters, + * we have to estimate an upper bound on the potential increase in that + * counter. We can split the mapping at each end of the range, and for + * each step of the exchange we can split the mapping that we're + * working on if the mappings do not align. + */ + d_nexts1 = d_nexts2 = 3; + + while (xmi_has_more_exchange_work(xmi)) { + /* + * Walk through the file ranges until we find something to + * exchange. Because we're simulating the exchange, pass in + * adj to capture skipped mappings for correct estimation of + * bmbt record merges. + */ + error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, &adj); + if (error) + goto out_free; + if (!xmi_has_more_exchange_work(xmi)) + break; + + /* Update accounting. */ + if (xfs_bmap_is_real_extent(&irec1)) + ip1_blocks += irec1.br_blockcount; + if (xfs_bmap_is_real_extent(&irec2)) + ip2_blocks += irec2.br_blockcount; + req->nr_exchanges++; + + /* Read the next mappings from both files. */ + error = xmi_next(req->ip1, bmap_flags, &irec1, &adj.right1); + if (error) + goto out_free; + + error = xmi_next(req->ip2, bmap_flags, &irec2, &adj.right2); + if (error) + goto out_free; + + /* Update extent count deltas. */ + d_nexts1 += xmi_delta_nextents_step(req->ip1->i_mount, + &adj.left1, &irec1, &irec2, &adj.right1); + + d_nexts2 += xmi_delta_nextents_step(req->ip1->i_mount, + &adj.left2, &irec2, &irec1, &adj.right2); + + /* Now pretend we exchanged the mappings. */ + if (xmi_can_merge(&adj.left2, &irec1)) + adj.left2.br_blockcount += irec1.br_blockcount; + else + memcpy(&adj.left2, &irec1, sizeof(irec1)); + + if (xmi_can_merge(&adj.left1, &irec2)) + adj.left1.br_blockcount += irec2.br_blockcount; + else + memcpy(&adj.left1, &irec2, sizeof(irec2)); + + xmi_advance(xmi, &irec1); + } + + /* Account for the blocks that are being exchanged. */ + if (XFS_IS_REALTIME_INODE(req->ip1) && + xfs_exchmaps_reqfork(req) == XFS_DATA_FORK) { + req->ip1_rtbcount = ip1_blocks; + req->ip2_rtbcount = ip2_blocks; + } else { + req->ip1_bcount = ip1_blocks; + req->ip2_bcount = ip2_blocks; + } + + /* + * Make sure that both forks have enough slack left in their extent + * counters that the exchange operation will not overflow. + */ + trace_xfs_exchmaps_delta_nextents(req, d_nexts1, d_nexts2); + if (req->ip1 == req->ip2) { + error = xmi_ensure_delta_nextents(req, req->ip1, + d_nexts1 + d_nexts2); + } else { + error = xmi_ensure_delta_nextents(req, req->ip1, d_nexts1); + if (error) + goto out_free; + error = xmi_ensure_delta_nextents(req, req->ip2, d_nexts2); + } + if (error) + goto out_free; + + trace_xfs_exchmaps_initial_estimate(req); + error = xfs_exchmaps_estimate_overhead(req); +out_free: + kmem_cache_free(xfs_exchmaps_intent_cache, xmi); + return error; +} + +/* Set the reflink flag before an operation. */ +static inline void +xfs_exchmaps_set_reflink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + trace_xfs_reflink_set_inode_flag(ip); + + ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* + * If either file has shared blocks and we're exchanging data forks, we must + * flag the other file as having shared blocks so that we get the shared-block + * rmap functions if we need to fix up the rmaps. + */ +void +xfs_exchmaps_ensure_reflink( + struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi) +{ + unsigned int rs = 0; + + if (xfs_is_reflink_inode(xmi->xmi_ip1)) + rs |= 1; + if (xfs_is_reflink_inode(xmi->xmi_ip2)) + rs |= 2; + + if ((rs & 1) && !xfs_is_reflink_inode(xmi->xmi_ip2)) + xfs_exchmaps_set_reflink(tp, xmi->xmi_ip2); + + if ((rs & 2) && !xfs_is_reflink_inode(xmi->xmi_ip1)) + xfs_exchmaps_set_reflink(tp, xmi->xmi_ip1); +} + +/* Set the large extent count flag before an operation if needed. */ +static inline void +xfs_exchmaps_ensure_large_extent_counts( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + if (xfs_inode_has_large_extent_counts(ip)) + return; + + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Widen the extent counter fields of both inodes if necessary. */ +void +xfs_exchmaps_upgrade_extent_counts( + struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi) +{ + if (!xfs_has_large_extent_counts(tp->t_mountp)) + return; + + xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip1); + xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip2); +} + +/* + * Schedule an exchange a range of mappings from one inode to another. + * + * The use of file mapping exchange log intent items ensures the operation can + * be resumed even if the system goes down. The caller must commit the + * transaction to start the work. + * + * The caller must ensure the inodes must be joined to the transaction and + * ILOCKd; they will still be joined to the transaction at exit. + */ +void +xfs_exchange_mappings( + struct xfs_trans *tp, + const struct xfs_exchmaps_req *req) +{ + struct xfs_exchmaps_intent *xmi; + + BUILD_BUG_ON(XFS_EXCHMAPS_INTERNAL_FLAGS & XFS_EXCHMAPS_LOGGED_FLAGS); + + xfs_assert_ilocked(req->ip1, XFS_ILOCK_EXCL); + xfs_assert_ilocked(req->ip2, XFS_ILOCK_EXCL); + ASSERT(!(req->flags & ~XFS_EXCHMAPS_LOGGED_FLAGS)); + if (req->flags & XFS_EXCHMAPS_SET_SIZES) + ASSERT(!(req->flags & XFS_EXCHMAPS_ATTR_FORK)); + ASSERT(xfs_has_exchange_range(tp->t_mountp)); + + if (req->blockcount == 0) + return; + + xmi = xfs_exchmaps_init_intent(req); + xfs_exchmaps_defer_add(tp, xmi); + xfs_exchmaps_ensure_reflink(tp, xmi); + xfs_exchmaps_upgrade_extent_counts(tp, xmi); +} diff --git a/fs/xfs/libxfs/xfs_exchmaps.h b/fs/xfs/libxfs/xfs_exchmaps.h new file mode 100644 index 000000000000..fa822dff202a --- /dev/null +++ b/fs/xfs/libxfs/xfs_exchmaps.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_EXCHMAPS_H__ +#define __XFS_EXCHMAPS_H__ + +/* In-core deferred operation info about a file mapping exchange request. */ +struct xfs_exchmaps_intent { + /* List of other incore deferred work. */ + struct list_head xmi_list; + + /* Inodes participating in the operation. */ + struct xfs_inode *xmi_ip1; + struct xfs_inode *xmi_ip2; + + /* File offset range information. */ + xfs_fileoff_t xmi_startoff1; + xfs_fileoff_t xmi_startoff2; + xfs_filblks_t xmi_blockcount; + + /* Set these file sizes after the operation, unless negative. */ + xfs_fsize_t xmi_isize1; + xfs_fsize_t xmi_isize2; + + uint64_t xmi_flags; /* XFS_EXCHMAPS_* flags */ +}; + +/* Try to convert inode2 from block to short format at the end, if possible. */ +#define __XFS_EXCHMAPS_INO2_SHORTFORM (1ULL << 63) + +#define XFS_EXCHMAPS_INTERNAL_FLAGS (__XFS_EXCHMAPS_INO2_SHORTFORM) + +/* flags that can be passed to xfs_exchmaps_{estimate,mappings} */ +#define XFS_EXCHMAPS_PARAMS (XFS_EXCHMAPS_ATTR_FORK | \ + XFS_EXCHMAPS_SET_SIZES | \ + XFS_EXCHMAPS_INO1_WRITTEN) + +static inline int +xfs_exchmaps_whichfork(const struct xfs_exchmaps_intent *xmi) +{ + if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK) + return XFS_ATTR_FORK; + return XFS_DATA_FORK; +} + +/* Parameters for a mapping exchange request. */ +struct xfs_exchmaps_req { + /* Inodes participating in the operation. */ + struct xfs_inode *ip1; + struct xfs_inode *ip2; + + /* File offset range information. */ + xfs_fileoff_t startoff1; + xfs_fileoff_t startoff2; + xfs_filblks_t blockcount; + + /* XFS_EXCHMAPS_* operation flags */ + uint64_t flags; + + /* + * Fields below this line are filled out by xfs_exchmaps_estimate; + * callers should initialize this part of the struct to zero. + */ + + /* + * Data device blocks to be moved out of ip1, and free space needed to + * handle the bmbt changes. + */ + xfs_filblks_t ip1_bcount; + + /* + * Data device blocks to be moved out of ip2, and free space needed to + * handle the bmbt changes. + */ + xfs_filblks_t ip2_bcount; + + /* rt blocks to be moved out of ip1. */ + xfs_filblks_t ip1_rtbcount; + + /* rt blocks to be moved out of ip2. */ + xfs_filblks_t ip2_rtbcount; + + /* Free space needed to handle the bmbt changes */ + unsigned long long resblks; + + /* Number of exchanges needed to complete the operation */ + unsigned long long nr_exchanges; +}; + +static inline int +xfs_exchmaps_reqfork(const struct xfs_exchmaps_req *req) +{ + if (req->flags & XFS_EXCHMAPS_ATTR_FORK) + return XFS_ATTR_FORK; + return XFS_DATA_FORK; +} + +int xfs_exchmaps_estimate_overhead(struct xfs_exchmaps_req *req); +int xfs_exchmaps_estimate(struct xfs_exchmaps_req *req); + +extern struct kmem_cache *xfs_exchmaps_intent_cache; + +int __init xfs_exchmaps_intent_init_cache(void); +void xfs_exchmaps_intent_destroy_cache(void); + +struct xfs_exchmaps_intent *xfs_exchmaps_init_intent( + const struct xfs_exchmaps_req *req); +void xfs_exchmaps_ensure_reflink(struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi); +void xfs_exchmaps_upgrade_extent_counts(struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi); + +int xfs_exchmaps_finish_one(struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi); + +int xfs_exchmaps_check_forks(struct xfs_mount *mp, + const struct xfs_exchmaps_req *req); + +void xfs_exchange_mappings(struct xfs_trans *tp, + const struct xfs_exchmaps_req *req); + +#endif /* __XFS_EXCHMAPS_H__ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 382ab1e71c0b..9566a7623365 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -90,8 +90,7 @@ struct xfs_ifork; #define XFSLABEL_MAX 12 /* - * Superblock - in core version. Must match the ondisk version below. - * Must be padded to 64 bit alignment. + * Superblock - in core version. Must be padded to 64 bit alignment. */ typedef struct xfs_sb { uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ @@ -175,13 +174,20 @@ typedef struct xfs_sb { xfs_lsn_t sb_lsn; /* last write sequence */ uuid_t sb_meta_uuid; /* metadata file system unique id */ + xfs_ino_t sb_metadirino; /* metadata directory tree root */ + + xfs_rgnumber_t sb_rgcount; /* number of realtime groups */ + xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */ + uint8_t sb_rgblklog; /* rt group number shift */ + uint8_t sb_pad[7]; /* zeroes */ + xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */ + xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */ + /* must be padded to 64 bit alignment */ } xfs_sb_t; -#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc) - /* - * Superblock - on disk version. Must match the in core version above. + * Superblock - on disk version. * Must be padded to 64 bit alignment. */ struct xfs_dsb { @@ -262,9 +268,24 @@ struct xfs_dsb { __be64 sb_lsn; /* last write sequence */ uuid_t sb_meta_uuid; /* metadata file system unique id */ - /* must be padded to 64 bit alignment */ + __be64 sb_metadirino; /* metadata directory tree root */ + __be32 sb_rgcount; /* # of realtime groups */ + __be32 sb_rgextents; /* size of rtgroup in rtx */ + __u8 sb_rgblklog; /* rt group number shift */ + __u8 sb_pad[7]; /* zeroes */ + __be64 sb_rtstart; /* start of internal RT section (FSB) */ + __be64 sb_rtreserved; /* reserved (zoned) RT blocks */ + + /* + * The size of this structure must be padded to 64 bit alignment. + * + * NOTE: Don't forget to update secondary_sb_whack in xfs_repair when + * adding new fields here. + */ }; +#define XFS_SB_CRC_OFF offsetof(struct xfs_dsb, sb_crc) + /* * Misc. Flags - warning - these will be cleared by xfs_repair unless * a feature bit is set when the flag is used. @@ -279,7 +300,7 @@ struct xfs_dsb { #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) -static inline bool xfs_sb_is_v5(struct xfs_sb *sbp) +static inline bool xfs_sb_is_v5(const struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } @@ -288,12 +309,12 @@ static inline bool xfs_sb_is_v5(struct xfs_sb *sbp) * Detect a mismatched features2 field. Older kernels read/wrote * this into the wrong slot, so to be safe we keep them in sync. */ -static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) +static inline bool xfs_sb_has_mismatched_features2(const struct xfs_sb *sbp) { return sbp->sb_bad_features2 != sbp->sb_features2; } -static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) +static inline bool xfs_sb_version_hasmorebits(const struct xfs_sb *sbp) { return xfs_sb_is_v5(sbp) || (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); @@ -343,8 +364,8 @@ static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp) #define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL static inline bool xfs_sb_has_compat_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_compat & feature) != 0; } @@ -361,31 +382,42 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_ro_compat & feature) != 0; } -#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ -#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ -#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ -#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ -#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ -#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ +#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ +#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ +#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ +#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ +#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ +#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */ +#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */ +#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */ +#define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */ +#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */ + #define XFS_SB_FEAT_INCOMPAT_ALL \ - (XFS_SB_FEAT_INCOMPAT_FTYPE| \ - XFS_SB_FEAT_INCOMPAT_SPINODES| \ - XFS_SB_FEAT_INCOMPAT_META_UUID| \ - XFS_SB_FEAT_INCOMPAT_BIGTIME| \ - XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \ - XFS_SB_FEAT_INCOMPAT_NREXT64) + (XFS_SB_FEAT_INCOMPAT_FTYPE | \ + XFS_SB_FEAT_INCOMPAT_SPINODES | \ + XFS_SB_FEAT_INCOMPAT_META_UUID | \ + XFS_SB_FEAT_INCOMPAT_BIGTIME | \ + XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR | \ + XFS_SB_FEAT_INCOMPAT_NREXT64 | \ + XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \ + XFS_SB_FEAT_INCOMPAT_PARENT | \ + XFS_SB_FEAT_INCOMPAT_METADIR | \ + XFS_SB_FEAT_INCOMPAT_ZONED | \ + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool xfs_sb_has_incompat_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_incompat & feature) != 0; } @@ -396,8 +428,8 @@ xfs_sb_has_incompat_feature( #define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL static inline bool xfs_sb_has_incompat_log_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_log_incompat & feature) != 0; } @@ -417,7 +449,7 @@ xfs_sb_add_incompat_log_features( sbp->sb_features_log_incompat |= features; } -static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp) +static inline bool xfs_sb_version_haslogxattrs(const struct xfs_sb *sbp) { return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat & XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); @@ -477,15 +509,9 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) #define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) /* - * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the - * arrays below. - */ -#define XFS_BTNUM_AGF ((int)XFS_BTNUM_RMAPi + 1) - -/* - * The second word of agf_levels in the first a.g. overlaps the EFS - * superblock's magic number. Since the magic numbers valid for EFS - * are > 64k, our value cannot be confused for an EFS superblock's. + * agf_cnt_level in the first AGF overlaps the EFS superblock's magic number. + * Since the magic numbers valid for EFS are > 64k, our value cannot be confused + * for an EFS superblock. */ typedef struct xfs_agf { @@ -499,8 +525,13 @@ typedef struct xfs_agf { /* * Freespace and rmap information */ - __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */ - __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ + __be32 agf_bno_root; /* bnobt root block */ + __be32 agf_cnt_root; /* cntbt root block */ + __be32 agf_rmap_root; /* rmapbt root block */ + + __be32 agf_bno_level; /* bnobt btree levels */ + __be32 agf_cnt_level; /* cntbt btree levels */ + __be32 agf_rmap_level; /* rmapbt btree levels */ __be32 agf_flfirst; /* first freelist block's index */ __be32 agf_fllast; /* last freelist block's index */ @@ -692,21 +723,58 @@ struct xfs_agfl { /* * Realtime bitmap information is accessed by the word, which is currently - * stored in host-endian format. + * stored in host-endian format. Starting with the realtime groups feature, + * the words are stored in be32 ondisk. */ union xfs_rtword_raw { __u32 old; + __be32 rtg; }; /* * Realtime summary counts are accessed by the word, which is currently - * stored in host-endian format. + * stored in host-endian format. Starting with the realtime groups feature, + * the words are stored in be32 ondisk. */ union xfs_suminfo_raw { __u32 old; + __be32 rtg; }; /* + * Realtime allocation groups break the rt section into multiple pieces that + * could be locked independently. Realtime block group numbers are 32-bit + * quantities. Block numbers within a group are also 32-bit quantities, but + * the upper bit must never be set. rtgroup 0 might have a superblock in it, + * so the minimum size of an rtgroup is 2 rtx. + */ +#define XFS_MAX_RGBLOCKS ((xfs_rgblock_t)(1U << 31) - 1) +#define XFS_MIN_RGEXTENTS ((xfs_rtxlen_t)2) +#define XFS_MAX_RGNUMBER ((xfs_rgnumber_t)(-1U)) + +#define XFS_RTSB_MAGIC 0x46726F67 /* 'Frog' */ + +/* + * Realtime superblock - on disk version. Must be padded to 64 bit alignment. + * The first block of the realtime volume contains this superblock. + */ +struct xfs_rtsb { + __be32 rsb_magicnum; /* magic number == XFS_RTSB_MAGIC */ + __le32 rsb_crc; /* superblock crc */ + + __be32 rsb_pad; /* zero */ + unsigned char rsb_fname[XFSLABEL_MAX]; /* file system name */ + + uuid_t rsb_uuid; /* user-visible file system unique id */ + uuid_t rsb_meta_uuid; /* metadata file system unique id */ + + /* must be padded to 64 bit alignment */ +}; + +#define XFS_RTSB_CRC_OFF offsetof(struct xfs_rtsb, rsb_crc) +#define XFS_RTSB_DADDR ((xfs_daddr_t)0) /* daddr in rt section */ + +/* * XFS Timestamps * ============== * @@ -788,6 +856,31 @@ static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds) return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET; } +enum xfs_metafile_type { + XFS_METAFILE_UNKNOWN, /* unknown */ + XFS_METAFILE_DIR, /* metadir directory */ + XFS_METAFILE_USRQUOTA, /* user quota */ + XFS_METAFILE_GRPQUOTA, /* group quota */ + XFS_METAFILE_PRJQUOTA, /* project quota */ + XFS_METAFILE_RTBITMAP, /* rt bitmap */ + XFS_METAFILE_RTSUMMARY, /* rt summary */ + XFS_METAFILE_RTRMAP, /* rt rmap */ + XFS_METAFILE_RTREFCOUNT, /* rt refcount */ + + XFS_METAFILE_MAX +} __packed; + +#define XFS_METAFILE_TYPE_STR \ + { XFS_METAFILE_UNKNOWN, "unknown" }, \ + { XFS_METAFILE_DIR, "dir" }, \ + { XFS_METAFILE_USRQUOTA, "usrquota" }, \ + { XFS_METAFILE_GRPQUOTA, "grpquota" }, \ + { XFS_METAFILE_PRJQUOTA, "prjquota" }, \ + { XFS_METAFILE_RTBITMAP, "rtbitmap" }, \ + { XFS_METAFILE_RTSUMMARY, "rtsummary" }, \ + { XFS_METAFILE_RTRMAP, "rtrmap" }, \ + { XFS_METAFILE_RTREFCOUNT, "rtrefcount" } + /* * On-disk inode structure. * @@ -810,7 +903,7 @@ struct xfs_dinode { __be16 di_mode; /* mode and type of file */ __u8 di_version; /* inode version */ __u8 di_format; /* format of di_c data */ - __be16 di_onlink; /* old number of links to file */ + __be16 di_metatype; /* XFS_METAFILE_*; was di_onlink */ __be32 di_uid; /* owner's user id */ __be32 di_gid; /* owner's group id */ __be32 di_nlink; /* number of links to file */ @@ -866,7 +959,12 @@ struct xfs_dinode { __be64 di_changecount; /* number of attribute changes */ __be64 di_lsn; /* flush sequence */ __be64 di_flags2; /* more random flags */ - __be32 di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + __be32 di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + __be32 di_used_blocks; + }; __u8 di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ @@ -899,6 +997,12 @@ static inline uint xfs_dinode_size(int version) #define XFS_MAXLINK ((1U << 31) - 1U) /* + * Any file that hits the maximum ondisk link count should be pinned to avoid + * a use-after-free situation. + */ +#define XFS_NLINK_PINNED (~0U) + +/* * Values for di_format * * This enum is used in string mapping in xfs_trace.h; please keep the @@ -909,7 +1013,8 @@ enum xfs_dinode_fmt { XFS_DINODE_FMT_LOCAL, /* bulk data */ XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ - XFS_DINODE_FMT_UUID /* added long ago, but never used */ + XFS_DINODE_FMT_UUID, /* added long ago, but never used */ + XFS_DINODE_FMT_META_BTREE, /* metadata btree */ }; #define XFS_INODE_FORMAT_STR \ @@ -917,7 +1022,8 @@ enum xfs_dinode_fmt { { XFS_DINODE_FMT_LOCAL, "local" }, \ { XFS_DINODE_FMT_EXTENTS, "extent" }, \ { XFS_DINODE_FMT_BTREE, "btree" }, \ - { XFS_DINODE_FMT_UUID, "uuid" } + { XFS_DINODE_FMT_UUID, "uuid" }, \ + { XFS_DINODE_FMT_META_BTREE, "meta_btree" } /* * Max values for extnum and aextnum. @@ -1080,21 +1186,60 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) * Values for di_flags2 These start by being exposed to userspace in the upper * 16 bits of the XFS_XFLAG_s range. */ -#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ -#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ -#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ -#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ -#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ +/* use DAX for this inode */ +#define XFS_DIFLAG2_DAX_BIT 0 + +/* file's blocks may be shared */ +#define XFS_DIFLAG2_REFLINK_BIT 1 + +/* copy on write extent size hint */ +#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 + +/* big timestamps */ +#define XFS_DIFLAG2_BIGTIME_BIT 3 -#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) -#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) -#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) -#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) -#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) +/* large extent counters */ +#define XFS_DIFLAG2_NREXT64_BIT 4 + +/* + * The inode contains filesystem metadata and can be found through the metadata + * directory tree. Metadata inodes must satisfy the following constraints: + * + * - V5 filesystem (and ftype) are enabled; + * - The only valid modes are regular files and directories; + * - The access bits must be zero; + * - DMAPI event and state masks are zero; + * - The user and group IDs must be zero; + * - The project ID can be used as a u32 annotation; + * - The immutable, sync, noatime, nodump, nodefrag flags must be set. + * - The dax flag must not be set. + * - Directories must have nosymlinks set. + * + * These requirements are chosen defensively to minimize the ability of + * userspace to read or modify the contents, should a metadata file ever + * escape to userspace. + * + * There are further constraints on the directory tree itself: + * + * - Metadata inodes must never be resolvable through the root directory; + * - They must never be accessed by userspace; + * - Metadata directory entries must have correct ftype. + * + * Superblock-rooted metadata files must have the METADATA iflag set even + * though they do not have a parent directory. + */ +#define XFS_DIFLAG2_METADATA_BIT 5 + +#define XFS_DIFLAG2_DAX (1ULL << XFS_DIFLAG2_DAX_BIT) +#define XFS_DIFLAG2_REFLINK (1ULL << XFS_DIFLAG2_REFLINK_BIT) +#define XFS_DIFLAG2_COWEXTSIZE (1ULL << XFS_DIFLAG2_COWEXTSIZE_BIT) +#define XFS_DIFLAG2_BIGTIME (1ULL << XFS_DIFLAG2_BIGTIME_BIT) +#define XFS_DIFLAG2_NREXT64 (1ULL << XFS_DIFLAG2_NREXT64_BIT) +#define XFS_DIFLAG2_METADATA (1ULL << XFS_DIFLAG2_METADATA_BIT) #define XFS_DIFLAG2_ANY \ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ - XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_METADATA) static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) { @@ -1109,6 +1254,12 @@ static inline bool xfs_dinode_has_large_extent_counts( (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64)); } +static inline bool xfs_dinode_is_metadir(const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA)); +} + /* * Inode number format: * low inopblog bits - offset in block @@ -1157,6 +1308,24 @@ static inline bool xfs_dinode_has_large_extent_counts( #define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ /* + * RT bit manipulation macros. + */ +#define XFS_RTBITMAP_MAGIC 0x424D505A /* BMPZ */ +#define XFS_RTSUMMARY_MAGIC 0x53554D59 /* SUMY */ + +struct xfs_rtbuf_blkinfo { + __be32 rt_magic; /* validity check on block */ + __be32 rt_crc; /* CRC of block */ + __be64 rt_owner; /* inode that owns the block */ + __be64 rt_blkno; /* first block of the buffer */ + __be64 rt_lsn; /* sequence number of last write */ + uuid_t rt_uuid; /* filesystem we belong to */ +}; + +#define XFS_RTBUF_CRC_OFF \ + offsetof(struct xfs_rtbuf_blkinfo, rt_crc) + +/* * Dquot and dquot block format definitions */ #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ @@ -1575,6 +1744,24 @@ typedef __be32 xfs_rmap_ptr_t; XFS_IBT_BLOCK(mp) + 1) /* + * Realtime Reverse mapping btree format definitions + * + * This is a btree for reverse mapping records for realtime volumes + */ +#define XFS_RTRMAP_CRC_MAGIC 0x4d415052 /* 'MAPR' */ + +/* + * rtrmap root header, on-disk form only. + */ +struct xfs_rtrmap_root { + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ +}; + +/* inode-based btree pointer type */ +typedef __be64 xfs_rtrmap_ptr_t; + +/* * Reference Count Btree format definitions * */ @@ -1617,12 +1804,29 @@ struct xfs_refcount_key { __be32 rc_startblock; /* starting block number */ }; -#define MAXREFCOUNT ((xfs_nlink_t)~0U) -#define MAXREFCEXTLEN ((xfs_extlen_t)~0U) +#define XFS_REFC_REFCOUNT_MAX ((xfs_nlink_t)~0U) +#define XFS_REFC_LEN_MAX ((xfs_extlen_t)~0U) /* btree pointer type */ typedef __be32 xfs_refcount_ptr_t; +/* + * Realtime Reference Count btree format definitions + * + * This is a btree for reference count records for realtime volumes + */ +#define XFS_RTREFC_CRC_MAGIC 0x52434e54 /* 'RCNT' */ + +/* + * rt refcount root header, on-disk form only. + */ +struct xfs_rtrefcount_root { + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ +}; + +/* inode-rooted btree pointer type */ +typedef __be64 xfs_rtrefcount_ptr_t; /* * BMAP Btree format definitions diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 6360073865db..12463ba766da 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -8,6 +8,7 @@ /* * SGI's XFS filesystem's major stuff (constants, structures) + * NOTE: This file must be compile-able with C++ compilers. */ /* @@ -186,7 +187,11 @@ struct xfs_fsop_geom { __u32 logsunit; /* log stripe unit, bytes */ uint32_t sick; /* o: unhealthy fs & rt metadata */ uint32_t checked; /* o: checked fs & rt metadata */ - __u64 reserved[17]; /* reserved space */ + __u32 rgextents; /* rt extents in a realtime group */ + __u32 rgcount; /* number of realtime groups */ + __u64 rtstart; /* start of internal rt section */ + __u64 rtreserved; /* RT (zoned) reserved blocks */ + __u64 reserved[14]; /* reserved space */ }; #define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */ @@ -195,6 +200,10 @@ struct xfs_fsop_geom { #define XFS_FSOP_GEOM_SICK_PQUOTA (1 << 3) /* project quota */ #define XFS_FSOP_GEOM_SICK_RT_BITMAP (1 << 4) /* realtime bitmap */ #define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */ +#define XFS_FSOP_GEOM_SICK_QUOTACHECK (1 << 6) /* quota counts */ +#define XFS_FSOP_GEOM_SICK_NLINKS (1 << 7) /* inode link counts */ +#define XFS_FSOP_GEOM_SICK_METADIR (1 << 8) /* metadata directory */ +#define XFS_FSOP_GEOM_SICK_METAPATH (1 << 9) /* metadir tree path */ /* Output for XFS_FS_COUNTS */ typedef struct xfs_fsop_counts { @@ -237,6 +246,10 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ #define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */ #define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */ +#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */ +#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */ +#define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */ +#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */ /* * Minimum and maximum sizes need for growth checks. @@ -292,6 +305,7 @@ struct xfs_ag_geometry { #define XFS_AG_GEOM_SICK_FINOBT (1 << 7) /* free inode index */ #define XFS_AG_GEOM_SICK_RMAPBT (1 << 8) /* reverse mappings */ #define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9) /* reference counts */ +#define XFS_AG_GEOM_SICK_INODES (1 << 10) /* bad inodes were seen */ /* * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT @@ -406,6 +420,7 @@ struct xfs_bulkstat { #define XFS_BS_SICK_XATTR (1 << 5) /* extended attributes */ #define XFS_BS_SICK_SYMLINK (1 << 6) /* symbolic link remote target */ #define XFS_BS_SICK_PARENT (1 << 7) /* parent pointers */ +#define XFS_BS_SICK_DIRTREE (1 << 8) /* directory tree structure */ /* * Project quota id helpers (previously projid was 16bit only @@ -482,9 +497,17 @@ struct xfs_bulk_ireq { */ #define XFS_BULK_IREQ_NREXT64 (1U << 2) +/* + * Allow bulkstat to return information about metadata directories. This + * enables xfs_scrub to find them for scanning, as they are otherwise ordinary + * directories. + */ +#define XFS_BULK_IREQ_METADIR (1U << 3) + #define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ XFS_BULK_IREQ_SPECIAL | \ - XFS_BULK_IREQ_NREXT64) + XFS_BULK_IREQ_NREXT64 | \ + XFS_BULK_IREQ_METADIR) /* Operate on the root directory inode. */ #define XFS_BULK_IREQ_SPECIAL_ROOT (1) @@ -629,7 +652,9 @@ typedef struct xfs_fsop_attrmulti_handlereq { /* * per machine unique filesystem identifier types. */ -typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */ +typedef struct xfs_fsid { + __u32 val[2]; /* file system id type */ +} xfs_fsid_t; typedef struct xfs_fid { __u16 fid_len; /* length of remainder */ @@ -709,9 +734,26 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */ #define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */ #define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */ +#define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */ +#define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */ +#define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */ +#define XFS_SCRUB_TYPE_DIRTREE 28 /* directory tree structure */ +#define XFS_SCRUB_TYPE_METAPATH 29 /* metadata directory tree paths */ +#define XFS_SCRUB_TYPE_RGSUPER 30 /* realtime superblock */ +#define XFS_SCRUB_TYPE_RTRMAPBT 31 /* rtgroup reverse mapping btree */ +#define XFS_SCRUB_TYPE_RTREFCBT 32 /* realtime reference count btree */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 25 +#define XFS_SCRUB_TYPE_NR 33 + +/* + * This special type code only applies to the vectored scrub implementation. + * + * If any of the previous scrub vectors recorded runtime errors or have + * sv_flags bits set that match the OFLAG bits in the barrier vector's + * sv_flags, set the barrier's sv_ret to -ECANCELED and return to userspace. + */ +#define XFS_SCRUB_TYPE_BARRIER (0xFFFFFFFF) /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1u << 0) @@ -757,6 +799,47 @@ struct xfs_scrub_metadata { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED) #define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT) +/* Vectored scrub calls to reduce the number of kernel transitions. */ + +struct xfs_scrub_vec { + __u32 sv_type; /* XFS_SCRUB_TYPE_* */ + __u32 sv_flags; /* XFS_SCRUB_FLAGS_* */ + __s32 sv_ret; /* 0 or a negative error code */ + __u32 sv_reserved; /* must be zero */ +}; + +/* Vectored metadata scrub control structure. */ +struct xfs_scrub_vec_head { + __u64 svh_ino; /* inode number. */ + __u32 svh_gen; /* inode generation. */ + __u32 svh_agno; /* ag number. */ + __u32 svh_flags; /* XFS_SCRUB_VEC_FLAGS_* */ + __u16 svh_rest_us; /* wait this much time between vector items */ + __u16 svh_nr; /* number of svh_vectors */ + __u64 svh_reserved; /* must be zero */ + __u64 svh_vectors; /* pointer to buffer of xfs_scrub_vec */ +}; + +#define XFS_SCRUB_VEC_FLAGS_ALL (0) + +/* + * i: sm_ino values for XFS_SCRUB_TYPE_METAPATH to select a metadata file for + * path checking. + */ +#define XFS_SCRUB_METAPATH_PROBE (0) /* do we have a metapath scrubber? */ +#define XFS_SCRUB_METAPATH_RTDIR (1) /* rtrgroups metadir */ +#define XFS_SCRUB_METAPATH_RTBITMAP (2) /* per-rtg bitmap */ +#define XFS_SCRUB_METAPATH_RTSUMMARY (3) /* per-rtg summary */ +#define XFS_SCRUB_METAPATH_QUOTADIR (4) /* quota metadir */ +#define XFS_SCRUB_METAPATH_USRQUOTA (5) /* user quota */ +#define XFS_SCRUB_METAPATH_GRPQUOTA (6) /* group quota */ +#define XFS_SCRUB_METAPATH_PRJQUOTA (7) /* project quota */ +#define XFS_SCRUB_METAPATH_RTRMAPBT (8) /* realtime reverse mapping */ +#define XFS_SCRUB_METAPATH_RTREFCOUNTBT (9) /* realtime refcount */ + +/* Number of metapath sm_ino values */ +#define XFS_SCRUB_METAPATH_NR (10) + /* * ioctl limits */ @@ -766,6 +849,159 @@ struct xfs_scrub_metadata { # define XFS_XATTR_LIST_MAX 65536 #endif +/* + * Exchange part of file1 with part of the file that this ioctl that is being + * called against (which we'll call file2). Filesystems must be able to + * restart and complete the operation even after the system goes down. + */ +struct xfs_exchange_range { + __s32 file1_fd; + __u32 pad; /* must be zeroes */ + __u64 file1_offset; /* file1 offset, bytes */ + __u64 file2_offset; /* file2 offset, bytes */ + __u64 length; /* bytes to exchange */ + + __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */ +}; + +/* + * Using the same definition of file2 as struct xfs_exchange_range, commit the + * contents of file1 into file2 if file2 has the same inode number, mtime, and + * ctime as the arguments provided to the call. The old contents of file2 will + * be moved to file1. + * + * Returns -EBUSY if there isn't an exact match for the file2 fields. + * + * Filesystems must be able to restart and complete the operation even after + * the system goes down. + */ +struct xfs_commit_range { + __s32 file1_fd; + __u32 pad; /* must be zeroes */ + __u64 file1_offset; /* file1 offset, bytes */ + __u64 file2_offset; /* file2 offset, bytes */ + __u64 length; /* bytes to exchange */ + + __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */ + + /* opaque file2 metadata for freshness checks */ + __u64 file2_freshness[6]; +}; + +/* + * Exchange file data all the way to the ends of both files, and then exchange + * the file sizes. This flag can be used to replace a file's contents with a + * different amount of data. length will be ignored. + */ +#define XFS_EXCHANGE_RANGE_TO_EOF (1ULL << 0) + +/* Flush all changes in file data and file metadata to disk before returning. */ +#define XFS_EXCHANGE_RANGE_DSYNC (1ULL << 1) + +/* Dry run; do all the parameter verification but do not change anything. */ +#define XFS_EXCHANGE_RANGE_DRY_RUN (1ULL << 2) + +/* + * Exchange only the parts of the two files where the file allocation units + * mapped to file1's range have been written to. This can accelerate + * scatter-gather atomic writes with a temp file if all writes are aligned to + * the file allocation unit. + */ +#define XFS_EXCHANGE_RANGE_FILE1_WRITTEN (1ULL << 3) + +#define XFS_EXCHANGE_RANGE_ALL_FLAGS (XFS_EXCHANGE_RANGE_TO_EOF | \ + XFS_EXCHANGE_RANGE_DSYNC | \ + XFS_EXCHANGE_RANGE_DRY_RUN | \ + XFS_EXCHANGE_RANGE_FILE1_WRITTEN) + +/* Iterating parent pointers of files. */ + +/* target was the root directory */ +#define XFS_GETPARENTS_OFLAG_ROOT (1U << 0) + +/* Cursor is done iterating pptrs */ +#define XFS_GETPARENTS_OFLAG_DONE (1U << 1) + +#define XFS_GETPARENTS_OFLAGS_ALL (XFS_GETPARENTS_OFLAG_ROOT | \ + XFS_GETPARENTS_OFLAG_DONE) + +#define XFS_GETPARENTS_IFLAGS_ALL (0) + +struct xfs_getparents_rec { + struct xfs_handle gpr_parent; /* Handle to parent */ + __u32 gpr_reclen; /* Length of entire record */ + __u32 gpr_reserved; /* zero */ + char gpr_name[]; /* Null-terminated filename */ +}; + +/* Iterate through this file's directory parent pointers */ +struct xfs_getparents { + /* + * Structure to track progress in iterating the parent pointers. + * Must be initialized to zeroes before the first ioctl call, and + * not touched by callers after that. + */ + struct xfs_attrlist_cursor gp_cursor; + + /* Input flags: XFS_GETPARENTS_IFLAG* */ + __u16 gp_iflags; + + /* Output flags: XFS_GETPARENTS_OFLAG* */ + __u16 gp_oflags; + + /* Size of the gp_buffer in bytes */ + __u32 gp_bufsize; + + /* Must be set to zero */ + __u64 gp_reserved; + + /* Pointer to a buffer in which to place xfs_getparents_rec */ + __u64 gp_buffer; +}; + +static inline struct xfs_getparents_rec * +xfs_getparents_first_rec(struct xfs_getparents *gp) +{ + return (struct xfs_getparents_rec *)(uintptr_t)gp->gp_buffer; +} + +static inline struct xfs_getparents_rec * +xfs_getparents_next_rec(struct xfs_getparents *gp, + struct xfs_getparents_rec *gpr) +{ + void *next = ((char *)gpr + gpr->gpr_reclen); + void *end = (void *)(uintptr_t)(gp->gp_buffer + gp->gp_bufsize); + + if (next >= end) + return NULL; + + return (struct xfs_getparents_rec *)next; +} + +/* Iterate through this file handle's directory parent pointers. */ +struct xfs_getparents_by_handle { + /* Handle to file whose parents we want. */ + struct xfs_handle gph_handle; + + struct xfs_getparents gph_request; +}; + +/* + * Output for XFS_IOC_RTGROUP_GEOMETRY + */ +struct xfs_rtgroup_geometry { + __u32 rg_number; /* i/o: rtgroup number */ + __u32 rg_length; /* o: length in blocks */ + __u32 rg_sick; /* o: sick things in ag */ + __u32 rg_checked; /* o: checked metadata in ag */ + __u32 rg_flags; /* i/o: flags for this ag */ + __u32 rg_reserved[27]; /* o: zero */ +}; +#define XFS_RTGROUP_GEOM_SICK_SUPER (1U << 0) /* superblock */ +#define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */ +#define XFS_RTGROUP_GEOM_SICK_SUMMARY (1U << 2) /* rtsummary */ +#define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */ +#define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */ /* * ioctl commands that are used by Linux filesystems @@ -802,6 +1038,10 @@ struct xfs_scrub_metadata { /* XFS_IOC_GETFSMAP ------ hoisted 59 */ #define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata) #define XFS_IOC_AG_GEOMETRY _IOWR('X', 61, struct xfs_ag_geometry) +#define XFS_IOC_GETPARENTS _IOWR('X', 62, struct xfs_getparents) +#define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle) +#define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head) +#define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry) /* * ioctl commands that replace IRIX syssgi()'s @@ -837,8 +1077,20 @@ struct xfs_scrub_metadata { #define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom) #define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req) #define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req) +#define XFS_IOC_EXCHANGE_RANGE _IOW ('X', 129, struct xfs_exchange_range) +#define XFS_IOC_START_COMMIT _IOR ('X', 130, struct xfs_commit_range) +#define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ +/* + * Devices supported by a single XFS file system. Reported in fsmaps fmr_device + * when using internal RT devices. + */ +enum xfs_device { + XFS_DEV_DATA = 1, + XFS_DEV_LOG = 2, + XFS_DEV_RT = 3, +}; #ifndef HAVE_BBMACROS /* diff --git a/fs/xfs/libxfs/xfs_group.c b/fs/xfs/libxfs/xfs_group.c new file mode 100644 index 000000000000..e9d76bcdc820 --- /dev/null +++ b/fs/xfs/libxfs/xfs_group.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Red Hat, Inc. + */ + +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_extent_busy.h" +#include "xfs_group.h" + +/* + * Groups can have passive and active references. + * + * For passive references the code freeing a group is responsible for cleaning + * up objects that hold the passive references (e.g. cached buffers). + * Routines manipulating passive references are xfs_group_get, xfs_group_hold + * and xfs_group_put. + * + * Active references are for short term access to the group for walking trees or + * accessing state. If a group is being shrunk or offlined, the lookup will fail + * to find that group and return NULL instead. + * Routines manipulating active references are xfs_group_grab and + * xfs_group_rele. + */ + +struct xfs_group * +xfs_group_get( + struct xfs_mount *mp, + uint32_t index, + enum xfs_group_type type) +{ + struct xfs_group *xg; + + rcu_read_lock(); + xg = xa_load(&mp->m_groups[type].xa, index); + if (xg) { + trace_xfs_group_get(xg, _RET_IP_); + ASSERT(atomic_read(&xg->xg_ref) >= 0); + atomic_inc(&xg->xg_ref); + } + rcu_read_unlock(); + return xg; +} + +struct xfs_group * +xfs_group_hold( + struct xfs_group *xg) +{ + ASSERT(atomic_read(&xg->xg_ref) > 0 || + atomic_read(&xg->xg_active_ref) > 0); + + trace_xfs_group_hold(xg, _RET_IP_); + atomic_inc(&xg->xg_ref); + return xg; +} + +void +xfs_group_put( + struct xfs_group *xg) +{ + trace_xfs_group_put(xg, _RET_IP_); + + ASSERT(atomic_read(&xg->xg_ref) > 0); + atomic_dec(&xg->xg_ref); +} + +struct xfs_group * +xfs_group_grab( + struct xfs_mount *mp, + uint32_t index, + enum xfs_group_type type) +{ + struct xfs_group *xg; + + rcu_read_lock(); + xg = xa_load(&mp->m_groups[type].xa, index); + if (xg) { + trace_xfs_group_grab(xg, _RET_IP_); + if (!atomic_inc_not_zero(&xg->xg_active_ref)) + xg = NULL; + } + rcu_read_unlock(); + return xg; +} + +/* + * Iterate to the next group. To start the iteration at @start_index, a %NULL + * @xg is passed, else the previous group returned from this function. The + * caller should break out of the loop when this returns %NULL. If the caller + * wants to break out of a loop that did not finish it needs to release the + * active reference to @xg using xfs_group_rele() itself. + */ +struct xfs_group * +xfs_group_next_range( + struct xfs_mount *mp, + struct xfs_group *xg, + uint32_t start_index, + uint32_t end_index, + enum xfs_group_type type) +{ + uint32_t index = start_index; + + if (xg) { + index = xg->xg_gno + 1; + xfs_group_rele(xg); + } + if (index > end_index) + return NULL; + return xfs_group_grab(mp, index, type); +} + +/* + * Find the next group after @xg, or the first group if @xg is NULL. + */ +struct xfs_group * +xfs_group_grab_next_mark( + struct xfs_mount *mp, + struct xfs_group *xg, + xa_mark_t mark, + enum xfs_group_type type) +{ + unsigned long index = 0; + + if (xg) { + index = xg->xg_gno + 1; + xfs_group_rele(xg); + } + + rcu_read_lock(); + xg = xa_find(&mp->m_groups[type].xa, &index, ULONG_MAX, mark); + if (xg) { + trace_xfs_group_grab_next_tag(xg, _RET_IP_); + if (!atomic_inc_not_zero(&xg->xg_active_ref)) + xg = NULL; + } + rcu_read_unlock(); + return xg; +} + +void +xfs_group_rele( + struct xfs_group *xg) +{ + trace_xfs_group_rele(xg, _RET_IP_); + atomic_dec(&xg->xg_active_ref); +} + +void +xfs_group_free( + struct xfs_mount *mp, + uint32_t index, + enum xfs_group_type type, + void (*uninit)(struct xfs_group *xg)) +{ + struct xfs_group *xg = xa_erase(&mp->m_groups[type].xa, index); + + XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_ref) != 0); + + xfs_defer_drain_free(&xg->xg_intents_drain); +#ifdef __KERNEL__ + kfree(xg->xg_busy_extents); +#endif + + if (uninit) + uninit(xg); + + /* drop the mount's active reference */ + xfs_group_rele(xg); + XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) != 0); + kfree_rcu_mightsleep(xg); +} + +int +xfs_group_insert( + struct xfs_mount *mp, + struct xfs_group *xg, + uint32_t index, + enum xfs_group_type type) +{ + int error; + + xg->xg_mount = mp; + xg->xg_gno = index; + xg->xg_type = type; + +#ifdef __KERNEL__ + xg->xg_busy_extents = xfs_extent_busy_alloc(); + if (!xg->xg_busy_extents) + return -ENOMEM; + spin_lock_init(&xg->xg_state_lock); + xfs_hooks_init(&xg->xg_rmap_update_hooks); +#endif + xfs_defer_drain_init(&xg->xg_intents_drain); + + /* Active ref owned by mount indicates group is online. */ + atomic_set(&xg->xg_active_ref, 1); + + error = xa_insert(&mp->m_groups[type].xa, index, xg, GFP_KERNEL); + if (error) { + WARN_ON_ONCE(error == -EBUSY); + goto out_drain; + } + + return 0; +out_drain: + xfs_defer_drain_free(&xg->xg_intents_drain); +#ifdef __KERNEL__ + kfree(xg->xg_busy_extents); +#endif + return error; +} + +struct xfs_group * +xfs_group_get_by_fsb( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + enum xfs_group_type type) +{ + return xfs_group_get(mp, xfs_fsb_to_gno(mp, fsbno, type), type); +} diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h new file mode 100644 index 000000000000..4423932a2313 --- /dev/null +++ b/fs/xfs/libxfs/xfs_group.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2018 Red Hat, Inc. + */ +#ifndef __LIBXFS_GROUP_H +#define __LIBXFS_GROUP_H 1 + +struct xfs_group { + struct xfs_mount *xg_mount; + uint32_t xg_gno; + enum xfs_group_type xg_type; + atomic_t xg_ref; /* passive reference count */ + atomic_t xg_active_ref; /* active reference count */ + + /* Precalculated geometry info */ + uint32_t xg_block_count; /* max usable gbno */ + uint32_t xg_min_gbno; /* min usable gbno */ + +#ifdef __KERNEL__ + /* -- kernel only structures below this line -- */ + + union { + /* + * For perags and non-zoned RT groups: + * Track freed but not yet committed extents. + */ + struct xfs_extent_busy_tree *xg_busy_extents; + + /* + * For zoned RT groups: + * List of groups that need a zone reset. + * + * The zonegc code forces a log flush of the rtrmap inode before + * resetting the write pointer, so there is no need for + * individual busy extent tracking. + */ + struct xfs_group *xg_next_reset; + }; + + /* + * Bitsets of per-ag metadata that have been checked and/or are sick. + * Callers should hold xg_state_lock before accessing this field. + */ + uint16_t xg_checked; + uint16_t xg_sick; + spinlock_t xg_state_lock; + + /* + * We use xfs_drain to track the number of deferred log intent items + * that have been queued (but not yet processed) so that waiters (e.g. + * scrub) will not lock resources when other threads are in the middle + * of processing a chain of intent items only to find momentary + * inconsistencies. + */ + struct xfs_defer_drain xg_intents_drain; + + /* + * Hook to feed rmapbt updates to an active online repair. + */ + struct xfs_hooks xg_rmap_update_hooks; +#endif /* __KERNEL__ */ +}; + +struct xfs_group *xfs_group_get(struct xfs_mount *mp, uint32_t index, + enum xfs_group_type type); +struct xfs_group *xfs_group_get_by_fsb(struct xfs_mount *mp, + xfs_fsblock_t fsbno, enum xfs_group_type type); +struct xfs_group *xfs_group_hold(struct xfs_group *xg); +void xfs_group_put(struct xfs_group *xg); + +struct xfs_group *xfs_group_grab(struct xfs_mount *mp, uint32_t index, + enum xfs_group_type type); +struct xfs_group *xfs_group_next_range(struct xfs_mount *mp, + struct xfs_group *xg, uint32_t start_index, uint32_t end_index, + enum xfs_group_type type); +struct xfs_group *xfs_group_grab_next_mark(struct xfs_mount *mp, + struct xfs_group *xg, xa_mark_t mark, enum xfs_group_type type); +void xfs_group_rele(struct xfs_group *xg); + +void xfs_group_free(struct xfs_mount *mp, uint32_t index, + enum xfs_group_type type, void (*uninit)(struct xfs_group *xg)); +int xfs_group_insert(struct xfs_mount *mp, struct xfs_group *xg, + uint32_t index, enum xfs_group_type); + +#define xfs_group_set_mark(_xg, _mark) \ + xa_set_mark(&(_xg)->xg_mount->m_groups[(_xg)->xg_type].xa, \ + (_xg)->xg_gno, (_mark)) +#define xfs_group_clear_mark(_xg, _mark) \ + xa_clear_mark(&(_xg)->xg_mount->m_groups[(_xg)->xg_type].xa, \ + (_xg)->xg_gno, (_mark)) +#define xfs_group_marked(_mp, _type, _mark) \ + xa_marked(&(_mp)->m_groups[(_type)].xa, (_mark)) + +static inline xfs_agblock_t +xfs_group_max_blocks( + struct xfs_group *xg) +{ + return xg->xg_mount->m_groups[xg->xg_type].blocks; +} + +static inline xfs_fsblock_t +xfs_group_start_fsb( + struct xfs_group *xg) +{ + return ((xfs_fsblock_t)xg->xg_gno) << + xg->xg_mount->m_groups[xg->xg_type].blklog; +} + +static inline xfs_fsblock_t +xfs_gbno_to_fsb( + struct xfs_group *xg, + xfs_agblock_t gbno) +{ + return xfs_group_start_fsb(xg) | gbno; +} + +static inline xfs_daddr_t +xfs_gbno_to_daddr( + struct xfs_group *xg, + xfs_agblock_t gbno) +{ + struct xfs_mount *mp = xg->xg_mount; + struct xfs_groups *g = &mp->m_groups[xg->xg_type]; + xfs_fsblock_t fsbno; + + if (g->has_daddr_gaps) + fsbno = xfs_gbno_to_fsb(xg, gbno); + else + fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno; + + return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno); +} + +static inline uint32_t +xfs_fsb_to_gno( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + enum xfs_group_type type) +{ + if (!mp->m_groups[type].blklog) + return 0; + return fsbno >> mp->m_groups[type].blklog; +} + +static inline xfs_agblock_t +xfs_fsb_to_gbno( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + enum xfs_group_type type) +{ + return fsbno & mp->m_groups[type].blkmask; +} + +static inline bool +xfs_verify_gbno( + struct xfs_group *xg, + uint32_t gbno) +{ + if (gbno >= xg->xg_block_count) + return false; + if (gbno < xg->xg_min_gbno) + return false; + return true; +} + +static inline bool +xfs_verify_gbext( + struct xfs_group *xg, + uint32_t gbno, + uint32_t glen) +{ + uint32_t end; + + if (!xfs_verify_gbno(xg, gbno)) + return false; + if (glen == 0 || check_add_overflow(gbno, glen - 1, &end)) + return false; + if (!xfs_verify_gbno(xg, end)) + return false; + return true; +} + +#endif /* __LIBXFS_GROUP_H */ diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 6296993ff8f3..b31000f7190c 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -6,6 +6,8 @@ #ifndef __XFS_HEALTH_H__ #define __XFS_HEALTH_H__ +struct xfs_group; + /* * In-Core Filesystem Health Assessments * ===================================== @@ -26,25 +28,50 @@ * and the "sick" field tells us if that piece was found to need repairs. * Therefore we can conclude that for a given sick flag value: * - * - checked && sick => metadata needs repair - * - checked && !sick => metadata is ok - * - !checked => has not been examined since mount + * - checked && sick => metadata needs repair + * - checked && !sick => metadata is ok + * - !checked && sick => errors have been observed during normal operation, + * but the metadata has not been checked thoroughly + * - !checked && !sick => has not been examined since mount + * + * Evidence of health problems can be sorted into three basic categories: + * + * a) Primary evidence, which signals that something is defective within the + * general grouping of metadata. + * + * b) Secondary evidence, which are side effects of primary problem but are + * not themselves problems. These can be forgotten when the primary + * health problems are addressed. + * + * c) Indirect evidence, which points to something being wrong in another + * group, but we had to release resources and this is all that's left of + * that state. */ struct xfs_mount; struct xfs_perag; struct xfs_inode; struct xfs_fsop_geom; +struct xfs_btree_cur; +struct xfs_da_args; +struct xfs_rtgroup; /* Observable health issues for metadata spanning the entire filesystem. */ #define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */ #define XFS_SICK_FS_UQUOTA (1 << 1) /* user quota */ #define XFS_SICK_FS_GQUOTA (1 << 2) /* group quota */ #define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */ - -/* Observable health issues for realtime volume metadata. */ -#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */ -#define XFS_SICK_RT_SUMMARY (1 << 1) /* realtime summary */ +#define XFS_SICK_FS_QUOTACHECK (1 << 4) /* quota counts */ +#define XFS_SICK_FS_NLINKS (1 << 5) /* inode link counts */ +#define XFS_SICK_FS_METADIR (1 << 6) /* metadata directory tree */ +#define XFS_SICK_FS_METAPATH (1 << 7) /* metadata directory tree path */ + +/* Observable health issues for realtime group metadata. */ +#define XFS_SICK_RG_SUPER (1 << 0) /* rt group superblock */ +#define XFS_SICK_RG_BITMAP (1 << 1) /* rt group bitmap */ +#define XFS_SICK_RG_SUMMARY (1 << 2) /* rt groups summary */ +#define XFS_SICK_RG_RMAPBT (1 << 3) /* reverse mappings */ +#define XFS_SICK_RG_REFCNTBT (1 << 4) /* reference counts */ /* Observable health issues for AG metadata. */ #define XFS_SICK_AG_SB (1 << 0) /* superblock */ @@ -57,6 +84,7 @@ struct xfs_fsop_geom; #define XFS_SICK_AG_FINOBT (1 << 7) /* free inode index */ #define XFS_SICK_AG_RMAPBT (1 << 8) /* reverse mappings */ #define XFS_SICK_AG_REFCNTBT (1 << 9) /* reference counts */ +#define XFS_SICK_AG_INODES (1 << 10) /* inactivated bad inodes */ /* Observable health issues for inode metadata. */ #define XFS_SICK_INO_CORE (1 << 0) /* inode core */ @@ -73,14 +101,25 @@ struct xfs_fsop_geom; #define XFS_SICK_INO_DIR_ZAPPED (1 << 10) /* directory erased */ #define XFS_SICK_INO_SYMLINK_ZAPPED (1 << 11) /* symlink erased */ +/* Don't propagate sick status to ag health summary during inactivation */ +#define XFS_SICK_INO_FORGET (1 << 12) +#define XFS_SICK_INO_DIRTREE (1 << 13) /* directory tree structure */ + /* Primary evidence of health problems in a given group. */ #define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \ XFS_SICK_FS_UQUOTA | \ XFS_SICK_FS_GQUOTA | \ - XFS_SICK_FS_PQUOTA) - -#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \ - XFS_SICK_RT_SUMMARY) + XFS_SICK_FS_PQUOTA | \ + XFS_SICK_FS_QUOTACHECK | \ + XFS_SICK_FS_NLINKS | \ + XFS_SICK_FS_METADIR | \ + XFS_SICK_FS_METAPATH) + +#define XFS_SICK_RG_PRIMARY (XFS_SICK_RG_SUPER | \ + XFS_SICK_RG_BITMAP | \ + XFS_SICK_RG_SUMMARY | \ + XFS_SICK_RG_RMAPBT | \ + XFS_SICK_RG_REFCNTBT) #define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \ XFS_SICK_AG_AGF | \ @@ -100,36 +139,93 @@ struct xfs_fsop_geom; XFS_SICK_INO_DIR | \ XFS_SICK_INO_XATTR | \ XFS_SICK_INO_SYMLINK | \ - XFS_SICK_INO_PARENT) + XFS_SICK_INO_PARENT | \ + XFS_SICK_INO_DIRTREE) #define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \ XFS_SICK_INO_BMBTA_ZAPPED | \ XFS_SICK_INO_DIR_ZAPPED | \ XFS_SICK_INO_SYMLINK_ZAPPED) -/* These functions must be provided by the xfs implementation. */ +/* Secondary state related to (but not primary evidence of) health problems. */ +#define XFS_SICK_FS_SECONDARY (0) +#define XFS_SICK_RG_SECONDARY (0) +#define XFS_SICK_AG_SECONDARY (0) +#define XFS_SICK_INO_SECONDARY (XFS_SICK_INO_FORGET) + +/* Evidence of health problems elsewhere. */ +#define XFS_SICK_FS_INDIRECT (0) +#define XFS_SICK_RG_INDIRECT (0) +#define XFS_SICK_AG_INDIRECT (XFS_SICK_AG_INODES) +#define XFS_SICK_INO_INDIRECT (0) + +/* All health masks. */ +#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \ + XFS_SICK_FS_SECONDARY | \ + XFS_SICK_FS_INDIRECT) + +#define XFS_SICK_RG_ALL (XFS_SICK_RG_PRIMARY | \ + XFS_SICK_RG_SECONDARY | \ + XFS_SICK_RG_INDIRECT) + +#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \ + XFS_SICK_AG_SECONDARY | \ + XFS_SICK_AG_INDIRECT) + +#define XFS_SICK_INO_ALL (XFS_SICK_INO_PRIMARY | \ + XFS_SICK_INO_SECONDARY | \ + XFS_SICK_INO_INDIRECT | \ + XFS_SICK_INO_ZAPPED) + +/* + * These functions must be provided by the xfs implementation. Function + * behavior with respect to the first argument should be as follows: + * + * xfs_*_mark_sick: Set the sick flags and do not set checked flags. + * Runtime code should call this upon encountering + * a corruption. + * + * xfs_*_mark_corrupt: Set the sick and checked flags simultaneously. + * Fsck tools should call this when corruption is + * found. + * + * xfs_*_mark_healthy: Clear the sick flags and set the checked flags. + * Fsck tools should call this after correcting errors. + * + * xfs_*_measure_sickness: Return the sick and check status in the provided + * out parameters. + */ void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask); +void xfs_fs_mark_corrupt(struct xfs_mount *mp, unsigned int mask); void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask); void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick, unsigned int *checked); -void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask); -void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask); -void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick, - unsigned int *checked); - -void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask); -void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask); -void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick, +void xfs_rgno_mark_sick(struct xfs_mount *mp, xfs_rgnumber_t rgno, + unsigned int mask); + +void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno, + unsigned int mask); +void xfs_group_mark_sick(struct xfs_group *xg, unsigned int mask); +#define xfs_ag_mark_sick(pag, mask) \ + xfs_group_mark_sick(pag_group(pag), (mask)) +void xfs_group_mark_corrupt(struct xfs_group *xg, unsigned int mask); +void xfs_group_mark_healthy(struct xfs_group *xg, unsigned int mask); +void xfs_group_measure_sickness(struct xfs_group *xg, unsigned int *sick, unsigned int *checked); void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask); +void xfs_inode_mark_corrupt(struct xfs_inode *ip, unsigned int mask); void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask); void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick, unsigned int *checked); void xfs_health_unmount(struct xfs_mount *mp); +void xfs_bmap_mark_sick(struct xfs_inode *ip, int whichfork); +void xfs_btree_mark_sick(struct xfs_btree_cur *cur); +void xfs_dirattr_mark_sick(struct xfs_inode *ip, int whichfork); +void xfs_da_mark_sick(struct xfs_da_args *args); /* Now some helpers. */ @@ -143,22 +239,25 @@ xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask) } static inline bool -xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask) +xfs_group_has_sickness( + struct xfs_group *xg, + unsigned int mask) { - unsigned int sick, checked; + unsigned int sick, checked; - xfs_rt_measure_sickness(mp, &sick, &checked); + xfs_group_measure_sickness(xg, &sick, &checked); return sick & mask; } -static inline bool -xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask) -{ - unsigned int sick, checked; +#define xfs_ag_has_sickness(pag, mask) \ + xfs_group_has_sickness(pag_group(pag), (mask)) +#define xfs_ag_is_healthy(pag) \ + (!xfs_ag_has_sickness((pag), UINT_MAX)) - xfs_ag_measure_sickness(pag, &sick, &checked); - return sick & mask; -} +#define xfs_rtgroup_has_sickness(rtg, mask) \ + xfs_group_has_sickness(rtg_group(rtg), (mask)) +#define xfs_rtgroup_is_healthy(rtg) \ + (!xfs_rtgroup_has_sickness((rtg), UINT_MAX)) static inline bool xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask) @@ -176,18 +275,6 @@ xfs_fs_is_healthy(struct xfs_mount *mp) } static inline bool -xfs_rt_is_healthy(struct xfs_mount *mp) -{ - return !xfs_rt_has_sickness(mp, -1U); -} - -static inline bool -xfs_ag_is_healthy(struct xfs_perag *pag) -{ - return !xfs_ag_has_sickness(pag, -1U); -} - -static inline bool xfs_inode_is_healthy(struct xfs_inode *ip) { return !xfs_inode_has_sickness(ip, -1U); @@ -195,6 +282,11 @@ xfs_inode_is_healthy(struct xfs_inode *ip) void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo); void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); +void xfs_rtgroup_geom_health(struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo); void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs); +#define xfs_metadata_is_sick(error) \ + (unlikely((error) == -EFSCORRUPTED || (error) == -EFSBADCRC)) + #endif /* __XFS_HEALTH_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 2361a22035b0..0c47b5c6ca7d 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -27,6 +27,7 @@ #include "xfs_log.h" #include "xfs_rmap.h" #include "xfs_ag.h" +#include "xfs_health.h" /* * Lookup a record by ino in the btree given by cur. @@ -140,13 +141,13 @@ xfs_inobt_complain_bad_rec( struct xfs_mount *mp = cur->bc_mp; xfs_warn(mp, - "%s Inode BTree record corruption in AG %d detected at %pS!", - cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", - cur->bc_ag.pag->pag_agno, fa); + "%sbt record corruption in AG %d detected at %pS!", + cur->bc_ops->name, cur->bc_group->xg_gno, fa); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, irec->ir_free, irec->ir_holemask); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -169,7 +170,7 @@ xfs_inobt_get_rec( return error; xfs_inobt_btrec_to_irec(mp, rec, irec); - fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec); + fa = xfs_inobt_check_irec(to_perag(cur->bc_group), irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, irec); @@ -205,14 +206,17 @@ xfs_inobt_insert( struct xfs_buf *agbp, xfs_agino_t newino, xfs_agino_t newlen, - xfs_btnum_t btnum) + bool is_finobt) { struct xfs_btree_cur *cur; xfs_agino_t thisino; int i; int error; - cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); + if (is_finobt) + cur = xfs_finobt_init_cursor(pag, tp, agbp); + else + cur = xfs_inobt_init_cursor(pag, tp, agbp); for (thisino = newino; thisino < newino + newlen; @@ -271,8 +275,10 @@ xfs_check_agi_freecount( } } while (i == 1); - if (!xfs_is_shutdown(cur->bc_mp)) - ASSERT(freecount == cur->bc_ag.pag->pagi_freecount); + if (!xfs_is_shutdown(cur->bc_mp)) { + ASSERT(freecount == + to_perag(cur->bc_group)->pagi_freecount); + } } return 0; } @@ -358,7 +364,7 @@ xfs_ialloc_inode_init( (j * M_IGEO(mp)->blocks_per_cluster)); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, - XBF_UNMAPPED, &fbuf); + 0, &fbuf); if (error) return error; @@ -528,16 +534,14 @@ __xfs_inobt_rec_merge( } /* - * Insert a new sparse inode chunk into the associated inode btree. The inode - * record for the sparse chunk is pre-aligned to a startino that should match - * any pre-existing sparse inode record in the tree. This allows sparse chunks - * to fill over time. + * Insert a new sparse inode chunk into the associated inode allocation btree. + * The inode record for the sparse chunk is pre-aligned to a startino that + * should match any pre-existing sparse inode record in the tree. This allows + * sparse chunks to fill over time. * - * This function supports two modes of handling preexisting records depending on - * the merge flag. If merge is true, the provided record is merged with the + * If no preexisting record exists, the provided record is inserted. + * If there is a preexisting record, the provided record is merged with the * existing record and updated in place. The merged record is returned in nrec. - * If merge is false, an existing record is replaced with the provided record. - * If no preexisting record exists, the provided record is always inserted. * * It is considered corruption if a merge is requested and not possible. Given * the sparse inode alignment constraints, this should never happen. @@ -547,17 +551,15 @@ xfs_inobt_insert_sprec( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - int btnum, - struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ - bool merge) /* merge or replace */ + struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; int error; int i; struct xfs_inobt_rec_incore rec; - cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); + cur = xfs_inobt_init_cursor(pag, tp, agbp); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); @@ -571,6 +573,7 @@ xfs_inobt_insert_sprec( if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -579,45 +582,42 @@ xfs_inobt_insert_sprec( } /* - * A record exists at this startino. Merge or replace the record - * depending on what we've been asked to do. + * A record exists at this startino. Merge the records. */ - if (merge) { - error = xfs_inobt_get_rec(cur, &rec, &i); - if (error) - goto error; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto error; - } - if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { - error = -EFSCORRUPTED; - goto error; - } + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } + if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } - /* - * This should never fail. If we have coexisting records that - * cannot merge, something is seriously wrong. - */ - if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { - error = -EFSCORRUPTED; - goto error; - } + /* + * This should never fail. If we have coexisting records that + * cannot merge, something is seriously wrong. + */ + if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } - trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino, - rec.ir_holemask, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_pre(pag, &rec, nrec); - /* merge to nrec to output the updated record */ - __xfs_inobt_rec_merge(nrec, &rec); + /* merge to nrec to output the updated record */ + __xfs_inobt_rec_merge(nrec, &rec); - trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_post(pag, nrec); - error = xfs_inobt_rec_check_count(mp, nrec); - if (error) - goto error; - } + error = xfs_inobt_rec_check_count(mp, nrec); + if (error) + goto error; error = xfs_inobt_update(cur, nrec); if (error) @@ -632,6 +632,59 @@ error: } /* + * Insert a new sparse inode chunk into the free inode btree. The inode + * record for the sparse chunk is pre-aligned to a startino that should match + * any pre-existing sparse inode record in the tree. This allows sparse chunks + * to fill over time. + * + * The new record is always inserted, overwriting a pre-existing record if + * there is one. + */ +STATIC int +xfs_finobt_insert_sprec( + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */ +{ + struct xfs_mount *mp = pag_mount(pag); + struct xfs_btree_cur *cur; + int error; + int i; + + cur = xfs_finobt_init_cursor(pag, tp, agbp); + + /* the new record is pre-aligned so we know where to look */ + error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + /* if nothing there, insert a new record and return */ + if (i == 0) { + error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, + nrec->ir_count, nrec->ir_freecount, + nrec->ir_free, &i); + if (error) + goto error; + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } + } else { + error = xfs_inobt_update(cur, nrec); + if (error) + goto error; + } + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + + +/* * Allocate new inodes in the allocation group specified by agbp. Returns 0 if * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so * the caller knows it can try another AG, a hard -ENOSPC when over the maximum @@ -714,8 +767,7 @@ xfs_ialloc_ag_alloc( /* Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_exact_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - args.agbno)); + xfs_agbno_to_fsb(pag, args.agbno)); if (error) return error; @@ -757,8 +809,8 @@ xfs_ialloc_ag_alloc( */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - be32_to_cpu(agi->agi_root))); + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); if (error) return error; } @@ -770,8 +822,8 @@ xfs_ialloc_ag_alloc( if (isaligned && args.fsbno == NULLFSBLOCK) { args.alignment = igeo->cluster_align; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - be32_to_cpu(agi->agi_root))); + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); if (error) return error; } @@ -801,13 +853,14 @@ sparse_alloc: * the end of the AG. */ args.min_agbno = args.mp->m_sb.sb_inoalignmt; - args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, + args.max_agbno = round_down(xfs_ag_block_count(args.mp, + pag_agno(pag)), args.mp->m_sb.sb_inoalignmt) - igeo->ialloc_blks; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - be32_to_cpu(agi->agi_root))); + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); if (error) return error; @@ -830,7 +883,7 @@ sparse_alloc: * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag_agno(pag), args.agbno, args.len, get_random_u32()); if (error) @@ -857,13 +910,11 @@ sparse_alloc: * if necessary. If a merge does occur, rec is updated to the * merged record. */ - error = xfs_inobt_insert_sprec(pag, tp, agbp, - XFS_BTNUM_INO, &rec, true); + error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec); if (error == -EFSCORRUPTED) { xfs_alert(args.mp, "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", - XFS_AGINO_TO_INO(args.mp, pag->pag_agno, - rec.ir_startino), + xfs_agino_to_ino(pag, rec.ir_startino), rec.ir_holemask, rec.ir_count); xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); } @@ -882,21 +933,19 @@ sparse_alloc: * existing record with this one. */ if (xfs_has_finobt(args.mp)) { - error = xfs_inobt_insert_sprec(pag, tp, agbp, - XFS_BTNUM_FINO, &rec, false); + error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec); if (error) return error; } } else { /* full chunk - insert new records to both btrees */ - error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, - XFS_BTNUM_INO); + error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false); if (error) return error; if (xfs_has_finobt(args.mp)) { error = xfs_inobt_insert(pag, tp, agbp, newino, - newlen, XFS_BTNUM_FINO); + newlen, true); if (error) return error; } @@ -949,8 +998,10 @@ xfs_ialloc_next_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } return 0; @@ -974,8 +1025,10 @@ xfs_ialloc_get_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } return 0; @@ -1003,6 +1056,33 @@ xfs_inobt_first_free_inode( } /* + * If this AG has corrupt inodes, check if allocating this inode would fail + * with corruption errors. Returns 0 if we're clear, or EAGAIN to try again + * somewhere else. + */ +static int +xfs_dialloc_check_ino( + struct xfs_perag *pag, + struct xfs_trans *tp, + xfs_ino_t ino) +{ + struct xfs_imap imap; + struct xfs_buf *bp; + int error; + + error = xfs_imap(pag, tp, ino, &imap, 0); + if (error) + return -EAGAIN; + + error = xfs_imap_to_bp(pag_mount(pag), tp, &imap, &bp); + if (error) + return -EAGAIN; + + xfs_trans_brelse(tp, bp); + return 0; +} + +/* * Allocate an inode using the inobt-only algorithm. */ STATIC int @@ -1030,7 +1110,7 @@ xfs_dialloc_ag_inobt( ASSERT(pag->pagi_freecount > 0); restart_pagno: - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1045,7 +1125,7 @@ xfs_dialloc_ag_inobt( /* * If in the same AG as the parent, try to get near the parent. */ - if (pagno == pag->pag_agno) { + if (pagno == pag_agno(pag)) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ @@ -1053,6 +1133,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1061,6 +1142,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, j != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1219,6 +1301,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1228,6 +1311,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1237,6 +1321,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1248,7 +1333,14 @@ alloc_inode: ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); + + if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { + error = xfs_dialloc_check_ino(pag, tp, ino); + if (error) + goto error0; + } + rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; error = xfs_inobt_update(cur, &rec); @@ -1297,8 +1389,10 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(lcur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(lcur); return -EFSCORRUPTED; + } /* * See if we've landed in the parent inode record. The finobt @@ -1322,12 +1416,14 @@ xfs_dialloc_ag_finobt_near( if (error) goto error_rcur; if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) { + xfs_btree_mark_sick(lcur); error = -EFSCORRUPTED; goto error_rcur; } } if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) { + xfs_btree_mark_sick(lcur); error = -EFSCORRUPTED; goto error_rcur; } @@ -1383,8 +1479,10 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return 0; } } @@ -1395,14 +1493,18 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return 0; } @@ -1424,14 +1526,18 @@ xfs_dialloc_ag_update_inobt( error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); @@ -1440,8 +1546,10 @@ xfs_dialloc_ag_update_inobt( if (XFS_IS_CORRUPT(cur->bc_mp, rec.ir_free != frec->ir_free || - rec.ir_freecount != frec->ir_freecount)) + rec.ir_freecount != frec->ir_freecount)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return xfs_inobt_update(cur, &rec); } @@ -1483,7 +1591,7 @@ xfs_dialloc_ag( if (!pagino) pagino = be32_to_cpu(agi->agi_newino); - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(cur); if (error) @@ -1494,7 +1602,7 @@ xfs_dialloc_ag( * parent. If so, find the closest available inode to the parent. If * not, consider the agi hint or find the first free inode in the AG. */ - if (pag->pag_agno == pagno) + if (pag_agno(pag) == pagno) error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); else error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); @@ -1506,7 +1614,13 @@ xfs_dialloc_ag( ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); + + if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { + error = xfs_dialloc_check_ino(pag, tp, ino); + if (error) + goto error_cur; + } /* * Modify or remove the finobt record. @@ -1526,7 +1640,7 @@ xfs_dialloc_ag( * the original freecount. If all is well, make the equivalent update to * the inobt using the finobt record and offset information. */ - icur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); + icur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(icur); if (error) @@ -1623,7 +1737,7 @@ xfs_dialloc_good_ag( return false; if (!xfs_perag_initialised_agi(pag)) { - error = xfs_ialloc_read_agi(pag, tp, NULL); + error = xfs_ialloc_read_agi(pag, tp, 0, NULL); if (error) return false; } @@ -1692,7 +1806,7 @@ xfs_dialloc_try_ag( * Then read in the AGI buffer and recheck with the AGI buffer * lock held. */ - error = xfs_ialloc_read_agi(pag, *tpp, &agbp); + error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp); if (error) return error; @@ -1729,6 +1843,40 @@ out_release: } /* + * Pick an AG for the new inode. + * + * Directories, symlinks, and regular files frequently allocate at least one + * block, so factor that potential expansion when we examine whether an AG has + * enough space for file creation. Try to keep metadata files all in the same + * AG. + */ +static inline xfs_agnumber_t +xfs_dialloc_pick_ag( + struct xfs_mount *mp, + struct xfs_inode *dp, + umode_t mode) +{ + xfs_agnumber_t start_agno; + + if (!dp) + return 0; + if (xfs_is_metadir_inode(dp)) { + if (mp->m_sb.sb_logstart) + return XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); + return 0; + } + + if (S_ISDIR(mode)) + return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi; + + start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino); + if (start_agno >= mp->m_maxagi) + start_agno = 0; + + return start_agno; +} + +/* * Allocate an on-disk inode. * * Mode is used to tell whether the new inode is a directory and hence where to @@ -1739,34 +1887,23 @@ out_release: int xfs_dialloc( struct xfs_trans **tpp, - xfs_ino_t parent, - umode_t mode, + const struct xfs_icreate_args *args, xfs_ino_t *new_ino) { struct xfs_mount *mp = (*tpp)->t_mountp; - xfs_agnumber_t agno; - int error = 0; - xfs_agnumber_t start_agno; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_ino_t ino = NULLFSINO; + xfs_ino_t parent = args->pip ? args->pip->i_ino : 0; + xfs_agnumber_t agno; + xfs_agnumber_t start_agno; + umode_t mode = args->mode & S_IFMT; bool ok_alloc = true; bool low_space = false; int flags; - xfs_ino_t ino = NULLFSINO; + int error = 0; - /* - * Directories, symlinks, and regular files frequently allocate at least - * one block, so factor that potential expansion when we examine whether - * an AG has enough space for file creation. - */ - if (S_ISDIR(mode)) - start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) % - mp->m_maxagi; - else { - start_agno = XFS_INO_TO_AGNO(mp, parent); - if (start_agno >= mp->m_maxagi) - start_agno = 0; - } + start_agno = xfs_dialloc_pick_ag(mp, args->pip, mode); /* * If we have already hit the ceiling of inode blocks then clear @@ -1790,7 +1927,7 @@ xfs_dialloc( * that we can immediately allocate, but then we allow allocation on the * second pass if we fail to find an AG with free inodes in it. */ - if (percpu_counter_read_positive(&mp->m_fdblocks) < + if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) < mp->m_low_space[XFS_LOWSP_1_PCNT]) { ok_alloc = false; low_space = true; @@ -1830,6 +1967,21 @@ retry: } return -ENOSPC; } + + /* + * Protect against obviously corrupt allocation btree records. Later + * xfs_iget checks will catch re-allocation of other active in-memory + * and on-disk inodes. If we don't catch reallocating the parent inode + * here we will deadlock in xfs_iget() so we have to do these checks + * first. + */ + if (ino == parent || !xfs_verify_dir_ino(mp, ino)) { + xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); + xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), + XFS_SICK_AG_INOBT); + return -EFSCORRUPTED; + } + *new_ino = ino; return 0; } @@ -1842,7 +1994,7 @@ retry: static int xfs_difree_inode_chunk( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_inobt_rec_incore *rec) { struct xfs_mount *mp = tp->t_mountp; @@ -1856,10 +2008,9 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - return xfs_free_extent_later(tp, - XFS_AGB_TO_FSB(mp, agno, sagbno), + return xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, 0); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1903,10 +2054,9 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - error = xfs_free_extent_later(tp, - XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, - false); + error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, agbno), + contigblk, &XFS_RMAP_OINFO_INODES, + XFS_AG_RESV_NONE, 0); if (error) return error; @@ -1928,7 +2078,7 @@ xfs_difree_inobt( struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_agi *agi = agbp->b_addr; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; @@ -1943,7 +2093,7 @@ xfs_difree_inobt( /* * Initialize the cursor. */ - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(cur); if (error) @@ -1958,6 +2108,7 @@ xfs_difree_inobt( goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1968,6 +2119,7 @@ xfs_difree_inobt( goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1991,8 +2143,7 @@ xfs_difree_inobt( if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { xic->deleted = true; - xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, - rec.ir_startino); + xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* @@ -2015,7 +2166,7 @@ xfs_difree_inobt( goto error0; } - error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); + error = xfs_difree_inode_chunk(tp, pag, &rec); if (error) goto error0; } else { @@ -2061,14 +2212,14 @@ xfs_difree_finobt( xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; int error; int i; - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) @@ -2080,6 +2231,7 @@ xfs_difree_finobt( * something is out of sync. */ if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -2106,6 +2258,7 @@ xfs_difree_finobt( if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -2116,6 +2269,7 @@ xfs_difree_finobt( if (XFS_IS_CORRUPT(mp, rec.ir_free != ibtrec->ir_free || rec.ir_freecount != ibtrec->ir_freecount)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -2181,31 +2335,31 @@ xfs_difree( /* * Break up inode number into its components. */ - if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) { - xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).", - __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno); + if (pag_agno(pag) != XFS_INO_TO_AGNO(mp, inode)) { + xfs_warn(mp, "%s: agno != pag_agno(pag) (%d != %d).", + __func__, XFS_INO_TO_AGNO(mp, inode), pag_agno(pag)); ASSERT(0); return -EINVAL; } agino = XFS_INO_TO_AGINO(mp, inode); - if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { - xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", + if (inode != xfs_agino_to_ino(pag, agino)) { + xfs_warn(mp, "%s: inode != xfs_agino_to_ino() (%llu != %llu).", __func__, (unsigned long long)inode, - (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); + (unsigned long long)xfs_agino_to_ino(pag, agino)); ASSERT(0); return -EINVAL; } agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agbno >= mp->m_sb.sb_agblocks) { - xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", - __func__, agbno, mp->m_sb.sb_agblocks); + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { + xfs_warn(mp, "%s: agbno >= xfs_ag_block_count (%d >= %d).", + __func__, agbno, xfs_ag_block_count(mp, pag_agno(pag))); ASSERT(0); return -EINVAL; } /* * Get the allocation group header. */ - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); @@ -2244,18 +2398,18 @@ xfs_imap_lookup( xfs_agblock_t *offset_agbno, int flags) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; int error; int i; - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", - __func__, error, pag->pag_agno); + __func__, error, pag_agno(pag)); return error; } @@ -2265,7 +2419,7 @@ xfs_imap_lookup( * we have a record, we need to ensure it contains the inode number * we are looking up. */ - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) @@ -2305,7 +2459,7 @@ xfs_imap( struct xfs_imap *imap, /* location map structure */ uint flags) /* flags for inode btree lookup */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); xfs_agblock_t agbno; /* block number of inode in the alloc group */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ @@ -2321,8 +2475,8 @@ xfs_imap( */ agino = XFS_INO_TO_AGINO(mp, ino); agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agbno >= mp->m_sb.sb_agblocks || - ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag)) || + ino != xfs_agino_to_ino(pag, agino)) { error = -EINVAL; #ifdef DEBUG /* @@ -2331,17 +2485,18 @@ xfs_imap( */ if (flags & XFS_IGET_UNTRUSTED) return error; - if (agbno >= mp->m_sb.sb_agblocks) { + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { xfs_alert(mp, "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", __func__, (unsigned long long)agbno, - (unsigned long)mp->m_sb.sb_agblocks); + (unsigned long)xfs_ag_block_count(mp, + pag_agno(pag))); } - if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + if (ino != xfs_agino_to_ino(pag, agino)) { xfs_alert(mp, - "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", + "%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)", __func__, ino, - XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); + xfs_agino_to_ino(pag, agino)); } xfs_stack_trace(); #endif /* DEBUG */ @@ -2371,7 +2526,7 @@ xfs_imap( offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno); + imap->im_blkno = xfs_agbno_to_daddr(pag, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); @@ -2401,7 +2556,7 @@ out_map: offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno); + imap->im_blkno = xfs_agbno_to_daddr(pag, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); @@ -2594,16 +2749,19 @@ int xfs_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, + xfs_buf_flags_t flags, struct xfs_buf **agibpp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); int error; - trace_xfs_read_agi(pag->pag_mount, pag->pag_agno); + trace_xfs_read_agi(pag); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops); + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); if (error) return error; if (tp) @@ -2621,15 +2779,18 @@ int xfs_ialloc_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, + int flags, struct xfs_buf **agibpp) { struct xfs_buf *agibp; struct xfs_agi *agi; int error; - trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno); + trace_xfs_ialloc_read_agi(pag); - error = xfs_read_agi(pag, tp, &agibp); + error = xfs_read_agi(pag, tp, + (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, + &agibp); if (error) return error; @@ -2645,7 +2806,7 @@ xfs_ialloc_read_agi( * we are in the middle of a forced shutdown. */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - xfs_is_shutdown(pag->pag_mount)); + xfs_is_shutdown(pag_mount(pag))); if (agibpp) *agibpp = agibp; else @@ -2745,7 +2906,7 @@ xfs_ialloc_count_inodes_rec( xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); - fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_inobt_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, &irec); @@ -2765,7 +2926,7 @@ xfs_ialloc_count_inodes( struct xfs_ialloc_count_inodes ci = {0}; int error; - ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + ASSERT(xfs_btree_is_ino(cur->bc_ops)); error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci); if (error) return error; @@ -2806,8 +2967,8 @@ xfs_ialloc_setup_geometry( /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; - igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); - igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); + igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, true); + igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, false); igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2; igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2; @@ -2892,6 +3053,11 @@ xfs_ialloc_setup_geometry( igeo->ialloc_align = mp->m_dalign; else igeo->ialloc_align = 0; + + if (mp->m_sb.sb_blocksize > PAGE_SIZE) + igeo->min_folio_order = mp->m_sb.sb_blocklog - PAGE_SHIFT; + else + igeo->min_folio_order = 0; } /* Compute the location of the root directory inode that is laid out by mkfs. */ @@ -2979,13 +3145,13 @@ xfs_ialloc_check_shrink( int has; int error; - if (!xfs_has_sparseinodes(pag->pag_mount)) + if (!xfs_has_sparseinodes(pag_mount(pag))) return 0; - cur = xfs_inobt_init_cursor(pag, tp, agibp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agibp); /* Look up the inobt record that would correspond to the new EOFS. */ - agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length); + agino = XFS_AGB_TO_AGINO(pag_mount(pag), new_length); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has); if (error || !has) goto out; @@ -2995,6 +3161,7 @@ xfs_ialloc_check_shrink( goto out; if (!has) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT); error = -EFSCORRUPTED; goto out; } diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index f1412183bb44..3a1323155a45 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -33,11 +33,13 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog); } +struct xfs_icreate_args; + /* * Allocate an inode on disk. Mode is used to tell whether the new inode will * need space, and whether it is a directory. */ -int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode, +int xfs_dialloc(struct xfs_trans **tpp, const struct xfs_icreate_args *args, xfs_ino_t *new_ino); int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag, @@ -63,10 +65,11 @@ xfs_ialloc_log_agi( struct xfs_buf *bp, /* allocation group header buffer */ uint32_t fields); /* bitmask of fields to log */ -int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, +int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, xfs_buf_flags_t flags, struct xfs_buf **agibpp); int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, - struct xfs_buf **agibpp); + int flags, struct xfs_buf **agibpp); +#define XFS_IALLOC_FLAG_TRYLOCK (1U << 0) /* use trylock for buffer locking */ /* * Lookup a record by ino in the btree given by cur. diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 42a5e1f227a0..6f270d8f4270 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -17,6 +17,7 @@ #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_health.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_rmap.h" @@ -36,8 +37,16 @@ STATIC struct xfs_btree_cur * xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { - return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_btnum); + return xfs_inobt_init_cursor(to_perag(cur->bc_group), cur->bc_tp, + cur->bc_ag.agbp); +} + +STATIC struct xfs_btree_cur * +xfs_finobt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_finobt_init_cursor(to_perag(cur->bc_group), cur->bc_tp, + cur->bc_ag.agbp); } STATIC void @@ -81,9 +90,9 @@ xfs_inobt_mod_blockcount( if (!xfs_has_inobtcounts(cur->bc_mp)) return; - if (cur->bc_btnum == XFS_BTNUM_FINO) + if (xfs_btree_is_fino(cur->bc_ops)) be32_add_cpu(&agi->agi_fblocks, howmuch); - else if (cur->bc_btnum == XFS_BTNUM_INO) + else be32_add_cpu(&agi->agi_iblocks, howmuch); xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS); } @@ -103,7 +112,7 @@ __xfs_inobt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.pag = cur->bc_ag.pag; + args.pag = to_perag(cur->bc_group); args.oinfo = XFS_RMAP_OINFO_INOBT; args.minlen = 1; args.maxlen = 1; @@ -111,7 +120,7 @@ __xfs_inobt_alloc_block( args.resv = resv; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, sbno)); + xfs_agbno_to_fsb(args.pag, sbno)); if (error) return error; @@ -161,7 +170,7 @@ __xfs_inobt_free_block( xfs_inobt_mod_blockcount(cur, -1); fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_INOBT, resv, false); + &XFS_RMAP_OINFO_INOBT, resv, 0); } STATIC int @@ -239,7 +248,7 @@ xfs_inobt_init_ptr_from_cur( { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_root; } @@ -251,7 +260,8 @@ xfs_finobt_init_ptr_from_cur( { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agi->agi_seqno)); + ptr->s = agi->agi_free_root; } @@ -300,7 +310,7 @@ xfs_inobt_verify( * xfs_perag_initialised_agi(pag)) if we ever do. */ if (xfs_has_crc(mp)) { - fa = xfs_btree_sblock_v5hdr_verify(bp); + fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; } @@ -310,7 +320,7 @@ xfs_inobt_verify( if (level >= M_IGEO(mp)->inobt_maxlevels) return __this_address; - return xfs_btree_sblock_verify(bp, + return xfs_btree_agblock_verify(bp, M_IGEO(mp)->inobt_mxr[level != 0]); } @@ -320,7 +330,7 @@ xfs_inobt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_sblock_verify_crc(bp)) + if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_inobt_verify(bp); @@ -344,7 +354,7 @@ xfs_inobt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_sblock_calc_crc(bp); + xfs_btree_agblock_calc_crc(bp); } @@ -398,9 +408,17 @@ xfs_inobt_keys_contiguous( be32_to_cpu(key2->inobt.ir_startino)); } -static const struct xfs_btree_ops xfs_inobt_ops = { +const struct xfs_btree_ops xfs_inobt_ops = { + .name = "ino", + .type = XFS_BTREE_TYPE_AG, + .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, + + .lru_refs = XFS_INO_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_ibt_2), + .sick_mask = XFS_SICK_AG_INOBT, .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_inobt_set_root, @@ -420,11 +438,19 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .keys_contiguous = xfs_inobt_keys_contiguous, }; -static const struct xfs_btree_ops xfs_finobt_ops = { +const struct xfs_btree_ops xfs_finobt_ops = { + .name = "fino", + .type = XFS_BTREE_TYPE_AG, + .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, - .dup_cursor = xfs_inobt_dup_cursor, + .lru_refs = XFS_INO_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_fibt_2), + .sick_mask = XFS_SICK_AG_FINOBT, + + .dup_cursor = xfs_finobt_dup_cursor, .set_root = xfs_finobt_set_root, .alloc_block = xfs_finobt_alloc_block, .free_block = xfs_finobt_free_block, @@ -443,65 +469,54 @@ static const struct xfs_btree_ops xfs_finobt_ops = { }; /* - * Initialize a new inode btree cursor. + * Create an inode btree cursor. + * + * For staging cursors tp and agbp are NULL. */ -static struct xfs_btree_cur * -xfs_inobt_init_common( +struct xfs_btree_cur * +xfs_inobt_init_cursor( struct xfs_perag *pag, - struct xfs_trans *tp, /* transaction pointer */ - xfs_btnum_t btnum) /* ialloc or free ino btree */ + struct xfs_trans *tp, + struct xfs_buf *agbp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; - cur = xfs_btree_alloc_cursor(mp, tp, btnum, + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_inobt_ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); - if (btnum == XFS_BTNUM_INO) { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); - cur->bc_ops = &xfs_inobt_ops; - } else { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); - cur->bc_ops = &xfs_finobt_ops; - } - - if (xfs_has_crc(mp)) - cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + cur->bc_group = xfs_group_hold(pag_group(pag)); + cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agi *agi = agbp->b_addr; - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + } return cur; } -/* Create an inode btree cursor. */ +/* + * Create a free inode btree cursor. + * + * For staging cursors tp and agbp are NULL. + */ struct xfs_btree_cur * -xfs_inobt_init_cursor( +xfs_finobt_init_cursor( struct xfs_perag *pag, struct xfs_trans *tp, - struct xfs_buf *agbp, - xfs_btnum_t btnum) + struct xfs_buf *agbp) { + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; - struct xfs_agi *agi = agbp->b_addr; - cur = xfs_inobt_init_common(pag, tp, btnum); - if (btnum == XFS_BTNUM_INO) - cur->bc_nlevels = be32_to_cpu(agi->agi_level); - else - cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_finobt_ops, + M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; - return cur; -} + if (agbp) { + struct xfs_agi *agi = agbp->b_addr; -/* Create an inode btree cursor with a fake root for staging. */ -struct xfs_btree_cur * -xfs_inobt_stage_cursor( - struct xfs_perag *pag, - struct xbtree_afakeroot *afake, - xfs_btnum_t btnum) -{ - struct xfs_btree_cur *cur; - - cur = xfs_inobt_init_common(pag, NULL, btnum); - xfs_btree_stage_afakeroot(cur, afake); + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + } return cur; } @@ -521,7 +536,7 @@ xfs_inobt_commit_staged_btree( ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - if (cur->bc_btnum == XFS_BTNUM_INO) { + if (xfs_btree_is_ino(cur->bc_ops)) { fields = XFS_AGI_ROOT | XFS_AGI_LEVEL; agi->agi_root = cpu_to_be32(afake->af_root); agi->agi_level = cpu_to_be32(afake->af_levels); @@ -530,7 +545,7 @@ xfs_inobt_commit_staged_btree( fields |= XFS_AGI_IBLOCKS; } xfs_ialloc_log_agi(tp, agbp, fields); - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops); + xfs_btree_commit_afakeroot(cur, tp, agbp); } else { fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL; agi->agi_free_root = cpu_to_be32(afake->af_root); @@ -540,7 +555,7 @@ xfs_inobt_commit_staged_btree( fields |= XFS_AGI_IBLOCKS; } xfs_ialloc_log_agi(tp, agbp, fields); - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops); + xfs_btree_commit_afakeroot(cur, tp, agbp); } } @@ -558,11 +573,11 @@ xfs_inobt_block_maxrecs( /* * Calculate number of records in an inobt btree block. */ -int +unsigned int xfs_inobt_maxrecs( struct xfs_mount *mp, - int blocklen, - int leaf) + unsigned int blocklen, + bool leaf) { blocklen -= XFS_INOBT_BLOCK_LEN(mp); return xfs_inobt_block_maxrecs(blocklen, leaf); @@ -701,8 +716,8 @@ static xfs_extlen_t xfs_inobt_max_size( struct xfs_perag *pag) { - struct xfs_mount *mp = pag->pag_mount; - xfs_agblock_t agblocks = pag->block_count; + struct xfs_mount *mp = pag_mount(pag); + xfs_agblock_t agblocks = pag_group(pag)->xg_block_count; /* Bail out if we're uninitialized, which can happen in mkfs. */ if (M_IGEO(mp)->inobt_mxr[0] == 0) @@ -713,7 +728,7 @@ xfs_inobt_max_size( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (xfs_ag_contains_log(mp, pag->pag_agno)) + if (xfs_ag_contains_log(mp, pag_agno(pag))) agblocks -= mp->m_sb.sb_logblocks; return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, @@ -721,48 +736,26 @@ xfs_inobt_max_size( XFS_INODES_PER_CHUNK); } -/* Read AGI and create inobt cursor. */ -int -xfs_inobt_cur( - struct xfs_perag *pag, - struct xfs_trans *tp, - xfs_btnum_t which, - struct xfs_btree_cur **curpp, - struct xfs_buf **agi_bpp) -{ - struct xfs_btree_cur *cur; - int error; - - ASSERT(*agi_bpp == NULL); - ASSERT(*curpp == NULL); - - error = xfs_ialloc_read_agi(pag, tp, agi_bpp); - if (error) - return error; - - cur = xfs_inobt_init_cursor(pag, tp, *agi_bpp, which); - *curpp = cur; - return 0; -} - static int -xfs_inobt_count_blocks( +xfs_finobt_count_blocks( struct xfs_perag *pag, struct xfs_trans *tp, - xfs_btnum_t btnum, xfs_extlen_t *tree_blocks) { struct xfs_buf *agbp = NULL; - struct xfs_btree_cur *cur = NULL; + struct xfs_btree_cur *cur; + xfs_filblks_t blocks; int error; - error = xfs_inobt_cur(pag, tp, btnum, &cur, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) return error; - error = xfs_btree_count_blocks(cur, tree_blocks); + cur = xfs_finobt_init_cursor(pag, tp, agbp); + error = xfs_btree_count_blocks(cur, &blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); + *tree_blocks = blocks; return error; } @@ -778,7 +771,7 @@ xfs_finobt_read_blocks( struct xfs_agi *agi; int error; - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) return error; @@ -801,14 +794,13 @@ xfs_finobt_calc_reserves( xfs_extlen_t tree_len = 0; int error; - if (!xfs_has_finobt(pag->pag_mount)) + if (!xfs_has_finobt(pag_mount(pag))) return 0; - if (xfs_has_inobtcounts(pag->pag_mount)) + if (xfs_has_inobtcounts(pag_mount(pag))) error = xfs_finobt_read_blocks(pag, tp, &tree_len); else - error = xfs_inobt_count_blocks(pag, tp, XFS_BTNUM_FINO, - &tree_len); + error = xfs_finobt_count_blocks(pag, tp, &tree_len); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 3262c3fe5ebe..300edf5bc009 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -46,11 +46,12 @@ struct xfs_perag; (maxrecs) * sizeof(xfs_inobt_key_t) + \ ((index) - 1) * sizeof(xfs_inobt_ptr_t))) -extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag, - struct xfs_trans *tp, struct xfs_buf *agbp, xfs_btnum_t btnum); -struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag, - struct xbtree_afakeroot *afake, xfs_btnum_t btnum); -extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag, + struct xfs_trans *tp, struct xfs_buf *agbp); +struct xfs_btree_cur *xfs_finobt_init_cursor(struct xfs_perag *pag, + struct xfs_trans *tp, struct xfs_buf *agbp); +unsigned int xfs_inobt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); /* ir_holemask to inode allocation bitmap conversion */ uint64_t xfs_inobt_irec_to_allocmask(const struct xfs_inobt_rec_incore *irec); @@ -66,9 +67,6 @@ int xfs_finobt_calc_reserves(struct xfs_perag *perag, struct xfs_trans *tp, xfs_extlen_t *ask, xfs_extlen_t *used); extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp, unsigned long long len); -int xfs_inobt_cur(struct xfs_perag *pag, struct xfs_trans *tp, - xfs_btnum_t btnum, struct xfs_btree_cur **curpp, - struct xfs_buf **agi_bpp); void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index f4e6b200cdf8..8796f2b3e534 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -394,11 +394,18 @@ xfs_iext_leaf_key( return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK; } +static inline void * +xfs_iext_alloc_node( + int size) +{ + return kzalloc(size, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); +} + static void xfs_iext_grow( struct xfs_ifork *ifp) { - struct xfs_iext_node *node = kmem_zalloc(NODE_SIZE, KM_NOFS); + struct xfs_iext_node *node = xfs_iext_alloc_node(NODE_SIZE); int i; if (ifp->if_height == 1) { @@ -454,7 +461,7 @@ xfs_iext_split_node( int *nr_entries) { struct xfs_iext_node *node = *nodep; - struct xfs_iext_node *new = kmem_zalloc(NODE_SIZE, KM_NOFS); + struct xfs_iext_node *new = xfs_iext_alloc_node(NODE_SIZE); const int nr_move = KEYS_PER_NODE / 2; int nr_keep = nr_move + (KEYS_PER_NODE & 1); int i = 0; @@ -542,7 +549,7 @@ xfs_iext_split_leaf( int *nr_entries) { struct xfs_iext_leaf *leaf = cur->leaf; - struct xfs_iext_leaf *new = kmem_zalloc(NODE_SIZE, KM_NOFS); + struct xfs_iext_leaf *new = xfs_iext_alloc_node(NODE_SIZE); const int nr_move = RECS_PER_LEAF / 2; int nr_keep = nr_move + (RECS_PER_LEAF & 1); int i; @@ -583,7 +590,7 @@ xfs_iext_alloc_root( { ASSERT(ifp->if_bytes == 0); - ifp->if_data = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS); + ifp->if_data = xfs_iext_alloc_node(sizeof(struct xfs_iext_rec)); ifp->if_height = 1; /* now that we have a node step into it */ @@ -603,7 +610,8 @@ xfs_iext_realloc_root( if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF) new_size = NODE_SIZE; - new = krealloc(ifp->if_data, new_size, GFP_NOFS | __GFP_NOFAIL); + new = krealloc(ifp->if_data, new_size, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes); ifp->if_data = new; cur->leaf = new; @@ -743,7 +751,7 @@ xfs_iext_remove_node( again: ASSERT(node->ptrs[pos]); ASSERT(node->ptrs[pos] == victim); - kmem_free(victim); + kfree(victim); nr_entries = xfs_iext_node_nr_entries(node, pos) - 1; offset = node->keys[0]; @@ -789,7 +797,7 @@ again: ASSERT(node == ifp->if_data); ifp->if_data = node->ptrs[0]; ifp->if_height--; - kmem_free(node); + kfree(node); } } @@ -863,7 +871,7 @@ xfs_iext_free_last_leaf( struct xfs_ifork *ifp) { ifp->if_height--; - kmem_free(ifp->if_data); + kfree(ifp->if_data); ifp->if_data = NULL; } @@ -1044,7 +1052,7 @@ xfs_iext_destroy_node( } } - kmem_free(node); + kfree(node); } void diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 137a65bda95d..aa13fc00afd7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -18,6 +18,8 @@ #include "xfs_trans.h" #include "xfs_ialloc.h" #include "xfs_dir2.h" +#include "xfs_health.h" +#include "xfs_metafile.h" #include <linux/iversion.h> @@ -132,9 +134,14 @@ xfs_imap_to_bp( struct xfs_imap *imap, struct xfs_buf **bpp) { - return xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - imap->im_len, XBF_UNMAPPED, bpp, - &xfs_inode_buf_ops); + int error; + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + imap->im_len, 0, bpp, &xfs_inode_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno), + XFS_SICK_AG_INODES); + return error; } static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts) @@ -203,12 +210,15 @@ xfs_inode_from_disk( * They will also be unconditionally written back to disk as v2 inodes. */ if (unlikely(from->di_version == 1)) { - set_nlink(inode, be16_to_cpu(from->di_onlink)); + /* di_metatype used to be di_onlink */ + set_nlink(inode, be16_to_cpu(from->di_metatype)); ip->i_projid = 0; } else { set_nlink(inode, be32_to_cpu(from->di_nlink)); ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | be16_to_cpu(from->di_projid_lo); + if (xfs_dinode_is_metadir(from)) + ip->i_metatype = be16_to_cpu(from->di_metatype); } i_uid_write(inode, be32_to_cpu(from->di_uid)); @@ -242,7 +252,10 @@ xfs_inode_from_disk( be64_to_cpu(from->di_changecount)); ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); ip->i_diflags2 = be64_to_cpu(from->di_flags2); + /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = be32_to_cpu(from->di_cowextsize); + BUILD_BUG_ON(sizeof(from->di_cowextsize) != + sizeof(from->di_used_blocks)); } error = xfs_iformat_data_fork(ip, from); @@ -309,7 +322,10 @@ xfs_inode_to_disk( struct inode *inode = VFS_I(ip); to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); - to->di_onlink = 0; + if (xfs_is_metadir_inode(ip)) + to->di_metatype = cpu_to_be16(ip->i_metatype); + else + to->di_metatype = 0; to->di_format = xfs_ifork_format(&ip->i_df); to->di_uid = cpu_to_be32(i_uid_read(inode)); @@ -336,6 +352,7 @@ xfs_inode_to_disk( to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); to->di_flags2 = cpu_to_be64(ip->i_diflags2); + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = cpu_to_be32(ip->i_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); @@ -368,17 +385,40 @@ xfs_dinode_verify_fork( /* * For fork types that can contain local data, check that the fork * format matches the size of local data contained within the fork. - * - * For all types, check that when the size says the should be in extent - * or btree format, the inode isn't claiming it is in local format. */ if (whichfork == XFS_DATA_FORK) { - if (S_ISDIR(mode) || S_ISLNK(mode)) { + /* + * A directory small enough to fit in the inode must be stored + * in local format. The directory sf <-> extents conversion + * code updates the directory size accordingly. Directories + * being truncated have zero size and are not subject to this + * check. + */ + if (S_ISDIR(mode)) { + if (dip->di_size && + be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + /* + * A symlink with a target small enough to fit in the inode can + * be stored in extents format if xattrs were added (thus + * converting the data fork from shortform to remote format) + * and then removed. + */ + if (S_ISLNK(mode)) { if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_EXTENTS && fork_format != XFS_DINODE_FMT_LOCAL) return __this_address; } + /* + * For all types, check that when the size says the fork should + * be in extent or btree format, the inode isn't claiming to be + * in local format. + */ if (be64_to_cpu(dip->di_size) > fork_size && fork_format == XFS_DINODE_FMT_LOCAL) return __this_address; @@ -405,6 +445,30 @@ xfs_dinode_verify_fork( if (di_nextents > max_extents) return __this_address; break; + case XFS_DINODE_FMT_META_BTREE: + if (!xfs_has_metadir(mp)) + return __this_address; + if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA))) + return __this_address; + switch (be16_to_cpu(dip->di_metatype)) { + case XFS_METAFILE_RTRMAP: + /* + * growfs must create the rtrmap inodes before adding a + * realtime volume to the filesystem, so we cannot use + * the rtrmapbt predicate here. + */ + if (!xfs_has_rmapbt(mp)) + return __this_address; + break; + case XFS_METAFILE_RTREFCOUNT: + /* same comment about growfs and rmap inodes applies */ + if (!xfs_has_reflink(mp)) + return __this_address; + break; + default: + return __this_address; + } + break; default: return __this_address; } @@ -424,6 +488,10 @@ xfs_dinode_verify_forkoff( if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3)) return __this_address; break; + case XFS_DINODE_FMT_META_BTREE: + if (!xfs_has_metadir(mp) || !xfs_has_parent(mp)) + return __this_address; + fallthrough; case XFS_DINODE_FMT_LOCAL: /* fall through ... */ case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ case XFS_DINODE_FMT_BTREE: @@ -454,6 +522,69 @@ xfs_dinode_verify_nrext64( return NULL; } +/* + * Validate all the picky requirements we have for a file that claims to be + * filesystem metadata. + */ +xfs_failaddr_t +xfs_dinode_verify_metadir( + struct xfs_mount *mp, + struct xfs_dinode *dip, + uint16_t mode, + uint16_t flags, + uint64_t flags2) +{ + if (!xfs_has_metadir(mp)) + return __this_address; + + /* V5 filesystem only */ + if (dip->di_version < 3) + return __this_address; + + if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX) + return __this_address; + + /* V3 inode fields that are always zero */ + if ((flags2 & XFS_DIFLAG2_NREXT64) && dip->di_nrext64_pad) + return __this_address; + if (!(flags2 & XFS_DIFLAG2_NREXT64) && dip->di_flushiter) + return __this_address; + + /* Metadata files can only be directories or regular files */ + if (!S_ISDIR(mode) && !S_ISREG(mode)) + return __this_address; + + /* They must have zero access permissions */ + if (mode & 0777) + return __this_address; + + /* DMAPI event and state masks are zero */ + if (dip->di_dmevmask || dip->di_dmstate) + return __this_address; + + /* + * User and group IDs must be zero. The project ID is used for + * grouping inodes. Metadata inodes are never accounted to quotas. + */ + if (dip->di_uid || dip->di_gid) + return __this_address; + + /* Mandatory inode flags must be set */ + if (S_ISDIR(mode)) { + if ((flags & XFS_METADIR_DIFLAGS) != XFS_METADIR_DIFLAGS) + return __this_address; + } else { + if ((flags & XFS_METAFILE_DIFLAGS) != XFS_METAFILE_DIFLAGS) + return __this_address; + } + + /* dax flags2 must not be set */ + if (flags2 & XFS_DIFLAG2_DAX) + return __this_address; + + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -485,6 +616,23 @@ xfs_dinode_verify( return __this_address; } + /* + * Historical note: xfsprogs in the 3.2 era set up its incore inodes to + * have di_nlink track the link count, even if the actual filesystem + * only supported V1 inodes (i.e. di_onlink). When writing out the + * ondisk inode, it would set both the ondisk di_nlink and di_onlink to + * the the incore di_nlink value, which is why we cannot check for + * di_nlink==0 on a V1 inode. V2/3 inodes would get written out with + * di_onlink==0, so we can check that. + */ + if (dip->di_version == 2) { + if (dip->di_metatype) + return __this_address; + } else if (dip->di_version >= 3) { + if (!xfs_dinode_is_metadir(dip) && dip->di_metatype) + return __this_address; + } + /* don't allow invalid i_size */ di_size = be64_to_cpu(dip->di_size); if (di_size & (1ULL << 63)) @@ -494,9 +642,20 @@ xfs_dinode_verify( if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) return __this_address; - /* No zero-length symlinks/dirs. */ - if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) - return __this_address; + /* + * No zero-length symlinks/dirs unless they're unlinked and hence being + * inactivated. + */ + if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) { + if (dip->di_version > 1) { + if (dip->di_nlink) + return __this_address; + } else { + /* di_metatype used to be di_onlink */ + if (dip->di_metatype) + return __this_address; + } + } fa = xfs_dinode_verify_nrext64(mp, dip); if (fa) @@ -510,9 +669,6 @@ xfs_dinode_verify( if (mode && nextents + naextents > nblocks) return __this_address; - if (nextents + naextents == 0 && nblocks != 0) - return __this_address; - if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; @@ -596,20 +752,40 @@ xfs_dinode_verify( return __this_address; /* don't let reflink and realtime mix */ - if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) + if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME) && + !xfs_has_rtreflink(mp)) return __this_address; - /* COW extent size hint validation */ - fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), - mode, flags, flags2); - if (fa) - return fa; + if (xfs_has_zoned(mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) { + if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents) + return __this_address; + } else { + /* COW extent size hint validation */ + fa = xfs_inode_validate_cowextsize(mp, + be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) + return fa; + } /* bigtime iflag can only happen on bigtime filesystems */ if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) return __this_address; + if (flags2 & XFS_DIFLAG2_METADATA) { + fa = xfs_dinode_verify_metadir(mp, dip, mode, flags, flags2); + if (fa) + return fa; + } + + /* metadata inodes containing btrees always have zero extent count */ + if (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK) != XFS_DINODE_FMT_META_BTREE) { + if (nextents + naextents == 0 && nblocks != 0) + return __this_address; + } + return NULL; } @@ -745,11 +921,29 @@ xfs_inode_validate_cowextsize( bool rt_flag; bool hint_flag; uint32_t cowextsize_bytes; + uint32_t blocksize_bytes; rt_flag = (flags & XFS_DIFLAG_REALTIME); hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); + /* + * Similar to extent size hints, a directory can be configured to + * propagate realtime status and a CoW extent size hint to newly + * created files even if there is no realtime device, and the hints on + * disk can become misaligned if the sysadmin changes the rt extent + * size while adding the realtime device. + * + * Therefore, we can only enforce the rextsize alignment check against + * regular realtime files, and rely on callers to decide when alignment + * checks are appropriate, and fix things up as needed. + */ + + if (rt_flag) + blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); + else + blocksize_bytes = mp->m_sb.sb_blocksize; + if (hint_flag && !xfs_has_reflink(mp)) return __this_address; @@ -763,16 +957,13 @@ xfs_inode_validate_cowextsize( if (mode && !hint_flag && cowextsize != 0) return __this_address; - if (hint_flag && rt_flag) - return __this_address; - - if (cowextsize_bytes % mp->m_sb.sb_blocksize) + if (cowextsize_bytes % blocksize_bytes) return __this_address; if (cowextsize > XFS_MAX_BMBT_EXTLEN) return __this_address; - if (cowextsize > mp->m_sb.sb_agblocks / 2) + if (!rt_flag && cowextsize > mp->m_sb.sb_agblocks / 2) return __this_address; return NULL; diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 585ed5a110af..8d43d2641c73 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -28,6 +28,9 @@ int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); +xfs_failaddr_t xfs_dinode_verify_metadir(struct xfs_mount *mp, + struct xfs_dinode *dip, uint16_t mode, uint16_t flags, + uint64_t flags2); xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp, uint32_t extsize, uint16_t mode, uint16_t flags); xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index f4569e18a8d0..4f99b90add55 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -25,6 +25,10 @@ #include "xfs_attr_leaf.h" #include "xfs_types.h" #include "xfs_errortag.h" +#include "xfs_health.h" +#include "xfs_symlink_remote.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" struct kmem_cache *xfs_ifork_cache; @@ -50,7 +54,8 @@ xfs_init_local_fork( mem_size++; if (size) { - char *new_data = kmem_alloc(mem_size, KM_NOFS); + char *new_data = kmalloc(mem_size, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); memcpy(new_data, data, size); if (zero_terminate) @@ -77,7 +82,7 @@ xfs_iformat_local( /* * If the size is unreasonable, then something * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. + * kmalloc() or memcpy() below. */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, @@ -87,6 +92,7 @@ xfs_iformat_local( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_local", dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } @@ -116,7 +122,7 @@ xfs_iformat_extents( /* * If the number of extents is unreasonable, then something is wrong and - * we just bail out rather than crash in kmem_alloc() or memcpy() below. + * we just bail out rather than crash in kmalloc() or memcpy() below. */ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).", @@ -124,6 +130,7 @@ xfs_iformat_extents( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(1)", dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } @@ -143,6 +150,7 @@ xfs_iformat_extents( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(2)", dp, sizeof(*dp), fa); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return xfs_bmap_complain_bad_rec(ip, whichfork, fa, &new); } @@ -172,14 +180,14 @@ xfs_iformat_btree( struct xfs_mount *mp = ip->i_mount; xfs_bmdr_block_t *dfp; struct xfs_ifork *ifp; - /* REFERENCED */ + struct xfs_btree_block *broot; int nrecs; int size; int level; ifp = xfs_ifork_ptr(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); - size = XFS_BMAP_BROOT_SPACE(mp, dfp); + size = xfs_bmap_broot_space(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); level = be16_to_cpu(dfp->bb_level); @@ -192,7 +200,7 @@ xfs_iformat_btree( */ if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) || nrecs == 0 || - XFS_BMDR_SPACE_CALC(nrecs) > + xfs_bmdr_space_calc(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || ifp->if_nextents > ip->i_nblocks) || level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) { @@ -201,18 +209,17 @@ xfs_iformat_btree( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_btree", dfp, size, __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } - ifp->if_broot_bytes = size; - ifp->if_broot = kmem_alloc(size, KM_NOFS); - ASSERT(ifp->if_broot != NULL); + broot = xfs_broot_alloc(ifp, size); /* * Copy and convert from the on-disk structure * to the in-memory structure. */ xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), - ifp->if_broot, size); + broot, size); ifp->if_bytes = 0; ifp->if_data = NULL; @@ -262,15 +269,27 @@ xfs_iformat_data_fork( return xfs_iformat_extents(ip, dip, XFS_DATA_FORK); case XFS_DINODE_FMT_BTREE: return xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + case XFS_DINODE_FMT_META_BTREE: + switch (ip->i_metatype) { + case XFS_METAFILE_RTRMAP: + return xfs_iformat_rtrmap(ip, dip); + case XFS_METAFILE_RTREFCOUNT: + return xfs_iformat_rtrefcount(ip, dip); + default: + break; + } + fallthrough; default: xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } break; default: xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } } @@ -342,6 +361,7 @@ xfs_iformat_attr_fork( default: xfs_inode_verifier_error(ip, error, __func__, dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); error = -EFSCORRUPTED; break; } @@ -352,134 +372,68 @@ xfs_iformat_attr_fork( } /* - * Reallocate the space for if_broot based on the number of records - * being added or deleted as indicated in rec_diff. Move the records - * and pointers in if_broot to fit the new size. When shrinking this - * will eliminate holes between the records and pointers created by - * the caller. When growing this will create holes to be filled in - * by the caller. - * - * The caller must not request to add more records than would fit in - * the on-disk inode root. If the if_broot is currently NULL, then - * if we are adding records, one will be allocated. The caller must also - * not request that the number of records go below zero, although - * it can go to zero. - * - * ip -- the inode whose if_broot area is changing - * ext_diff -- the change in the number of records, positive or negative, - * requested for the if_broot array. + * Allocate the if_broot component of an inode fork so that it is @new_size + * bytes in size, using __GFP_NOLOCKDEP like all the other code that + * initializes a broot during inode load. Returns if_broot. */ -void -xfs_iroot_realloc( - xfs_inode_t *ip, - int rec_diff, - int whichfork) +struct xfs_btree_block * +xfs_broot_alloc( + struct xfs_ifork *ifp, + size_t new_size) { - struct xfs_mount *mp = ip->i_mount; - int cur_max; - struct xfs_ifork *ifp; - struct xfs_btree_block *new_broot; - int new_max; - size_t new_size; - char *np; - char *op; + ASSERT(ifp->if_broot == NULL); - /* - * Handle the degenerate case quietly. - */ - if (rec_diff == 0) { - return; - } - - ifp = xfs_ifork_ptr(ip, whichfork); - if (rec_diff > 0) { - /* - * If there wasn't any memory allocated before, just - * allocate it now and get out. - */ - if (ifp->if_broot_bytes == 0) { - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); - ifp->if_broot = kmem_alloc(new_size, KM_NOFS); - ifp->if_broot_bytes = (int)new_size; - return; - } + ifp->if_broot = kmalloc(new_size, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); + ifp->if_broot_bytes = new_size; + return ifp->if_broot; +} - /* - * If there is already an existing if_broot, then we need - * to realloc() it and shift the pointers to their new - * location. The records don't change location because - * they are kept butted up against the btree block header. - */ - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); - new_max = cur_max + rec_diff; - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); - ifp->if_broot = krealloc(ifp->if_broot, new_size, - GFP_NOFS | __GFP_NOFAIL); - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - (int)new_size); - ifp->if_broot_bytes = (int)new_size; - ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - xfs_inode_fork_size(ip, whichfork)); - memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t)); - return; +/* + * Reallocate the if_broot component of an inode fork so that it is @new_size + * bytes in size. Returns if_broot. + */ +struct xfs_btree_block * +xfs_broot_realloc( + struct xfs_ifork *ifp, + size_t new_size) +{ + /* No size change? No action needed. */ + if (new_size == ifp->if_broot_bytes) + return ifp->if_broot; + + /* New size is zero, free it. */ + if (new_size == 0) { + ifp->if_broot_bytes = 0; + kfree(ifp->if_broot); + ifp->if_broot = NULL; + return NULL; } /* - * rec_diff is less than 0. In this case, we are shrinking the - * if_broot buffer. It must already exist. If we go to zero - * records, just get rid of the root and clear the status bit. + * Shrinking the iroot means we allocate a new smaller object and copy + * it. We don't trust krealloc not to nop on realloc-down. */ - ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); - new_max = cur_max + rec_diff; - ASSERT(new_max >= 0); - if (new_max > 0) - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); - else - new_size = 0; - if (new_size > 0) { - new_broot = kmem_alloc(new_size, KM_NOFS); - /* - * First copy over the btree block header. - */ - memcpy(new_broot, ifp->if_broot, - XFS_BMBT_BLOCK_LEN(ip->i_mount)); - } else { - new_broot = NULL; + if (ifp->if_broot_bytes > 0 && ifp->if_broot_bytes > new_size) { + struct xfs_btree_block *old_broot = ifp->if_broot; + + ifp->if_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); + ifp->if_broot_bytes = new_size; + memcpy(ifp->if_broot, old_broot, new_size); + kfree(old_broot); + return ifp->if_broot; } /* - * Only copy the records and pointers if there are any. + * Growing the iroot means we can krealloc. This may get us the same + * object. */ - if (new_max > 0) { - /* - * First copy the records. - */ - op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); - np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); - memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); - - /* - * Then copy the pointers. - */ - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, - (int)new_size); - memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t)); - } - kmem_free(ifp->if_broot); - ifp->if_broot = new_broot; - ifp->if_broot_bytes = (int)new_size; - if (ifp->if_broot) - ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - xfs_inode_fork_size(ip, whichfork)); - return; + ifp->if_broot = krealloc(ifp->if_broot, new_size, + GFP_KERNEL | __GFP_NOFAIL); + ifp->if_broot_bytes = new_size; + return ifp->if_broot; } - /* * This is called when the amount of space needed for if_data * is increased or decreased. The change in size is indicated by @@ -488,7 +442,7 @@ xfs_iroot_realloc( * * If the amount of space needed has decreased below the size of the * inline buffer, then switch to using the inline buffer. Otherwise, - * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer + * use krealloc() or kmalloc() to adjust the size of the buffer * to what is needed. * * ip -- the inode whose if_data area is changing @@ -509,7 +463,7 @@ xfs_idata_realloc( if (byte_diff) { ifp->if_data = krealloc(ifp->if_data, new_size, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOFAIL); if (new_size == 0) ifp->if_data = NULL; ifp->if_bytes = new_size; @@ -524,13 +478,13 @@ xfs_idestroy_fork( struct xfs_ifork *ifp) { if (ifp->if_broot != NULL) { - kmem_free(ifp->if_broot); + kfree(ifp->if_broot); ifp->if_broot = NULL; } switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: - kmem_free(ifp->if_data); + kfree(ifp->if_data); ifp->if_data = NULL; break; case XFS_DINODE_FMT_EXTENTS: @@ -562,7 +516,7 @@ xfs_iextents_copy( struct xfs_bmbt_irec rec; int64_t copied = 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); ASSERT(ifp->if_bytes > 0); for_each_xfs_iext(ifp, &icur, &rec) { @@ -643,7 +597,7 @@ xfs_iflush_fork( if ((iip->ili_fields & brootflag[whichfork]) && (ifp->if_broot_bytes > 0)) { ASSERT(ifp->if_broot != NULL); - ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <= xfs_inode_fork_size(ip, whichfork)); xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, @@ -659,6 +613,25 @@ xfs_iflush_fork( } break; + case XFS_DINODE_FMT_META_BTREE: + ASSERT(whichfork == XFS_DATA_FORK); + + if (!(iip->ili_fields & brootflag[whichfork])) + break; + + switch (ip->i_metatype) { + case XFS_METAFILE_RTRMAP: + xfs_iflush_rtrmap(ip, dip); + break; + case XFS_METAFILE_RTREFCOUNT: + xfs_iflush_rtrefcount(ip, dip); + break; + default: + ASSERT(0); + break; + } + break; + default: ASSERT(0); break; @@ -689,7 +662,7 @@ xfs_ifork_init_cow( return; ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; } @@ -753,52 +726,54 @@ xfs_ifork_verify_local_attr( return 0; } +/* + * Check if the inode fork supports adding nr_to_add more extents. + * + * If it doesn't but we can upgrade it to large extent counters, do the upgrade. + * If we can't upgrade or are already using big counters but still can't fit the + * additional extents, return -EFBIG. + */ int -xfs_iext_count_may_overflow( +xfs_iext_count_extend( + struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, - int nr_to_add) + uint nr_to_add) { + struct xfs_mount *mp = ip->i_mount; + bool has_large = + xfs_inode_has_large_extent_counts(ip); struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); - uint64_t max_exts; uint64_t nr_exts; + ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); + if (whichfork == XFS_COW_FORK) return 0; - max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip), - whichfork); - - if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) - max_exts = 10; - + /* no point in upgrading if if_nextents overflows */ nr_exts = ifp->if_nextents + nr_to_add; - if (nr_exts < ifp->if_nextents || nr_exts > max_exts) + if (nr_exts < ifp->if_nextents) + return -EFBIG; + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + nr_exts > 10) return -EFBIG; + if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) { + if (has_large || !xfs_has_large_extent_counts(mp)) + return -EFBIG; + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } return 0; } -/* - * Upgrade this inode's extent counter fields to be able to handle a potential - * increase in the extent count by nr_to_add. Normally this is the same - * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG. - */ -int -xfs_iext_count_upgrade( - struct xfs_trans *tp, +/* Decide if a file mapping is on the realtime device or not. */ +bool +xfs_ifork_is_realtime( struct xfs_inode *ip, - uint nr_to_add) + int whichfork) { - ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); - - if (!xfs_has_large_extent_counts(ip->i_mount) || - xfs_inode_has_large_extent_counts(ip) || - XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) - return -EFBIG; - - ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - return 0; + return XFS_IS_REALTIME_INODE(ip) && whichfork != XFS_ATTR_FORK; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 96303249d28a..69ed0919d60b 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -170,7 +170,11 @@ void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, void xfs_idestroy_fork(struct xfs_ifork *ifp); void * xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, int whichfork); -void xfs_iroot_realloc(struct xfs_inode *, int, int); +struct xfs_btree_block *xfs_broot_alloc(struct xfs_ifork *ifp, + size_t new_size); +struct xfs_btree_block *xfs_broot_realloc(struct xfs_ifork *ifp, + size_t new_size); + int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, int); @@ -256,10 +260,9 @@ extern void xfs_ifork_init_cow(struct xfs_inode *ip); int xfs_ifork_verify_local_data(struct xfs_inode *ip); int xfs_ifork_verify_local_attr(struct xfs_inode *ip); -int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, - int nr_to_add); -int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, - uint nr_to_add); +int xfs_iext_count_extend(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, uint nr_to_add); +bool xfs_ifork_is_realtime(struct xfs_inode *ip, int whichfork); /* returns true if the fork has extents but they are not read in yet. */ static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp) diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c new file mode 100644 index 000000000000..48fe49a5f050 --- /dev/null +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -0,0 +1,752 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include <linux/iversion.h> +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_inode_util.h" +#include "xfs_trans.h" +#include "xfs_ialloc.h" +#include "xfs_health.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_ag.h" +#include "xfs_iunlink_item.h" +#include "xfs_inode_item.h" + +uint16_t +xfs_flags2diflags( + struct xfs_inode *ip, + unsigned int xflags) +{ + /* can't set PREALLOC this way, just preserve it */ + uint16_t di_flags = + (ip->i_diflags & XFS_DIFLAG_PREALLOC); + + if (xflags & FS_XFLAG_IMMUTABLE) + di_flags |= XFS_DIFLAG_IMMUTABLE; + if (xflags & FS_XFLAG_APPEND) + di_flags |= XFS_DIFLAG_APPEND; + if (xflags & FS_XFLAG_SYNC) + di_flags |= XFS_DIFLAG_SYNC; + if (xflags & FS_XFLAG_NOATIME) + di_flags |= XFS_DIFLAG_NOATIME; + if (xflags & FS_XFLAG_NODUMP) + di_flags |= XFS_DIFLAG_NODUMP; + if (xflags & FS_XFLAG_NODEFRAG) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (xflags & FS_XFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + if (S_ISDIR(VFS_I(ip)->i_mode)) { + if (xflags & FS_XFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (xflags & FS_XFLAG_NOSYMLINKS) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if (xflags & FS_XFLAG_EXTSZINHERIT) + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + if (xflags & FS_XFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(VFS_I(ip)->i_mode)) { + if (xflags & FS_XFLAG_REALTIME) + di_flags |= XFS_DIFLAG_REALTIME; + if (xflags & FS_XFLAG_EXTSIZE) + di_flags |= XFS_DIFLAG_EXTSIZE; + } + + return di_flags; +} + +uint64_t +xfs_flags2diflags2( + struct xfs_inode *ip, + unsigned int xflags) +{ + uint64_t di_flags2 = + (ip->i_diflags2 & (XFS_DIFLAG2_REFLINK | + XFS_DIFLAG2_BIGTIME | + XFS_DIFLAG2_NREXT64)); + + if (xflags & FS_XFLAG_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + if (xflags & FS_XFLAG_COWEXTSIZE) + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + + return di_flags2; +} + +uint32_t +xfs_ip2xflags( + struct xfs_inode *ip) +{ + uint32_t flags = 0; + + if (ip->i_diflags & XFS_DIFLAG_ANY) { + if (ip->i_diflags & XFS_DIFLAG_REALTIME) + flags |= FS_XFLAG_REALTIME; + if (ip->i_diflags & XFS_DIFLAG_PREALLOC) + flags |= FS_XFLAG_PREALLOC; + if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) + flags |= FS_XFLAG_IMMUTABLE; + if (ip->i_diflags & XFS_DIFLAG_APPEND) + flags |= FS_XFLAG_APPEND; + if (ip->i_diflags & XFS_DIFLAG_SYNC) + flags |= FS_XFLAG_SYNC; + if (ip->i_diflags & XFS_DIFLAG_NOATIME) + flags |= FS_XFLAG_NOATIME; + if (ip->i_diflags & XFS_DIFLAG_NODUMP) + flags |= FS_XFLAG_NODUMP; + if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) + flags |= FS_XFLAG_RTINHERIT; + if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT) + flags |= FS_XFLAG_PROJINHERIT; + if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS) + flags |= FS_XFLAG_NOSYMLINKS; + if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) + flags |= FS_XFLAG_EXTSIZE; + if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) + flags |= FS_XFLAG_EXTSZINHERIT; + if (ip->i_diflags & XFS_DIFLAG_NODEFRAG) + flags |= FS_XFLAG_NODEFRAG; + if (ip->i_diflags & XFS_DIFLAG_FILESTREAM) + flags |= FS_XFLAG_FILESTREAM; + } + + if (ip->i_diflags2 & XFS_DIFLAG2_ANY) { + if (ip->i_diflags2 & XFS_DIFLAG2_DAX) + flags |= FS_XFLAG_DAX; + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) + flags |= FS_XFLAG_COWEXTSIZE; + } + + if (xfs_inode_has_attr_fork(ip)) + flags |= FS_XFLAG_HASATTR; + return flags; +} + +prid_t +xfs_get_initial_prid(struct xfs_inode *dp) +{ + if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT) + return dp->i_projid; + + /* Assign to the root project by default. */ + return 0; +} + +/* Propagate di_flags from a parent inode to a child inode. */ +static inline void +xfs_inode_inherit_flags( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + unsigned int di_flags = 0; + xfs_failaddr_t failaddr; + umode_t mode = VFS_I(ip)->i_mode; + + if (S_ISDIR(mode)) { + if (pip->i_diflags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + ip->i_extsize = pip->i_extsize; + } + if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(mode)) { + if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && + xfs_has_realtime(ip->i_mount)) + di_flags |= XFS_DIFLAG_REALTIME; + if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSIZE; + ip->i_extsize = pip->i_extsize; + } + } + if ((pip->i_diflags & XFS_DIFLAG_NOATIME) && + xfs_inherit_noatime) + di_flags |= XFS_DIFLAG_NOATIME; + if ((pip->i_diflags & XFS_DIFLAG_NODUMP) && + xfs_inherit_nodump) + di_flags |= XFS_DIFLAG_NODUMP; + if ((pip->i_diflags & XFS_DIFLAG_SYNC) && + xfs_inherit_sync) + di_flags |= XFS_DIFLAG_SYNC; + if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) && + xfs_inherit_nosymlinks) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) && + xfs_inherit_nodefrag) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_diflags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + + ip->i_diflags |= di_flags; + + /* + * Inode verifiers on older kernels only check that the extent size + * hint is an integer multiple of the rt extent size on realtime files. + * They did not check the hint alignment on a directory with both + * rtinherit and extszinherit flags set. If the misaligned hint is + * propagated from a directory into a new realtime file, new file + * allocations will fail due to math errors in the rt allocator and/or + * trip the verifiers. Validate the hint settings in the new file so + * that we don't let broken hints propagate. + */ + failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize, + VFS_I(ip)->i_mode, ip->i_diflags); + if (failaddr) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + } +} + +/* Propagate di_flags2 from a parent inode to a child inode. */ +static inline void +xfs_inode_inherit_flags2( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + xfs_failaddr_t failaddr; + + if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { + ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = pip->i_cowextsize; + } + if (pip->i_diflags2 & XFS_DIFLAG2_DAX) + ip->i_diflags2 |= XFS_DIFLAG2_DAX; + if (xfs_is_metadir_inode(pip)) + ip->i_diflags2 |= XFS_DIFLAG2_METADATA; + + /* Don't let invalid cowextsize hints propagate. */ + failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, + VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2); + if (failaddr) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + } +} + +/* + * If we need to create attributes immediately after allocating the inode, + * initialise an empty attribute fork right now. We use the default fork offset + * for attributes here as we don't know exactly what size or how many + * attributes we might be adding. We can do this safely here because we know + * the data fork is completely empty and this saves us from needing to run a + * separate transaction to set the fork offset in the immediate future. + * + * If we have parent pointers and the caller hasn't told us that the file will + * never be linked into a directory tree, we /must/ create the attr fork. + */ +static inline bool +xfs_icreate_want_attrfork( + struct xfs_mount *mp, + const struct xfs_icreate_args *args) +{ + if (args->flags & XFS_ICREATE_INIT_XATTRS) + return true; + + if (!(args->flags & XFS_ICREATE_UNLINKABLE) && xfs_has_parent(mp)) + return true; + + return false; +} + +/* Initialise an inode's attributes. */ +void +xfs_inode_init( + struct xfs_trans *tp, + const struct xfs_icreate_args *args, + struct xfs_inode *ip) +{ + struct xfs_inode *pip = args->pip; + struct inode *dir = pip ? VFS_I(pip) : NULL; + struct xfs_mount *mp = tp->t_mountp; + struct inode *inode = VFS_I(ip); + unsigned int flags; + int times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG | + XFS_ICHGTIME_ACCESS; + + if (args->flags & XFS_ICREATE_TMPFILE) + set_nlink(inode, 0); + else if (S_ISDIR(args->mode)) + set_nlink(inode, 2); + else + set_nlink(inode, 1); + inode->i_rdev = args->rdev; + + if (!args->idmap || pip == NULL) { + /* creating a tree root, sb rooted, or detached file */ + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + ip->i_projid = 0; + inode->i_mode = args->mode; + } else { + /* creating a child in the directory tree */ + if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { + inode_fsuid_set(inode, args->idmap); + inode->i_gid = dir->i_gid; + inode->i_mode = args->mode; + } else { + inode_init_owner(args->idmap, inode, dir, args->mode); + } + + /* + * If the group ID of the new file does not match the effective + * group ID or one of the supplementary group IDs, the S_ISGID + * bit is cleared (and only if the irix_sgid_inherit + * compatibility variable is set). + */ + if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && + !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode))) + inode->i_mode &= ~S_ISGID; + + ip->i_projid = xfs_get_initial_prid(pip); + } + + ip->i_disk_size = 0; + ip->i_df.if_nextents = 0; + ASSERT(ip->i_nblocks == 0); + + ip->i_extsize = 0; + ip->i_diflags = 0; + + if (xfs_has_v3inodes(mp)) { + inode_set_iversion(inode, 1); + /* also covers the di_used_blocks union arm: */ + ip->i_cowextsize = 0; + times |= XFS_ICHGTIME_CREATE; + } + + xfs_trans_ichgtime(tp, ip, times); + + flags = XFS_ILOG_CORE; + switch (args->mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + ip->i_df.if_format = XFS_DINODE_FMT_DEV; + flags |= XFS_ILOG_DEV; + break; + case S_IFREG: + case S_IFDIR: + if (pip && (pip->i_diflags & XFS_DIFLAG_ANY)) + xfs_inode_inherit_flags(ip, pip); + if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) + xfs_inode_inherit_flags2(ip, pip); + fallthrough; + case S_IFLNK: + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_bytes = 0; + ip->i_df.if_data = NULL; + break; + default: + ASSERT(0); + } + + if (xfs_icreate_want_attrfork(mp, args)) { + ip->i_forkoff = xfs_default_attroffset(ip) >> 3; + xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); + + if (!xfs_has_attr(mp)) { + spin_lock(&mp->m_sb_lock); + xfs_add_attr(mp); + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); + } + } + + xfs_trans_log_inode(tp, ip, flags); +} + +/* + * In-Core Unlinked List Lookups + * ============================= + * + * Every inode is supposed to be reachable from some other piece of metadata + * with the exception of the root directory. Inodes with a connection to a + * file descriptor but not linked from anywhere in the on-disk directory tree + * are collectively known as unlinked inodes, though the filesystem itself + * maintains links to these inodes so that on-disk metadata are consistent. + * + * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI + * header contains a number of buckets that point to an inode, and each inode + * record has a pointer to the next inode in the hash chain. This + * singly-linked list causes scaling problems in the iunlink remove function + * because we must walk that list to find the inode that points to the inode + * being removed from the unlinked hash bucket list. + * + * Hence we keep an in-memory double linked list to link each inode on an + * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer + * based lists would require having 64 list heads in the perag, one for each + * list. This is expensive in terms of memory (think millions of AGs) and cache + * misses on lookups. Instead, use the fact that inodes on the unlinked list + * must be referenced at the VFS level to keep them on the list and hence we + * have an existence guarantee for inodes on the unlinked list. + * + * Given we have an existence guarantee, we can use lockless inode cache lookups + * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode + * for the double linked unlinked list, and we don't need any extra locking to + * keep the list safe as all manipulations are done under the AGI buffer lock. + * Keeping the list up to date does not require memory allocation, just finding + * the XFS inode and updating the next/prev unlinked list aginos. + */ + +/* + * Update the prev pointer of the next agino. Returns -ENOLINK if the inode + * is not in cache. + */ +static int +xfs_iunlink_update_backref( + struct xfs_perag *pag, + xfs_agino_t prev_agino, + xfs_agino_t next_agino) +{ + struct xfs_inode *ip; + + /* No update necessary if we are at the end of the list. */ + if (next_agino == NULLAGINO) + return 0; + + ip = xfs_iunlink_lookup(pag, next_agino); + if (!ip) + return -ENOLINK; + + ip->i_prev_unlinked = prev_agino; + return 0; +} + +/* + * Point the AGI unlinked bucket at an inode and log the results. The caller + * is responsible for validating the old value. + */ +STATIC int +xfs_iunlink_update_bucket( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agibp, + unsigned int bucket_index, + xfs_agino_t new_agino) +{ + struct xfs_agi *agi = agibp->b_addr; + xfs_agino_t old_value; + int offset; + + ASSERT(xfs_verify_agino_or_null(pag, new_agino)); + + old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); + trace_xfs_iunlink_update_bucket(pag, bucket_index, old_value, + new_agino); + + /* + * We should never find the head of the list already set to the value + * passed in because either we're adding or removing ourselves from the + * head of the list. + */ + if (old_value == new_agino) { + xfs_buf_mark_corrupt(agibp); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + return -EFSCORRUPTED; + } + + agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); + offset = offsetof(struct xfs_agi, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +} + +static int +xfs_iunlink_insert_inode( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agibp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = agibp->b_addr; + xfs_agino_t next_agino; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; + + /* + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the pointer isn't garbage and that this inode + * isn't already on the list. + */ + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (next_agino == agino || + !xfs_verify_agino_or_null(pag, next_agino)) { + xfs_buf_mark_corrupt(agibp); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + return -EFSCORRUPTED; + } + + /* + * Update the prev pointer in the next inode to point back to this + * inode. + */ + error = xfs_iunlink_update_backref(pag, agino, next_agino); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); + if (error) + return error; + + if (next_agino != NULLAGINO) { + /* + * There is already another inode in the bucket, so point this + * inode to the current head of the list. + */ + error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); + if (error) + return error; + ip->i_next_unlinked = next_agino; + } + + /* Point the head of the list to point to this inode. */ + ip->i_prev_unlinked = NULLAGINO; + return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); +} + +/* + * This is called when the inode's link count has gone to 0 or we are creating + * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. + * + * We place the on-disk inode on a list in the AGI. It will be pulled from this + * list when the inode is freed. + */ +int +xfs_iunlink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; + struct xfs_buf *agibp; + int error; + + ASSERT(VFS_I(ip)->i_nlink == 0); + ASSERT(VFS_I(ip)->i_mode != 0); + trace_xfs_iunlink(ip); + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(pag, tp, 0, &agibp); + if (error) + goto out; + + error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); +out: + xfs_perag_put(pag); + return error; +} + +static int +xfs_iunlink_remove_inode( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agibp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = agibp->b_addr; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t head_agino; + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; + + trace_xfs_iunlink_remove(ip); + + /* + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the head pointer isn't garbage. + */ + head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (!xfs_verify_agino(pag, head_agino)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + agi, sizeof(*agi)); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + return -EFSCORRUPTED; + } + + /* + * Set our inode's next_unlinked pointer to NULL and then return + * the old pointer value so that we can update whatever was previous + * to us in the list to point to whatever was next in the list. + */ + error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); + if (error) + return error; + + /* + * Update the prev pointer in the next inode to point back to previous + * inode in the chain. + */ + error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, + ip->i_next_unlinked); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, + ip->i_next_unlinked); + if (error) + return error; + + if (head_agino != agino) { + struct xfs_inode *prev_ip; + + prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); + if (!prev_ip) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + error = xfs_iunlink_log_inode(tp, prev_ip, pag, + ip->i_next_unlinked); + prev_ip->i_next_unlinked = ip->i_next_unlinked; + } else { + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, + ip->i_next_unlinked); + } + + ip->i_next_unlinked = NULLAGINO; + ip->i_prev_unlinked = 0; + return error; +} + +/* + * Pull the on-disk inode from the AGI unlinked list. + */ +int +xfs_iunlink_remove( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + struct xfs_buf *agibp; + int error; + + trace_xfs_iunlink_remove(ip); + + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(pag, tp, 0, &agibp); + if (error) + return error; + + return xfs_iunlink_remove_inode(tp, pag, agibp, ip); +} + +/* + * Decrement the link count on an inode & log the change. If this causes the + * link count to go to zero, move the inode to AGI unlinked list so that it can + * be freed when the last active reference goes away via xfs_inactive(). + */ +int +xfs_droplink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + if (inode->i_nlink == 0) { + xfs_info_ratelimited(tp->t_mountp, + "Inode 0x%llx link count dropped below zero. Pinning link count.", + ip->i_ino); + set_nlink(inode, XFS_NLINK_PINNED); + } + if (inode->i_nlink != XFS_NLINK_PINNED) + drop_nlink(inode); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (inode->i_nlink) + return 0; + + return xfs_iunlink(tp, ip); +} + +/* + * Increment the link count on an inode & log the change. + */ +void +xfs_bumplink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + if (inode->i_nlink == XFS_NLINK_PINNED - 1) + xfs_info_ratelimited(tp->t_mountp, + "Inode 0x%llx link count exceeded maximum. Pinning link count.", + ip->i_ino); + if (inode->i_nlink != XFS_NLINK_PINNED) + inc_nlink(inode); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Free an inode in the ondisk index and zero it out. */ +int +xfs_inode_uninit( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_inode *ip, + struct xfs_icluster *xic) +{ + struct xfs_mount *mp = ip->i_mount; + int error; + + /* + * Free the inode first so that we guarantee that the AGI lock is going + * to be taken before we remove the inode from the unlinked list. This + * makes the AGI lock -> unlinked list modification order the same as + * used in O_TMPFILE creation. + */ + error = xfs_difree(tp, pag, ip->i_ino, xic); + if (error) + return error; + + error = xfs_iunlink_remove(tp, pag, ip); + if (error) + return error; + + /* + * Free any local-format data sitting around before we reset the + * data fork to extents format. Note that the attr fork data has + * already been freed by xfs_attr_inactive. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + kfree(ip->i_df.if_data); + ip->i_df.if_data = NULL; + ip->i_df.if_bytes = 0; + } + + VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ + ip->i_diflags = 0; + ip->i_diflags2 = mp->m_ino_geo.new_diflags2; + ip->i_forkoff = 0; /* mark the attr fork not in use */ + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; + + /* + * Bump the generation count so no one will be confused + * by reincarnations of this inode. + */ + VFS_I(ip)->i_generation++; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h new file mode 100644 index 000000000000..060242998a23 --- /dev/null +++ b/fs/xfs/libxfs/xfs_inode_util.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_INODE_UTIL_H__ +#define __XFS_INODE_UTIL_H__ + +struct xfs_icluster; + +uint16_t xfs_flags2diflags(struct xfs_inode *ip, unsigned int xflags); +uint64_t xfs_flags2diflags2(struct xfs_inode *ip, unsigned int xflags); +uint32_t xfs_dic2xflags(struct xfs_inode *ip); +uint32_t xfs_ip2xflags(struct xfs_inode *ip); + +prid_t xfs_get_initial_prid(struct xfs_inode *dp); + +/* + * File creation context. + * + * Due to our only partial reliance on the VFS to propagate uid and gid values + * according to accepted Unix behaviors, callers must initialize idmap to the + * correct idmapping structure to get the correct inheritance behaviors when + * XFS_MOUNT_GRPID is set. + * + * To create files detached from the directory tree (e.g. quota inodes), set + * idmap to NULL. To create a tree root, set pip to NULL. + */ +struct xfs_icreate_args { + struct mnt_idmap *idmap; + struct xfs_inode *pip; /* parent inode or null */ + dev_t rdev; + umode_t mode; + +#define XFS_ICREATE_TMPFILE (1U << 0) /* create an unlinked file */ +#define XFS_ICREATE_INIT_XATTRS (1U << 1) /* will set xattrs immediately */ +#define XFS_ICREATE_UNLINKABLE (1U << 2) /* cannot link into dir tree */ + uint16_t flags; +}; + +/* + * Flags for xfs_trans_ichgtime(). + */ +#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ +#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ +#define XFS_ICHGTIME_ACCESS 0x8 /* last access timestamp */ +void xfs_trans_ichgtime(struct xfs_trans *tp, struct xfs_inode *ip, int flags); + +void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args, + struct xfs_inode *ip); + +int xfs_inode_uninit(struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_inode *ip, struct xfs_icluster *xic); + +int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip); +int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_inode *ip); +int xfs_droplink(struct xfs_trans *tp, struct xfs_inode *ip); +void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip); + +#endif /* __XFS_INODE_UTIL_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 269573c82808..0d637c276db0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -115,10 +115,13 @@ struct xfs_unmount_log_format { #define XLOG_REG_TYPE_BUD_FORMAT 26 #define XLOG_REG_TYPE_ATTRI_FORMAT 27 #define XLOG_REG_TYPE_ATTRD_FORMAT 28 -#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_NAME 29 #define XLOG_REG_TYPE_ATTR_VALUE 30 -#define XLOG_REG_TYPE_MAX 30 - +#define XLOG_REG_TYPE_XMI_FORMAT 31 +#define XLOG_REG_TYPE_XMD_FORMAT 32 +#define XLOG_REG_TYPE_ATTR_NEWNAME 33 +#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 +#define XLOG_REG_TYPE_MAX 34 /* * Flags to log operation header @@ -243,6 +246,14 @@ typedef struct xfs_trans_header { #define XFS_LI_BUD 0x1245 #define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/ #define XFS_LI_ATTRD 0x1247 /* attr set/remove done */ +#define XFS_LI_XMI 0x1248 /* mapping exchange intent */ +#define XFS_LI_XMD 0x1249 /* mapping exchange done */ +#define XFS_LI_EFI_RT 0x124a /* realtime extent free intent */ +#define XFS_LI_EFD_RT 0x124b /* realtime extent free done */ +#define XFS_LI_RUI_RT 0x124c /* realtime rmap update intent */ +#define XFS_LI_RUD_RT 0x124d /* realtime rmap update done */ +#define XFS_LI_CUI_RT 0x124e /* realtime refcount update intent */ +#define XFS_LI_CUD_RT 0x124f /* realtime refcount update done */ #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -260,7 +271,15 @@ typedef struct xfs_trans_header { { XFS_LI_BUI, "XFS_LI_BUI" }, \ { XFS_LI_BUD, "XFS_LI_BUD" }, \ { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \ - { XFS_LI_ATTRD, "XFS_LI_ATTRD" } + { XFS_LI_ATTRD, "XFS_LI_ATTRD" }, \ + { XFS_LI_XMI, "XFS_LI_XMI" }, \ + { XFS_LI_XMD, "XFS_LI_XMD" }, \ + { XFS_LI_EFI_RT, "XFS_LI_EFI_RT" }, \ + { XFS_LI_EFD_RT, "XFS_LI_EFD_RT" }, \ + { XFS_LI_RUI_RT, "XFS_LI_RUI_RT" }, \ + { XFS_LI_RUD_RT, "XFS_LI_RUD_RT" }, \ + { XFS_LI_CUI_RT, "XFS_LI_CUI_RT" }, \ + { XFS_LI_CUD_RT, "XFS_LI_CUD_RT" } /* * Inode Log Item Format definitions. @@ -340,12 +359,6 @@ struct xfs_inode_log_format_32 { */ #define XFS_ILOG_IVERSION 0x8000 -#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ - XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ - XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \ - XFS_ILOG_AOWNER) - #define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT) @@ -397,7 +410,7 @@ struct xfs_log_dinode { uint16_t di_mode; /* mode and type of file */ int8_t di_version; /* inode version */ int8_t di_format; /* format of di_c data */ - uint8_t di_pad3[2]; /* unused in v2/3 inodes */ + uint16_t di_metatype; /* metadata type, if DIFLAG2_METADATA */ uint32_t di_uid; /* owner's user id */ uint32_t di_gid; /* owner's group id */ uint32_t di_nlink; /* number of links to file */ @@ -462,7 +475,12 @@ struct xfs_log_dinode { xfs_lsn_t di_lsn; uint64_t di_flags2; /* more random flags */ - uint32_t di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + uint32_t di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + uint32_t di_used_blocks; + }; uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ @@ -838,10 +856,12 @@ struct xfs_cud_log_format { #define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31) #define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30) +#define XFS_BMAP_EXTENT_REALTIME (1U << 29) #define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \ XFS_BMAP_EXTENT_ATTR_FORK | \ - XFS_BMAP_EXTENT_UNWRITTEN) + XFS_BMAP_EXTENT_UNWRITTEN | \ + XFS_BMAP_EXTENT_REALTIME) /* * This is the structure used to lay out an bui log item in the @@ -877,6 +897,61 @@ struct xfs_bud_log_format { }; /* + * XMI/XMD (file mapping exchange) log format definitions + */ + +/* This is the structure used to lay out an mapping exchange log item. */ +struct xfs_xmi_log_format { + uint16_t xmi_type; /* xmi log item type */ + uint16_t xmi_size; /* size of this item */ + uint32_t __pad; /* must be zero */ + uint64_t xmi_id; /* xmi identifier */ + + uint64_t xmi_inode1; /* inumber of first file */ + uint64_t xmi_inode2; /* inumber of second file */ + uint32_t xmi_igen1; /* generation of first file */ + uint32_t xmi_igen2; /* generation of second file */ + uint64_t xmi_startoff1; /* block offset into file1 */ + uint64_t xmi_startoff2; /* block offset into file2 */ + uint64_t xmi_blockcount; /* number of blocks */ + uint64_t xmi_flags; /* XFS_EXCHMAPS_* */ + uint64_t xmi_isize1; /* intended file1 size */ + uint64_t xmi_isize2; /* intended file2 size */ +}; + +/* Exchange mappings between extended attribute forks instead of data forks. */ +#define XFS_EXCHMAPS_ATTR_FORK (1ULL << 0) + +/* Set the file sizes when finished. */ +#define XFS_EXCHMAPS_SET_SIZES (1ULL << 1) + +/* + * Exchange the mappings of the two files only if the file allocation units + * mapped to file1's range have been written. + */ +#define XFS_EXCHMAPS_INO1_WRITTEN (1ULL << 2) + +/* Clear the reflink flag from inode1 after the operation. */ +#define XFS_EXCHMAPS_CLEAR_INO1_REFLINK (1ULL << 3) + +/* Clear the reflink flag from inode2 after the operation. */ +#define XFS_EXCHMAPS_CLEAR_INO2_REFLINK (1ULL << 4) + +#define XFS_EXCHMAPS_LOGGED_FLAGS (XFS_EXCHMAPS_ATTR_FORK | \ + XFS_EXCHMAPS_SET_SIZES | \ + XFS_EXCHMAPS_INO1_WRITTEN | \ + XFS_EXCHMAPS_CLEAR_INO1_REFLINK | \ + XFS_EXCHMAPS_CLEAR_INO2_REFLINK) + +/* This is the structure used to lay out an mapping exchange done log item. */ +struct xfs_xmd_log_format { + uint16_t xmd_type; /* xmd log item type */ + uint16_t xmd_size; /* size of this item */ + uint32_t __pad; + uint64_t xmd_xmi_id; /* id of corresponding xmi */ +}; + +/* * Dquot Log format definitions. * * The first two fields must be the type and size fitting into @@ -964,6 +1039,9 @@ struct xfs_icreate_log { #define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */ #define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */ #define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */ +#define XFS_ATTRI_OP_FLAGS_PPTR_SET 4 /* Set parent pointer */ +#define XFS_ATTRI_OP_FLAGS_PPTR_REMOVE 5 /* Remove parent pointer */ +#define XFS_ATTRI_OP_FLAGS_PPTR_REPLACE 6 /* Replace parent pointer */ #define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ /* @@ -972,6 +1050,7 @@ struct xfs_icreate_log { */ #define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \ XFS_ATTR_SECURE | \ + XFS_ATTR_PARENT | \ XFS_ATTR_INCOMPLETE) /* @@ -981,11 +1060,22 @@ struct xfs_icreate_log { struct xfs_attri_log_format { uint16_t alfi_type; /* attri log item type */ uint16_t alfi_size; /* size of this item */ - uint32_t __pad; /* pad to 64 bit aligned */ + uint32_t alfi_igen; /* generation of alfi_ino for pptr ops */ uint64_t alfi_id; /* attri identifier */ uint64_t alfi_ino; /* the inode for this attr operation */ uint32_t alfi_op_flags; /* marks the op as a set or remove */ - uint32_t alfi_name_len; /* attr name length */ + union { + uint32_t alfi_name_len; /* attr name length */ + struct { + /* + * For PPTR_REPLACE, these are the lengths of the old + * and new attr names. The new and old values must + * have the same length. + */ + uint16_t alfi_old_name_len; + uint16_t alfi_new_name_len; + }; + }; uint32_t alfi_value_len; /* attr value length */ uint32_t alfi_attr_filter;/* attr filter flags */ }; diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 9fe7a9564bca..66c7916fb5cd 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -75,6 +75,14 @@ extern const struct xlog_recover_item_ops xlog_cui_item_ops; extern const struct xlog_recover_item_ops xlog_cud_item_ops; extern const struct xlog_recover_item_ops xlog_attri_item_ops; extern const struct xlog_recover_item_ops xlog_attrd_item_ops; +extern const struct xlog_recover_item_ops xlog_xmi_item_ops; +extern const struct xlog_recover_item_ops xlog_xmd_item_ops; +extern const struct xlog_recover_item_ops xlog_rtefi_item_ops; +extern const struct xlog_recover_item_ops xlog_rtefd_item_ops; +extern const struct xlog_recover_item_ops xlog_rtrui_item_ops; +extern const struct xlog_recover_item_ops xlog_rtrud_item_ops; +extern const struct xlog_recover_item_ops xlog_rtcui_item_ops; +extern const struct xlog_recover_item_ops xlog_rtcud_item_ops; /* * Macros, structures, prototypes for internal log manager use. @@ -121,6 +129,8 @@ bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp); +int xlog_recover_iget_handle(struct xfs_mount *mp, xfs_ino_t ino, uint32_t gen, + struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); int xlog_alloc_buf_cancel_table(struct xlog *log); diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 9975b93a7412..34bba96d30ca 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -17,6 +17,34 @@ #include "xfs_trace.h" /* + * Shortly after enabling the large extents count feature in 2023, longstanding + * bugs were found in the code that computes the minimum log size. Luckily, + * the bugs resulted in over-estimates of that size, so there's no impact to + * existing users. However, we don't want to reduce the minimum log size + * because that can create the situation where a newer mkfs writes a new + * filesystem that an older kernel won't mount. + * + * Several years prior, we also discovered that the transaction reservations + * for rmap and reflink operations were unnecessarily large. That was fixed, + * but the minimum log size computation was left alone to avoid the + * compatibility problems noted above. Fix that too. + * + * Therefore, we only may correct the computation starting with filesystem + * features that didn't exist in 2023. In other words, only turn this on if + * the filesystem has parent pointers. + * + * This function can be called before the XFS_HAS_* flags have been set up, + * (e.g. mkfs) so we must check the ondisk superblock. + */ +static inline bool +xfs_want_minlogsize_fixes( + struct xfs_sb *sb) +{ + return xfs_sb_is_v5(sb) && + xfs_sb_has_incompat_feature(sb, XFS_SB_FEAT_INCOMPAT_PARENT); +} + +/* * Calculate the maximum length in bytes that would be required for a local * attribute value as large attributes out of line are not logged. */ @@ -31,6 +59,15 @@ xfs_log_calc_max_attrsetm_res( MAXNAMELEN - 1; nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); nblks += XFS_B_TO_FSB(mp, size); + + /* + * If the feature set is new enough, correct a unit conversion error in + * the xattr transaction reservation code that resulted in oversized + * minimum log size computations. + */ + if (xfs_want_minlogsize_fixes(&mp->m_sb)) + size = XFS_B_TO_FSB(mp, size); + nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK); return M_RES(mp)->tr_attrsetm.tr_logres + @@ -49,6 +86,16 @@ xfs_log_calc_trans_resv_for_minlogblocks( unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; /* + * If the feature set is new enough, drop the oversized minimum log + * size computation introduced by the original reflink code. + */ + if (xfs_want_minlogsize_fixes(&mp->m_sb)) { + xfs_trans_resv_calc(mp, resv); + resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend; + return; + } + + /* * In the early days of rmap+reflink, we always set the rmap maxlevels * to 9 even if the AG was small enough that it would never grow to * that height. Transaction reservation sizes influence the minimum @@ -61,6 +108,9 @@ xfs_log_calc_trans_resv_for_minlogblocks( xfs_trans_resv_calc(mp, resv); + /* Copy the dynamic transaction reservation types from the running fs */ + resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend; + if (xfs_has_reflink(mp)) { /* * In the early days of reflink, typical log operation counts diff --git a/fs/xfs/libxfs/xfs_metadir.c b/fs/xfs/libxfs/xfs_metadir.c new file mode 100644 index 000000000000..178e89711cb7 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metadir.c @@ -0,0 +1,485 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_metafile.h" +#include "xfs_metadir.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_bmap_btree.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_trans_space.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_parent.h" +#include "xfs_health.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_btree.h" +#include "xfs_alloc.h" + +/* + * Metadata Directory Tree + * ======================= + * + * These functions provide an abstraction layer for looking up, creating, and + * deleting metadata inodes that live within a special metadata directory tree. + * + * This code does not manage the five existing metadata inodes: real time + * bitmap & summary; and the user, group, and quotas. All other metadata + * inodes must use only the xfs_meta{dir,file}_* functions. + * + * Callers wishing to create or hardlink a metadata inode must create an + * xfs_metadir_update structure, call the appropriate xfs_metadir* function, + * and then call xfs_metadir_commit or xfs_metadir_cancel to commit or cancel + * the update. Files in the metadata directory tree currently cannot be + * unlinked. + * + * When the metadir feature is enabled, all metadata inodes must have the + * "metadata" inode flag set to prevent them from being exposed to the outside + * world. + * + * Callers must take the ILOCK of any inode in the metadata directory tree to + * synchronize access to that inode. It is never necessary to take the IOLOCK + * or the MMAPLOCK since metadata inodes must not be exposed to user space. + */ + +static inline void +xfs_metadir_set_xname( + struct xfs_name *xname, + const char *path, + unsigned char ftype) +{ + xname->name = (const unsigned char *)path; + xname->len = strlen(path); + xname->type = ftype; +} + +/* + * Given a parent directory @dp and a metadata inode path component @xname, + * Look up the inode number in the directory, returning it in @ino. + * @xname.type must match the directory entry's ftype. + * + * Caller must hold ILOCK_EXCL. + */ +static inline int +xfs_metadir_lookup( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_name *xname, + xfs_ino_t *ino) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args = { + .trans = tp, + .dp = dp, + .geo = mp->m_dir_geo, + .name = xname->name, + .namelen = xname->len, + .hashval = xfs_dir2_hashname(mp, xname), + .whichfork = XFS_DATA_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + .owner = dp->i_ino, + }; + int error; + + if (!S_ISDIR(VFS_I(dp)->i_mode)) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + if (xfs_is_shutdown(mp)) + return -EIO; + + error = xfs_dir_lookup_args(&args); + if (error) + return error; + + if (!xfs_verify_ino(mp, args.inumber)) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + if (xname->type != XFS_DIR3_FT_UNKNOWN && xname->type != args.filetype) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + trace_xfs_metadir_lookup(dp, xname, args.inumber); + *ino = args.inumber; + return 0; +} + +/* + * Look up and read a metadata inode from the metadata directory. If the path + * component doesn't exist, return -ENOENT. + */ +int +xfs_metadir_load( + struct xfs_trans *tp, + struct xfs_inode *dp, + const char *path, + enum xfs_metafile_type metafile_type, + struct xfs_inode **ipp) +{ + struct xfs_name xname; + xfs_ino_t ino; + int error; + + xfs_metadir_set_xname(&xname, path, XFS_DIR3_FT_UNKNOWN); + + xfs_ilock(dp, XFS_ILOCK_EXCL); + error = xfs_metadir_lookup(tp, dp, &xname, &ino); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + if (error) + return error; + return xfs_trans_metafile_iget(tp, ino, metafile_type, ipp); +} + +/* + * Unlock and release resources after committing (or cancelling) a metadata + * directory tree operation. The caller retains its reference to @upd->ip + * and must release it explicitly. + */ +static inline void +xfs_metadir_teardown( + struct xfs_metadir_update *upd, + int error) +{ + trace_xfs_metadir_teardown(upd, error); + + if (upd->ppargs) { + xfs_parent_finish(upd->dp->i_mount, upd->ppargs); + upd->ppargs = NULL; + } + + if (upd->ip) { + if (upd->ip_locked) + xfs_iunlock(upd->ip, XFS_ILOCK_EXCL); + upd->ip_locked = false; + } + + if (upd->dp_locked) + xfs_iunlock(upd->dp, XFS_ILOCK_EXCL); + upd->dp_locked = false; +} + +/* + * Begin the process of creating a metadata file by allocating transactions + * and taking whatever resources we're going to need. + */ +int +xfs_metadir_start_create( + struct xfs_metadir_update *upd) +{ + struct xfs_mount *mp = upd->dp->i_mount; + int error; + + ASSERT(upd->dp != NULL); + ASSERT(upd->ip == NULL); + ASSERT(xfs_has_metadir(mp)); + ASSERT(upd->metafile_type != XFS_METAFILE_UNKNOWN); + + error = xfs_parent_start(mp, &upd->ppargs); + if (error) + return error; + + /* + * If we ever need the ability to create rt metadata files on a + * pre-metadir filesystem, we'll need to dqattach the parent here. + * Currently we assume that mkfs will create the files and quotacheck + * will account for them. + */ + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, + xfs_create_space_res(mp, MAXNAMELEN), 0, 0, &upd->tp); + if (error) + goto out_teardown; + + /* + * Lock the parent directory if there is one. We can't ijoin it to + * the transaction until after the child file has been created. + */ + xfs_ilock(upd->dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + upd->dp_locked = true; + + trace_xfs_metadir_start_create(upd); + return 0; +out_teardown: + xfs_metadir_teardown(upd, error); + return error; +} + +/* + * Create a metadata inode with the given @mode, and insert it into the + * metadata directory tree at the given @upd->path. The path up to the final + * component must already exist. The final path component must not exist. + * + * The new metadata inode will be attached to the update structure @upd->ip, + * with the ILOCK held until the caller releases it. + * + * NOTE: This function may return a new inode to the caller even if it returns + * a negative error code. If an inode is passed back, the caller must finish + * setting up the inode before releasing it. + */ +int +xfs_metadir_create( + struct xfs_metadir_update *upd, + umode_t mode) +{ + struct xfs_icreate_args args = { + .pip = upd->dp, + .mode = mode, + }; + struct xfs_name xname; + struct xfs_dir_update du = { + .dp = upd->dp, + .name = &xname, + .ppargs = upd->ppargs, + }; + struct xfs_mount *mp = upd->dp->i_mount; + xfs_ino_t ino; + unsigned int resblks; + int error; + + xfs_assert_ilocked(upd->dp, XFS_ILOCK_EXCL); + + /* Check that the name does not already exist in the directory. */ + xfs_metadir_set_xname(&xname, upd->path, XFS_DIR3_FT_UNKNOWN); + error = xfs_metadir_lookup(upd->tp, upd->dp, &xname, &ino); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + fallthrough; + default: + return error; + } + + /* + * A newly created regular or special file just has one directory + * entry pointing to them, but a directory also the "." entry + * pointing to itself. + */ + error = xfs_dialloc(&upd->tp, &args, &ino); + if (error) + return error; + error = xfs_icreate(upd->tp, ino, &args, &upd->ip); + if (error) + return error; + du.ip = upd->ip; + xfs_metafile_set_iflag(upd->tp, upd->ip, upd->metafile_type); + upd->ip_locked = true; + + /* + * Join the directory inode to the transaction. We do not do it + * earlier because xfs_dialloc rolls the transaction. + */ + xfs_trans_ijoin(upd->tp, upd->dp, 0); + + /* Create the entry. */ + if (S_ISDIR(args.mode)) + resblks = xfs_mkdir_space_res(mp, xname.len); + else + resblks = xfs_create_space_res(mp, xname.len); + xname.type = xfs_mode_to_ftype(args.mode); + + trace_xfs_metadir_try_create(upd); + + error = xfs_dir_create_child(upd->tp, resblks, &du); + if (error) + return error; + + /* Metadir files are not accounted to quota. */ + + trace_xfs_metadir_create(upd); + + return 0; +} + +#ifndef __KERNEL__ +/* + * Begin the process of linking a metadata file by allocating transactions + * and locking whatever resources we're going to need. + */ +int +xfs_metadir_start_link( + struct xfs_metadir_update *upd) +{ + struct xfs_mount *mp = upd->dp->i_mount; + unsigned int resblks; + int nospace_error = 0; + int error; + + ASSERT(upd->dp != NULL); + ASSERT(upd->ip != NULL); + ASSERT(xfs_has_metadir(mp)); + + error = xfs_parent_start(mp, &upd->ppargs); + if (error) + return error; + + resblks = xfs_link_space_res(mp, MAXNAMELEN); + error = xfs_trans_alloc_dir(upd->dp, &M_RES(mp)->tr_link, upd->ip, + &resblks, &upd->tp, &nospace_error); + if (error) + goto out_teardown; + if (!resblks) { + /* We don't allow reservationless updates. */ + xfs_trans_cancel(upd->tp); + upd->tp = NULL; + xfs_iunlock(upd->dp, XFS_ILOCK_EXCL); + xfs_iunlock(upd->ip, XFS_ILOCK_EXCL); + error = nospace_error; + goto out_teardown; + } + + upd->dp_locked = true; + upd->ip_locked = true; + + trace_xfs_metadir_start_link(upd); + return 0; +out_teardown: + xfs_metadir_teardown(upd, error); + return error; +} + +/* + * Link the metadata directory given by @path to the inode @upd->ip. + * The path (up to the final component) must already exist, but the final + * component must not already exist. + */ +int +xfs_metadir_link( + struct xfs_metadir_update *upd) +{ + struct xfs_name xname; + struct xfs_dir_update du = { + .dp = upd->dp, + .name = &xname, + .ip = upd->ip, + .ppargs = upd->ppargs, + }; + struct xfs_mount *mp = upd->dp->i_mount; + xfs_ino_t ino; + unsigned int resblks; + int error; + + xfs_assert_ilocked(upd->dp, XFS_ILOCK_EXCL); + xfs_assert_ilocked(upd->ip, XFS_ILOCK_EXCL); + + /* Look up the name in the current directory. */ + xfs_metadir_set_xname(&xname, upd->path, + xfs_mode_to_ftype(VFS_I(upd->ip)->i_mode)); + error = xfs_metadir_lookup(upd->tp, upd->dp, &xname, &ino); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + fallthrough; + default: + return error; + } + + resblks = xfs_link_space_res(mp, xname.len); + error = xfs_dir_add_child(upd->tp, resblks, &du); + if (error) + return error; + + trace_xfs_metadir_link(upd); + + return 0; +} +#endif /* ! __KERNEL__ */ + +/* Commit a metadir update and unlock/drop all resources. */ +int +xfs_metadir_commit( + struct xfs_metadir_update *upd) +{ + int error; + + trace_xfs_metadir_commit(upd); + + error = xfs_trans_commit(upd->tp); + upd->tp = NULL; + + xfs_metadir_teardown(upd, error); + return error; +} + +/* Cancel a metadir update and unlock/drop all resources. */ +void +xfs_metadir_cancel( + struct xfs_metadir_update *upd, + int error) +{ + trace_xfs_metadir_cancel(upd); + + xfs_trans_cancel(upd->tp); + upd->tp = NULL; + + xfs_metadir_teardown(upd, error); +} + +/* Create a metadata for the last component of the path. */ +int +xfs_metadir_mkdir( + struct xfs_inode *dp, + const char *path, + struct xfs_inode **ipp) +{ + struct xfs_metadir_update upd = { + .dp = dp, + .path = path, + .metafile_type = XFS_METAFILE_DIR, + }; + int error; + + if (xfs_is_shutdown(dp->i_mount)) + return -EIO; + + /* Allocate a transaction to create the last directory. */ + error = xfs_metadir_start_create(&upd); + if (error) + return error; + + /* Create the subdirectory and take our reference. */ + error = xfs_metadir_create(&upd, S_IFDIR); + if (error) + goto out_cancel; + + error = xfs_metadir_commit(&upd); + if (error) + goto out_irele; + + xfs_finish_inode_setup(upd.ip); + *ipp = upd.ip; + return 0; + +out_cancel: + xfs_metadir_cancel(&upd, error); +out_irele: + /* Have to finish setting up the inode to ensure it's deleted. */ + if (upd.ip) { + xfs_finish_inode_setup(upd.ip); + xfs_irele(upd.ip); + } + return error; +} diff --git a/fs/xfs/libxfs/xfs_metadir.h b/fs/xfs/libxfs/xfs_metadir.h new file mode 100644 index 000000000000..bfecac7d3d14 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metadir.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_METADIR_H__ +#define __XFS_METADIR_H__ + +/* Cleanup widget for metadata inode creation and deletion. */ +struct xfs_metadir_update { + /* Parent directory */ + struct xfs_inode *dp; + + /* Path to metadata file */ + const char *path; + + /* Parent pointer update context */ + struct xfs_parent_args *ppargs; + + /* Child metadata file */ + struct xfs_inode *ip; + + struct xfs_trans *tp; + + enum xfs_metafile_type metafile_type; + + unsigned int dp_locked:1; + unsigned int ip_locked:1; +}; + +int xfs_metadir_load(struct xfs_trans *tp, struct xfs_inode *dp, + const char *path, enum xfs_metafile_type metafile_type, + struct xfs_inode **ipp); + +int xfs_metadir_start_create(struct xfs_metadir_update *upd); +int xfs_metadir_create(struct xfs_metadir_update *upd, umode_t mode); + +int xfs_metadir_start_link(struct xfs_metadir_update *upd); +int xfs_metadir_link(struct xfs_metadir_update *upd); + +int xfs_metadir_commit(struct xfs_metadir_update *upd); +void xfs_metadir_cancel(struct xfs_metadir_update *upd, int error); + +int xfs_metadir_mkdir(struct xfs_inode *dp, const char *path, + struct xfs_inode **ipp); + +#endif /* __XFS_METADIR_H__ */ diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c new file mode 100644 index 000000000000..225923e463c4 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_metafile.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_alloc.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" + +static const struct { + enum xfs_metafile_type mtype; + const char *name; +} xfs_metafile_type_strs[] = { XFS_METAFILE_TYPE_STR }; + +const char * +xfs_metafile_type_str(enum xfs_metafile_type metatype) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(xfs_metafile_type_strs); i++) { + if (xfs_metafile_type_strs[i].mtype == metatype) + return xfs_metafile_type_strs[i].name; + } + + return NULL; +} + +/* Set up an inode to be recognized as a metadata directory inode. */ +void +xfs_metafile_set_iflag( + struct xfs_trans *tp, + struct xfs_inode *ip, + enum xfs_metafile_type metafile_type) +{ + VFS_I(ip)->i_mode &= ~0777; + VFS_I(ip)->i_uid = GLOBAL_ROOT_UID; + VFS_I(ip)->i_gid = GLOBAL_ROOT_GID; + if (S_ISDIR(VFS_I(ip)->i_mode)) + ip->i_diflags |= XFS_METADIR_DIFLAGS; + else + ip->i_diflags |= XFS_METAFILE_DIFLAGS; + ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; + ip->i_diflags2 |= XFS_DIFLAG2_METADATA; + ip->i_metatype = metafile_type; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Clear the metadata directory inode flag. */ +void +xfs_metafile_clear_iflag( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + ASSERT(xfs_is_metadir_inode(ip)); + ASSERT(VFS_I(ip)->i_nlink == 0); + + ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* + * Is the metafile reservations at or beneath a certain threshold? + */ +static inline bool +xfs_metafile_resv_can_cover( + struct xfs_mount *mp, + int64_t rhs) +{ + /* + * The amount of space that can be allocated to this metadata file is + * the remaining reservation for the particular metadata file + the + * global free block count. Take care of the first case to avoid + * touching the per-cpu counter. + */ + if (mp->m_metafile_resv_avail >= rhs) + return true; + + /* + * There aren't enough blocks left in the inode's reservation, but it + * isn't critical unless there also isn't enough free space. + */ + return xfs_compare_freecounter(mp, XC_FREE_BLOCKS, + rhs - mp->m_metafile_resv_avail, 2048) >= 0; +} + +/* + * Is the metafile reservation critically low on blocks? For now we'll define + * that as the number of blocks we can get our hands on being less than 10% of + * what we reserved or less than some arbitrary number (maximum btree height). + */ +bool +xfs_metafile_resv_critical( + struct xfs_mount *mp) +{ + ASSERT(xfs_has_metadir(mp)); + + trace_xfs_metafile_resv_critical(mp, 0); + + if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels)) + return true; + + if (!xfs_metafile_resv_can_cover(mp, + div_u64(mp->m_metafile_resv_target, 10))) + return true; + + return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); +} + +/* Allocate a block from the metadata file's reservation. */ +void +xfs_metafile_resv_alloc_space( + struct xfs_inode *ip, + struct xfs_alloc_arg *args) +{ + struct xfs_mount *mp = ip->i_mount; + int64_t len = args->len; + + ASSERT(xfs_is_metadir_inode(ip)); + ASSERT(args->resv == XFS_AG_RESV_METAFILE); + + trace_xfs_metafile_resv_alloc_space(mp, args->len); + + /* + * Allocate the blocks from the metadata inode's block reservation + * and update the ondisk sb counter. + */ + mutex_lock(&mp->m_metafile_resv_lock); + if (mp->m_metafile_resv_avail > 0) { + int64_t from_resv; + + from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail); + mp->m_metafile_resv_avail -= from_resv; + xfs_mod_delalloc(ip, 0, -from_resv); + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, + -from_resv); + len -= from_resv; + } + + /* + * Any allocation in excess of the reservation requires in-core and + * on-disk fdblocks updates. If we can grab @len blocks from the + * in-core fdblocks then all we need to do is update the on-disk + * superblock; if not, then try to steal some from the transaction's + * block reservation. Overruns are only expected for rmap btrees. + */ + if (len) { + unsigned int field; + int error; + + error = xfs_dec_fdblocks(ip->i_mount, len, true); + if (error) + field = XFS_TRANS_SB_FDBLOCKS; + else + field = XFS_TRANS_SB_RES_FDBLOCKS; + + xfs_trans_mod_sb(args->tp, field, -len); + } + + mp->m_metafile_resv_used += args->len; + mutex_unlock(&mp->m_metafile_resv_lock); + + ip->i_nblocks += args->len; + xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE); +} + +/* Free a block to the metadata file's reservation. */ +void +xfs_metafile_resv_free_space( + struct xfs_inode *ip, + struct xfs_trans *tp, + xfs_filblks_t len) +{ + struct xfs_mount *mp = ip->i_mount; + int64_t to_resv; + + ASSERT(xfs_is_metadir_inode(ip)); + + trace_xfs_metafile_resv_free_space(mp, len); + + ip->i_nblocks -= len; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + mutex_lock(&mp->m_metafile_resv_lock); + mp->m_metafile_resv_used -= len; + + /* + * Add the freed blocks back into the inode's delalloc reservation + * until it reaches the maximum size. Update the ondisk fdblocks only. + */ + to_resv = mp->m_metafile_resv_target - + (mp->m_metafile_resv_used + mp->m_metafile_resv_avail); + if (to_resv > 0) { + to_resv = min_t(int64_t, to_resv, len); + mp->m_metafile_resv_avail += to_resv; + xfs_mod_delalloc(ip, 0, to_resv); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv); + len -= to_resv; + } + mutex_unlock(&mp->m_metafile_resv_lock); + + /* + * Everything else goes back to the filesystem, so update the in-core + * and on-disk counters. + */ + if (len) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len); +} + +static void +__xfs_metafile_resv_free( + struct xfs_mount *mp) +{ + if (mp->m_metafile_resv_avail) { + xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail); + xfs_add_fdblocks(mp, mp->m_metafile_resv_avail); + } + mp->m_metafile_resv_avail = 0; + mp->m_metafile_resv_used = 0; + mp->m_metafile_resv_target = 0; +} + +/* Release unused metafile space reservation. */ +void +xfs_metafile_resv_free( + struct xfs_mount *mp) +{ + if (!xfs_has_metadir(mp)) + return; + + trace_xfs_metafile_resv_free(mp, 0); + + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + mutex_unlock(&mp->m_metafile_resv_lock); +} + +/* Set up a metafile space reservation. */ +int +xfs_metafile_resv_init( + struct xfs_mount *mp) +{ + struct xfs_rtgroup *rtg = NULL; + xfs_filblks_t used = 0, target = 0; + xfs_filblks_t hidden_space; + xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4; + int error = 0; + + if (!xfs_has_metadir(mp)) + return 0; + + /* + * Free any previous reservation to have a clean slate. + */ + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + + /* + * Currently the only btree metafiles that require reservations are the + * rtrmap and the rtrefcount. Anything new will have to be added here + * as well. + */ + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + if (xfs_has_rtrmapbt(mp)) { + used += rtg_rmap(rtg)->i_nblocks; + target += xfs_rtrmapbt_calc_reserves(mp); + } + if (xfs_has_rtreflink(mp)) { + used += rtg_refcount(rtg)->i_nblocks; + target += xfs_rtrefcountbt_calc_reserves(mp); + } + } + + if (!target) + goto out_unlock; + + /* + * Space taken by the per-AG metadata btrees are accounted on-disk as + * used space. We therefore only hide the space that is reserved but + * not used by the trees. + */ + if (used > target) + target = used; + else if (target > dblocks_avail) + target = dblocks_avail; + hidden_space = target - used; + + error = xfs_dec_fdblocks(mp, hidden_space, true); + if (error) { + trace_xfs_metafile_resv_init_error(mp, 0); + goto out_unlock; + } + + xfs_mod_sb_delalloc(mp, hidden_space); + + mp->m_metafile_resv_target = target; + mp->m_metafile_resv_used = used; + mp->m_metafile_resv_avail = hidden_space; + + trace_xfs_metafile_resv_init(mp, target); + +out_unlock: + mutex_unlock(&mp->m_metafile_resv_lock); + return error; +} diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h new file mode 100644 index 000000000000..ae6f9e779b98 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metafile.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_METAFILE_H__ +#define __XFS_METAFILE_H__ + +const char *xfs_metafile_type_str(enum xfs_metafile_type metatype); + +/* All metadata files must have these flags set. */ +#define XFS_METAFILE_DIFLAGS (XFS_DIFLAG_IMMUTABLE | \ + XFS_DIFLAG_SYNC | \ + XFS_DIFLAG_NOATIME | \ + XFS_DIFLAG_NODUMP | \ + XFS_DIFLAG_NODEFRAG) + +/* All metadata directories must have these flags set. */ +#define XFS_METADIR_DIFLAGS (XFS_METAFILE_DIFLAGS | \ + XFS_DIFLAG_NOSYMLINKS) + +void xfs_metafile_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip, + enum xfs_metafile_type metafile_type); +void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip); + +/* Space reservations for metadata inodes. */ +struct xfs_alloc_arg; + +bool xfs_metafile_resv_critical(struct xfs_mount *mp); +void xfs_metafile_resv_alloc_space(struct xfs_inode *ip, + struct xfs_alloc_arg *args); +void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp, + xfs_filblks_t len); +void xfs_metafile_resv_free(struct xfs_mount *mp); +int xfs_metafile_resv_init(struct xfs_mount *mp); + +/* Code specific to kernel/userspace; must be provided externally. */ + +int xfs_trans_metafile_iget(struct xfs_trans *tp, xfs_ino_t ino, + enum xfs_metafile_type metafile_type, struct xfs_inode **ipp); +int xfs_metafile_iget(struct xfs_mount *mp, xfs_ino_t ino, + enum xfs_metafile_type metafile_type, struct xfs_inode **ipp); + +#endif /* __XFS_METAFILE_H__ */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 81885a6a028e..5ed44fdf7491 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -19,40 +19,46 @@ static_assert((value) == (expected), \ "XFS: value of " #value " is wrong, expected " #expected) +#define XFS_CHECK_SB_OFFSET(field, offset) \ + XFS_CHECK_OFFSET(struct xfs_dsb, field, offset); \ + XFS_CHECK_OFFSET(struct xfs_sb, field, offset); + static inline void __init xfs_check_ondisk_structs(void) { - /* ag/file structures */ + /* file structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_acl, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12); - XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224); - XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36); - XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); - XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48); - XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64); - XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104); XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk, 136); - XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 264); XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56); + XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8); + + /* space btrees */ + XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224); + XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36); + XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344); + XFS_CHECK_STRUCT_SIZE(struct xfs_alloc_rec, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); - XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8); - XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_bmdr_key_t, 8); /* dir/attr trees */ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80); @@ -67,32 +73,38 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr, 64); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_entry_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_hdr_t, 32); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_hdr, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_map, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_name_local, 4); /* realtime structures */ + XFS_CHECK_STRUCT_SIZE(struct xfs_rtsb, 56); XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4); XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo, 48); + XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root, 4); + XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_rtrefcount_root, 4); /* - * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to - * 4 bytes anyway so it's not obviously a problem. Hence for the moment - * we don't check this structure. This can be re-instated when the attr - * definitions are updated to use c99 VLA definitions. + * m68k has problems with struct xfs_attr_leaf_name_remote, but we pad + * it to 4 bytes anyway so it's not obviously a problem. Hence for the + * moment we don't check this structure. This can be re-instated when + * the attr definitions are updated to use c99 VLA definitions. * - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_name_remote, 12); */ - XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, valuelen, 0); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, namelen, 2); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, nameval, 3); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, valueblk, 0); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, valuelen, 4); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, namelen, 8); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, name, 9); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leafblock, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr, 4); XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize, 0); XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count, 2); @@ -100,25 +112,40 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen, 1); XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags, 2); XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval, 3); - XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); - XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16); - XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0); - XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3); - XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0); - XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1); - XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_blkinfo, 12); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_intnode, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16); + XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, freetag, 0); + XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, length, 2); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_tail, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_entry, 3); + XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, namelen, 0); + XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, offset, 1); + XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, name, 3); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_hdr, 10); + XFS_CHECK_STRUCT_SIZE(struct xfs_parent_rec, 12); + + /* ondisk dir/attr structures from xfs/122 */ + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_entry, 3); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_unused, 6); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_tail, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_entry, 3); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_hdr, 10); /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); @@ -155,6 +182,16 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16); XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16); + /* ondisk log structures from xfs/122 */ + XFS_CHECK_STRUCT_SIZE(struct xfs_unmount_log_format, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_xmd_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_xmi_log_format, 88); + + /* parent pointer ioctls */ + XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_rec, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_getparents, 40); + XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_by_handle, 64); + /* * The v5 superblock format extended several v4 header structures with * additional data. While new fields are only accessible on v5 @@ -194,6 +231,72 @@ xfs_check_ondisk_structs(void) XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4); XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT, 16299260424LL); + + /* superblock field checks we got from xfs/122 */ + XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304); + XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304); + XFS_CHECK_SB_OFFSET(sb_magicnum, 0); + XFS_CHECK_SB_OFFSET(sb_blocksize, 4); + XFS_CHECK_SB_OFFSET(sb_dblocks, 8); + XFS_CHECK_SB_OFFSET(sb_rblocks, 16); + XFS_CHECK_SB_OFFSET(sb_rextents, 24); + XFS_CHECK_SB_OFFSET(sb_uuid, 32); + XFS_CHECK_SB_OFFSET(sb_logstart, 48); + XFS_CHECK_SB_OFFSET(sb_rootino, 56); + XFS_CHECK_SB_OFFSET(sb_rbmino, 64); + XFS_CHECK_SB_OFFSET(sb_rsumino, 72); + XFS_CHECK_SB_OFFSET(sb_rextsize, 80); + XFS_CHECK_SB_OFFSET(sb_agblocks, 84); + XFS_CHECK_SB_OFFSET(sb_agcount, 88); + XFS_CHECK_SB_OFFSET(sb_rbmblocks, 92); + XFS_CHECK_SB_OFFSET(sb_logblocks, 96); + XFS_CHECK_SB_OFFSET(sb_versionnum, 100); + XFS_CHECK_SB_OFFSET(sb_sectsize, 102); + XFS_CHECK_SB_OFFSET(sb_inodesize, 104); + XFS_CHECK_SB_OFFSET(sb_inopblock, 106); + XFS_CHECK_SB_OFFSET(sb_blocklog, 120); + XFS_CHECK_SB_OFFSET(sb_fname[12], 120); + XFS_CHECK_SB_OFFSET(sb_sectlog, 121); + XFS_CHECK_SB_OFFSET(sb_inodelog, 122); + XFS_CHECK_SB_OFFSET(sb_inopblog, 123); + XFS_CHECK_SB_OFFSET(sb_agblklog, 124); + XFS_CHECK_SB_OFFSET(sb_rextslog, 125); + XFS_CHECK_SB_OFFSET(sb_inprogress, 126); + XFS_CHECK_SB_OFFSET(sb_imax_pct, 127); + XFS_CHECK_SB_OFFSET(sb_icount, 128); + XFS_CHECK_SB_OFFSET(sb_ifree, 136); + XFS_CHECK_SB_OFFSET(sb_fdblocks, 144); + XFS_CHECK_SB_OFFSET(sb_frextents, 152); + XFS_CHECK_SB_OFFSET(sb_uquotino, 160); + XFS_CHECK_SB_OFFSET(sb_gquotino, 168); + XFS_CHECK_SB_OFFSET(sb_qflags, 176); + XFS_CHECK_SB_OFFSET(sb_flags, 178); + XFS_CHECK_SB_OFFSET(sb_shared_vn, 179); + XFS_CHECK_SB_OFFSET(sb_inoalignmt, 180); + XFS_CHECK_SB_OFFSET(sb_unit, 184); + XFS_CHECK_SB_OFFSET(sb_width, 188); + XFS_CHECK_SB_OFFSET(sb_dirblklog, 192); + XFS_CHECK_SB_OFFSET(sb_logsectlog, 193); + XFS_CHECK_SB_OFFSET(sb_logsectsize, 194); + XFS_CHECK_SB_OFFSET(sb_logsunit, 196); + XFS_CHECK_SB_OFFSET(sb_features2, 200); + XFS_CHECK_SB_OFFSET(sb_bad_features2, 204); + XFS_CHECK_SB_OFFSET(sb_features_compat, 208); + XFS_CHECK_SB_OFFSET(sb_features_ro_compat, 212); + XFS_CHECK_SB_OFFSET(sb_features_incompat, 216); + XFS_CHECK_SB_OFFSET(sb_features_log_incompat, 220); + XFS_CHECK_SB_OFFSET(sb_crc, 224); + XFS_CHECK_SB_OFFSET(sb_spino_align, 228); + XFS_CHECK_SB_OFFSET(sb_pquotino, 232); + XFS_CHECK_SB_OFFSET(sb_lsn, 240); + XFS_CHECK_SB_OFFSET(sb_meta_uuid, 248); + XFS_CHECK_SB_OFFSET(sb_metadirino, 264); + XFS_CHECK_SB_OFFSET(sb_rgcount, 272); + XFS_CHECK_SB_OFFSET(sb_rgextents, 276); + XFS_CHECK_SB_OFFSET(sb_rgblklog, 280); + XFS_CHECK_SB_OFFSET(sb_pad, 281); + XFS_CHECK_SB_OFFSET(sb_rtstart, 288); + XFS_CHECK_SB_OFFSET(sb_rtreserved, 296); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/libxfs/xfs_parent.c b/fs/xfs/libxfs/xfs_parent.c new file mode 100644 index 000000000000..69366c44a701 --- /dev/null +++ b/fs/xfs/libxfs/xfs_parent.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022-2024 Oracle. + * All rights reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_da_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr_sf.h" +#include "xfs_bmap.h" +#include "xfs_defer.h" +#include "xfs_log.h" +#include "xfs_xattr.h" +#include "xfs_parent.h" +#include "xfs_trans_space.h" +#include "xfs_attr_item.h" +#include "xfs_health.h" + +struct kmem_cache *xfs_parent_args_cache; + +/* + * Parent pointer attribute handling. + * + * Because the attribute name is a filename component, it will never be longer + * than 255 bytes and must not contain nulls or slashes. These are roughly the + * same constraints that apply to attribute names. + * + * The attribute value must always be a struct xfs_parent_rec. This means the + * attribute will never be in remote format because 12 bytes is nowhere near + * xfs_attr_leaf_entsize_local_max() (~75% of block size). + * + * Creating a new parent attribute will always create a new attribute - there + * should never, ever be an existing attribute in the tree for a new inode. + * ENOSPC behavior is problematic - creating the inode without the parent + * pointer is effectively a corruption, so we allow parent attribute creation + * to dip into the reserve block pool to avoid unexpected ENOSPC errors from + * occurring. + */ + +/* Return true if parent pointer attr name is valid. */ +bool +xfs_parent_namecheck( + unsigned int attr_flags, + const void *name, + size_t length) +{ + /* + * Parent pointers always use logged operations, so there should never + * be incomplete xattrs. + */ + if (attr_flags & XFS_ATTR_INCOMPLETE) + return false; + + return xfs_dir2_namecheck(name, length); +} + +/* Return true if parent pointer attr value is valid. */ +bool +xfs_parent_valuecheck( + struct xfs_mount *mp, + const void *value, + size_t valuelen) +{ + const struct xfs_parent_rec *rec = value; + + if (!xfs_has_parent(mp)) + return false; + + /* The xattr value must be a parent record. */ + if (valuelen != sizeof(struct xfs_parent_rec)) + return false; + + /* The parent record must be local. */ + if (value == NULL) + return false; + + /* The parent inumber must be valid. */ + if (!xfs_verify_dir_ino(mp, be64_to_cpu(rec->p_ino))) + return false; + + return true; +} + +/* Compute the attribute name hash for a parent pointer. */ +xfs_dahash_t +xfs_parent_hashval( + struct xfs_mount *mp, + const uint8_t *name, + int namelen, + xfs_ino_t parent_ino) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + + /* + * Use the same dirent name hash as would be used on the directory, but + * mix in the parent inode number to avoid collisions on hardlinked + * files with identical names but different parents. + */ + return xfs_dir2_hashname(mp, &xname) ^ + upper_32_bits(parent_ino) ^ lower_32_bits(parent_ino); +} + +/* Compute the attribute name hash from the xattr components. */ +xfs_dahash_t +xfs_parent_hashattr( + struct xfs_mount *mp, + const uint8_t *name, + int namelen, + const void *value, + int valuelen) +{ + const struct xfs_parent_rec *rec = value; + + /* Requires a local attr value in xfs_parent_rec format */ + if (valuelen != sizeof(struct xfs_parent_rec)) { + ASSERT(valuelen == sizeof(struct xfs_parent_rec)); + return 0; + } + + if (!value) { + ASSERT(value != NULL); + return 0; + } + + return xfs_parent_hashval(mp, name, namelen, be64_to_cpu(rec->p_ino)); +} + +/* + * Initialize the parent pointer arguments structure. Caller must have zeroed + * the contents of @args. @tp is only required for updates. + */ +static void +xfs_parent_da_args_init( + struct xfs_da_args *args, + struct xfs_trans *tp, + struct xfs_parent_rec *rec, + struct xfs_inode *child, + xfs_ino_t owner, + const struct xfs_name *parent_name) +{ + args->geo = child->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->attr_filter = XFS_ATTR_PARENT; + args->op_flags = XFS_DA_OP_LOGGED | XFS_DA_OP_OKNOENT; + args->trans = tp; + args->dp = child; + args->owner = owner; + args->name = parent_name->name; + args->namelen = parent_name->len; + args->value = rec; + args->valuelen = sizeof(struct xfs_parent_rec); + xfs_attr_sethash(args); +} + +/* Make sure the incore state is ready for a parent pointer query/update. */ +static inline int +xfs_parent_iread_extents( + struct xfs_trans *tp, + struct xfs_inode *child) +{ + /* Parent pointers require that the attr fork must exist. */ + if (XFS_IS_CORRUPT(child->i_mount, !xfs_inode_has_attr_fork(child))) { + xfs_inode_mark_sick(child, XFS_SICK_INO_PARENT); + return -EFSCORRUPTED; + } + + return xfs_iread_extents(tp, child, XFS_ATTR_FORK); +} + +/* Add a parent pointer to reflect a dirent addition. */ +int +xfs_parent_addname( + struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *dp, + const struct xfs_name *parent_name, + struct xfs_inode *child) +{ + int error; + + error = xfs_parent_iread_extents(tp, child); + if (error) + return error; + + xfs_inode_to_parent_rec(&ppargs->rec, dp); + xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child, + child->i_ino, parent_name); + xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_SET); + return 0; +} + +/* Remove a parent pointer to reflect a dirent removal. */ +int +xfs_parent_removename( + struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *dp, + const struct xfs_name *parent_name, + struct xfs_inode *child) +{ + int error; + + error = xfs_parent_iread_extents(tp, child); + if (error) + return error; + + xfs_inode_to_parent_rec(&ppargs->rec, dp); + xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child, + child->i_ino, parent_name); + xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_REMOVE); + return 0; +} + +/* Replace one parent pointer with another to reflect a rename. */ +int +xfs_parent_replacename( + struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *old_dp, + const struct xfs_name *old_name, + struct xfs_inode *new_dp, + const struct xfs_name *new_name, + struct xfs_inode *child) +{ + int error; + + error = xfs_parent_iread_extents(tp, child); + if (error) + return error; + + xfs_inode_to_parent_rec(&ppargs->rec, old_dp); + xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child, + child->i_ino, old_name); + + xfs_inode_to_parent_rec(&ppargs->new_rec, new_dp); + ppargs->args.new_name = new_name->name; + ppargs->args.new_namelen = new_name->len; + ppargs->args.new_value = &ppargs->new_rec; + ppargs->args.new_valuelen = sizeof(struct xfs_parent_rec); + xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_REPLACE); + return 0; +} + +/* + * Extract parent pointer information from any parent pointer xattr into + * @parent_ino/gen. The last two parameters can be NULL pointers. + * + * Returns 0 if this is not a parent pointer xattr at all; or -EFSCORRUPTED for + * garbage. + */ +int +xfs_parent_from_attr( + struct xfs_mount *mp, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + xfs_ino_t *parent_ino, + uint32_t *parent_gen) +{ + const struct xfs_parent_rec *rec = value; + + ASSERT(attr_flags & XFS_ATTR_PARENT); + + if (!xfs_parent_namecheck(attr_flags, name, namelen)) + return -EFSCORRUPTED; + if (!xfs_parent_valuecheck(mp, value, valuelen)) + return -EFSCORRUPTED; + + if (parent_ino) + *parent_ino = be64_to_cpu(rec->p_ino); + if (parent_gen) + *parent_gen = be32_to_cpu(rec->p_gen); + return 0; +} + +/* + * Look up a parent pointer record (@parent_name -> @pptr) of @ip. + * + * Caller must hold at least ILOCK_SHARED. The scratchpad need not be + * initialized. + * + * Returns 0 if the pointer is found, -ENOATTR if there is no match, or a + * negative errno. + */ +int +xfs_parent_lookup( + struct xfs_trans *tp, + struct xfs_inode *ip, + const struct xfs_name *parent_name, + struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch) +{ + memset(scratch, 0, sizeof(struct xfs_da_args)); + xfs_parent_da_args_init(scratch, tp, pptr, ip, ip->i_ino, parent_name); + return xfs_attr_get_ilocked(scratch); +} + +/* Sanity-check a parent pointer before we try to perform repairs. */ +static inline bool +xfs_parent_sanity_check( + struct xfs_mount *mp, + const struct xfs_name *parent_name, + const struct xfs_parent_rec *pptr) +{ + if (!xfs_parent_namecheck(XFS_ATTR_PARENT, parent_name->name, + parent_name->len)) + return false; + + if (!xfs_parent_valuecheck(mp, pptr, sizeof(*pptr))) + return false; + + return true; +} + + +/* + * Attach the parent pointer (@parent_name -> @pptr) to @ip immediately. + * Caller must not have a transaction or hold the ILOCK. This is for + * specialized repair functions only. The scratchpad need not be initialized. + */ +int +xfs_parent_set( + struct xfs_inode *ip, + xfs_ino_t owner, + const struct xfs_name *parent_name, + struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch) +{ + if (!xfs_parent_sanity_check(ip->i_mount, parent_name, pptr)) { + ASSERT(0); + return -EFSCORRUPTED; + } + + memset(scratch, 0, sizeof(struct xfs_da_args)); + xfs_parent_da_args_init(scratch, NULL, pptr, ip, owner, parent_name); + return xfs_attr_set(scratch, XFS_ATTRUPDATE_CREATE, false); +} + +/* + * Remove the parent pointer (@parent_name -> @pptr) from @ip immediately. + * Caller must not have a transaction or hold the ILOCK. This is for + * specialized repair functions only. The scratchpad need not be initialized. + */ +int +xfs_parent_unset( + struct xfs_inode *ip, + xfs_ino_t owner, + const struct xfs_name *parent_name, + struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch) +{ + if (!xfs_parent_sanity_check(ip->i_mount, parent_name, pptr)) { + ASSERT(0); + return -EFSCORRUPTED; + } + + memset(scratch, 0, sizeof(struct xfs_da_args)); + xfs_parent_da_args_init(scratch, NULL, pptr, ip, owner, parent_name); + return xfs_attr_set(scratch, XFS_ATTRUPDATE_REMOVE, false); +} diff --git a/fs/xfs/libxfs/xfs_parent.h b/fs/xfs/libxfs/xfs_parent.h new file mode 100644 index 000000000000..b8036527cdc7 --- /dev/null +++ b/fs/xfs/libxfs/xfs_parent.h @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022-2024 Oracle. + * All Rights Reserved. + */ +#ifndef __XFS_PARENT_H__ +#define __XFS_PARENT_H__ + +/* Metadata validators */ +bool xfs_parent_namecheck(unsigned int attr_flags, const void *name, + size_t length); +bool xfs_parent_valuecheck(struct xfs_mount *mp, const void *value, + size_t valuelen); + +xfs_dahash_t xfs_parent_hashval(struct xfs_mount *mp, const uint8_t *name, + int namelen, xfs_ino_t parent_ino); +xfs_dahash_t xfs_parent_hashattr(struct xfs_mount *mp, const uint8_t *name, + int namelen, const void *value, int valuelen); + +/* Initializes a xfs_parent_rec to be stored as an attribute name. */ +static inline void +xfs_parent_rec_init( + struct xfs_parent_rec *rec, + xfs_ino_t ino, + uint32_t gen) +{ + rec->p_ino = cpu_to_be64(ino); + rec->p_gen = cpu_to_be32(gen); +} + +/* Initializes a xfs_parent_rec to be stored as an attribute name. */ +static inline void +xfs_inode_to_parent_rec( + struct xfs_parent_rec *rec, + const struct xfs_inode *dp) +{ + xfs_parent_rec_init(rec, dp->i_ino, VFS_IC(dp)->i_generation); +} + +extern struct kmem_cache *xfs_parent_args_cache; + +/* + * Parent pointer information needed to pass around the deferred xattr update + * machinery. + */ +struct xfs_parent_args { + struct xfs_parent_rec rec; + struct xfs_parent_rec new_rec; + struct xfs_da_args args; +}; + +/* + * Start a parent pointer update by allocating the context object we need to + * perform a parent pointer update. + */ +static inline int +xfs_parent_start( + struct xfs_mount *mp, + struct xfs_parent_args **ppargsp) +{ + if (!xfs_has_parent(mp)) { + *ppargsp = NULL; + return 0; + } + + *ppargsp = kmem_cache_zalloc(xfs_parent_args_cache, GFP_KERNEL); + if (!*ppargsp) + return -ENOMEM; + return 0; +} + +/* Finish a parent pointer update by freeing the context object. */ +static inline void +xfs_parent_finish( + struct xfs_mount *mp, + struct xfs_parent_args *ppargs) +{ + if (ppargs) + kmem_cache_free(xfs_parent_args_cache, ppargs); +} + +int xfs_parent_addname(struct xfs_trans *tp, struct xfs_parent_args *ppargs, + struct xfs_inode *dp, const struct xfs_name *parent_name, + struct xfs_inode *child); +int xfs_parent_removename(struct xfs_trans *tp, struct xfs_parent_args *ppargs, + struct xfs_inode *dp, const struct xfs_name *parent_name, + struct xfs_inode *child); +int xfs_parent_replacename(struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *old_dp, const struct xfs_name *old_name, + struct xfs_inode *new_dp, const struct xfs_name *new_name, + struct xfs_inode *child); + +int xfs_parent_from_attr(struct xfs_mount *mp, unsigned int attr_flags, + const unsigned char *name, unsigned int namelen, + const void *value, unsigned int valuelen, + xfs_ino_t *parent_ino, uint32_t *parent_gen); + +/* Repair functions */ +int xfs_parent_lookup(struct xfs_trans *tp, struct xfs_inode *ip, + const struct xfs_name *name, struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch); +int xfs_parent_set(struct xfs_inode *ip, xfs_ino_t owner, + const struct xfs_name *name, struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch); +int xfs_parent_unset(struct xfs_inode *ip, xfs_ino_t owner, + const struct xfs_name *name, struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch); + +#endif /* __XFS_PARENT_H__ */ diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index cb035da3f990..763d941a8420 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -56,7 +56,7 @@ typedef uint8_t xfs_dqtype_t; * And, of course, we also need to take into account the dquot log format item * used to describe each dquot. */ -#define XFS_DQUOT_LOGRES(mp) \ +#define XFS_DQUOT_LOGRES \ ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6) #define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) @@ -143,4 +143,47 @@ time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq, __be32 dtimer); __be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer); +static inline const char * +xfs_dqinode_path(xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: + return "user"; + case XFS_DQTYPE_GROUP: + return "group"; + case XFS_DQTYPE_PROJ: + return "project"; + } + + ASSERT(0); + return NULL; +} + +static inline enum xfs_metafile_type +xfs_dqinode_metafile_type(xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: + return XFS_METAFILE_USRQUOTA; + case XFS_DQTYPE_GROUP: + return XFS_METAFILE_GRPQUOTA; + case XFS_DQTYPE_PROJ: + return XFS_METAFILE_PRJQUOTA; + } + + ASSERT(0); + return XFS_METAFILE_UNKNOWN; +} + +unsigned int xfs_dqinode_sick_mask(xfs_dqtype_t type); + +int xfs_dqinode_load(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dqtype_t type, struct xfs_inode **ipp); +int xfs_dqinode_metadir_create(struct xfs_inode *dp, xfs_dqtype_t type, + struct xfs_inode **ipp); +int xfs_dqinode_metadir_link(struct xfs_inode *dp, xfs_dqtype_t type, + struct xfs_inode *ip); +int xfs_dqinode_mkdir_parent(struct xfs_mount *mp, struct xfs_inode **dpp); +int xfs_dqinode_load_parent(struct xfs_trans *tp, struct xfs_inode **dpp); + #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 6709a7f8bad5..cebe83f7842a 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -23,6 +23,11 @@ #include "xfs_refcount.h" #include "xfs_rmap.h" #include "xfs_ag.h" +#include "xfs_health.h" +#include "xfs_refcount_item.h" +#include "xfs_rtgroup.h" +#include "xfs_rtalloc.h" +#include "xfs_rtrefcount_btree.h" struct kmem_cache *xfs_refcount_intent_cache; @@ -50,7 +55,7 @@ xfs_refcount_lookup_le( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + trace_xfs_refcount_lookup(cur, xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; @@ -70,7 +75,7 @@ xfs_refcount_lookup_ge( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + trace_xfs_refcount_lookup(cur, xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; @@ -90,7 +95,7 @@ xfs_refcount_lookup_eq( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + trace_xfs_refcount_lookup(cur, xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; @@ -126,7 +131,7 @@ xfs_refcount_check_irec( struct xfs_perag *pag, const struct xfs_refcount_irec *irec) { - if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) + if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX) return __this_address; if (!xfs_refcount_check_domain(irec)) @@ -136,12 +141,43 @@ xfs_refcount_check_irec( if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount)) return __this_address; - if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) + if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX) return __this_address; return NULL; } +xfs_failaddr_t +xfs_rtrefcount_check_irec( + struct xfs_rtgroup *rtg, + const struct xfs_refcount_irec *irec) +{ + if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX) + return __this_address; + + if (!xfs_refcount_check_domain(irec)) + return __this_address; + + /* check for valid extent range, including overflow */ + if (!xfs_verify_rgbext(rtg, irec->rc_startblock, irec->rc_blockcount)) + return __this_address; + + if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX) + return __this_address; + + return NULL; +} + +static inline xfs_failaddr_t +xfs_refcount_check_btrec( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec) +{ + if (xfs_btree_is_rtrefcount(cur->bc_ops)) + return xfs_rtrefcount_check_irec(to_rtg(cur->bc_group), irec); + return xfs_refcount_check_irec(to_perag(cur->bc_group), irec); +} + static inline int xfs_refcount_complain_bad_rec( struct xfs_btree_cur *cur, @@ -150,12 +186,19 @@ xfs_refcount_complain_bad_rec( { struct xfs_mount *mp = cur->bc_mp; - xfs_warn(mp, + if (xfs_btree_is_rtrefcount(cur->bc_ops)) { + xfs_warn(mp, + "RT Refcount BTree record corruption in rtgroup %u detected at %pS!", + cur->bc_group->xg_gno, fa); + } else { + xfs_warn(mp, "Refcount BTree record corruption in AG %d detected at %pS!", - cur->bc_ag.pag->pag_agno, fa); + cur->bc_group->xg_gno, fa); + } xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -177,11 +220,11 @@ xfs_refcount_get_rec( return error; xfs_refcount_btrec_to_irec(rec, irec); - fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec); + fa = xfs_refcount_check_btrec(cur, irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, irec); - trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + trace_xfs_refcount_get(cur, irec); return 0; } @@ -199,7 +242,7 @@ xfs_refcount_update( uint32_t start; int error; - trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + trace_xfs_refcount_update(cur, irec); start = xfs_refcount_encode_startblock(irec->rc_startblock, irec->rc_domain); @@ -209,8 +252,7 @@ xfs_refcount_update( error = xfs_btree_update(cur, &rec); if (error) - trace_xfs_refcount_update_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_update_error(cur, error, _RET_IP_); return error; } @@ -227,7 +269,7 @@ xfs_refcount_insert( { int error; - trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + trace_xfs_refcount_insert(cur, irec); cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; @@ -238,14 +280,14 @@ xfs_refcount_insert( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } out_error: if (error) - trace_xfs_refcount_insert_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_insert_error(cur, error, _RET_IP_); return error; } @@ -268,12 +310,14 @@ xfs_refcount_delete( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } - trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec); + trace_xfs_refcount_delete(cur, &irec); error = xfs_btree_delete(cur, i); if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -283,8 +327,7 @@ xfs_refcount_delete( &found_rec); out_error: if (error) - trace_xfs_refcount_delete_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_delete_error(cur, error, _RET_IP_); return error; } @@ -398,6 +441,7 @@ xfs_refcount_split_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -407,8 +451,7 @@ xfs_refcount_split_extent( return 0; *shape_changed = true; - trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, - &rcext, agbno); + trace_xfs_refcount_split_extent(cur, &rcext, agbno); /* Establish the right extent. */ tmp = rcext; @@ -425,14 +468,14 @@ xfs_refcount_split_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } return error; out_error: - trace_xfs_refcount_split_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_split_extent_error(cur, error, _RET_IP_); return error; } @@ -451,8 +494,7 @@ xfs_refcount_merge_center_extents( int error; int found_rec; - trace_xfs_refcount_merge_center_extents(cur->bc_mp, - cur->bc_ag.pag->pag_agno, left, center, right); + trace_xfs_refcount_merge_center_extents(cur, left, center, right); ASSERT(left->rc_domain == center->rc_domain); ASSERT(right->rc_domain == center->rc_domain); @@ -470,6 +512,7 @@ xfs_refcount_merge_center_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -478,6 +521,7 @@ xfs_refcount_merge_center_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -487,6 +531,7 @@ xfs_refcount_merge_center_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -498,6 +543,7 @@ xfs_refcount_merge_center_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -511,8 +557,7 @@ xfs_refcount_merge_center_extents( return error; out_error: - trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_merge_center_extents_error(cur, error, _RET_IP_); return error; } @@ -530,8 +575,7 @@ xfs_refcount_merge_left_extent( int error; int found_rec; - trace_xfs_refcount_merge_left_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, left, cleft); + trace_xfs_refcount_merge_left_extent(cur, left, cleft); ASSERT(left->rc_domain == cleft->rc_domain); @@ -542,6 +586,7 @@ xfs_refcount_merge_left_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -550,6 +595,7 @@ xfs_refcount_merge_left_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -561,6 +607,7 @@ xfs_refcount_merge_left_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -575,8 +622,7 @@ xfs_refcount_merge_left_extent( return error; out_error: - trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_merge_left_extent_error(cur, error, _RET_IP_); return error; } @@ -593,8 +639,7 @@ xfs_refcount_merge_right_extent( int error; int found_rec; - trace_xfs_refcount_merge_right_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, cright, right); + trace_xfs_refcount_merge_right_extent(cur, cright, right); ASSERT(right->rc_domain == cright->rc_domain); @@ -608,6 +653,7 @@ xfs_refcount_merge_right_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -616,6 +662,7 @@ xfs_refcount_merge_right_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -627,6 +674,7 @@ xfs_refcount_merge_right_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -641,8 +689,7 @@ xfs_refcount_merge_right_extent( return error; out_error: - trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_merge_right_extent_error(cur, error, _RET_IP_); return error; } @@ -674,6 +721,7 @@ xfs_refcount_find_left_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -693,6 +741,7 @@ xfs_refcount_find_left_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -729,13 +778,11 @@ not_found: cleft->rc_refcount = 1; cleft->rc_domain = domain; } - trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, - left, cleft, agbno); + trace_xfs_refcount_find_left_extent(cur, left, cleft, agbno); return error; out_error: - trace_xfs_refcount_find_left_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_find_left_extent_error(cur, error, _RET_IP_); return error; } @@ -767,6 +814,7 @@ xfs_refcount_find_right_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -786,6 +834,7 @@ xfs_refcount_find_right_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -822,13 +871,12 @@ not_found: cright->rc_refcount = 1; cright->rc_domain = domain; } - trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, - cright, right, agbno + aglen); + trace_xfs_refcount_find_right_extent(cur, cright, right, + agbno + aglen); return error; out_error: - trace_xfs_refcount_find_right_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_find_right_extent_error(cur, error, _RET_IP_); return error; } @@ -845,9 +893,9 @@ xfs_refc_merge_refcount( const struct xfs_refcount_irec *irec, enum xfs_refc_adjust_op adjust) { - /* Once a record hits MAXREFCOUNT, it is pinned there forever */ - if (irec->rc_refcount == MAXREFCOUNT) - return MAXREFCOUNT; + /* Once a record hits XFS_REFC_REFCOUNT_MAX, it is pinned forever */ + if (irec->rc_refcount == XFS_REFC_REFCOUNT_MAX) + return XFS_REFC_REFCOUNT_MAX; return irec->rc_refcount + adjust; } @@ -890,7 +938,7 @@ xfs_refc_want_merge_center( * hence we need to catch u32 addition overflows here. */ ulen += cleft->rc_blockcount + right->rc_blockcount; - if (ulen >= MAXREFCEXTLEN) + if (ulen >= XFS_REFC_LEN_MAX) return false; *ulenp = ulen; @@ -925,7 +973,7 @@ xfs_refc_want_merge_left( * hence we need to catch u32 addition overflows here. */ ulen += cleft->rc_blockcount; - if (ulen >= MAXREFCEXTLEN) + if (ulen >= XFS_REFC_LEN_MAX) return false; return true; @@ -959,7 +1007,7 @@ xfs_refc_want_merge_right( * hence we need to catch u32 addition overflows here. */ ulen += cright->rc_blockcount; - if (ulen >= MAXREFCEXTLEN) + if (ulen >= XFS_REFC_LEN_MAX) return false; return true; @@ -1056,25 +1104,41 @@ xfs_refcount_still_have_space( * to handle each of the shape changes to the refcount btree. */ overhead = xfs_allocfree_block_count(cur->bc_mp, - cur->bc_ag.refc.shape_changes); - overhead += cur->bc_mp->m_refc_maxlevels; + cur->bc_refc.shape_changes); + overhead += cur->bc_maxlevels; overhead *= cur->bc_mp->m_sb.sb_blocksize; /* * Only allow 2 refcount extent updates per transaction if the * refcount continue update "error" has been injected. */ - if (cur->bc_ag.refc.nr_ops > 2 && + if (cur->bc_refc.nr_ops > 2 && XFS_TEST_ERROR(false, cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; - if (cur->bc_ag.refc.nr_ops == 0) + if (cur->bc_refc.nr_ops == 0) return true; else if (overhead > cur->bc_tp->t_log_res) return false; - return cur->bc_tp->t_log_res - overhead > - cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; + return cur->bc_tp->t_log_res - overhead > + cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; +} + +/* Schedule an extent free. */ +static int +xrefc_free_extent( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *rec) +{ + unsigned int flags = 0; + + if (xfs_btree_is_rtrefcount(cur->bc_ops)) + flags |= XFS_FREE_EXTENT_REALTIME; + + return xfs_free_extent_later(cur->bc_tp, + xfs_gbno_to_fsb(cur->bc_group, rec->rc_startblock), + rec->rc_blockcount, NULL, XFS_AG_RESV_NONE, flags); } /* @@ -1093,7 +1157,6 @@ xfs_refcount_adjust_extents( struct xfs_refcount_irec ext, tmp; int error; int found_rec, found_tmp; - xfs_fsblock_t fsbno; /* Merging did all the work already. */ if (*aglen == 0) @@ -1109,7 +1172,7 @@ xfs_refcount_adjust_extents( if (error) goto out_error; if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) { - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; + ext.rc_startblock = xfs_group_max_blocks(cur->bc_group); ext.rc_blockcount = 0; ext.rc_refcount = 0; ext.rc_domain = XFS_REFC_DOMAIN_SHARED; @@ -1127,14 +1190,13 @@ xfs_refcount_adjust_extents( tmp.rc_refcount = 1 + adj; tmp.rc_domain = XFS_REFC_DOMAIN_SHARED; - trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, &tmp); + trace_xfs_refcount_modify_extent(cur, &tmp); /* * Either cover the hole (increment) or * delete the range (decrement). */ - cur->bc_ag.refc.nr_ops++; + cur->bc_refc.nr_ops++; if (tmp.rc_refcount) { error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -1142,16 +1204,12 @@ xfs_refcount_adjust_extents( goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } } else { - fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_ag.pag->pag_agno, - tmp.rc_startblock); - error = xfs_free_extent_later(cur->bc_tp, fsbno, - tmp.rc_blockcount, NULL, - XFS_AG_RESV_NONE, false); + error = xrefc_free_extent(cur, &tmp); if (error) goto out_error; } @@ -1180,6 +1238,7 @@ xfs_refcount_adjust_extents( */ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) || XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1188,12 +1247,11 @@ xfs_refcount_adjust_extents( * Adjust the reference count and either update the tree * (incr) or free the blocks (decr). */ - if (ext.rc_refcount == MAXREFCOUNT) + if (ext.rc_refcount == XFS_REFC_REFCOUNT_MAX) goto skip; ext.rc_refcount += adj; - trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, &ext); - cur->bc_ag.refc.nr_ops++; + trace_xfs_refcount_modify_extent(cur, &ext); + cur->bc_refc.nr_ops++; if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) @@ -1203,17 +1261,13 @@ xfs_refcount_adjust_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } goto advloop; } else { - fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_ag.pag->pag_agno, - ext.rc_startblock); - error = xfs_free_extent_later(cur->bc_tp, fsbno, - ext.rc_blockcount, NULL, - XFS_AG_RESV_NONE, false); + error = xrefc_free_extent(cur, &ext); if (error) goto out_error; } @@ -1230,8 +1284,7 @@ advloop: return error; out_error: - trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_modify_extent_error(cur, error, _RET_IP_); return error; } @@ -1248,11 +1301,9 @@ xfs_refcount_adjust( int error; if (adj == XFS_REFCOUNT_ADJUST_INCREASE) - trace_xfs_refcount_increase(cur->bc_mp, - cur->bc_ag.pag->pag_agno, *agbno, *aglen); + trace_xfs_refcount_increase(cur, *agbno, *aglen); else - trace_xfs_refcount_decrease(cur->bc_mp, - cur->bc_ag.pag->pag_agno, *agbno, *aglen); + trace_xfs_refcount_decrease(cur, *agbno, *aglen); /* * Ensure that no rcextents cross the boundary of the adjustment range. @@ -1281,7 +1332,7 @@ xfs_refcount_adjust( if (shape_changed) shape_changes++; if (shape_changes) - cur->bc_ag.refc.shape_changes++; + cur->bc_refc.shape_changes++; /* Now that we've taken care of the ends, adjust the middle extents */ error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj); @@ -1291,28 +1342,10 @@ xfs_refcount_adjust( return 0; out_error: - trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.pag->pag_agno, - error, _RET_IP_); + trace_xfs_refcount_adjust_error(cur, error, _RET_IP_); return error; } -/* Clean up after calling xfs_refcount_finish_one. */ -void -xfs_refcount_finish_one_cleanup( - struct xfs_trans *tp, - struct xfs_btree_cur *rcur, - int error) -{ - struct xfs_buf *agbp; - - if (rcur == NULL) - return; - agbp = rcur->bc_ag.agbp; - xfs_btree_del_cursor(rcur, error); - if (error) - xfs_trans_brelse(tp, agbp); -} - /* * Set up a continuation a deferred refcount operation by updating the intent. * Checks to make sure we're not going to run off the end of the AG. @@ -1324,16 +1357,18 @@ xfs_refcount_continue_op( xfs_agblock_t new_agbno) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, - ri->ri_blockcount))) + ri->ri_blockcount))) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } - ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + ri->ri_startblock = xfs_agbno_to_fsb(pag, new_agbno); ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount)); - ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); + ASSERT(pag_agno(pag) == XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); return 0; } @@ -1352,7 +1387,7 @@ xfs_refcount_finish_one( struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_btree_cur *rcur; + struct xfs_btree_cur *rcur = *pcur; struct xfs_buf *agbp = NULL; int error = 0; xfs_agblock_t bno; @@ -1361,9 +1396,7 @@ xfs_refcount_finish_one( bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock); - trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock), - ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock), - ri->ri_blockcount); + trace_xfs_refcount_deferred(mp, ri); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; @@ -1372,25 +1405,25 @@ xfs_refcount_finish_one( * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ - rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { - nr_ops = rcur->bc_ag.refc.nr_ops; - shape_changes = rcur->bc_ag.refc.shape_changes; - xfs_refcount_finish_one_cleanup(tp, rcur, 0); + if (rcur != NULL && rcur->bc_group != ri->ri_group) { + nr_ops = rcur->bc_refc.nr_ops; + shape_changes = rcur->bc_refc.shape_changes; + xfs_btree_del_cursor(rcur, 0); rcur = NULL; *pcur = NULL; } if (rcur == NULL) { - error = xfs_alloc_read_agf(ri->ri_pag, tp, + struct xfs_perag *pag = to_perag(ri->ri_group); + + error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING, &agbp); if (error) return error; - rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag); - rcur->bc_ag.refc.nr_ops = nr_ops; - rcur->bc_ag.refc.shape_changes = shape_changes; + *pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); + rcur->bc_refc.nr_ops = nr_ops; + rcur->bc_refc.shape_changes = shape_changes; } - *pcur = rcur; switch (ri->ri_type) { case XFS_REFCOUNT_INCREASE: @@ -1426,8 +1459,116 @@ xfs_refcount_finish_one( return -EFSCORRUPTED; } if (!error && ri->ri_blockcount > 0) - trace_xfs_refcount_finish_one_leftover(mp, ri->ri_pag->pag_agno, - ri->ri_type, bno, ri->ri_blockcount); + trace_xfs_refcount_finish_one_leftover(mp, ri); + return error; +} + +/* + * Set up a continuation a deferred rtrefcount operation by updating the + * intent. Checks to make sure we're not going to run off the end of the + * rtgroup. + */ +static inline int +xfs_rtrefcount_continue_op( + struct xfs_btree_cur *cur, + struct xfs_refcount_intent *ri, + xfs_agblock_t new_agbno) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); + + if (XFS_IS_CORRUPT(mp, !xfs_verify_rgbext(rtg, new_agbno, + ri->ri_blockcount))) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + + ri->ri_startblock = xfs_rgbno_to_rtb(rtg, new_agbno); + + ASSERT(xfs_verify_rtbext(mp, ri->ri_startblock, ri->ri_blockcount)); + return 0; +} + +/* + * Process one of the deferred realtime refcount operations. We pass back the + * btree cursor to maintain our lock on the btree between calls. + */ +int +xfs_rtrefcount_finish_one( + struct xfs_trans *tp, + struct xfs_refcount_intent *ri, + struct xfs_btree_cur **pcur) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); + struct xfs_btree_cur *rcur = *pcur; + int error = 0; + xfs_rgblock_t bno; + unsigned long nr_ops = 0; + int shape_changes = 0; + + bno = xfs_rtb_to_rgbno(mp, ri->ri_startblock); + + trace_xfs_refcount_deferred(mp, ri); + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + return -EIO; + + /* + * If we haven't gotten a cursor or the cursor AG doesn't match + * the startblock, get one now. + */ + if (rcur != NULL && rcur->bc_group != ri->ri_group) { + nr_ops = rcur->bc_refc.nr_ops; + shape_changes = rcur->bc_refc.shape_changes; + xfs_btree_del_cursor(rcur, 0); + rcur = NULL; + *pcur = NULL; + } + if (rcur == NULL) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_REFCOUNT); + *pcur = rcur = xfs_rtrefcountbt_init_cursor(tp, rtg); + + rcur->bc_refc.nr_ops = nr_ops; + rcur->bc_refc.shape_changes = shape_changes; + } + + switch (ri->ri_type) { + case XFS_REFCOUNT_INCREASE: + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_INCREASE); + if (error) + return error; + if (ri->ri_blockcount > 0) + error = xfs_rtrefcount_continue_op(rcur, ri, bno); + break; + case XFS_REFCOUNT_DECREASE: + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_DECREASE); + if (error) + return error; + if (ri->ri_blockcount > 0) + error = xfs_rtrefcount_continue_op(rcur, ri, bno); + break; + case XFS_REFCOUNT_ALLOC_COW: + error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount); + if (error) + return error; + ri->ri_blockcount = 0; + break; + case XFS_REFCOUNT_FREE_COW: + error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount); + if (error) + return error; + ri->ri_blockcount = 0; + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + if (!error && ri->ri_blockcount > 0) + trace_xfs_refcount_finish_one_leftover(mp, ri); return error; } @@ -1438,25 +1579,21 @@ static void __xfs_refcount_add( struct xfs_trans *tp, enum xfs_refcount_intent_type type, + bool isrt, xfs_fsblock_t startblock, xfs_extlen_t blockcount) { struct xfs_refcount_intent *ri; - trace_xfs_refcount_defer(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, startblock), - type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), - blockcount); - ri = kmem_cache_alloc(xfs_refcount_intent_cache, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_startblock = startblock; ri->ri_blockcount = blockcount; + ri->ri_realtime = isrt; - xfs_refcount_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); + xfs_refcount_defer_add(tp, ri); } /* @@ -1465,12 +1602,13 @@ __xfs_refcount_add( void xfs_refcount_increase_extent( struct xfs_trans *tp, + bool isrt, struct xfs_bmbt_irec *PREV) { if (!xfs_has_reflink(tp->t_mountp)) return; - __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock, + __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, isrt, PREV->br_startblock, PREV->br_blockcount); } @@ -1480,12 +1618,13 @@ xfs_refcount_increase_extent( void xfs_refcount_decrease_extent( struct xfs_trans *tp, + bool isrt, struct xfs_bmbt_irec *PREV) { if (!xfs_has_reflink(tp->t_mountp)) return; - __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock, + __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, isrt, PREV->br_startblock, PREV->br_blockcount); } @@ -1511,8 +1650,7 @@ xfs_refcount_find_shared( int have; int error; - trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_find_shared(cur, agbno, aglen); /* By default, skip the whole range */ *fbno = NULLAGBLOCK; @@ -1535,6 +1673,7 @@ xfs_refcount_find_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1552,6 +1691,7 @@ xfs_refcount_find_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1585,6 +1725,7 @@ xfs_refcount_find_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1596,13 +1737,11 @@ xfs_refcount_find_shared( } done: - trace_xfs_refcount_find_shared_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, *fbno, *flen); + trace_xfs_refcount_find_shared_result(cur, *fbno, *flen); out_error: if (error) - trace_xfs_refcount_find_shared_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_find_shared_error(cur, error, _RET_IP_); return error; } @@ -1682,11 +1821,12 @@ xfs_refcount_adjust_cow_extents( goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec && ext.rc_domain != XFS_REFC_DOMAIN_COW)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } if (!found_rec) { - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; + ext.rc_startblock = xfs_group_max_blocks(cur->bc_group); ext.rc_blockcount = 0; ext.rc_refcount = 0; ext.rc_domain = XFS_REFC_DOMAIN_COW; @@ -1697,6 +1837,7 @@ xfs_refcount_adjust_cow_extents( /* Adding a CoW reservation, there should be nothing here. */ if (XFS_IS_CORRUPT(cur->bc_mp, agbno + aglen > ext.rc_startblock)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1706,14 +1847,14 @@ xfs_refcount_adjust_cow_extents( tmp.rc_refcount = 1; tmp.rc_domain = XFS_REFC_DOMAIN_COW; - trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, &tmp); + trace_xfs_refcount_modify_extent(cur, &tmp); error = xfs_refcount_insert(cur, &tmp, &found_tmp); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1721,25 +1862,28 @@ xfs_refcount_adjust_cow_extents( case XFS_REFCOUNT_ADJUST_COW_FREE: /* Removing a CoW reservation, there should be one extent. */ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_startblock != agbno)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount != aglen)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_refcount != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } ext.rc_refcount = 0; - trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, &ext); + trace_xfs_refcount_modify_extent(cur, &ext); error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1750,8 +1894,7 @@ xfs_refcount_adjust_cow_extents( return error; out_error: - trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_modify_extent_error(cur, error, _RET_IP_); return error; } @@ -1797,8 +1940,7 @@ xfs_refcount_adjust_cow( return 0; out_error: - trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.pag->pag_agno, - error, _RET_IP_); + trace_xfs_refcount_adjust_cow_error(cur, error, _RET_IP_); return error; } @@ -1811,8 +1953,7 @@ __xfs_refcount_cow_alloc( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_cow_increase(rcur, agbno, aglen); /* Add refcount btree reservation */ return xfs_refcount_adjust_cow(rcur, agbno, aglen, @@ -1828,8 +1969,7 @@ __xfs_refcount_cow_free( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_cow_decrease(rcur, agbno, aglen); /* Remove refcount btree reservation */ return xfs_refcount_adjust_cow(rcur, agbno, aglen, @@ -1840,6 +1980,7 @@ __xfs_refcount_cow_free( void xfs_refcount_alloc_cow_extent( struct xfs_trans *tp, + bool isrt, xfs_fsblock_t fsb, xfs_extlen_t len) { @@ -1848,17 +1989,17 @@ xfs_refcount_alloc_cow_extent( if (!xfs_has_reflink(mp)) return; - __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); + __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, isrt, fsb, len); /* Add rmap entry */ - xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), - XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); + xfs_rmap_alloc_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW); } /* Forget a CoW staging event in the refcount btree. */ void xfs_refcount_free_cow_extent( struct xfs_trans *tp, + bool isrt, xfs_fsblock_t fsb, xfs_extlen_t len) { @@ -1868,9 +2009,8 @@ xfs_refcount_free_cow_extent( return; /* Remove rmap entry */ - xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), - XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); - __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); + xfs_rmap_free_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW); + __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, isrt, fsb, len); } struct xfs_refcount_recovery { @@ -1889,17 +2029,20 @@ xfs_refcount_recover_extent( struct xfs_refcount_recovery *rr; if (XFS_IS_CORRUPT(cur->bc_mp, - be32_to_cpu(rec->refc.rc_refcount) != 1)) + be32_to_cpu(rec->refc.rc_refcount) != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } rr = kmalloc(sizeof(struct xfs_refcount_recovery), GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL || + if (xfs_refcount_check_btrec(cur, &rr->rr_rrec) != NULL || XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { + xfs_btree_mark_sick(cur); kfree(rr); return -EFSCORRUPTED; } @@ -1911,12 +2054,13 @@ xfs_refcount_recover_extent( /* Find and remove leftover CoW reservations. */ int xfs_refcount_recover_cow_leftovers( - struct xfs_mount *mp, - struct xfs_perag *pag) + struct xfs_group *xg) { + struct xfs_mount *mp = xg->xg_mount; + bool isrt = xg->xg_type == XG_TYPE_RTG; struct xfs_trans *tp; struct xfs_btree_cur *cur; - struct xfs_buf *agbp; + struct xfs_buf *agbp = NULL; struct xfs_refcount_recovery *rr, *n; struct list_head debris; union xfs_btree_irec low = { @@ -1929,10 +2073,19 @@ xfs_refcount_recover_cow_leftovers( xfs_fsblock_t fsb; int error; - /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */ + /* reflink filesystems must not have groups larger than 2^31-1 blocks */ + BUILD_BUG_ON(XFS_MAX_RGBLOCKS >= XFS_REFC_COWFLAG); BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG); - if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS) - return -EOPNOTSUPP; + + if (isrt) { + if (!xfs_has_rtgroups(mp)) + return 0; + if (xfs_group_max_blocks(xg) >= XFS_MAX_RGBLOCKS) + return -EOPNOTSUPP; + } else { + if (xfs_group_max_blocks(xg) > XFS_MAX_CRC_AG_BLOCKS) + return -EOPNOTSUPP; + } INIT_LIST_HEAD(&debris); @@ -1950,16 +2103,24 @@ xfs_refcount_recover_cow_leftovers( if (error) return error; - error = xfs_alloc_read_agf(pag, tp, 0, &agbp); - if (error) - goto out_trans; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); + if (isrt) { + xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT); + cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(xg)); + } else { + error = xfs_alloc_read_agf(to_perag(xg), tp, 0, &agbp); + if (error) + goto out_trans; + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, to_perag(xg)); + } /* Find all the leftover CoW staging extents. */ error = xfs_btree_query_range(cur, &low, &high, xfs_refcount_recover_extent, &debris); xfs_btree_del_cursor(cur, error); - xfs_trans_brelse(tp, agbp); + if (agbp) + xfs_trans_brelse(tp, agbp); + else + xfs_rtgroup_unlock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT); xfs_trans_cancel(tp); if (error) goto out_free; @@ -1971,19 +2132,16 @@ xfs_refcount_recover_cow_leftovers( if (error) goto out_free; - trace_xfs_refcount_recover_extent(mp, pag->pag_agno, - &rr->rr_rrec); - /* Free the orphan record */ - fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, - rr->rr_rrec.rc_startblock); - xfs_refcount_free_cow_extent(tp, fsb, + fsb = xfs_gbno_to_fsb(xg, rr->rr_rrec.rc_startblock); + xfs_refcount_free_cow_extent(tp, isrt, fsb, rr->rr_rrec.rc_blockcount); /* Free the block. */ error = xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, + isrt ? XFS_FREE_EXTENT_REALTIME : 0); if (error) goto out_trans; @@ -2048,7 +2206,7 @@ xfs_refcount_query_range_helper( xfs_failaddr_t fa; xfs_refcount_btrec_to_irec(rec, &irec); - fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_refcount_check_btrec(cur, &irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 9b56768a590c..f2e299a716a4 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -12,6 +12,7 @@ struct xfs_perag; struct xfs_btree_cur; struct xfs_bmbt_irec; struct xfs_refcount_irec; +struct xfs_rtgroup; extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); @@ -48,12 +49,19 @@ enum xfs_refcount_intent_type { XFS_REFCOUNT_FREE_COW, }; +#define XFS_REFCOUNT_INTENT_STRINGS \ + { XFS_REFCOUNT_INCREASE, "incr" }, \ + { XFS_REFCOUNT_DECREASE, "decr" }, \ + { XFS_REFCOUNT_ALLOC_COW, "alloc_cow" }, \ + { XFS_REFCOUNT_FREE_COW, "free_cow" } + struct xfs_refcount_intent { struct list_head ri_list; - struct xfs_perag *ri_pag; + struct xfs_group *ri_group; enum xfs_refcount_intent_type ri_type; xfs_extlen_t ri_blockcount; xfs_fsblock_t ri_startblock; + bool ri_realtime; }; /* Check that the refcount is appropriate for the record domain. */ @@ -68,29 +76,25 @@ xfs_refcount_check_domain( return true; } -void xfs_refcount_update_get_group(struct xfs_mount *mp, - struct xfs_refcount_intent *ri); - -void xfs_refcount_increase_extent(struct xfs_trans *tp, +void xfs_refcount_increase_extent(struct xfs_trans *tp, bool isrt, struct xfs_bmbt_irec *irec); -void xfs_refcount_decrease_extent(struct xfs_trans *tp, +void xfs_refcount_decrease_extent(struct xfs_trans *tp, bool isrt, struct xfs_bmbt_irec *irec); -extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, - struct xfs_btree_cur *rcur, int error); -extern int xfs_refcount_finish_one(struct xfs_trans *tp, +int xfs_refcount_finish_one(struct xfs_trans *tp, + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); +int xfs_rtrefcount_finish_one(struct xfs_trans *tp, struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_end_of_shared); -void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, - xfs_extlen_t len); -void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, - xfs_extlen_t len); -extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, - struct xfs_perag *pag); +void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt, + xfs_fsblock_t fsb, xfs_extlen_t len); +void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt, + xfs_fsblock_t fsb, xfs_extlen_t len); +int xfs_refcount_recover_cow_leftovers(struct xfs_group *xg); /* * While we're adjusting the refcounts records of an extent, we have @@ -119,6 +123,8 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag, const struct xfs_refcount_irec *irec); +xfs_failaddr_t xfs_rtrefcount_check_irec(struct xfs_rtgroup *rtg, + const struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 0d80bd99147c..54505fee1852 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -16,6 +16,7 @@ #include "xfs_refcount.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_health.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_bit.h" @@ -29,7 +30,7 @@ xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.pag); + cur->bc_ag.agbp, to_perag(cur->bc_group)); } STATIC void @@ -67,23 +68,20 @@ xfs_refcountbt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.pag = cur->bc_ag.pag; + args.pag = to_perag(cur->bc_group); args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; args.resv = XFS_AG_RESV_METADATA; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, - xfs_refc_block(args.mp))); + xfs_agbno_to_fsb(args.pag, xfs_refc_block(args.mp))); if (error) goto out_error; - trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, - args.agbno, 1); if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; } - ASSERT(args.agno == cur->bc_ag.pag->pag_agno); + ASSERT(args.agno == cur->bc_group->xg_gno); ASSERT(args.len == 1); new->s = cpu_to_be32(args.agbno); @@ -107,12 +105,10 @@ xfs_refcountbt_free_block( struct xfs_agf *agf = agbp->b_addr; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); - trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, - XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false); + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, 0); } STATIC int @@ -173,7 +169,7 @@ xfs_refcountbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_refcount_root; } @@ -220,7 +216,7 @@ xfs_refcountbt_verify( if (!xfs_has_reflink(mp)) return __this_address; - fa = xfs_btree_sblock_v5hdr_verify(bp); + fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; @@ -242,7 +238,7 @@ xfs_refcountbt_verify( } else if (level >= mp->m_refc_maxlevels) return __this_address; - return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]); + return xfs_btree_agblock_verify(bp, mp->m_refc_mxr[level != 0]); } STATIC void @@ -251,7 +247,7 @@ xfs_refcountbt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_sblock_verify_crc(bp)) + if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_refcountbt_verify(bp); @@ -275,7 +271,7 @@ xfs_refcountbt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_sblock_calc_crc(bp); + xfs_btree_agblock_calc_crc(bp); } @@ -321,9 +317,17 @@ xfs_refcountbt_keys_contiguous( be32_to_cpu(key2->refc.rc_startblock)); } -static const struct xfs_btree_ops xfs_refcountbt_ops = { +const struct xfs_btree_ops xfs_refcountbt_ops = { + .name = "refcount", + .type = XFS_BTREE_TYPE_AG, + .rec_len = sizeof(struct xfs_refcount_rec), .key_len = sizeof(struct xfs_refcount_key), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, + + .lru_refs = XFS_REFC_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2), + .sick_mask = XFS_SICK_AG_REFCNTBT, .dup_cursor = xfs_refcountbt_dup_cursor, .set_root = xfs_refcountbt_set_root, @@ -344,32 +348,10 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = { }; /* - * Initialize a new refcount btree cursor. + * Create a new refcount btree cursor. + * + * For staging cursors tp and agbp are NULL. */ -static struct xfs_btree_cur * -xfs_refcountbt_init_common( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_perag *pag) -{ - struct xfs_btree_cur *cur; - - ASSERT(pag->pag_agno < mp->m_sb.sb_agcount); - - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC, - mp->m_refc_maxlevels, xfs_refcountbt_cur_cache); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); - - cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - - cur->bc_ag.pag = xfs_perag_hold(pag); - cur->bc_ag.refc.nr_ops = 0; - cur->bc_ag.refc.shape_changes = 0; - cur->bc_ops = &xfs_refcountbt_ops; - return cur; -} - -/* Create a btree cursor. */ struct xfs_btree_cur * xfs_refcountbt_init_cursor( struct xfs_mount *mp, @@ -377,26 +359,21 @@ xfs_refcountbt_init_cursor( struct xfs_buf *agbp, struct xfs_perag *pag) { - struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; - cur = xfs_refcountbt_init_common(mp, tp, pag); - cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); - cur->bc_ag.agbp = agbp; - return cur; -} + ASSERT(pag_agno(pag) < mp->m_sb.sb_agcount); -/* Create a btree cursor with a fake root for staging. */ -struct xfs_btree_cur * -xfs_refcountbt_stage_cursor( - struct xfs_mount *mp, - struct xbtree_afakeroot *afake, - struct xfs_perag *pag) -{ - struct xfs_btree_cur *cur; + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_refcountbt_ops, + mp->m_refc_maxlevels, xfs_refcountbt_cur_cache); + cur->bc_group = xfs_group_hold(pag_group(pag)); + cur->bc_refc.nr_ops = 0; + cur->bc_refc.shape_changes = 0; + cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agf *agf = agbp->b_addr; - cur = xfs_refcountbt_init_common(mp, NULL, pag); - xfs_btree_stage_afakeroot(cur, afake); + cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + } return cur; } @@ -421,7 +398,7 @@ xfs_refcountbt_commit_staged_btree( xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS | XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops); + xfs_btree_commit_afakeroot(cur, tp, agbp); } /* Calculate number of records in a refcount btree block. */ @@ -439,9 +416,10 @@ xfs_refcountbt_block_maxrecs( /* * Calculate the number of records in a refcount btree block. */ -int +unsigned int xfs_refcountbt_maxrecs( - int blocklen, + struct xfs_mount *mp, + unsigned int blocklen, bool leaf) { blocklen -= XFS_REFCOUNT_BLOCK_LEN; @@ -536,7 +514,7 @@ xfs_refcountbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (xfs_ag_contains_log(mp, pag->pag_agno)) + if (xfs_ag_contains_log(mp, pag_agno(pag))) agblocks -= mp->m_sb.sb_logblocks; *ask += xfs_refcountbt_max_size(mp, agblocks); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index d66b37259bed..beb93bef6a81 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -48,9 +48,8 @@ struct xbtree_afakeroot; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_perag *pag); -struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, struct xfs_perag *pag); -extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); +unsigned int xfs_refcountbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 76bf7f48cb5a..3cdf50563fec 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -23,6 +23,10 @@ #include "xfs_error.h" #include "xfs_inode.h" #include "xfs_ag.h" +#include "xfs_health.h" +#include "xfs_rmap_item.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" struct kmem_cache *xfs_rmap_intent_cache; @@ -56,8 +60,10 @@ xfs_rmap_lookup_le( error = xfs_rmap_get_rec(cur, irec, &get_stat); if (error) return error; - if (!get_stat) + if (!get_stat) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return 0; } @@ -97,8 +103,7 @@ xfs_rmap_update( union xfs_btree_rec rec; int error; - trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, - irec->rm_startblock, irec->rm_blockcount, + trace_xfs_rmap_update(cur, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock); @@ -108,8 +113,7 @@ xfs_rmap_update( xfs_rmap_irec_offset_pack(irec)); error = xfs_btree_update(cur, &rec); if (error) - trace_xfs_rmap_update_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_update_error(cur, error, _RET_IP_); return error; } @@ -125,13 +129,13 @@ xfs_rmap_insert( int i; int error; - trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, - len, owner, offset, flags); + trace_xfs_rmap_insert(rcur, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); if (error) goto done; if (XFS_IS_CORRUPT(rcur->bc_mp, i != 0)) { + xfs_btree_mark_sick(rcur); error = -EFSCORRUPTED; goto done; } @@ -145,13 +149,13 @@ xfs_rmap_insert( if (error) goto done; if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(rcur); error = -EFSCORRUPTED; goto done; } done: if (error) - trace_xfs_rmap_insert_error(rcur->bc_mp, - rcur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_insert_error(rcur, error, _RET_IP_); return error; } @@ -167,13 +171,13 @@ xfs_rmap_delete( int i; int error; - trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, - len, owner, offset, flags); + trace_xfs_rmap_delete(rcur, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); if (error) goto done; if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(rcur); error = -EFSCORRUPTED; goto done; } @@ -182,13 +186,13 @@ xfs_rmap_delete( if (error) goto done; if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(rcur); error = -EFSCORRUPTED; goto done; } done: if (error) - trace_xfs_rmap_delete_error(rcur->bc_mp, - rcur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_delete_error(rcur, error, _RET_IP_); return error; } @@ -208,10 +212,10 @@ xfs_rmap_btrec_to_irec( /* Simple checks for rmap records. */ xfs_failaddr_t xfs_rmap_check_irec( - struct xfs_btree_cur *cur, + struct xfs_perag *pag, const struct xfs_rmap_irec *irec) { - struct xfs_mount *mp = cur->bc_mp; + struct xfs_mount *mp = pag_mount(pag); bool is_inode; bool is_unwritten; bool is_bmbt; @@ -226,8 +230,8 @@ xfs_rmap_check_irec( return __this_address; } else { /* check for valid extent range, including overflow */ - if (!xfs_verify_agbext(cur->bc_ag.pag, irec->rm_startblock, - irec->rm_blockcount)) + if (!xfs_verify_agbext(pag, irec->rm_startblock, + irec->rm_blockcount)) return __this_address; } @@ -262,6 +266,80 @@ xfs_rmap_check_irec( return NULL; } +static xfs_failaddr_t +xfs_rtrmap_check_meta_irec( + struct xfs_rtgroup *rtg, + const struct xfs_rmap_irec *irec) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (irec->rm_offset != 0) + return __this_address; + if (irec->rm_flags & XFS_RMAP_UNWRITTEN) + return __this_address; + + switch (irec->rm_owner) { + case XFS_RMAP_OWN_FS: + if (irec->rm_startblock != 0) + return __this_address; + if (irec->rm_blockcount != mp->m_sb.sb_rextsize) + return __this_address; + return NULL; + case XFS_RMAP_OWN_COW: + if (!xfs_has_rtreflink(mp)) + return __this_address; + if (!xfs_verify_rgbext(rtg, irec->rm_startblock, + irec->rm_blockcount)) + return __this_address; + return NULL; + default: + return __this_address; + } + + return NULL; +} + +static xfs_failaddr_t +xfs_rtrmap_check_inode_irec( + struct xfs_rtgroup *rtg, + const struct xfs_rmap_irec *irec) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (!xfs_verify_ino(mp, irec->rm_owner)) + return __this_address; + if (!xfs_verify_rgbext(rtg, irec->rm_startblock, irec->rm_blockcount)) + return __this_address; + if (!xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount)) + return __this_address; + return NULL; +} + +xfs_failaddr_t +xfs_rtrmap_check_irec( + struct xfs_rtgroup *rtg, + const struct xfs_rmap_irec *irec) +{ + if (irec->rm_blockcount == 0) + return __this_address; + if (irec->rm_flags & (XFS_RMAP_BMBT_BLOCK | XFS_RMAP_ATTR_FORK)) + return __this_address; + if (XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)) + return xfs_rtrmap_check_meta_irec(rtg, irec); + return xfs_rtrmap_check_inode_irec(rtg, irec); +} + +static inline xfs_failaddr_t +xfs_rmap_check_btrec( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec) +{ + if (xfs_btree_is_rtrmap(cur->bc_ops) || + xfs_btree_is_mem_rtrmap(cur->bc_ops)) + return xfs_rtrmap_check_irec(to_rtg(cur->bc_group), irec); + return xfs_rmap_check_irec(to_perag(cur->bc_group), irec); +} + static inline int xfs_rmap_complain_bad_rec( struct xfs_btree_cur *cur, @@ -270,13 +348,22 @@ xfs_rmap_complain_bad_rec( { struct xfs_mount *mp = cur->bc_mp; - xfs_warn(mp, - "Reverse Mapping BTree record corruption in AG %d detected at %pS!", - cur->bc_ag.pag->pag_agno, fa); + if (xfs_btree_is_mem_rmap(cur->bc_ops)) + xfs_warn(mp, + "In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa); + else if (xfs_btree_is_rtrmap(cur->bc_ops)) + xfs_warn(mp, + "RT Reverse Mapping BTree record corruption in rtgroup %u detected at %pS!", + cur->bc_group->xg_gno, fa); + else + xfs_warn(mp, + "Reverse Mapping BTree record corruption in AG %d detected at %pS!", + cur->bc_group->xg_gno, fa); xfs_warn(mp, "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, irec->rm_blockcount); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -299,7 +386,7 @@ xfs_rmap_get_rec( fa = xfs_rmap_btrec_to_irec(rec, irec); if (!fa) - fa = xfs_rmap_check_irec(cur, irec); + fa = xfs_rmap_check_btrec(cur, irec); if (fa) return xfs_rmap_complain_bad_rec(cur, fa, irec); @@ -320,8 +407,7 @@ xfs_rmap_find_left_neighbor_helper( { struct xfs_find_left_neighbor_info *info = priv; - trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, - cur->bc_ag.pag->pag_agno, rec->rm_startblock, + trace_xfs_rmap_find_left_neighbor_candidate(cur, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -371,8 +457,8 @@ xfs_rmap_find_left_neighbor( info.high.rm_blockcount = 0; info.irec = irec; - trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, - cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); + trace_xfs_rmap_find_left_neighbor_query(cur, bno, 0, owner, offset, + flags); /* * Historically, we always used the range query to walk every reverse @@ -403,8 +489,7 @@ xfs_rmap_find_left_neighbor( return error; *stat = 1; - trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, + trace_xfs_rmap_find_left_neighbor_result(cur, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return 0; @@ -419,8 +504,7 @@ xfs_rmap_lookup_le_range_helper( { struct xfs_find_left_neighbor_info *info = priv; - trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, - cur->bc_ag.pag->pag_agno, rec->rm_startblock, + trace_xfs_rmap_lookup_le_range_candidate(cur, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -467,8 +551,7 @@ xfs_rmap_lookup_le_range( *stat = 0; info.irec = irec; - trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno, - bno, 0, owner, offset, flags); + trace_xfs_rmap_lookup_le_range(cur, bno, 0, owner, offset, flags); /* * Historically, we always used the range query to walk every reverse @@ -499,8 +582,7 @@ xfs_rmap_lookup_le_range( return error; *stat = 1; - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, + trace_xfs_rmap_lookup_le_range_result(cur, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return 0; @@ -512,14 +594,15 @@ xfs_rmap_lookup_le_range( */ static int xfs_rmap_free_check_owner( - struct xfs_mount *mp, + struct xfs_btree_cur *cur, uint64_t ltoff, struct xfs_rmap_irec *rec, - xfs_filblks_t len, + xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags) { + struct xfs_mount *mp = cur->bc_mp; int error = 0; if (owner == XFS_RMAP_OWN_UNKNOWN) @@ -529,12 +612,14 @@ xfs_rmap_free_check_owner( if (XFS_IS_CORRUPT(mp, (flags & XFS_RMAP_UNWRITTEN) != (rec->rm_flags & XFS_RMAP_UNWRITTEN))) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } /* Make sure the owner matches what we expect to find in the tree. */ if (XFS_IS_CORRUPT(mp, owner != rec->rm_owner)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } @@ -546,16 +631,19 @@ xfs_rmap_free_check_owner( if (flags & XFS_RMAP_BMBT_BLOCK) { if (XFS_IS_CORRUPT(mp, !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } } else { if (XFS_IS_CORRUPT(mp, rec->rm_offset > offset)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } if (XFS_IS_CORRUPT(mp, offset + len > ltoff + rec->rm_blockcount)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } @@ -606,8 +694,7 @@ xfs_rmap_unmap( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_unmap(cur, bno, len, unwritten, oinfo); /* * We should always have a left record because there's a static record @@ -618,14 +705,14 @@ xfs_rmap_unmap( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, - ltrec.rm_blockcount, ltrec.rm_owner, - ltrec.rm_offset, ltrec.rm_flags); + trace_xfs_rmap_lookup_le_range_result(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, + ltrec.rm_flags); ltoff = ltrec.rm_offset; /* @@ -639,6 +726,7 @@ xfs_rmap_unmap( if (XFS_IS_CORRUPT(mp, bno < ltrec.rm_startblock + ltrec.rm_blockcount)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -665,6 +753,7 @@ xfs_rmap_unmap( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -677,26 +766,27 @@ xfs_rmap_unmap( ltrec.rm_startblock > bno || ltrec.rm_startblock + ltrec.rm_blockcount < bno + len)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } /* Check owner information. */ - error = xfs_rmap_free_check_owner(mp, ltoff, <rec, len, owner, + error = xfs_rmap_free_check_owner(cur, ltoff, <rec, len, owner, offset, flags); if (error) goto out_error; if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - ltrec.rm_startblock, ltrec.rm_blockcount, - ltrec.rm_owner, ltrec.rm_offset, - ltrec.rm_flags); + trace_xfs_rmap_delete(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -767,8 +857,7 @@ xfs_rmap_unmap( else cur->bc_rec.r.rm_offset = offset + len; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, - cur->bc_rec.r.rm_startblock, + trace_xfs_rmap_insert(cur, cur->bc_rec.r.rm_startblock, cur->bc_rec.r.rm_blockcount, cur->bc_rec.r.rm_owner, cur->bc_rec.r.rm_offset, @@ -779,15 +868,93 @@ xfs_rmap_unmap( } out_done: - trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_unmap_done(cur, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_unmap_error(mp, cur->bc_ag.pag->pag_agno, - error, _RET_IP_); + trace_xfs_rmap_unmap_error(cur, error, _RET_IP_); return error; } +#ifdef CONFIG_XFS_LIVE_HOOKS +/* + * Use a static key here to reduce the overhead of rmapbt live updates. If + * the compiler supports jump labels, the static branch will be replaced by a + * nop sled when there are no hook users. Online fsck is currently the only + * caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_rmap_hooks_switch); + +void +xfs_rmap_hook_disable(void) +{ + xfs_hooks_switch_off(&xfs_rmap_hooks_switch); +} + +void +xfs_rmap_hook_enable(void) +{ + xfs_hooks_switch_on(&xfs_rmap_hooks_switch); +} + +/* Call downstream hooks for a reverse mapping update. */ +static inline void +xfs_rmap_update_hook( + struct xfs_trans *tp, + struct xfs_group *xg, + enum xfs_rmap_intent_type op, + xfs_agblock_t startblock, + xfs_extlen_t blockcount, + bool unwritten, + const struct xfs_owner_info *oinfo) +{ + if (xfs_hooks_switched_on(&xfs_rmap_hooks_switch)) { + struct xfs_rmap_update_params p = { + .startblock = startblock, + .blockcount = blockcount, + .unwritten = unwritten, + .oinfo = *oinfo, /* struct copy */ + }; + + if (xg) + xfs_hooks_call(&xg->xg_rmap_update_hooks, op, &p); + } +} + +/* Call the specified function during a reverse mapping update. */ +int +xfs_rmap_hook_add( + struct xfs_group *xg, + struct xfs_rmap_hook *hook) +{ + return xfs_hooks_add(&xg->xg_rmap_update_hooks, &hook->rmap_hook); +} + +/* Stop calling the specified function during a reverse mapping update. */ +void +xfs_rmap_hook_del( + struct xfs_group *xg, + struct xfs_rmap_hook *hook) +{ + xfs_hooks_del(&xg->xg_rmap_update_hooks, &hook->rmap_hook); +} + +/* Configure rmap update hook functions. */ +void +xfs_rmap_hook_setup( + struct xfs_rmap_hook *hook, + notifier_fn_t mod_fn) +{ + xfs_hook_setup(&hook->rmap_hook, mod_fn); +} +#else +# define xfs_rmap_update_hook(t, p, o, s, b, u, oi) do { } while (0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + /* * Remove a reference to an extent in the rmap btree. */ @@ -808,7 +975,8 @@ xfs_rmap_free( return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); - + xfs_rmap_update_hook(tp, pag_group(pag), XFS_RMAP_UNMAP, bno, len, + false, oinfo); error = xfs_rmap_unmap(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -874,8 +1042,7 @@ xfs_rmap_map( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_map(cur, bno, len, unwritten, oinfo); ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); /* @@ -888,8 +1055,7 @@ xfs_rmap_map( if (error) goto out_error; if (have_lt) { - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, + trace_xfs_rmap_lookup_le_range_result(cur, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -900,6 +1066,7 @@ xfs_rmap_map( if (XFS_IS_CORRUPT(mp, have_lt != 0 && ltrec.rm_startblock + ltrec.rm_blockcount > bno)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -917,17 +1084,19 @@ xfs_rmap_map( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, have_gt != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } if (XFS_IS_CORRUPT(mp, bno + len > gtrec.rm_startblock)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } - trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, - gtrec.rm_blockcount, gtrec.rm_owner, - gtrec.rm_offset, gtrec.rm_flags); + trace_xfs_rmap_find_right_neighbor_result(cur, + gtrec.rm_startblock, gtrec.rm_blockcount, + gtrec.rm_owner, gtrec.rm_offset, + gtrec.rm_flags); if (!xfs_rmap_is_mergeable(>rec, owner, flags)) have_gt = 0; } @@ -964,16 +1133,14 @@ xfs_rmap_map( * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| */ ltrec.rm_blockcount += gtrec.rm_blockcount; - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - gtrec.rm_startblock, - gtrec.rm_blockcount, - gtrec.rm_owner, - gtrec.rm_offset, - gtrec.rm_flags); + trace_xfs_rmap_delete(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1015,23 +1182,21 @@ xfs_rmap_map( cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, - owner, offset, flags); + trace_xfs_rmap_insert(cur, bno, len, owner, offset, flags); error = xfs_btree_insert(cur, &i); if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } } - trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_map_done(cur, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_map_error(mp, cur->bc_ag.pag->pag_agno, - error, _RET_IP_); + trace_xfs_rmap_map_error(cur, error, _RET_IP_); return error; } @@ -1055,6 +1220,8 @@ xfs_rmap_alloc( return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); + xfs_rmap_update_hook(tp, pag_group(pag), XFS_RMAP_MAP, bno, len, false, + oinfo); error = xfs_rmap_map(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -1104,8 +1271,7 @@ xfs_rmap_convert( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_convert(cur, bno, len, unwritten, oinfo); /* * For the initial lookup, look for an exact match or the left-adjacent @@ -1116,14 +1282,14 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, PREV.rm_startblock, - PREV.rm_blockcount, PREV.rm_owner, - PREV.rm_offset, PREV.rm_flags); + trace_xfs_rmap_lookup_le_range_result(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, + PREV.rm_flags); ASSERT(PREV.rm_offset <= offset); ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff); @@ -1153,19 +1319,20 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if (XFS_IS_CORRUPT(mp, LEFT.rm_startblock + LEFT.rm_blockcount > bno)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, LEFT.rm_startblock, - LEFT.rm_blockcount, LEFT.rm_owner, - LEFT.rm_offset, LEFT.rm_flags); + trace_xfs_rmap_find_left_neighbor_result(cur, + LEFT.rm_startblock, LEFT.rm_blockcount, + LEFT.rm_owner, LEFT.rm_offset, LEFT.rm_flags); if (LEFT.rm_startblock + LEFT.rm_blockcount == bno && LEFT.rm_offset + LEFT.rm_blockcount == offset && xfs_rmap_is_mergeable(&LEFT, owner, newext)) @@ -1181,6 +1348,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1193,17 +1361,19 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, - RIGHT.rm_blockcount, RIGHT.rm_owner, - RIGHT.rm_offset, RIGHT.rm_flags); + trace_xfs_rmap_find_right_neighbor_result(cur, + RIGHT.rm_startblock, RIGHT.rm_blockcount, + RIGHT.rm_owner, RIGHT.rm_offset, + RIGHT.rm_flags); if (bno + len == RIGHT.rm_startblock && offset + len == RIGHT.rm_offset && xfs_rmap_is_mergeable(&RIGHT, owner, newext)) @@ -1219,14 +1389,14 @@ xfs_rmap_convert( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state, - _RET_IP_); + trace_xfs_rmap_convert_state(cur, state, _RET_IP_); /* reset the cursor back to PREV */ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1246,17 +1416,18 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - RIGHT.rm_startblock, RIGHT.rm_blockcount, - RIGHT.rm_owner, RIGHT.rm_offset, - RIGHT.rm_flags); + trace_xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1264,17 +1435,18 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - PREV.rm_startblock, PREV.rm_blockcount, - PREV.rm_owner, PREV.rm_offset, - PREV.rm_flags); + trace_xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1282,6 +1454,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1297,14 +1470,14 @@ xfs_rmap_convert( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - PREV.rm_startblock, PREV.rm_blockcount, - PREV.rm_owner, PREV.rm_offset, - PREV.rm_flags); + trace_xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1312,6 +1485,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1331,17 +1505,18 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - RIGHT.rm_startblock, RIGHT.rm_blockcount, - RIGHT.rm_owner, RIGHT.rm_offset, - RIGHT.rm_flags); + trace_xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1349,6 +1524,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1413,12 +1589,12 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, - len, owner, offset, newext); + trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1461,6 +1637,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1470,12 +1647,12 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, - len, owner, offset, newext); + trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1501,14 +1678,14 @@ xfs_rmap_convert( NEW = PREV; NEW.rm_blockcount = offset - PREV.rm_offset; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, - NEW.rm_startblock, NEW.rm_blockcount, - NEW.rm_owner, NEW.rm_offset, + trace_xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, NEW.rm_flags); error = xfs_btree_insert(cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1522,18 +1699,19 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } /* new middle extent - newext */ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN; cur->bc_rec.r.rm_flags |= newext; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, - owner, offset, newext); + trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1552,12 +1730,10 @@ xfs_rmap_convert( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_convert_done(cur, bno, len, unwritten, oinfo); done: if (error) - trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_convert_error(cur, error, _RET_IP_); return error; } @@ -1593,8 +1769,7 @@ xfs_rmap_convert_shared( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_convert(cur, bno, len, unwritten, oinfo); /* * For the initial lookup, look for and exact match or the left-adjacent @@ -1606,6 +1781,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1634,6 +1810,7 @@ xfs_rmap_convert_shared( if (XFS_IS_CORRUPT(mp, LEFT.rm_startblock + LEFT.rm_blockcount > bno)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1652,17 +1829,19 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, - RIGHT.rm_blockcount, RIGHT.rm_owner, - RIGHT.rm_offset, RIGHT.rm_flags); + trace_xfs_rmap_find_right_neighbor_result(cur, + RIGHT.rm_startblock, RIGHT.rm_blockcount, + RIGHT.rm_owner, RIGHT.rm_offset, + RIGHT.rm_flags); if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) state |= RMAP_RIGHT_CONTIG; } @@ -1676,8 +1855,7 @@ xfs_rmap_convert_shared( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state, - _RET_IP_); + trace_xfs_rmap_convert_state(cur, state, _RET_IP_); /* * Switch out based on the FILLING and CONTIG state bits. */ @@ -1706,6 +1884,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1732,6 +1911,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1758,6 +1938,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1781,6 +1962,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1816,6 +1998,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1861,6 +2044,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1896,6 +2080,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1934,6 +2119,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1967,12 +2153,10 @@ xfs_rmap_convert_shared( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_convert_done(cur, bno, len, unwritten, oinfo); done: if (error) - trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_convert_error(cur, error, _RET_IP_); return error; } @@ -2010,8 +2194,7 @@ xfs_rmap_unmap_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_unmap(cur, bno, len, unwritten, oinfo); /* * We should always have a left record because there's a static record @@ -2023,6 +2206,7 @@ xfs_rmap_unmap_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2033,12 +2217,14 @@ xfs_rmap_unmap_shared( ltrec.rm_startblock > bno || ltrec.rm_startblock + ltrec.rm_blockcount < bno + len)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } /* Make sure the owner matches what we expect to find in the tree. */ if (XFS_IS_CORRUPT(mp, owner != ltrec.rm_owner)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2047,16 +2233,19 @@ xfs_rmap_unmap_shared( if (XFS_IS_CORRUPT(mp, (flags & XFS_RMAP_UNWRITTEN) != (ltrec.rm_flags & XFS_RMAP_UNWRITTEN))) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } /* Check the offset. */ if (XFS_IS_CORRUPT(mp, ltrec.rm_offset > offset)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } if (XFS_IS_CORRUPT(mp, offset > ltoff + ltrec.rm_blockcount)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2113,6 +2302,7 @@ xfs_rmap_unmap_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2142,6 +2332,7 @@ xfs_rmap_unmap_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2159,12 +2350,10 @@ xfs_rmap_unmap_shared( goto out_error; } - trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_unmap_done(cur, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_unmap_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_unmap_error(cur, error, _RET_IP_); return error; } @@ -2199,8 +2388,7 @@ xfs_rmap_map_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_map(cur, bno, len, unwritten, oinfo); /* Is there a left record that abuts our range? */ error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags, @@ -2221,13 +2409,14 @@ xfs_rmap_map_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, have_gt != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } - trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, - gtrec.rm_blockcount, gtrec.rm_owner, - gtrec.rm_offset, gtrec.rm_flags); + trace_xfs_rmap_find_right_neighbor_result(cur, + gtrec.rm_startblock, gtrec.rm_blockcount, + gtrec.rm_owner, gtrec.rm_offset, + gtrec.rm_flags); if (!xfs_rmap_is_mergeable(>rec, owner, flags)) have_gt = 0; @@ -2273,6 +2462,7 @@ xfs_rmap_map_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2318,12 +2508,10 @@ xfs_rmap_map_shared( goto out_error; } - trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_map_done(cur, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_map_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_map_error(cur, error, _RET_IP_); return error; } @@ -2335,15 +2523,12 @@ xfs_rmap_map_raw( { struct xfs_owner_info oinfo; - oinfo.oi_owner = rmap->rm_owner; - oinfo.oi_offset = rmap->rm_offset; - oinfo.oi_flags = 0; - if (rmap->rm_flags & XFS_RMAP_ATTR_FORK) - oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; - if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK) - oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; + xfs_owner_info_pack(&oinfo, rmap->rm_owner, rmap->rm_offset, + rmap->rm_flags); - if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + if ((rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) || + XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) return xfs_rmap_map(cur, rmap->rm_startblock, rmap->rm_blockcount, rmap->rm_flags & XFS_RMAP_UNWRITTEN, @@ -2373,7 +2558,7 @@ xfs_rmap_query_range_helper( fa = xfs_rmap_btrec_to_irec(rec, &irec); if (!fa) - fa = xfs_rmap_check_irec(cur, &irec); + fa = xfs_rmap_check_btrec(cur, &irec); if (fa) return xfs_rmap_complain_bad_rec(cur, fa, &irec); @@ -2411,21 +2596,77 @@ xfs_rmap_query_all( return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query); } -/* Clean up after calling xfs_rmap_finish_one. */ -void -xfs_rmap_finish_one_cleanup( - struct xfs_trans *tp, - struct xfs_btree_cur *rcur, - int error) +/* Commit an rmap operation into the ondisk tree. */ +int +__xfs_rmap_finish_intent( + struct xfs_btree_cur *rcur, + enum xfs_rmap_intent_type op, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + bool unwritten) +{ + switch (op) { + case XFS_RMAP_ALLOC: + case XFS_RMAP_MAP: + return xfs_rmap_map(rcur, bno, len, unwritten, oinfo); + case XFS_RMAP_MAP_SHARED: + return xfs_rmap_map_shared(rcur, bno, len, unwritten, oinfo); + case XFS_RMAP_FREE: + case XFS_RMAP_UNMAP: + return xfs_rmap_unmap(rcur, bno, len, unwritten, oinfo); + case XFS_RMAP_UNMAP_SHARED: + return xfs_rmap_unmap_shared(rcur, bno, len, unwritten, oinfo); + case XFS_RMAP_CONVERT: + return xfs_rmap_convert(rcur, bno, len, !unwritten, oinfo); + case XFS_RMAP_CONVERT_SHARED: + return xfs_rmap_convert_shared(rcur, bno, len, !unwritten, + oinfo); + default: + ASSERT(0); + return -EFSCORRUPTED; + } +} + +static int +xfs_rmap_finish_init_cursor( + struct xfs_trans *tp, + struct xfs_rmap_intent *ri, + struct xfs_btree_cur **pcur) { - struct xfs_buf *agbp; + struct xfs_perag *pag = to_perag(ri->ri_group); + struct xfs_buf *agbp = NULL; + int error; - if (rcur == NULL) - return; - agbp = rcur->bc_ag.agbp; - xfs_btree_del_cursor(rcur, error); - if (error) - xfs_trans_brelse(tp, agbp); + /* + * Refresh the freelist before we start changing the rmapbt, because a + * shape change could cause us to allocate blocks. + */ + error = xfs_free_extent_fix_freelist(tp, pag, &agbp); + if (error) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); + return error; + } + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); + return -EFSCORRUPTED; + } + *pcur = xfs_rmapbt_init_cursor(tp->t_mountp, tp, agbp, pag); + return 0; +} + +static int +xfs_rtrmap_finish_init_cursor( + struct xfs_trans *tp, + struct xfs_rmap_intent *ri, + struct xfs_btree_cur **pcur) +{ + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + *pcur = xfs_rtrmapbt_init_cursor(tp, rtg); + return 0; } /* @@ -2441,20 +2682,13 @@ xfs_rmap_finish_one( struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_btree_cur *rcur; - struct xfs_buf *agbp = NULL; - int error = 0; struct xfs_owner_info oinfo; + struct xfs_mount *mp = tp->t_mountp; xfs_agblock_t bno; bool unwritten; + int error = 0; - bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock); - - trace_xfs_rmap_deferred(mp, ri->ri_pag->pag_agno, ri->ri_type, bno, - ri->ri_owner, ri->ri_whichfork, - ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount, - ri->ri_bmap.br_state); + trace_xfs_rmap_deferred(mp, ri); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) return -EIO; @@ -2463,66 +2697,33 @@ xfs_rmap_finish_one( * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ - rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { - xfs_rmap_finish_one_cleanup(tp, rcur, 0); - rcur = NULL; + if (*pcur != NULL && (*pcur)->bc_group != ri->ri_group) { + xfs_btree_del_cursor(*pcur, 0); *pcur = NULL; } - if (rcur == NULL) { - /* - * Refresh the freelist before we start changing the - * rmapbt, because a shape change could cause us to - * allocate blocks. - */ - error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp); + if (*pcur == NULL) { + if (ri->ri_group->xg_type == XG_TYPE_RTG) + error = xfs_rtrmap_finish_init_cursor(tp, ri, pcur); + else + error = xfs_rmap_finish_init_cursor(tp, ri, pcur); if (error) return error; - if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) - return -EFSCORRUPTED; - - rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag); } - *pcur = rcur; xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork, ri->ri_bmap.br_startoff); unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN; - bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock); - switch (ri->ri_type) { - case XFS_RMAP_ALLOC: - case XFS_RMAP_MAP: - error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount, - unwritten, &oinfo); - break; - case XFS_RMAP_MAP_SHARED: - error = xfs_rmap_map_shared(rcur, bno, - ri->ri_bmap.br_blockcount, unwritten, &oinfo); - break; - case XFS_RMAP_FREE: - case XFS_RMAP_UNMAP: - error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount, - unwritten, &oinfo); - break; - case XFS_RMAP_UNMAP_SHARED: - error = xfs_rmap_unmap_shared(rcur, bno, - ri->ri_bmap.br_blockcount, unwritten, &oinfo); - break; - case XFS_RMAP_CONVERT: - error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount, - !unwritten, &oinfo); - break; - case XFS_RMAP_CONVERT_SHARED: - error = xfs_rmap_convert_shared(rcur, bno, - ri->ri_bmap.br_blockcount, !unwritten, &oinfo); - break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - } + bno = xfs_fsb_to_gbno(mp, ri->ri_bmap.br_startblock, + ri->ri_group->xg_type); + error = __xfs_rmap_finish_intent(*pcur, ri->ri_type, bno, + ri->ri_bmap.br_blockcount, &oinfo, unwritten); + if (error) + return error; - return error; + xfs_rmap_update_hook(tp, ri->ri_group, ri->ri_type, bno, + ri->ri_bmap.br_blockcount, unwritten, &oinfo); + return 0; } /* @@ -2545,29 +2746,21 @@ __xfs_rmap_add( struct xfs_trans *tp, enum xfs_rmap_intent_type type, uint64_t owner, + bool isrt, int whichfork, struct xfs_bmbt_irec *bmap) { struct xfs_rmap_intent *ri; - trace_xfs_rmap_defer(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), - type, - XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), - owner, whichfork, - bmap->br_startoff, - bmap->br_blockcount, - bmap->br_state); - - ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_owner = owner; ri->ri_whichfork = whichfork; ri->ri_bmap = *bmap; + ri->ri_realtime = isrt; - xfs_rmap_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type); + xfs_rmap_defer_add(tp, ri); } /* Map an extent into a file. */ @@ -2579,6 +2772,7 @@ xfs_rmap_map_extent( struct xfs_bmbt_irec *PREV) { enum xfs_rmap_intent_type type = XFS_RMAP_MAP; + bool isrt = xfs_ifork_is_realtime(ip, whichfork); if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; @@ -2586,7 +2780,7 @@ xfs_rmap_map_extent( if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) type = XFS_RMAP_MAP_SHARED; - __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); + __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV); } /* Unmap an extent out of a file. */ @@ -2598,6 +2792,7 @@ xfs_rmap_unmap_extent( struct xfs_bmbt_irec *PREV) { enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP; + bool isrt = xfs_ifork_is_realtime(ip, whichfork); if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; @@ -2605,7 +2800,7 @@ xfs_rmap_unmap_extent( if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) type = XFS_RMAP_UNMAP_SHARED; - __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); + __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV); } /* @@ -2623,6 +2818,7 @@ xfs_rmap_convert_extent( struct xfs_bmbt_irec *PREV) { enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT; + bool isrt = xfs_ifork_is_realtime(ip, whichfork); if (!xfs_rmap_update_is_needed(mp, whichfork)) return; @@ -2630,15 +2826,15 @@ xfs_rmap_convert_extent( if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) type = XFS_RMAP_CONVERT_SHARED; - __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); + __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV); } /* Schedule the creation of an rmap for non-file data. */ void xfs_rmap_alloc_extent( struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, + bool isrt, + xfs_fsblock_t fsbno, xfs_extlen_t len, uint64_t owner) { @@ -2647,20 +2843,20 @@ xfs_rmap_alloc_extent( if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) return; - bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); + bmap.br_startblock = fsbno; bmap.br_blockcount = len; bmap.br_startoff = 0; bmap.br_state = XFS_EXT_NORM; - __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); + __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, isrt, XFS_DATA_FORK, &bmap); } /* Schedule the deletion of an rmap for non-file data. */ void xfs_rmap_free_extent( struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, + bool isrt, + xfs_fsblock_t fsbno, xfs_extlen_t len, uint64_t owner) { @@ -2669,12 +2865,12 @@ xfs_rmap_free_extent( if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) return; - bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); + bmap.br_startblock = fsbno; bmap.br_blockcount = len; bmap.br_startoff = 0; bmap.br_state = XFS_EXT_NORM; - __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); + __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, isrt, XFS_DATA_FORK, &bmap); } /* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */ diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 3c98d9d50afb..5f39f6e53cd1 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -7,6 +7,7 @@ #define __XFS_RMAP_H__ struct xfs_perag; +struct xfs_rtgroup; static inline void xfs_rmap_ino_bmbt_owner( @@ -157,18 +158,26 @@ enum xfs_rmap_intent_type { XFS_RMAP_FREE, }; +#define XFS_RMAP_INTENT_STRINGS \ + { XFS_RMAP_MAP, "map" }, \ + { XFS_RMAP_MAP_SHARED, "map_shared" }, \ + { XFS_RMAP_UNMAP, "unmap" }, \ + { XFS_RMAP_UNMAP_SHARED, "unmap_shared" }, \ + { XFS_RMAP_CONVERT, "cvt" }, \ + { XFS_RMAP_CONVERT_SHARED, "cvt_shared" }, \ + { XFS_RMAP_ALLOC, "alloc" }, \ + { XFS_RMAP_FREE, "free" } + struct xfs_rmap_intent { struct list_head ri_list; enum xfs_rmap_intent_type ri_type; int ri_whichfork; uint64_t ri_owner; struct xfs_bmbt_irec ri_bmap; - struct xfs_perag *ri_pag; + struct xfs_group *ri_group; + bool ri_realtime; }; -void xfs_rmap_update_get_group(struct xfs_mount *mp, - struct xfs_rmap_intent *ri); - /* functions for updating the rmapbt based on bmbt map/unmap operations */ void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); @@ -177,15 +186,17 @@ void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); -void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); -void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); +void xfs_rmap_alloc_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno, + xfs_extlen_t len, uint64_t owner); +void xfs_rmap_free_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno, + xfs_extlen_t len, uint64_t owner); -void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, - struct xfs_btree_cur *rcur, int error); int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur); +int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur, + enum xfs_rmap_intent_type op, xfs_agblock_t bno, + xfs_extlen_t len, const struct xfs_owner_info *oinfo, + bool unwritten); int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, @@ -195,7 +206,9 @@ int xfs_rmap_compare(const struct xfs_rmap_irec *a, union xfs_btree_rec; xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec); -xfs_failaddr_t xfs_rmap_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag, + const struct xfs_rmap_irec *irec); +xfs_failaddr_t xfs_rtrmap_check_irec(struct xfs_rtgroup *rtg, const struct xfs_rmap_irec *irec); int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno, @@ -235,4 +248,29 @@ extern struct kmem_cache *xfs_rmap_intent_cache; int __init xfs_rmap_intent_init_cache(void); void xfs_rmap_intent_destroy_cache(void); +/* + * Parameters for tracking reverse mapping changes. The hook function arg + * parameter is enum xfs_rmap_intent_type, and the rest is below. + */ +struct xfs_rmap_update_params { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; + struct xfs_owner_info oinfo; + bool unwritten; +}; + +#ifdef CONFIG_XFS_LIVE_HOOKS + +struct xfs_rmap_hook { + struct xfs_hook rmap_hook; +}; + +void xfs_rmap_hook_disable(void); +void xfs_rmap_hook_enable(void); + +int xfs_rmap_hook_add(struct xfs_group *xg, struct xfs_rmap_hook *hook); +void xfs_rmap_hook_del(struct xfs_group *xg, struct xfs_rmap_hook *hook); +void xfs_rmap_hook_setup(struct xfs_rmap_hook *hook, notifier_fn_t mod_fn); +#endif + #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 6c81b20e97d2..2cab694ac58a 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -16,11 +16,14 @@ #include "xfs_btree_staging.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_health.h" #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_extent_busy.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" static struct kmem_cache *xfs_rmapbt_cur_cache; @@ -54,7 +57,7 @@ xfs_rmapbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.pag); + cur->bc_ag.agbp, to_perag(cur->bc_group)); } STATIC void @@ -63,15 +66,15 @@ xfs_rmapbt_set_root( const union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_ag.agbp; - struct xfs_agf *agf = agbp->b_addr; - int btnum = cur->bc_btnum; + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag = to_perag(cur->bc_group); ASSERT(ptr->s != 0); - agf->agf_roots[btnum] = ptr->s; - be32_add_cpu(&agf->agf_levels[btnum], inc); - cur->bc_ag.pag->pagf_levels[btnum] += inc; + agf->agf_rmap_root = ptr->s; + be32_add_cpu(&agf->agf_rmap_level, inc); + pag->pagf_rmap_level += inc; xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -85,7 +88,8 @@ xfs_rmapbt_alloc_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); + struct xfs_alloc_arg args = { .len = 1 }; int error; xfs_agblock_t bno; @@ -94,20 +98,22 @@ xfs_rmapbt_alloc_block( &bno, 1); if (error) return error; - - trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1); if (bno == NULLAGBLOCK) { *stat = 0; return 0; } - xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false); + xfs_extent_busy_reuse(pag_group(pag), bno, 1, false); new->s = cpu_to_be32(bno); be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno); + /* + * Since rmapbt blocks are sourced from the AGFL, they are allocated one + * at a time and the reservation updates don't require a transaction. + */ + xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args); *stat = 1; return 0; @@ -120,20 +126,18 @@ xfs_rmapbt_free_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); xfs_agblock_t bno; int error; bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); - trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno, - bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1); if (error) return error; - xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1, + xfs_extent_busy_insert(cur->bc_tp, pag_group(pag), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); @@ -224,9 +228,9 @@ xfs_rmapbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); - ptr->s = agf->agf_roots[cur->bc_btnum]; + ptr->s = agf->agf_rmap_root; } /* @@ -340,18 +344,29 @@ xfs_rmapbt_verify( if (!xfs_has_rmapbt(mp)) return __this_address; - fa = xfs_btree_sblock_v5hdr_verify(bp); + fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; level = be16_to_cpu(block->bb_level); if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) + unsigned int maxlevel = pag->pagf_rmap_level; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Online repair could be rewriting the free space btrees, so + * we'll validate against the larger of either tree while this + * is going on. + */ + maxlevel = max_t(unsigned int, maxlevel, + pag->pagf_repair_rmap_level); +#endif + if (level >= maxlevel) return __this_address; } else if (level >= mp->m_rmap_maxlevels) return __this_address; - return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]); + return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]); } static void @@ -360,7 +375,7 @@ xfs_rmapbt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_sblock_verify_crc(bp)) + if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_rmapbt_verify(bp); @@ -384,7 +399,7 @@ xfs_rmapbt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_sblock_calc_crc(bp); + xfs_btree_agblock_calc_crc(bp); } @@ -476,9 +491,19 @@ xfs_rmapbt_keys_contiguous( be32_to_cpu(key2->rmap.rm_startblock)); } -static const struct xfs_btree_ops xfs_rmapbt_ops = { +const struct xfs_btree_ops xfs_rmapbt_ops = { + .name = "rmap", + .type = XFS_BTREE_TYPE_AG, + .geom_flags = XFS_BTGEO_OVERLAPPING, + .rec_len = sizeof(struct xfs_rmap_rec), + /* Overlapping btree; 2 keys per pointer. */ .key_len = 2 * sizeof(struct xfs_rmap_key), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, + + .lru_refs = XFS_RMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rmap_2), + .sick_mask = XFS_SICK_AG_RMAPBT, .dup_cursor = xfs_rmapbt_dup_cursor, .set_root = xfs_rmapbt_set_root, @@ -498,55 +523,175 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = { .keys_contiguous = xfs_rmapbt_keys_contiguous, }; -static struct xfs_btree_cur * -xfs_rmapbt_init_common( +/* + * Create a new reverse mapping btree cursor. + * + * For staging cursors tp and agbp are NULL. + */ +struct xfs_btree_cur * +xfs_rmapbt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, + struct xfs_buf *agbp, struct xfs_perag *pag) { struct xfs_btree_cur *cur; - /* Overlapping btree; 2 keys per pointer. */ - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops, mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); - cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); - cur->bc_ops = &xfs_rmapbt_ops; + cur->bc_group = xfs_group_hold(pag_group(pag)); + cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agf *agf = agbp->b_addr; - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level); + } return cur; } -/* Create a new reverse mapping btree cursor. */ +#ifdef CONFIG_XFS_BTREE_IN_MEM +static inline unsigned int +xfs_rmapbt_mem_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / + (2 * sizeof(struct xfs_rmap_key) + sizeof(__be64)); +} + +/* + * Validate an in-memory rmap btree block. Callers are allowed to generate an + * in-memory btree even if the ondisk feature is not enabled. + */ +static xfs_failaddr_t +xfs_rmapbt_mem_verify( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + unsigned int level; + unsigned int maxrecs; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + + level = be16_to_cpu(block->bb_level); + if (level >= xfs_rmapbt_maxlevels_ondisk()) + return __this_address; + + maxrecs = xfs_rmapbt_mem_block_maxrecs( + XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0); + return xfs_btree_memblock_verify(bp, maxrecs); +} + +static void +xfs_rmapbt_mem_rw_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa = xfs_rmapbt_mem_verify(bp); + + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +/* skip crc checks on in-memory btrees to save time */ +static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = { + .name = "xfs_rmapbt_mem", + .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, + .verify_read = xfs_rmapbt_mem_rw_verify, + .verify_write = xfs_rmapbt_mem_rw_verify, + .verify_struct = xfs_rmapbt_mem_verify, +}; + +const struct xfs_btree_ops xfs_rmapbt_mem_ops = { + .name = "mem_rmap", + .type = XFS_BTREE_TYPE_MEM, + .geom_flags = XFS_BTGEO_OVERLAPPING, + + .rec_len = sizeof(struct xfs_rmap_rec), + /* Overlapping btree; 2 keys per pointer. */ + .key_len = 2 * sizeof(struct xfs_rmap_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_RMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rmap_mem_2), + + .dup_cursor = xfbtree_dup_cursor, + .set_root = xfbtree_set_root, + .alloc_block = xfbtree_alloc_block, + .free_block = xfbtree_free_block, + .get_minrecs = xfbtree_get_minrecs, + .get_maxrecs = xfbtree_get_maxrecs, + .init_key_from_rec = xfs_rmapbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, + .init_ptr_from_cur = xfbtree_init_ptr_from_cur, + .key_diff = xfs_rmapbt_key_diff, + .buf_ops = &xfs_rmapbt_mem_buf_ops, + .diff_two_keys = xfs_rmapbt_diff_two_keys, + .keys_inorder = xfs_rmapbt_keys_inorder, + .recs_inorder = xfs_rmapbt_recs_inorder, + .keys_contiguous = xfs_rmapbt_keys_contiguous, +}; + +/* Create a cursor for an in-memory btree. */ struct xfs_btree_cur * -xfs_rmapbt_init_cursor( - struct xfs_mount *mp, +xfs_rmapbt_mem_cursor( + struct xfs_perag *pag, struct xfs_trans *tp, - struct xfs_buf *agbp, - struct xfs_perag *pag) + struct xfbtree *xfbt) { - struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; - cur = xfs_rmapbt_init_common(mp, tp, pag); - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); - cur->bc_ag.agbp = agbp; + cur = xfs_btree_alloc_cursor(pag_mount(pag), tp, &xfs_rmapbt_mem_ops, + xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache); + cur->bc_mem.xfbtree = xfbt; + cur->bc_nlevels = xfbt->nlevels; + + cur->bc_group = xfs_group_hold(pag_group(pag)); return cur; } -/* Create a new reverse mapping btree cursor with a fake root for staging. */ -struct xfs_btree_cur * -xfs_rmapbt_stage_cursor( +/* Create an in-memory rmap btree. */ +int +xfs_rmapbt_mem_init( struct xfs_mount *mp, - struct xbtree_afakeroot *afake, - struct xfs_perag *pag) + struct xfbtree *xfbt, + struct xfs_buftarg *btp, + xfs_agnumber_t agno) { - struct xfs_btree_cur *cur; + xfbt->owner = agno; + return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops); +} - cur = xfs_rmapbt_init_common(mp, NULL, pag); - xfs_btree_stage_afakeroot(cur, afake); - return cur; +/* Compute the max possible height for reverse mapping btrees in memory. */ +static unsigned int +xfs_rmapbt_mem_maxlevels(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2; + + /* + * How tall can an in-memory rmap btree become if we filled the entire + * AG with rmap records? + */ + return xfs_btree_compute_maxlevels(minrecs, + XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec)); } +#else +# define xfs_rmapbt_mem_maxlevels() (0) +#endif /* CONFIG_XFS_BTREE_IN_MEM */ /* * Install a new reverse mapping btree root. Caller is responsible for @@ -563,12 +708,12 @@ xfs_rmapbt_commit_staged_btree( ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); - agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + agf->agf_rmap_root = cpu_to_be32(afake->af_root); + agf->agf_rmap_level = cpu_to_be32(afake->af_levels); agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | XFS_AGF_RMAP_BLOCKS); - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops); + xfs_btree_commit_afakeroot(cur, tp, agbp); } /* Calculate number of records in a reverse mapping btree block. */ @@ -586,10 +731,11 @@ xfs_rmapbt_block_maxrecs( /* * Calculate number of records in an rmap btree block. */ -int +unsigned int xfs_rmapbt_maxrecs( - int blocklen, - int leaf) + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) { blocklen -= XFS_RMAP_BLOCK_LEN; return xfs_rmapbt_block_maxrecs(blocklen, leaf); @@ -618,7 +764,8 @@ xfs_rmapbt_maxlevels_ondisk(void) * like if it consumes almost all the blocks in the AG due to maximal * sharing factor. */ - return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS); + return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS), + xfs_rmapbt_mem_maxlevels()); } /* Compute the maximum height of an rmap btree. */ @@ -716,7 +863,7 @@ xfs_rmapbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (xfs_ag_contains_log(mp, pag->pag_agno)) + if (xfs_ag_contains_log(mp, pag_agno(pag))) agblocks -= mp->m_sb.sb_logblocks; /* Reserve 1% of the AG or enough for 1 block per record. */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 3244715dd111..119b1567cd0e 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -10,6 +10,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; struct xbtree_afakeroot; +struct xfbtree; /* rmaps only exist on crc enabled filesystems */ #define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN @@ -44,11 +45,10 @@ struct xbtree_afakeroot; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_perag *pag); -struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, struct xfs_perag *pag); void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); -int xfs_rmapbt_maxrecs(int blocklen, int leaf); +unsigned int xfs_rmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, @@ -64,4 +64,9 @@ unsigned int xfs_rmapbt_maxlevels_ondisk(void); int __init xfs_rmapbt_init_cur_cache(void); void xfs_rmapbt_destroy_cur_cache(void); +struct xfs_btree_cur *xfs_rmapbt_mem_cursor(struct xfs_perag *pag, + struct xfs_trans *tp, struct xfbtree *xfbtree); +int xfs_rmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, + struct xfs_buftarg *btp, xfs_agnumber_t agno); + #endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 31100120b2c5..5057536e586c 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -13,32 +13,94 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" #include "xfs_trans.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_rtbitmap.h" +#include "xfs_health.h" +#include "xfs_sb.h" +#include "xfs_errortag.h" +#include "xfs_log.h" +#include "xfs_buf_item.h" +#include "xfs_extent_busy.h" /* * Realtime allocator bitmap functions shared with userspace. */ -/* - * Real time buffers need verifiers to avoid runtime warnings during IO. - * We don't have anything to verify, however, so these are just dummy - * operations. - */ +static xfs_failaddr_t +xfs_rtbuf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + if (!xfs_verify_magic(bp, hdr->rt_magic)) + return __this_address; + if (!xfs_has_rtgroups(mp)) + return __this_address; + if (!xfs_has_crc(mp)) + return __this_address; + if (!uuid_equal(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (hdr->rt_blkno != cpu_to_be64(xfs_buf_daddr(bp))) + return __this_address; + return NULL; +} + static void xfs_rtbuf_verify_read( - struct xfs_buf *bp) + struct xfs_buf *bp) { + struct xfs_mount *mp = bp->b_mount; + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + xfs_failaddr_t fa; + + if (!xfs_has_rtgroups(mp)) + return; + + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr->rt_lsn))) { + fa = __this_address; + goto fail; + } + + if (!xfs_buf_verify_cksum(bp, XFS_RTBUF_CRC_OFF)) { + fa = __this_address; + goto fail; + } + + fa = xfs_rtbuf_verify(bp); + if (fa) + goto fail; + return; +fail: + xfs_verifier_error(bp, -EFSCORRUPTED, fa); } static void xfs_rtbuf_verify_write( struct xfs_buf *bp) { - return; + struct xfs_mount *mp = bp->b_mount; + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + struct xfs_buf_log_item *bip = bp->b_log_item; + xfs_failaddr_t fa; + + if (!xfs_has_rtgroups(mp)) + return; + + fa = xfs_rtbuf_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + + if (bip) + hdr->rt_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_RTBUF_CRC_OFF); } const struct xfs_buf_ops xfs_rtbuf_ops = { @@ -47,6 +109,22 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { .verify_write = xfs_rtbuf_verify_write, }; +const struct xfs_buf_ops xfs_rtbitmap_buf_ops = { + .name = "xfs_rtbitmap", + .magic = { 0, cpu_to_be32(XFS_RTBITMAP_MAGIC) }, + .verify_read = xfs_rtbuf_verify_read, + .verify_write = xfs_rtbuf_verify_write, + .verify_struct = xfs_rtbuf_verify, +}; + +const struct xfs_buf_ops xfs_rtsummary_buf_ops = { + .name = "xfs_rtsummary", + .magic = { 0, cpu_to_be32(XFS_RTSUMMARY_MAGIC) }, + .verify_read = xfs_rtbuf_verify_read, + .verify_write = xfs_rtbuf_verify_write, + .verify_struct = xfs_rtbuf_verify, +}; + /* Release cached rt bitmap and summary buffers. */ void xfs_rtbuf_cache_relse( @@ -68,32 +146,35 @@ xfs_rtbuf_cache_relse( * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ -int +static int xfs_rtbuf_get( struct xfs_rtalloc_args *args, xfs_fileoff_t block, /* block number in bitmap or summary */ - int issum) /* is summary not bitmap */ + enum xfs_rtg_inodes type) { + struct xfs_inode *ip = args->rtg->rtg_inodes[type]; struct xfs_mount *mp = args->mp; struct xfs_buf **cbpp; /* cached block buffer */ xfs_fileoff_t *coffp; /* cached block number */ struct xfs_buf *bp; /* block buffer, result */ - struct xfs_inode *ip; /* bitmap or summary inode */ struct xfs_bmbt_irec map; - enum xfs_blft type; + enum xfs_blft buf_type; int nmap = 1; int error; - if (issum) { + switch (type) { + case XFS_RTGI_SUMMARY: cbpp = &args->sumbp; coffp = &args->sumoff; - ip = mp->m_rsumip; - type = XFS_BLFT_RTSUMMARY_BUF; - } else { + buf_type = XFS_BLFT_RTSUMMARY_BUF; + break; + case XFS_RTGI_BITMAP: cbpp = &args->rbmbp; coffp = &args->rbmoff; - ip = mp->m_rbmip; - type = XFS_BLFT_RTBITMAP_BUF; + buf_type = XFS_BLFT_RTBITMAP_BUF; + break; + default: + return -EINVAL; } /* @@ -115,31 +196,75 @@ xfs_rtbuf_get( if (error) return error; - if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) + if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) { + xfs_rtginode_mark_sick(args->rtg, type); return -EFSCORRUPTED; + } ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), - mp->m_bsize, 0, &bp, &xfs_rtbuf_ops); + mp->m_bsize, 0, &bp, + xfs_rtblock_ops(mp, type)); + if (xfs_metadata_is_sick(error)) + xfs_rtginode_mark_sick(args->rtg, type); if (error) return error; - xfs_trans_buf_set_type(args->tp, bp, type); + if (xfs_has_rtgroups(mp)) { + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + if (hdr->rt_owner != cpu_to_be64(ip->i_ino)) { + xfs_buf_mark_corrupt(bp); + xfs_trans_brelse(args->tp, bp); + xfs_rtginode_mark_sick(args->rtg, type); + return -EFSCORRUPTED; + } + } + + xfs_trans_buf_set_type(args->tp, bp, buf_type); *cbpp = bp; *coffp = block; return 0; } +int +xfs_rtbitmap_read_buf( + struct xfs_rtalloc_args *args, + xfs_fileoff_t block) +{ + struct xfs_mount *mp = args->mp; + + if (XFS_IS_CORRUPT(mp, block >= mp->m_sb.sb_rbmblocks)) { + xfs_rtginode_mark_sick(args->rtg, XFS_RTGI_BITMAP); + return -EFSCORRUPTED; + } + + return xfs_rtbuf_get(args, block, XFS_RTGI_BITMAP); +} + +int +xfs_rtsummary_read_buf( + struct xfs_rtalloc_args *args, + xfs_fileoff_t block) +{ + struct xfs_mount *mp = args->mp; + + if (XFS_IS_CORRUPT(mp, block >= mp->m_rsumblocks)) { + xfs_rtginode_mark_sick(args->rtg, XFS_RTGI_SUMMARY); + return -EFSCORRUPTED; + } + return xfs_rtbuf_get(args, block, XFS_RTGI_SUMMARY); +} + /* - * Searching backward from start to limit, find the first block whose - * allocated/free state is different from start's. + * Searching backward from start find the first block whose allocated/free state + * is different from start's. */ int xfs_rtfind_back( struct xfs_rtalloc_args *args, xfs_rtxnum_t start, /* starting rtext to look at */ - xfs_rtxnum_t limit, /* last rtext to look at */ xfs_rtxnum_t *rtx) /* out: start rtext found */ { struct xfs_mount *mp = args->mp; @@ -168,7 +293,7 @@ xfs_rtfind_back( */ word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); - len = start - limit + 1; + len = start + 1; /* * Compute match value, based on the bit at start: if 1 (free) * then all-ones, else all-zeroes. @@ -309,6 +434,8 @@ xfs_rtfind_forw( xfs_rtword_t incore; unsigned int word; /* word number in the buffer */ + ASSERT(start <= limit); + /* * Compute and read in starting bitmap block for starting block. */ @@ -464,6 +591,7 @@ xfs_rtmodify_summary( { struct xfs_mount *mp = args->mp; xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno); + uint8_t *rsum_cache = args->rtg->rtg_rsum_cache; unsigned int infoword; xfs_suminfo_t val; int error; @@ -475,11 +603,11 @@ xfs_rtmodify_summary( infoword = xfs_rtsumoffs_to_infoword(mp, so); val = xfs_suminfo_add(args, infoword, delta); - if (mp->m_rsum_cache) { - if (val == 0 && log + 1 == mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log; - if (val != 0 && log >= mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log + 1; + if (rsum_cache) { + if (val == 0 && log + 1 == rsum_cache[bbno]) + rsum_cache[bbno] = log; + if (val != 0 && log >= rsum_cache[bbno]) + rsum_cache[bbno] = log + 1; } xfs_trans_log_rtsummary(args, infoword); @@ -691,14 +819,14 @@ xfs_rtfree_range( * We need to find the beginning and end of the extent so we can * properly update the summary. */ - error = xfs_rtfind_back(args, start, 0, &preblock); + error = xfs_rtfind_back(args, start, &preblock); if (error) { return error; } /* * Find the next allocated block (end of allocated extent). */ - error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, + error = xfs_rtfind_forw(args, end, args->rtg->rtg_extents - 1, &postblock); if (error) return error; @@ -922,19 +1050,25 @@ xfs_rtcheck_alloc_range( int xfs_rtfree_extent( struct xfs_trans *tp, /* transaction pointer */ + struct xfs_rtgroup *rtg, xfs_rtxnum_t start, /* starting rtext number to free */ xfs_rtxlen_t len) /* length of extent freed */ { struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *rbmip = rtg_bitmap(rtg); struct xfs_rtalloc_args args = { .mp = mp, .tp = tp, + .rtg = rtg, }; int error; struct timespec64 atime; - ASSERT(mp->m_rbmip->i_itemp != NULL); - ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + ASSERT(rbmip->i_itemp != NULL); + xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL); + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT)) + return -EIO; error = xfs_rtcheck_alloc_range(&args, start, len); if (error) @@ -951,19 +1085,21 @@ xfs_rtfree_extent( * Mark more blocks free in the superblock. */ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); + /* * If we've now freed all the blocks, reset the file sequence - * number to 0. + * number to 0 for pre-RTG file systems. */ - if (tp->t_frextents_delta + mp->m_sb.sb_frextents == + if (!xfs_has_rtgroups(mp) && + tp->t_frextents_delta + mp->m_sb.sb_frextents == mp->m_sb.sb_rextents) { - if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) - mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; + if (!(rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) + rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; - atime = inode_get_atime(VFS_I(mp->m_rbmip)); + atime = inode_get_atime(VFS_I(rbmip)); atime.tv_sec = 0; - inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime); - xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + inode_set_atime_to_ts(VFS_I(rbmip), atime); + xfs_trans_log_inode(tp, rbmip, XFS_ILOG_CORE); } error = 0; out: @@ -979,84 +1115,96 @@ out: int xfs_rtfree_blocks( struct xfs_trans *tp, + struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno, xfs_filblks_t rtlen) { struct xfs_mount *mp = tp->t_mountp; - xfs_rtxnum_t start; - xfs_filblks_t len; xfs_extlen_t mod; + int error; + ASSERT(!xfs_has_zoned(mp)); ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); - len = xfs_rtb_to_rtxrem(mp, rtlen, &mod); + mod = xfs_blen_to_rtxoff(mp, rtlen); if (mod) { ASSERT(mod == 0); return -EIO; } - start = xfs_rtb_to_rtxrem(mp, rtbno, &mod); + mod = xfs_rtb_to_rtxoff(mp, rtbno); if (mod) { ASSERT(mod == 0); return -EIO; } - return xfs_rtfree_extent(tp, start, len); + error = xfs_rtfree_extent(tp, rtg, xfs_rtb_to_rtx(mp, rtbno), + xfs_extlen_to_rtxlen(mp, rtlen)); + if (error) + return error; + + if (xfs_has_rtgroups(mp)) + xfs_extent_busy_insert(tp, rtg_group(rtg), + xfs_rtb_to_rgbno(mp, rtbno), rtlen, 0); + + return 0; } /* Find all the free records within a given range. */ int xfs_rtalloc_query_range( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, - const struct xfs_rtalloc_rec *low_rec, - const struct xfs_rtalloc_rec *high_rec, + xfs_rtxnum_t start, + xfs_rtxnum_t end, xfs_rtalloc_query_range_fn fn, void *priv) { + struct xfs_mount *mp = rtg_mount(rtg); struct xfs_rtalloc_args args = { + .rtg = rtg, .mp = mp, .tp = tp, }; - struct xfs_rtalloc_rec rec; - xfs_rtxnum_t rtstart; - xfs_rtxnum_t rtend; - xfs_rtxnum_t high_key; - int is_free; int error = 0; - if (low_rec->ar_startext > high_rec->ar_startext) + if (start > end) return -EINVAL; - if (low_rec->ar_startext >= mp->m_sb.sb_rextents || - low_rec->ar_startext == high_rec->ar_startext) + if (start == end || start >= rtg->rtg_extents) return 0; - high_key = min(high_rec->ar_startext, mp->m_sb.sb_rextents - 1); + end = min(end, rtg->rtg_extents - 1); + + if (xfs_has_zoned(mp)) + return -EINVAL; /* Iterate the bitmap, looking for discrepancies. */ - rtstart = low_rec->ar_startext; - while (rtstart <= high_key) { + while (start <= end) { + struct xfs_rtalloc_rec rec; + int is_free; + xfs_rtxnum_t rtend; + /* Is the first block free? */ - error = xfs_rtcheck_range(&args, rtstart, 1, 1, &rtend, + error = xfs_rtcheck_range(&args, start, 1, 1, &rtend, &is_free); if (error) break; /* How long does the extent go for? */ - error = xfs_rtfind_forw(&args, rtstart, high_key, &rtend); + error = xfs_rtfind_forw(&args, start, end, &rtend); if (error) break; if (is_free) { - rec.ar_startext = rtstart; - rec.ar_extcount = rtend - rtstart + 1; + rec.ar_startext = start; + rec.ar_extcount = rtend - start + 1; - error = fn(mp, tp, &rec, priv); + error = fn(rtg, tp, &rec, priv); if (error) break; } - rtstart = rtend + 1; + start = rtend + 1; } xfs_rtbuf_cache_relse(&args); @@ -1066,31 +1214,27 @@ xfs_rtalloc_query_range( /* Find all the free records. */ int xfs_rtalloc_query_all( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv) { - struct xfs_rtalloc_rec keys[2]; - - keys[0].ar_startext = 0; - keys[1].ar_startext = mp->m_sb.sb_rextents - 1; - keys[0].ar_extcount = keys[1].ar_extcount = 0; - - return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv); + return xfs_rtalloc_query_range(rtg, tp, 0, rtg->rtg_extents - 1, fn, + priv); } /* Is the given extent all free? */ int xfs_rtalloc_extent_is_free( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtxnum_t start, xfs_rtxlen_t len, bool *is_free) { struct xfs_rtalloc_args args = { - .mp = mp, + .mp = rtg_mount(rtg), + .rtg = rtg, .tp = tp, }; xfs_rtxnum_t end; @@ -1106,72 +1250,255 @@ xfs_rtalloc_extent_is_free( return 0; } +/* Compute the number of rt extents tracked by a single bitmap block. */ +xfs_rtxnum_t +xfs_rtbitmap_rtx_per_rbmblock( + struct xfs_mount *mp) +{ + unsigned int rbmblock_bytes = mp->m_sb.sb_blocksize; + + if (xfs_has_rtgroups(mp)) + rbmblock_bytes -= sizeof(struct xfs_rtbuf_blkinfo); + + return rbmblock_bytes * NBBY; +} + /* * Compute the number of rtbitmap blocks needed to track the given number of rt * extents. */ xfs_filblks_t -xfs_rtbitmap_blockcount( +xfs_rtbitmap_blockcount_len( struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { - return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize); + if (xfs_has_zoned(mp)) + return 0; + return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp)); } -/* - * Compute the maximum level number of the realtime summary file, as defined by - * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct - * use of rt volumes with more than 2^32 extents. - */ -uint8_t -xfs_compute_rextslog( - xfs_rtbxlen_t rtextents) +/* How many rt extents does each rtbitmap file track? */ +static inline xfs_rtbxlen_t +xfs_rtbitmap_bitcount( + struct xfs_mount *mp) { - if (!rtextents) + if (!mp->m_sb.sb_rextents) return 0; - return xfs_highbit64(rtextents); + + /* rtgroup size can be nonzero even if rextents is zero */ + if (xfs_has_rtgroups(mp)) + return mp->m_sb.sb_rgextents; + + return mp->m_sb.sb_rextents; } /* - * Compute the number of rtbitmap words needed to populate every block of a - * bitmap that is large enough to track the given number of rt extents. + * Compute the number of rtbitmap blocks used for a given file system. */ -unsigned long long -xfs_rtbitmap_wordcount( - struct xfs_mount *mp, - xfs_rtbxlen_t rtextents) +xfs_filblks_t +xfs_rtbitmap_blockcount( + struct xfs_mount *mp) { - xfs_filblks_t blocks; - - blocks = xfs_rtbitmap_blockcount(mp, rtextents); - return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; + return xfs_rtbitmap_blockcount_len(mp, xfs_rtbitmap_bitcount(mp)); } -/* Compute the number of rtsummary blocks needed to track the given rt space. */ +/* + * Compute the geometry of the rtsummary file needed to track the given rt + * space. + */ xfs_filblks_t xfs_rtsummary_blockcount( struct xfs_mount *mp, - unsigned int rsumlevels, - xfs_extlen_t rbmblocks) + unsigned int *rsumlevels) { + xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp); unsigned long long rsumwords; - rsumwords = (unsigned long long)rsumlevels * rbmblocks; - return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG); + if (xfs_has_zoned(mp)) { + *rsumlevels = 0; + return 0; + } + + *rsumlevels = xfs_compute_rextslog(rextents) + 1; + rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels); + return howmany_64(rsumwords, mp->m_blockwsize); +} + +static int +xfs_rtfile_alloc_blocks( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb, + struct xfs_bmbt_irec *map) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int nmap = 1; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, + XFS_GROWFSRT_SPACE_RES(mp, count_fsb), 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + goto out_trans_cancel; + + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_METADATA, 0, map, &nmap); + if (error) + goto out_trans_cancel; + + return xfs_trans_commit(tp); + +out_trans_cancel: + xfs_trans_cancel(tp); + return error; +} + +/* Get a buffer for the block. */ +static int +xfs_rtfile_initialize_block( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, + xfs_fsblock_t fsbno, + void *data) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_inode *ip = rtg->rtg_inodes[type]; + struct xfs_trans *tp; + struct xfs_buf *bp; + void *bufdata; + const size_t copylen = mp->m_blockwsize << XFS_WORDLOG; + enum xfs_blft buf_type; + int error; + + if (type == XFS_RTGI_BITMAP) + buf_type = XFS_BLFT_RTBITMAP_BUF; + else if (type == XFS_RTGI_SUMMARY) + buf_type = XFS_BLFT_RTSUMMARY_BUF; + else + return -EINVAL; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero, 0, 0, 0, &tp); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, fsbno), mp->m_bsize, 0, &bp); + if (error) { + xfs_trans_cancel(tp); + return error; + } + bufdata = bp->b_addr; + + xfs_trans_buf_set_type(tp, bp, buf_type); + bp->b_ops = xfs_rtblock_ops(mp, type); + + if (xfs_has_rtgroups(mp)) { + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + if (type == XFS_RTGI_BITMAP) + hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC); + else + hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC); + hdr->rt_owner = cpu_to_be64(ip->i_ino); + hdr->rt_blkno = cpu_to_be64(XFS_FSB_TO_DADDR(mp, fsbno)); + hdr->rt_lsn = 0; + uuid_copy(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid); + + bufdata += sizeof(*hdr); + } + + if (data) + memcpy(bufdata, data, copylen); + else + memset(bufdata, 0, copylen); + xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); + return xfs_trans_commit(tp); } /* - * Compute the number of rtsummary info words needed to populate every block of - * a summary file that is large enough to track the given rt space. + * Allocate space to the bitmap or summary file, and zero it, for growfs. + * @data must be a contiguous buffer large enough to fill all blocks in the + * file; or NULL to initialize the contents to zeroes. */ -unsigned long long -xfs_rtsummary_wordcount( - struct xfs_mount *mp, - unsigned int rsumlevels, - xfs_extlen_t rbmblocks) +int +xfs_rtfile_initialize_blocks( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, + xfs_fileoff_t offset_fsb, /* offset to start from */ + xfs_fileoff_t end_fsb, /* offset to allocate to */ + void *data) /* data to fill the blocks */ { - xfs_filblks_t blocks; + struct xfs_mount *mp = rtg_mount(rtg); + const size_t copylen = mp->m_blockwsize << XFS_WORDLOG; + + while (offset_fsb < end_fsb) { + struct xfs_bmbt_irec map; + xfs_filblks_t i; + int error; + + error = xfs_rtfile_alloc_blocks(rtg->rtg_inodes[type], + offset_fsb, end_fsb - offset_fsb, &map); + if (error) + return error; + + /* + * Now we need to clear the allocated blocks. + * + * Do this one block per transaction, to keep it simple. + */ + for (i = 0; i < map.br_blockcount; i++) { + error = xfs_rtfile_initialize_block(rtg, type, + map.br_startblock + i, data); + if (error) + return error; + if (data) + data += copylen; + } - blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks); - return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; + offset_fsb = map.br_startoff + map.br_blockcount; + } + + return 0; +} + +int +xfs_rtbitmap_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + ip->i_disk_size = mp->m_sb.sb_rbmblocks * mp->m_sb.sb_blocksize; + if (init && !xfs_has_rtgroups(mp)) { + ip->i_diflags |= XFS_DIFLAG_NEWRTBM; + inode_set_atime(VFS_I(ip), 0, 0); + } + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} + +int +xfs_rtsummary_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + ip->i_disk_size = mp->m_rsumblocks * mp->m_sb.sb_blocksize; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; } diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 274dc7dae1fa..22e5d9cd95f4 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -6,7 +6,10 @@ #ifndef __XFS_RTBITMAP_H__ #define __XFS_RTBITMAP_H__ +#include "xfs_rtgroup.h" + struct xfs_rtalloc_args { + struct xfs_rtgroup *rtg; struct xfs_mount *mp; struct xfs_trans *tp; @@ -19,13 +22,37 @@ struct xfs_rtalloc_args { static inline xfs_rtblock_t xfs_rtx_to_rtb( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, xfs_rtxnum_t rtx) { + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rtblock_t start = xfs_group_start_fsb(rtg_group(rtg)); + + if (mp->m_rtxblklog >= 0) + return start + (rtx << mp->m_rtxblklog); + return start + (rtx * mp->m_sb.sb_rextsize); +} + +/* Convert an rgbno into an rt extent number. */ +static inline xfs_rtxnum_t +xfs_rgbno_to_rtx( + struct xfs_mount *mp, + xfs_rgblock_t rgbno) +{ + if (likely(mp->m_rtxblklog >= 0)) + return rgbno >> mp->m_rtxblklog; + return rgbno / mp->m_sb.sb_rextsize; +} + +static inline uint64_t +xfs_rtbxlen_to_blen( + struct xfs_mount *mp, + xfs_rtbxlen_t rtbxlen) +{ if (mp->m_rtxblklog >= 0) - return rtx << mp->m_rtxblklog; + return rtbxlen << mp->m_rtxblklog; - return rtx * mp->m_sb.sb_rextsize; + return rtbxlen * mp->m_sb.sb_rextsize; } static inline xfs_extlen_t @@ -62,84 +89,90 @@ xfs_extlen_to_rtxlen( return len / mp->m_sb.sb_rextsize; } -/* Convert an rt block number into an rt extent number. */ -static inline xfs_rtxnum_t -xfs_rtb_to_rtx( +/* Convert an rt block count into an rt extent count. */ +static inline xfs_rtbxlen_t +xfs_blen_to_rtbxlen( struct xfs_mount *mp, - xfs_rtblock_t rtbno) + uint64_t blen) { if (likely(mp->m_rtxblklog >= 0)) - return rtbno >> mp->m_rtxblklog; + return blen >> mp->m_rtxblklog; - return div_u64(rtbno, mp->m_sb.sb_rextsize); + return div_u64(blen, mp->m_sb.sb_rextsize); } -/* Return the offset of an rt block number within an rt extent. */ +/* Return the offset of a file block length within an rt extent. */ static inline xfs_extlen_t -xfs_rtb_to_rtxoff( +xfs_blen_to_rtxoff( struct xfs_mount *mp, - xfs_rtblock_t rtbno) + xfs_filblks_t blen) { if (likely(mp->m_rtxblklog >= 0)) - return rtbno & mp->m_rtxblkmask; + return blen & mp->m_rtxblkmask; - return do_div(rtbno, mp->m_sb.sb_rextsize); + return do_div(blen, mp->m_sb.sb_rextsize); } -/* - * Crack an rt block number into an rt extent number and an offset within that - * rt extent. Returns the rt extent number directly and the offset in @off. - */ -static inline xfs_rtxnum_t -xfs_rtb_to_rtxrem( +/* Round this block count up to the nearest rt extent size. */ +static inline xfs_filblks_t +xfs_blen_roundup_rtx( struct xfs_mount *mp, - xfs_rtblock_t rtbno, - xfs_extlen_t *off) + xfs_filblks_t blen) { - if (likely(mp->m_rtxblklog >= 0)) { - *off = rtbno & mp->m_rtxblkmask; - return rtbno >> mp->m_rtxblklog; - } - - return div_u64_rem(rtbno, mp->m_sb.sb_rextsize, off); + return roundup_64(blen, mp->m_sb.sb_rextsize); } -/* - * Convert an rt block number into an rt extent number, rounding up to the next - * rt extent if the rt block is not aligned to an rt extent boundary. - */ +/* Convert an rt block number into an rt extent number. */ static inline xfs_rtxnum_t -xfs_rtb_to_rtxup( +xfs_rtb_to_rtx( struct xfs_mount *mp, xfs_rtblock_t rtbno) { - if (likely(mp->m_rtxblklog >= 0)) { - if (rtbno & mp->m_rtxblkmask) - return (rtbno >> mp->m_rtxblklog) + 1; + /* open-coded 64-bit masking operation */ + rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask; + if (likely(mp->m_rtxblklog >= 0)) return rtbno >> mp->m_rtxblklog; - } + return div_u64(rtbno, mp->m_sb.sb_rextsize); +} - if (do_div(rtbno, mp->m_sb.sb_rextsize)) - rtbno++; - return rtbno; +/* Return the offset of a rtgroup block number within an rt extent. */ +static inline xfs_extlen_t +xfs_rgbno_to_rtxoff( + struct xfs_mount *mp, + xfs_rgblock_t rgbno) +{ + return rgbno % mp->m_sb.sb_rextsize; } -/* Round this rtblock up to the nearest rt extent size. */ -static inline xfs_rtblock_t -xfs_rtb_roundup_rtx( +/* Return the offset of an rt block number within an rt extent. */ +static inline xfs_extlen_t +xfs_rtb_to_rtxoff( struct xfs_mount *mp, xfs_rtblock_t rtbno) { - return roundup_64(rtbno, mp->m_sb.sb_rextsize); + /* open-coded 64-bit masking operation */ + rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask; + if (likely(mp->m_rtxblklog >= 0)) + return rtbno & mp->m_rtxblkmask; + return do_div(rtbno, mp->m_sb.sb_rextsize); +} + +/* Round this file block offset up to the nearest rt extent size. */ +static inline xfs_rtblock_t +xfs_fileoff_roundup_rtx( + struct xfs_mount *mp, + xfs_fileoff_t off) +{ + return roundup_64(off, mp->m_sb.sb_rextsize); } -/* Round this rtblock down to the nearest rt extent size. */ +/* Round this file block offset down to the nearest rt extent size. */ static inline xfs_rtblock_t -xfs_rtb_rounddown_rtx( +xfs_fileoff_rounddown_rtx( struct xfs_mount *mp, - xfs_rtblock_t rtbno) + xfs_fileoff_t off) { - return rounddown_64(rtbno, mp->m_sb.sb_rextsize); + return rounddown_64(off, mp->m_sb.sb_rextsize); } /* Convert an rt extent number to a file block offset in the rt bitmap file. */ @@ -148,6 +181,9 @@ xfs_rtx_to_rbmblock( struct xfs_mount *mp, xfs_rtxnum_t rtx) { + if (xfs_has_rtgroups(mp)) + return div_u64(rtx, mp->m_rtx_per_rbmblock); + return rtx >> mp->m_blkbit_log; } @@ -157,6 +193,13 @@ xfs_rtx_to_rbmword( struct xfs_mount *mp, xfs_rtxnum_t rtx) { + if (xfs_has_rtgroups(mp)) { + unsigned int mod; + + div_u64_rem(rtx >> XFS_NBWORDLOG, mp->m_blockwsize, &mod); + return mod; + } + return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1); } @@ -166,6 +209,9 @@ xfs_rbmblock_to_rtx( struct xfs_mount *mp, xfs_fileoff_t rbmoff) { + if (xfs_has_rtgroups(mp)) + return rbmoff * mp->m_rtx_per_rbmblock; + return rbmoff << mp->m_blkbit_log; } @@ -175,7 +221,14 @@ xfs_rbmblock_wordptr( struct xfs_rtalloc_args *args, unsigned int index) { - union xfs_rtword_raw *words = args->rbmbp->b_addr; + struct xfs_mount *mp = args->mp; + union xfs_rtword_raw *words; + struct xfs_rtbuf_blkinfo *hdr = args->rbmbp->b_addr; + + if (xfs_has_rtgroups(mp)) + words = (union xfs_rtword_raw *)(hdr + 1); + else + words = args->rbmbp->b_addr; return words + index; } @@ -188,6 +241,8 @@ xfs_rtbitmap_getword( { union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); + if (xfs_has_rtgroups(args->mp)) + return be32_to_cpu(word->rtg); return word->old; } @@ -200,7 +255,10 @@ xfs_rtbitmap_setword( { union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); - word->old = value; + if (xfs_has_rtgroups(args->mp)) + word->rtg = cpu_to_be32(value); + else + word->old = value; } /* @@ -225,6 +283,9 @@ xfs_rtsumoffs_to_block( struct xfs_mount *mp, xfs_rtsumoff_t rsumoff) { + if (xfs_has_rtgroups(mp)) + return rsumoff / mp->m_blockwsize; + return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t)); } @@ -239,6 +300,9 @@ xfs_rtsumoffs_to_infoword( { unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG; + if (xfs_has_rtgroups(mp)) + return rsumoff % mp->m_blockwsize; + return rsumoff & mask; } @@ -248,7 +312,13 @@ xfs_rsumblock_infoptr( struct xfs_rtalloc_args *args, unsigned int index) { - union xfs_suminfo_raw *info = args->sumbp->b_addr; + union xfs_suminfo_raw *info; + struct xfs_rtbuf_blkinfo *hdr = args->sumbp->b_addr; + + if (xfs_has_rtgroups(args->mp)) + info = (union xfs_suminfo_raw *)(hdr + 1); + else + info = args->sumbp->b_addr; return info + index; } @@ -261,6 +331,8 @@ xfs_suminfo_get( { union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + if (xfs_has_rtgroups(args->mp)) + return be32_to_cpu(info->rtg); return info->old; } @@ -273,10 +345,28 @@ xfs_suminfo_add( { union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + if (xfs_has_rtgroups(args->mp)) { + be32_add_cpu(&info->rtg, delta); + return be32_to_cpu(info->rtg); + } + info->old += delta; return info->old; } +static inline const struct xfs_buf_ops * +xfs_rtblock_ops( + struct xfs_mount *mp, + enum xfs_rtg_inodes type) +{ + if (xfs_has_rtgroups(mp)) { + if (type == XFS_RTGI_SUMMARY) + return &xfs_rtsummary_buf_ops; + return &xfs_rtbitmap_buf_ops; + } + return &xfs_rtbuf_ops; +} + /* * Functions for walking free space rtextents in the realtime bitmap. */ @@ -286,37 +376,19 @@ struct xfs_rtalloc_rec { }; typedef int (*xfs_rtalloc_query_range_fn)( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv); #ifdef CONFIG_XFS_RT void xfs_rtbuf_cache_relse(struct xfs_rtalloc_args *args); - -int xfs_rtbuf_get(struct xfs_rtalloc_args *args, xfs_fileoff_t block, - int issum); - -static inline int -xfs_rtbitmap_read_buf( - struct xfs_rtalloc_args *args, - xfs_fileoff_t block) -{ - return xfs_rtbuf_get(args, block, 0); -} - -static inline int -xfs_rtsummary_read_buf( - struct xfs_rtalloc_args *args, - xfs_fileoff_t block) -{ - return xfs_rtbuf_get(args, block, 1); -} - +int xfs_rtbitmap_read_buf(struct xfs_rtalloc_args *args, xfs_fileoff_t block); +int xfs_rtsummary_read_buf(struct xfs_rtalloc_args *args, xfs_fileoff_t block); int xfs_rtcheck_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len, int val, xfs_rtxnum_t *new, int *stat); int xfs_rtfind_back(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, - xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); + xfs_rtxnum_t *rtblock); int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, @@ -327,73 +399,55 @@ int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log, xfs_fileoff_t bbno, int delta); int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len); -int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, - const struct xfs_rtalloc_rec *low_rec, - const struct xfs_rtalloc_rec *high_rec, +int xfs_rtalloc_query_range(struct xfs_rtgroup *rtg, struct xfs_trans *tp, + xfs_rtxnum_t start, xfs_rtxnum_t end, xfs_rtalloc_query_range_fn fn, void *priv); -int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtalloc_query_range_fn fn, - void *priv); -int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtxnum_t start, xfs_rtxlen_t len, - bool *is_free); -/* - * Free an extent in the realtime subvolume. Length is expressed in - * realtime extents, as is the block number. - */ -int /* error */ -xfs_rtfree_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtxnum_t start, /* starting rtext number to free */ - xfs_rtxlen_t len); /* length of extent freed */ - +int xfs_rtalloc_query_all(struct xfs_rtgroup *rtg, struct xfs_trans *tp, + xfs_rtalloc_query_range_fn fn, void *priv); +int xfs_rtalloc_extent_is_free(struct xfs_rtgroup *rtg, struct xfs_trans *tp, + xfs_rtxnum_t start, xfs_rtxlen_t len, bool *is_free); +int xfs_rtfree_extent(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + xfs_rtxnum_t start, xfs_rtxlen_t len); /* Same as above, but in units of rt blocks. */ -int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, - xfs_filblks_t rtlen); - -uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); - -/* Do we support an rt volume having this number of rtextents? */ -static inline bool -xfs_validate_rtextents( - xfs_rtbxlen_t rtextents) -{ - /* No runt rt volumes */ - if (rtextents == 0) - return false; - - return true; -} +int xfs_rtfree_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + xfs_fsblock_t rtbno, xfs_filblks_t rtlen); -xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t - rtextents); -unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, +xfs_rtxnum_t xfs_rtbitmap_rtx_per_rbmblock(struct xfs_mount *mp); +xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp); +xfs_filblks_t xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); - xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp, - unsigned int rsumlevels, xfs_extlen_t rbmblocks); -unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, - unsigned int rsumlevels, xfs_extlen_t rbmblocks); + unsigned int *rsumlevels); + +int xfs_rtfile_initialize_blocks(struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb, void *data); +int xfs_rtbitmap_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); +int xfs_rtsummary_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); + #else /* CONFIG_XFS_RT */ # define xfs_rtfree_extent(t,b,l) (-ENOSYS) -# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS) + +static inline int xfs_rtfree_blocks(struct xfs_trans *tp, + struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno, + xfs_filblks_t rtlen) +{ + return -ENOSYS; +} # define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS) # define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS) # define xfs_rtbitmap_read_buf(a,b) (-ENOSYS) # define xfs_rtsummary_read_buf(a,b) (-ENOSYS) # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) -# define xfs_compute_rextslog(rtx) (0) -# define xfs_validate_rtextents(rtx) (false) static inline xfs_filblks_t -xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) +xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { /* shut up gcc */ return 0; } -# define xfs_rtbitmap_wordcount(mp, r) (0) -# define xfs_rtsummary_blockcount(mp, l, b) (0) -# define xfs_rtsummary_wordcount(mp, l, b) (0) #endif /* CONFIG_XFS_RT */ #endif /* __XFS_RTBITMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c new file mode 100644 index 000000000000..9186c58e83d5 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtgroup.c @@ -0,0 +1,750 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" +#include "xfs_ag_resv.h" +#include "xfs_health.h" +#include "xfs_error.h" +#include "xfs_bmap.h" +#include "xfs_defer.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_buf_item.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" +#include "xfs_metafile.h" +#include "xfs_metadir.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" + +/* Find the first usable fsblock in this rtgroup. */ +static inline uint32_t +xfs_rtgroup_min_block( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + if (xfs_has_rtsb(mp) && rgno == 0) + return mp->m_sb.sb_rextsize; + + return 0; +} + +/* Precompute this group's geometry */ +void +xfs_rtgroup_calc_geometry( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg, + xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents) +{ + rtg->rtg_extents = __xfs_rtgroup_extents(mp, rgno, rgcount, rextents); + rtg_group(rtg)->xg_block_count = rtg->rtg_extents * mp->m_sb.sb_rextsize; + rtg_group(rtg)->xg_min_gbno = xfs_rtgroup_min_block(mp, rgno); +} + +int +xfs_rtgroup_alloc( + struct xfs_mount *mp, + xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents) +{ + struct xfs_rtgroup *rtg; + int error; + + rtg = kzalloc(sizeof(struct xfs_rtgroup), GFP_KERNEL); + if (!rtg) + return -ENOMEM; + + xfs_rtgroup_calc_geometry(mp, rtg, rgno, rgcount, rextents); + + error = xfs_group_insert(mp, rtg_group(rtg), rgno, XG_TYPE_RTG); + if (error) + goto out_free_rtg; + return 0; + +out_free_rtg: + kfree(rtg); + return error; +} + +void +xfs_rtgroup_free( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + xfs_group_free(mp, rgno, XG_TYPE_RTG, NULL); +} + +/* Free a range of incore rtgroup objects. */ +void +xfs_free_rtgroups( + struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno) +{ + xfs_rgnumber_t rgno; + + for (rgno = first_rgno; rgno < end_rgno; rgno++) + xfs_rtgroup_free(mp, rgno); +} + +/* Initialize some range of incore rtgroup objects. */ +int +xfs_initialize_rtgroups( + struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno, + xfs_rtbxlen_t rextents) +{ + xfs_rgnumber_t index; + int error; + + if (first_rgno >= end_rgno) + return 0; + + for (index = first_rgno; index < end_rgno; index++) { + error = xfs_rtgroup_alloc(mp, index, end_rgno, rextents); + if (error) + goto out_unwind_new_rtgs; + } + + return 0; + +out_unwind_new_rtgs: + xfs_free_rtgroups(mp, first_rgno, index); + return error; +} + +/* Compute the number of rt extents in this realtime group. */ +xfs_rtxnum_t +__xfs_rtgroup_extents( + struct xfs_mount *mp, + xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents) +{ + ASSERT(rgno < rgcount); + if (rgno == rgcount - 1) + return rextents - ((xfs_rtxnum_t)rgno * mp->m_sb.sb_rgextents); + + ASSERT(xfs_has_rtgroups(mp)); + return mp->m_sb.sb_rgextents; +} + +xfs_rtxnum_t +xfs_rtgroup_extents( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + return __xfs_rtgroup_extents(mp, rgno, mp->m_sb.sb_rgcount, + mp->m_sb.sb_rextents); +} + +/* + * Update the rt extent count of the previous tail rtgroup if it changed during + * recovery (i.e. recovery of a growfs). + */ +int +xfs_update_last_rtgroup_size( + struct xfs_mount *mp, + xfs_rgnumber_t prev_rgcount) +{ + struct xfs_rtgroup *rtg; + + ASSERT(prev_rgcount > 0); + + rtg = xfs_rtgroup_grab(mp, prev_rgcount - 1); + if (!rtg) + return -EFSCORRUPTED; + rtg->rtg_extents = __xfs_rtgroup_extents(mp, prev_rgcount - 1, + mp->m_sb.sb_rgcount, mp->m_sb.sb_rextents); + rtg_group(rtg)->xg_block_count = rtg->rtg_extents * mp->m_sb.sb_rextsize; + xfs_rtgroup_rele(rtg); + return 0; +} + +/* Lock metadata inodes associated with this rt group. */ +void +xfs_rtgroup_lock( + struct xfs_rtgroup *rtg, + unsigned int rtglock_flags) +{ + ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); + ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || + !(rtglock_flags & XFS_RTGLOCK_BITMAP)); + + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + /* + * Lock both realtime free space metadata inodes for a + * freespace update. + */ + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } + } + + if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) + xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_EXCL); + + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) + xfs_ilock(rtg_refcount(rtg), XFS_ILOCK_EXCL); +} + +/* Unlock metadata inodes associated with this rt group. */ +void +xfs_rtgroup_unlock( + struct xfs_rtgroup *rtg, + unsigned int rtglock_flags) +{ + ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); + ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || + !(rtglock_flags & XFS_RTGLOCK_BITMAP)); + + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) + xfs_iunlock(rtg_refcount(rtg), XFS_ILOCK_EXCL); + + if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) + xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL); + + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } + } +} + +/* + * Join realtime group metadata inodes to the transaction. The ILOCKs will be + * released on transaction commit. + */ +void +xfs_rtgroup_trans_join( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg, + unsigned int rtglock_flags) +{ + ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); + ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)); + + if (!xfs_has_zoned(rtg_mount(rtg)) && + (rtglock_flags & XFS_RTGLOCK_BITMAP)) { + xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL); + } + + if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) + xfs_trans_ijoin(tp, rtg_rmap(rtg), XFS_ILOCK_EXCL); + + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) + xfs_trans_ijoin(tp, rtg_refcount(rtg), XFS_ILOCK_EXCL); +} + +/* Retrieve rt group geometry. */ +int +xfs_rtgroup_get_geometry( + struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo) +{ + /* Fill out form. */ + memset(rgeo, 0, sizeof(*rgeo)); + rgeo->rg_number = rtg_rgno(rtg); + rgeo->rg_length = rtg_blocks(rtg); + xfs_rtgroup_geom_health(rtg, rgeo); + return 0; +} + +#ifdef CONFIG_PROVE_LOCKING +static struct lock_class_key xfs_rtginode_lock_class; + +static int +xfs_rtginode_ilock_cmp_fn( + const struct lockdep_map *m1, + const struct lockdep_map *m2) +{ + const struct xfs_inode *ip1 = + container_of(m1, struct xfs_inode, i_lock.dep_map); + const struct xfs_inode *ip2 = + container_of(m2, struct xfs_inode, i_lock.dep_map); + + if (ip1->i_projid < ip2->i_projid) + return -1; + if (ip1->i_projid > ip2->i_projid) + return 1; + return 0; +} + +static inline void +xfs_rtginode_ilock_print_fn( + const struct lockdep_map *m) +{ + const struct xfs_inode *ip = + container_of(m, struct xfs_inode, i_lock.dep_map); + + printk(KERN_CONT " rgno=%u metatype=%s", ip->i_projid, + xfs_metafile_type_str(ip->i_metatype)); +} + +/* + * Most of the time each of the RTG inode locks are only taken one at a time. + * But when committing deferred ops, more than one of a kind can be taken. + * However, deferred rt ops will be committed in rgno order so there is no + * potential for deadlocks. The code here is needed to tell lockdep about this + * order. + */ +static inline void +xfs_rtginode_lockdep_setup( + struct xfs_inode *ip, + xfs_rgnumber_t rgno, + enum xfs_rtg_inodes type) +{ + lockdep_set_class_and_subclass(&ip->i_lock, &xfs_rtginode_lock_class, + type); + lock_set_cmp_fn(&ip->i_lock, xfs_rtginode_ilock_cmp_fn, + xfs_rtginode_ilock_print_fn); +} +#else +#define xfs_rtginode_lockdep_setup(ip, rgno, type) do { } while (0) +#endif /* CONFIG_PROVE_LOCKING */ + +struct xfs_rtginode_ops { + const char *name; /* short name */ + + enum xfs_metafile_type metafile_type; + + unsigned int sick; /* rtgroup sickness flag */ + + unsigned int fmt_mask; /* all valid data fork formats */ + + /* Does the fs have this feature? */ + bool (*enabled)(const struct xfs_mount *mp); + + /* Create this rtgroup metadata inode and initialize it. */ + int (*create)(struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init); +}; + +static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = { + [XFS_RTGI_BITMAP] = { + .name = "bitmap", + .metafile_type = XFS_METAFILE_RTBITMAP, + .sick = XFS_SICK_RG_BITMAP, + .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | + (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, + .create = xfs_rtbitmap_create, + }, + [XFS_RTGI_SUMMARY] = { + .name = "summary", + .metafile_type = XFS_METAFILE_RTSUMMARY, + .sick = XFS_SICK_RG_SUMMARY, + .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | + (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, + .create = xfs_rtsummary_create, + }, + [XFS_RTGI_RMAP] = { + .name = "rmap", + .metafile_type = XFS_METAFILE_RTRMAP, + .sick = XFS_SICK_RG_RMAPBT, + .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE, + /* + * growfs must create the rtrmap inodes before adding a + * realtime volume to the filesystem, so we cannot use the + * rtrmapbt predicate here. + */ + .enabled = xfs_has_rmapbt, + .create = xfs_rtrmapbt_create, + }, + [XFS_RTGI_REFCOUNT] = { + .name = "refcount", + .metafile_type = XFS_METAFILE_RTREFCOUNT, + .sick = XFS_SICK_RG_REFCNTBT, + .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE, + /* same comment about growfs and rmap inodes applies here */ + .enabled = xfs_has_reflink, + .create = xfs_rtrefcountbt_create, + }, +}; + +/* Return the shortname of this rtgroup inode. */ +const char * +xfs_rtginode_name( + enum xfs_rtg_inodes type) +{ + return xfs_rtginode_ops[type].name; +} + +/* Return the metafile type of this rtgroup inode. */ +enum xfs_metafile_type +xfs_rtginode_metafile_type( + enum xfs_rtg_inodes type) +{ + return xfs_rtginode_ops[type].metafile_type; +} + +/* Should this rtgroup inode be present? */ +bool +xfs_rtginode_enabled( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type) +{ + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + + if (!ops->enabled) + return true; + return ops->enabled(rtg_mount(rtg)); +} + +/* Mark an rtgroup inode sick */ +void +xfs_rtginode_mark_sick( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type) +{ + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + + xfs_group_mark_sick(rtg_group(rtg), ops->sick); +} + +/* Load and existing rtgroup inode into the rtgroup structure. */ +int +xfs_rtginode_load( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip; + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + int error; + + if (!xfs_rtginode_enabled(rtg, type)) + return 0; + + if (!xfs_has_rtgroups(mp)) { + xfs_ino_t ino; + + switch (type) { + case XFS_RTGI_BITMAP: + ino = mp->m_sb.sb_rbmino; + break; + case XFS_RTGI_SUMMARY: + ino = mp->m_sb.sb_rsumino; + break; + default: + /* None of the other types exist on !rtgroups */ + return 0; + } + + error = xfs_trans_metafile_iget(tp, ino, ops->metafile_type, + &ip); + } else { + const char *path; + + if (!mp->m_rtdirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + path = xfs_rtginode_path(rtg_rgno(rtg), type); + if (!path) + return -ENOMEM; + error = xfs_metadir_load(tp, mp->m_rtdirip, path, + ops->metafile_type, &ip); + kfree(path); + } + + if (error) { + if (xfs_metadata_is_sick(error)) + xfs_rtginode_mark_sick(rtg, type); + return error; + } + + if (XFS_IS_CORRUPT(mp, !((1U << ip->i_df.if_format) & ops->fmt_mask))) { + xfs_irele(ip); + xfs_rtginode_mark_sick(rtg, type); + return -EFSCORRUPTED; + } + + if (XFS_IS_CORRUPT(mp, ip->i_projid != rtg_rgno(rtg))) { + xfs_irele(ip); + xfs_rtginode_mark_sick(rtg, type); + return -EFSCORRUPTED; + } + + xfs_rtginode_lockdep_setup(ip, rtg_rgno(rtg), type); + rtg->rtg_inodes[type] = ip; + return 0; +} + +/* Release an rtgroup metadata inode. */ +void +xfs_rtginode_irele( + struct xfs_inode **ipp) +{ + if (*ipp) + xfs_irele(*ipp); + *ipp = NULL; +} + +/* Add a metadata inode for a realtime rmap btree. */ +int +xfs_rtginode_create( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, + bool init) +{ + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_metadir_update upd = { + .dp = mp->m_rtdirip, + .metafile_type = ops->metafile_type, + }; + int error; + + if (!xfs_rtginode_enabled(rtg, type)) + return 0; + + if (!mp->m_rtdirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + upd.path = xfs_rtginode_path(rtg_rgno(rtg), type); + if (!upd.path) + return -ENOMEM; + + error = xfs_metadir_start_create(&upd); + if (error) + goto out_path; + + error = xfs_metadir_create(&upd, S_IFREG); + if (error) + goto out_cancel; + + xfs_rtginode_lockdep_setup(upd.ip, rtg_rgno(rtg), type); + + upd.ip->i_projid = rtg_rgno(rtg); + error = ops->create(rtg, upd.ip, upd.tp, init); + if (error) + goto out_cancel; + + error = xfs_metadir_commit(&upd); + if (error) + goto out_path; + + kfree(upd.path); + xfs_finish_inode_setup(upd.ip); + rtg->rtg_inodes[type] = upd.ip; + return 0; + +out_cancel: + xfs_metadir_cancel(&upd, error); + /* Have to finish setting up the inode to ensure it's deleted. */ + if (upd.ip) { + xfs_finish_inode_setup(upd.ip); + xfs_irele(upd.ip); + } +out_path: + kfree(upd.path); + return error; +} + +/* Create the parent directory for all rtgroup inodes and load it. */ +int +xfs_rtginode_mkdir_parent( + struct xfs_mount *mp) +{ + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_mkdir(mp->m_metadirip, "rtgroups", &mp->m_rtdirip); +} + +/* Load the parent directory of all rtgroup inodes. */ +int +xfs_rtginode_load_parent( + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_load(tp, mp->m_metadirip, "rtgroups", + XFS_METAFILE_DIR, &mp->m_rtdirip); +} + +/* Check superblock fields for a read or a write. */ +static xfs_failaddr_t +xfs_rtsb_verify_common( + struct xfs_buf *bp) +{ + struct xfs_rtsb *rsb = bp->b_addr; + + if (!xfs_verify_magic(bp, rsb->rsb_magicnum)) + return __this_address; + if (rsb->rsb_pad) + return __this_address; + + /* Everything to the end of the fs block must be zero */ + if (memchr_inv(rsb + 1, 0, BBTOB(bp->b_length) - sizeof(*rsb))) + return __this_address; + + return NULL; +} + +/* Check superblock fields for a read or revalidation. */ +static inline xfs_failaddr_t +xfs_rtsb_verify_all( + struct xfs_buf *bp) +{ + struct xfs_rtsb *rsb = bp->b_addr; + struct xfs_mount *mp = bp->b_mount; + xfs_failaddr_t fa; + + fa = xfs_rtsb_verify_common(bp); + if (fa) + return fa; + + if (memcmp(&rsb->rsb_fname, &mp->m_sb.sb_fname, XFSLABEL_MAX)) + return __this_address; + if (!uuid_equal(&rsb->rsb_uuid, &mp->m_sb.sb_uuid)) + return __this_address; + if (!uuid_equal(&rsb->rsb_meta_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + + return NULL; +} + +static void +xfs_rtsb_read_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (!xfs_buf_verify_cksum(bp, XFS_RTSB_CRC_OFF)) { + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + return; + } + + fa = xfs_rtsb_verify_all(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +static void +xfs_rtsb_write_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_rtsb_verify_common(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + + xfs_buf_update_cksum(bp, XFS_RTSB_CRC_OFF); +} + +const struct xfs_buf_ops xfs_rtsb_buf_ops = { + .name = "xfs_rtsb", + .magic = { 0, cpu_to_be32(XFS_RTSB_MAGIC) }, + .verify_read = xfs_rtsb_read_verify, + .verify_write = xfs_rtsb_write_verify, + .verify_struct = xfs_rtsb_verify_all, +}; + +/* Update a realtime superblock from the primary fs super */ +void +xfs_update_rtsb( + struct xfs_buf *rtsb_bp, + const struct xfs_buf *sb_bp) +{ + const struct xfs_dsb *dsb = sb_bp->b_addr; + struct xfs_rtsb *rsb = rtsb_bp->b_addr; + const uuid_t *meta_uuid; + + rsb->rsb_magicnum = cpu_to_be32(XFS_RTSB_MAGIC); + + rsb->rsb_pad = 0; + memcpy(&rsb->rsb_fname, &dsb->sb_fname, XFSLABEL_MAX); + + memcpy(&rsb->rsb_uuid, &dsb->sb_uuid, sizeof(rsb->rsb_uuid)); + + /* + * The metadata uuid is the fs uuid if the metauuid feature is not + * enabled. + */ + if (dsb->sb_features_incompat & + cpu_to_be32(XFS_SB_FEAT_INCOMPAT_META_UUID)) + meta_uuid = &dsb->sb_meta_uuid; + else + meta_uuid = &dsb->sb_uuid; + memcpy(&rsb->rsb_meta_uuid, meta_uuid, sizeof(rsb->rsb_meta_uuid)); +} + +/* + * Update the realtime superblock from a filesystem superblock and log it to + * the given transaction. + */ +struct xfs_buf * +xfs_log_rtsb( + struct xfs_trans *tp, + const struct xfs_buf *sb_bp) +{ + struct xfs_buf *rtsb_bp; + + if (!xfs_has_rtsb(tp->t_mountp)) + return NULL; + + rtsb_bp = xfs_trans_getrtsb(tp); + if (!rtsb_bp) { + /* + * It's possible for the rtgroups feature to be enabled but + * there is no incore rt superblock buffer if the rt geometry + * was specified at mkfs time but the rt section has not yet + * been attached. In this case, rblocks must be zero. + */ + ASSERT(tp->t_mountp->m_sb.sb_rblocks == 0); + return NULL; + } + + xfs_update_rtsb(rtsb_bp, sb_bp); + xfs_trans_ordered_buf(tp, rtsb_bp); + return rtsb_bp; +} diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h new file mode 100644 index 000000000000..d36a6ae0abe5 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -0,0 +1,368 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __LIBXFS_RTGROUP_H +#define __LIBXFS_RTGROUP_H 1 + +#include "xfs_group.h" + +struct xfs_mount; +struct xfs_trans; + +enum xfs_rtg_inodes { + XFS_RTGI_BITMAP, /* allocation bitmap */ + XFS_RTGI_SUMMARY, /* allocation summary */ + XFS_RTGI_RMAP, /* rmap btree inode */ + XFS_RTGI_REFCOUNT, /* refcount btree inode */ + + XFS_RTGI_MAX, +}; + +#ifdef MAX_LOCKDEP_SUBCLASSES +static_assert(XFS_RTGI_MAX <= MAX_LOCKDEP_SUBCLASSES); +#endif + +/* + * Realtime group incore structure, similar to the per-AG structure. + */ +struct xfs_rtgroup { + struct xfs_group rtg_group; + + /* per-rtgroup metadata inodes */ + struct xfs_inode *rtg_inodes[XFS_RTGI_MAX]; + + /* Number of blocks in this group */ + xfs_rtxnum_t rtg_extents; + + /* + * For bitmap based RT devices this points to a cache of rt summary + * level per bitmap block with the invariant that rtg_rsum_cache[bbno] + * > the maximum i for which rsum[i][bbno] != 0, or 0 if + * rsum[i][bbno] == 0 for all i. + * Reads and writes are serialized by the rsumip inode lock. + * + * For zoned RT devices this points to the open zone structure for + * a group that is open for writers, or is NULL. + */ + union { + uint8_t *rtg_rsum_cache; + struct xfs_open_zone *rtg_open_zone; + }; +}; + +/* + * For zoned RT devices this is set on groups that have no written blocks + * and can be picked by the allocator for opening. + */ +#define XFS_RTG_FREE XA_MARK_0 + +/* + * For zoned RT devices this is set on groups that are fully written and that + * have unused blocks. Used by the garbage collection to pick targets. + */ +#define XFS_RTG_RECLAIMABLE XA_MARK_1 + +static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg) +{ + return container_of(xg, struct xfs_rtgroup, rtg_group); +} + +static inline struct xfs_group *rtg_group(struct xfs_rtgroup *rtg) +{ + return &rtg->rtg_group; +} + +static inline struct xfs_mount *rtg_mount(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_mount; +} + +static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_gno; +} + +static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_block_count; +} + +static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_inodes[XFS_RTGI_BITMAP]; +} + +static inline struct xfs_inode *rtg_summary(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_inodes[XFS_RTGI_SUMMARY]; +} + +static inline struct xfs_inode *rtg_rmap(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_inodes[XFS_RTGI_RMAP]; +} + +static inline struct xfs_inode *rtg_refcount(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_inodes[XFS_RTGI_REFCOUNT]; +} + +/* Passive rtgroup references */ +static inline struct xfs_rtgroup * +xfs_rtgroup_get( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + return to_rtg(xfs_group_get(mp, rgno, XG_TYPE_RTG)); +} + +static inline struct xfs_rtgroup * +xfs_rtgroup_hold( + struct xfs_rtgroup *rtg) +{ + return to_rtg(xfs_group_hold(rtg_group(rtg))); +} + +static inline void +xfs_rtgroup_put( + struct xfs_rtgroup *rtg) +{ + xfs_group_put(rtg_group(rtg)); +} + +/* Active rtgroup references */ +static inline struct xfs_rtgroup * +xfs_rtgroup_grab( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + return to_rtg(xfs_group_grab(mp, rgno, XG_TYPE_RTG)); +} + +static inline void +xfs_rtgroup_rele( + struct xfs_rtgroup *rtg) +{ + xfs_group_rele(rtg_group(rtg)); +} + +static inline struct xfs_rtgroup * +xfs_rtgroup_next_range( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg, + xfs_rgnumber_t start_rgno, + xfs_rgnumber_t end_rgno) +{ + return to_rtg(xfs_group_next_range(mp, rtg ? rtg_group(rtg) : NULL, + start_rgno, end_rgno, XG_TYPE_RTG)); +} + +static inline struct xfs_rtgroup * +xfs_rtgroup_next( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg) +{ + return xfs_rtgroup_next_range(mp, rtg, 0, mp->m_sb.sb_rgcount - 1); +} + +static inline bool +xfs_verify_rgbno( + struct xfs_rtgroup *rtg, + xfs_rgblock_t rgbno) +{ + ASSERT(xfs_has_rtgroups(rtg_mount(rtg))); + + return xfs_verify_gbno(rtg_group(rtg), rgbno); +} + +/* + * Check that [@rgbno,@len] is a valid extent range in @rtg. + * + * Must only be used for RTG-enabled file systems. + */ +static inline bool +xfs_verify_rgbext( + struct xfs_rtgroup *rtg, + xfs_rgblock_t rgbno, + xfs_extlen_t len) +{ + ASSERT(xfs_has_rtgroups(rtg_mount(rtg))); + + return xfs_verify_gbext(rtg_group(rtg), rgbno, len); +} + +static inline xfs_rtblock_t +xfs_rgbno_to_rtb( + struct xfs_rtgroup *rtg, + xfs_rgblock_t rgbno) +{ + return xfs_gbno_to_fsb(rtg_group(rtg), rgbno); +} + +static inline xfs_rgnumber_t +xfs_rtb_to_rgno( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return xfs_fsb_to_gno(mp, rtbno, XG_TYPE_RTG); +} + +static inline xfs_rgblock_t +xfs_rtb_to_rgbno( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return xfs_fsb_to_gbno(mp, rtbno, XG_TYPE_RTG); +} + +/* Is rtbno the start of a RT group? */ +static inline bool +xfs_rtbno_is_group_start( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return (rtbno & mp->m_groups[XG_TYPE_RTG].blkmask) == 0; +} + +/* Convert an rtgroups rt extent number into an rgbno. */ +static inline xfs_rgblock_t +xfs_rtx_to_rgbno( + struct xfs_rtgroup *rtg, + xfs_rtxnum_t rtx) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (likely(mp->m_rtxblklog >= 0)) + return rtx << mp->m_rtxblklog; + return rtx * mp->m_sb.sb_rextsize; +} + +static inline xfs_daddr_t +xfs_rtb_to_daddr( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); + + rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask); + } + + return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno); +} + +static inline xfs_rtblock_t +xfs_daddr_to_rtb( + struct xfs_mount *mp, + xfs_daddr_t daddr) +{ + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + xfs_rfsblock_t bno; + + bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb; + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { + xfs_rgnumber_t rgno; + uint32_t rgbno; + + rgno = div_u64_rem(bno, g->blocks, &rgbno); + return ((xfs_rtblock_t)rgno << g->blklog) + rgbno; + } + + return bno; +} + +#ifdef CONFIG_XFS_RT +int xfs_rtgroup_alloc(struct xfs_mount *mp, xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, xfs_rtbxlen_t rextents); +void xfs_rtgroup_free(struct xfs_mount *mp, xfs_rgnumber_t rgno); + +void xfs_free_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno); +int xfs_initialize_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno, xfs_rtbxlen_t rextents); + +xfs_rtxnum_t __xfs_rtgroup_extents(struct xfs_mount *mp, xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, xfs_rtbxlen_t rextents); +xfs_rtxnum_t xfs_rtgroup_extents(struct xfs_mount *mp, xfs_rgnumber_t rgno); +void xfs_rtgroup_calc_geometry(struct xfs_mount *mp, struct xfs_rtgroup *rtg, + xfs_rgnumber_t rgno, xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents); + +int xfs_update_last_rtgroup_size(struct xfs_mount *mp, + xfs_rgnumber_t prev_rgcount); + +/* Lock the rt bitmap inode in exclusive mode */ +#define XFS_RTGLOCK_BITMAP (1U << 0) +/* Lock the rt bitmap inode in shared mode */ +#define XFS_RTGLOCK_BITMAP_SHARED (1U << 1) +/* Lock the rt rmap inode in exclusive mode */ +#define XFS_RTGLOCK_RMAP (1U << 2) +/* Lock the rt refcount inode in exclusive mode */ +#define XFS_RTGLOCK_REFCOUNT (1U << 3) + +#define XFS_RTGLOCK_ALL_FLAGS (XFS_RTGLOCK_BITMAP | \ + XFS_RTGLOCK_BITMAP_SHARED | \ + XFS_RTGLOCK_RMAP | \ + XFS_RTGLOCK_REFCOUNT) + +void xfs_rtgroup_lock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags); +void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags); +void xfs_rtgroup_trans_join(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + unsigned int rtglock_flags); + +int xfs_rtgroup_get_geometry(struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo); + +int xfs_rtginode_mkdir_parent(struct xfs_mount *mp); +int xfs_rtginode_load_parent(struct xfs_trans *tp); + +const char *xfs_rtginode_name(enum xfs_rtg_inodes type); +enum xfs_metafile_type xfs_rtginode_metafile_type(enum xfs_rtg_inodes type); +bool xfs_rtginode_enabled(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type); +void xfs_rtginode_mark_sick(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type); +int xfs_rtginode_load(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type, + struct xfs_trans *tp); +int xfs_rtginode_create(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type, + bool init); +void xfs_rtginode_irele(struct xfs_inode **ipp); + +void xfs_rtginode_irele(struct xfs_inode **ipp); + +static inline const char *xfs_rtginode_path(xfs_rgnumber_t rgno, + enum xfs_rtg_inodes type) +{ + return kasprintf(GFP_KERNEL, "%u.%s", rgno, xfs_rtginode_name(type)); +} + +void xfs_update_rtsb(struct xfs_buf *rtsb_bp, + const struct xfs_buf *sb_bp); +struct xfs_buf *xfs_log_rtsb(struct xfs_trans *tp, + const struct xfs_buf *sb_bp); +#else +static inline void xfs_free_rtgroups(struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, xfs_rgnumber_t end_rgno) +{ +} + +static inline int xfs_initialize_rtgroups(struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, xfs_rgnumber_t end_rgno, + xfs_rtbxlen_t rextents) +{ + return 0; +} + +# define xfs_rtgroup_extents(mp, rgno) (0) +# define xfs_update_last_rtgroup_size(mp, rgno) (0) +# define xfs_rtgroup_lock(rtg, gf) ((void)0) +# define xfs_rtgroup_unlock(rtg, gf) ((void)0) +# define xfs_rtgroup_trans_join(tp, rtg, gf) ((void)0) +# define xfs_update_rtsb(bp, sb_bp) ((void)0) +# define xfs_log_rtsb(tp, sb_bp) (NULL) +# define xfs_rtgroup_get_geometry(rtg, rgeo) (-EOPNOTSUPP) +#endif /* CONFIG_XFS_RT */ + +#endif /* __LIBXFS_RTGROUP_H */ diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c new file mode 100644 index 000000000000..3db5e7a4a945 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c @@ -0,0 +1,757 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_refcount.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" +#include "xfs_metafile.h" +#include "xfs_health.h" + +static struct kmem_cache *xfs_rtrefcountbt_cur_cache; + +/* + * Realtime Reference Count btree. + * + * This is a btree used to track the owner(s) of a given extent in the realtime + * device. See the comments in xfs_refcount_btree.c for more information. + * + * This tree is basically the same as the regular refcount btree except that + * it's rooted in an inode. + */ + +static struct xfs_btree_cur * +xfs_rtrefcountbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_rtrefcountbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group)); +} + +STATIC int +xfs_rtrefcountbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0) / 2; + } + + return cur->bc_mp->m_rtrefc_mnr[level != 0]; +} + +STATIC int +xfs_rtrefcountbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0); + } + + return cur->bc_mp->m_rtrefc_mxr[level != 0]; +} + +/* + * Calculate number of records in a realtime refcount btree inode root. + */ +unsigned int +xfs_rtrefcountbt_droot_maxrecs( + unsigned int blocklen, + bool leaf) +{ + blocklen -= sizeof(struct xfs_rtrefcount_root); + + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (2 * sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); +} + +/* + * Get the maximum records we could store in the on-disk format. + * + * For non-root nodes this is equivalent to xfs_rtrefcountbt_get_maxrecs, but + * for the root node this checks the available space in the dinode fork so that + * we can resize the in-memory buffer to match it. After a resize to the + * maximum size this function returns the same value as + * xfs_rtrefcountbt_get_maxrecs for the root node, too. + */ +STATIC int +xfs_rtrefcountbt_get_dmaxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level != cur->bc_nlevels - 1) + return cur->bc_mp->m_rtrefc_mxr[level != 0]; + return xfs_rtrefcountbt_droot_maxrecs(cur->bc_ino.forksize, level == 0); +} + +STATIC void +xfs_rtrefcountbt_init_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + key->refc.rc_startblock = rec->refc.rc_startblock; +} + +STATIC void +xfs_rtrefcountbt_init_high_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + __u32 x; + + x = be32_to_cpu(rec->refc.rc_startblock); + x += be32_to_cpu(rec->refc.rc_blockcount) - 1; + key->refc.rc_startblock = cpu_to_be32(x); +} + +STATIC void +xfs_rtrefcountbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec->refc.rc_startblock = cpu_to_be32(start); + rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); + rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); +} + +STATIC void +xfs_rtrefcountbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ptr->l = 0; +} + +STATIC int64_t +xfs_rtrefcountbt_key_diff( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) +{ + const struct xfs_refcount_key *kp = &key->refc; + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + return (int64_t)be32_to_cpu(kp->rc_startblock) - start; +} + +STATIC int64_t +xfs_rtrefcountbt_diff_two_keys( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->refc.rc_startblock); + + return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - + be32_to_cpu(k2->refc.rc_startblock); +} + +static xfs_failaddr_t +xfs_rtrefcountbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + int level; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (!xfs_has_reflink(mp)) + return __this_address; + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + level = be16_to_cpu(block->bb_level); + if (level > mp->m_rtrefc_maxlevels) + return __this_address; + + return xfs_btree_fsblock_verify(bp, mp->m_rtrefc_mxr[level != 0]); +} + +static void +xfs_rtrefcountbt_read_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (!xfs_btree_fsblock_verify_crc(bp)) + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_rtrefcountbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } + + if (bp->b_error) + trace_xfs_btree_corrupt(bp, _RET_IP_); +} + +static void +xfs_rtrefcountbt_write_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_rtrefcountbt_verify(bp); + if (fa) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + xfs_btree_fsblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = { + .name = "xfs_rtrefcountbt", + .magic = { 0, cpu_to_be32(XFS_RTREFC_CRC_MAGIC) }, + .verify_read = xfs_rtrefcountbt_read_verify, + .verify_write = xfs_rtrefcountbt_write_verify, + .verify_struct = xfs_rtrefcountbt_verify, +}; + +STATIC int +xfs_rtrefcountbt_keys_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->refc.rc_startblock) < + be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC int +xfs_rtrefcountbt_recs_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->refc.rc_startblock) + + be32_to_cpu(r1->refc.rc_blockcount) <= + be32_to_cpu(r2->refc.rc_startblock); +} + +STATIC enum xbtree_key_contig +xfs_rtrefcountbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->refc.rc_startblock); + + return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock), + be32_to_cpu(key2->refc.rc_startblock)); +} + +static inline void +xfs_rtrefcountbt_move_ptrs( + struct xfs_mount *mp, + struct xfs_btree_block *broot, + short old_size, + size_t new_size, + unsigned int numrecs) +{ + void *dptr; + void *sptr; + + sptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, old_size); + dptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, new_size); + memmove(dptr, sptr, numrecs * sizeof(xfs_rtrefcount_ptr_t)); +} + +static struct xfs_btree_block * +xfs_rtrefcountbt_broot_realloc( + struct xfs_btree_cur *cur, + unsigned int new_numrecs) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + struct xfs_btree_block *broot; + unsigned int new_size; + unsigned int old_size = ifp->if_broot_bytes; + const unsigned int level = cur->bc_nlevels - 1; + + new_size = xfs_rtrefcount_broot_space_calc(mp, level, new_numrecs); + + /* Handle the nop case quietly. */ + if (new_size == old_size) + return ifp->if_broot; + + if (new_size > old_size) { + unsigned int old_numrecs; + + /* + * If there wasn't any memory allocated before, just allocate + * it now and get out. + */ + if (old_size == 0) + return xfs_broot_realloc(ifp, new_size); + + /* + * If there is already an existing if_broot, then we need to + * realloc it and possibly move the node block pointers because + * those are not butted up against the btree block header. + */ + old_numrecs = xfs_rtrefcountbt_maxrecs(mp, old_size, level); + broot = xfs_broot_realloc(ifp, new_size); + if (level > 0) + xfs_rtrefcountbt_move_ptrs(mp, broot, old_size, + new_size, old_numrecs); + goto out_broot; + } + + /* + * We're reducing numrecs. If we're going all the way to zero, just + * free the block. + */ + ASSERT(ifp->if_broot != NULL && old_size > 0); + if (new_size == 0) + return xfs_broot_realloc(ifp, 0); + + /* + * Shrink the btree root by possibly moving the rtrmapbt pointers, + * since they are not butted up against the btree block header. Then + * reallocate broot. + */ + if (level > 0) + xfs_rtrefcountbt_move_ptrs(mp, ifp->if_broot, old_size, + new_size, new_numrecs); + broot = xfs_broot_realloc(ifp, new_size); + +out_broot: + ASSERT(xfs_rtrefcount_droot_space(broot) <= + xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork)); + return broot; +} + +const struct xfs_btree_ops xfs_rtrefcountbt_ops = { + .name = "rtrefcount", + .type = XFS_BTREE_TYPE_INODE, + .geom_flags = XFS_BTGEO_IROOT_RECORDS, + + .rec_len = sizeof(struct xfs_refcount_rec), + .key_len = sizeof(struct xfs_refcount_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_REFC_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rtrefcbt_2), + .sick_mask = XFS_SICK_RG_REFCNTBT, + + .dup_cursor = xfs_rtrefcountbt_dup_cursor, + .alloc_block = xfs_btree_alloc_metafile_block, + .free_block = xfs_btree_free_metafile_block, + .get_minrecs = xfs_rtrefcountbt_get_minrecs, + .get_maxrecs = xfs_rtrefcountbt_get_maxrecs, + .get_dmaxrecs = xfs_rtrefcountbt_get_dmaxrecs, + .init_key_from_rec = xfs_rtrefcountbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur, + .key_diff = xfs_rtrefcountbt_key_diff, + .buf_ops = &xfs_rtrefcountbt_buf_ops, + .diff_two_keys = xfs_rtrefcountbt_diff_two_keys, + .keys_inorder = xfs_rtrefcountbt_keys_inorder, + .recs_inorder = xfs_rtrefcountbt_recs_inorder, + .keys_contiguous = xfs_rtrefcountbt_keys_contiguous, + .broot_realloc = xfs_rtrefcountbt_broot_realloc, +}; + +/* Allocate a new rt refcount btree cursor. */ +struct xfs_btree_cur * +xfs_rtrefcountbt_init_cursor( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg) +{ + struct xfs_inode *ip = rtg_refcount(rtg); + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_btree_cur *cur; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrefcountbt_ops, + mp->m_rtrefc_maxlevels, xfs_rtrefcountbt_cur_cache); + + cur->bc_ino.ip = ip; + cur->bc_refc.nr_ops = 0; + cur->bc_refc.shape_changes = 0; + cur->bc_group = xfs_group_hold(rtg_group(rtg)); + cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK); + cur->bc_ino.whichfork = XFS_DATA_FORK; + return cur; +} + +/* + * Install a new rt reverse mapping btree root. Caller is responsible for + * invalidating and freeing the old btree blocks. + */ +void +xfs_rtrefcountbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp) +{ + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; + struct xfs_ifork *ifp; + int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE); + + /* + * Free any resources hanging off the real fork, then shallow-copy the + * staging fork's contents into the real fork to transfer everything + * we just built. + */ + ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); + + cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno; + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); + xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK); +} + +/* Calculate number of records in a realtime refcount btree block. */ +static inline unsigned int +xfs_rtrefcountbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); +} + +/* + * Calculate number of records in an refcount btree block. + */ +unsigned int +xfs_rtrefcountbt_maxrecs( + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) +{ + blocklen -= XFS_RTREFCOUNT_BLOCK_LEN; + return xfs_rtrefcountbt_block_maxrecs(blocklen, leaf); +} + +/* Compute the max possible height for realtime refcount btrees. */ +unsigned int +xfs_rtrefcountbt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = xfs_rtrefcountbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_rtrefcountbt_block_maxrecs(blocklen, false) / 2; + + /* We need at most one record for every block in an rt group. */ + return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS); +} + +int __init +xfs_rtrefcountbt_init_cur_cache(void) +{ + xfs_rtrefcountbt_cur_cache = kmem_cache_create("xfs_rtrefcountbt_cur", + xfs_btree_cur_sizeof( + xfs_rtrefcountbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_rtrefcountbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_rtrefcountbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_rtrefcountbt_cur_cache); + xfs_rtrefcountbt_cur_cache = NULL; +} + +/* Compute the maximum height of a realtime refcount btree. */ +void +xfs_rtrefcountbt_compute_maxlevels( + struct xfs_mount *mp) +{ + unsigned int d_maxlevels, r_maxlevels; + + if (!xfs_has_rtreflink(mp)) { + mp->m_rtrefc_maxlevels = 0; + return; + } + + /* + * The realtime refcountbt lives on the data device, which means that + * its maximum height is constrained by the size of the data device and + * the height required to store one refcount record for each rtextent + * in an rt group. + */ + d_maxlevels = xfs_btree_space_to_height(mp->m_rtrefc_mnr, + mp->m_sb.sb_dblocks); + r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrefc_mnr, + mp->m_sb.sb_rgextents); + + /* Add one level to handle the inode root level. */ + mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1; +} + +/* Calculate the rtrefcount btree size for some records. */ +unsigned long long +xfs_rtrefcountbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_rtrefc_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +static unsigned long long +xfs_rtrefcountbt_max_size( + struct xfs_mount *mp, + xfs_rtblock_t rtblocks) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_rtrefc_mxr[0] == 0) + return 0; + + return xfs_rtrefcountbt_calc_size(mp, rtblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + * We need enough space to hold one record for every rt extent in the rtgroup. + */ +xfs_filblks_t +xfs_rtrefcountbt_calc_reserves( + struct xfs_mount *mp) +{ + if (!xfs_has_rtreflink(mp)) + return 0; + + return xfs_rtrefcountbt_max_size(mp, mp->m_sb.sb_rgextents); +} + +/* + * Convert on-disk form of btree root to in-memory form. + */ +STATIC void +xfs_rtrefcountbt_from_disk( + struct xfs_inode *ip, + struct xfs_rtrefcount_root *dblock, + int dblocklen, + struct xfs_btree_block *rblock) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_refcount_key *fkp; + __be64 *fpp; + struct xfs_refcount_key *tkp; + __be64 *tpp; + struct xfs_refcount_rec *frp; + struct xfs_refcount_rec *trp; + unsigned int numrecs; + unsigned int maxrecs; + unsigned int rblocklen; + + rblocklen = xfs_rtrefcount_broot_space(mp, dblock); + + xfs_btree_init_block(mp, rblock, &xfs_rtrefcountbt_ops, 0, 0, + ip->i_ino); + + rblock->bb_level = dblock->bb_level; + rblock->bb_numrecs = dblock->bb_numrecs; + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrefcount_droot_key_addr(dblock, 1); + tkp = xfs_rtrefcount_key_addr(rblock, 1); + fpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs); + tpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen); + numrecs = be16_to_cpu(dblock->bb_numrecs); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrefcount_droot_rec_addr(dblock, 1); + trp = xfs_rtrefcount_rec_addr(rblock, 1); + numrecs = be16_to_cpu(dblock->bb_numrecs); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Load a realtime reference count btree root in from disk. */ +int +xfs_iformat_rtrefcount( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + struct xfs_btree_block *broot; + unsigned int numrecs; + unsigned int level; + int dsize; + + /* + * growfs must create the rtrefcount inodes before adding a realtime + * volume to the filesystem, so we cannot use the rtrefcount predicate + * here. + */ + if (!xfs_has_reflink(ip->i_mount)) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK); + numrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (level > mp->m_rtrefc_maxlevels || + xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK), + xfs_rtrefcount_broot_space_calc(mp, level, numrecs)); + if (broot) + xfs_rtrefcountbt_from_disk(ip, dfp, dsize, broot); + return 0; +} + +/* + * Convert in-memory form of btree root to on-disk form. + */ +void +xfs_rtrefcountbt_to_disk( + struct xfs_mount *mp, + struct xfs_btree_block *rblock, + int rblocklen, + struct xfs_rtrefcount_root *dblock, + int dblocklen) +{ + struct xfs_refcount_key *fkp; + __be64 *fpp; + struct xfs_refcount_key *tkp; + __be64 *tpp; + struct xfs_refcount_rec *frp; + struct xfs_refcount_rec *trp; + unsigned int maxrecs; + unsigned int numrecs; + + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTREFC_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL)); + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)); + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)); + + dblock->bb_level = rblock->bb_level; + dblock->bb_numrecs = rblock->bb_numrecs; + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrefcount_key_addr(rblock, 1); + tkp = xfs_rtrefcount_droot_key_addr(dblock, 1); + fpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen); + tpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs); + numrecs = be16_to_cpu(rblock->bb_numrecs); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrefcount_rec_addr(rblock, 1); + trp = xfs_rtrefcount_droot_rec_addr(dblock, 1); + numrecs = be16_to_cpu(rblock->bb_numrecs); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Flush a realtime reference count btree root out to disk. */ +void +xfs_iflush_rtrefcount( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + + ASSERT(ifp->if_broot != NULL); + ASSERT(ifp->if_broot_bytes > 0); + ASSERT(xfs_rtrefcount_droot_space(ifp->if_broot) <= + xfs_inode_fork_size(ip, XFS_DATA_FORK)); + xfs_rtrefcountbt_to_disk(ip->i_mount, ifp->if_broot, + ifp->if_broot_bytes, dfp, + XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK)); +} + +/* + * Create a realtime refcount btree inode. + */ +int +xfs_rtrefcountbt_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_block *broot; + + ifp->if_format = XFS_DINODE_FMT_META_BTREE; + ASSERT(ifp->if_broot_bytes == 0); + ASSERT(ifp->if_bytes == 0); + + /* Initialize the empty incore btree root. */ + broot = xfs_broot_realloc(ifp, + xfs_rtrefcount_broot_space_calc(mp, 0, 0)); + if (broot) + xfs_btree_init_block(mp, broot, &xfs_rtrefcountbt_ops, 0, 0, + ip->i_ino); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h new file mode 100644 index 000000000000..a99b7a8aec86 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_RTREFCOUNT_BTREE_H__ +#define __XFS_RTREFCOUNT_BTREE_H__ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; +struct xbtree_ifakeroot; +struct xfs_rtgroup; + +/* refcounts only exist on crc enabled filesystems */ +#define XFS_RTREFCOUNT_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN + +struct xfs_btree_cur *xfs_rtrefcountbt_init_cursor(struct xfs_trans *tp, + struct xfs_rtgroup *rtg); +struct xfs_btree_cur *xfs_rtrefcountbt_stage_cursor(struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xbtree_ifakeroot *ifake); +void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp); +unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp, + unsigned int blocklen, bool leaf); +void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp); +unsigned int xfs_rtrefcountbt_droot_maxrecs(unsigned int blocklen, bool leaf); + +/* + * Addresses of records, keys, and pointers within an incore rtrefcountbt block. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +static inline struct xfs_refcount_rec * +xfs_rtrefcount_rec_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_refcount_rec *) + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + + (index - 1) * sizeof(struct xfs_refcount_rec)); +} + +static inline struct xfs_refcount_key * +xfs_rtrefcount_key_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_refcount_key *) + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + + (index - 1) * sizeof(struct xfs_refcount_key)); +} + +static inline xfs_rtrefcount_ptr_t * +xfs_rtrefcount_ptr_addr( + struct xfs_btree_block *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrefcount_ptr_t *) + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + + maxrecs * sizeof(struct xfs_refcount_key) + + (index - 1) * sizeof(xfs_rtrefcount_ptr_t)); +} + +unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void); +int __init xfs_rtrefcountbt_init_cur_cache(void); +void xfs_rtrefcountbt_destroy_cur_cache(void); + +xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp); +unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp, + unsigned long long len); + +/* Addresses of key, pointers, and records within an ondisk rtrefcount block. */ + +static inline struct xfs_refcount_rec * +xfs_rtrefcount_droot_rec_addr( + struct xfs_rtrefcount_root *block, + unsigned int index) +{ + return (struct xfs_refcount_rec *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_refcount_rec)); +} + +static inline struct xfs_refcount_key * +xfs_rtrefcount_droot_key_addr( + struct xfs_rtrefcount_root *block, + unsigned int index) +{ + return (struct xfs_refcount_key *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_refcount_key)); +} + +static inline xfs_rtrefcount_ptr_t * +xfs_rtrefcount_droot_ptr_addr( + struct xfs_rtrefcount_root *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrefcount_ptr_t *) + ((char *)(block + 1) + + maxrecs * sizeof(struct xfs_refcount_key) + + (index - 1) * sizeof(xfs_rtrefcount_ptr_t)); +} + +/* + * Address of pointers within the incore btree root. + * + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +static inline xfs_rtrefcount_ptr_t * +xfs_rtrefcount_broot_ptr_addr( + struct xfs_mount *mp, + struct xfs_btree_block *bb, + unsigned int index, + unsigned int block_size) +{ + return xfs_rtrefcount_ptr_addr(bb, index, + xfs_rtrefcountbt_maxrecs(mp, block_size, false)); +} + +/* + * Compute the space required for the incore btree root containing the given + * number of records. + */ +static inline size_t +xfs_rtrefcount_broot_space_calc( + struct xfs_mount *mp, + unsigned int level, + unsigned int nrecs) +{ + size_t sz = XFS_RTREFCOUNT_BLOCK_LEN; + + if (level > 0) + return sz + nrecs * (sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); + return sz + nrecs * sizeof(struct xfs_refcount_rec); +} + +/* + * Compute the space required for the incore btree root given the ondisk + * btree root block. + */ +static inline size_t +xfs_rtrefcount_broot_space(struct xfs_mount *mp, struct xfs_rtrefcount_root *bb) +{ + return xfs_rtrefcount_broot_space_calc(mp, be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +/* Compute the space required for the ondisk root block. */ +static inline size_t +xfs_rtrefcount_droot_space_calc( + unsigned int level, + unsigned int nrecs) +{ + size_t sz = sizeof(struct xfs_rtrefcount_root); + + if (level > 0) + return sz + nrecs * (sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); + return sz + nrecs * sizeof(struct xfs_refcount_rec); +} + +/* + * Compute the space required for the ondisk root block given an incore root + * block. + */ +static inline size_t +xfs_rtrefcount_droot_space(struct xfs_btree_block *bb) +{ + return xfs_rtrefcount_droot_space_calc(be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +int xfs_iformat_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip); +void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp, + struct xfs_btree_block *rblock, int rblocklen, + struct xfs_rtrefcount_root *dblock, int dblocklen); +void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip); + +int xfs_rtrefcountbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); + +#endif /* __XFS_RTREFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c new file mode 100644 index 000000000000..9bdc2cbfc113 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c @@ -0,0 +1,1054 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_metafile.h" +#include "xfs_rmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" +#include "xfs_rtgroup.h" +#include "xfs_bmap.h" +#include "xfs_health.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" + +static struct kmem_cache *xfs_rtrmapbt_cur_cache; + +/* + * Realtime Reverse Map btree. + * + * This is a btree used to track the owner(s) of a given extent in the realtime + * device. See the comments in xfs_rmap_btree.c for more information. + * + * This tree is basically the same as the regular rmap btree except that it + * is rooted in an inode and does not live in free space. + */ + +static struct xfs_btree_cur * +xfs_rtrmapbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_rtrmapbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group)); +} + +STATIC int +xfs_rtrmapbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0) / 2; + } + + return cur->bc_mp->m_rtrmap_mnr[level != 0]; +} + +STATIC int +xfs_rtrmapbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0); + } + + return cur->bc_mp->m_rtrmap_mxr[level != 0]; +} + +/* Calculate number of records in the ondisk realtime rmap btree inode root. */ +unsigned int +xfs_rtrmapbt_droot_maxrecs( + unsigned int blocklen, + bool leaf) +{ + blocklen -= sizeof(struct xfs_rtrmap_root); + + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / (2 * sizeof(struct xfs_rmap_key) + + sizeof(xfs_rtrmap_ptr_t)); +} + +/* + * Get the maximum records we could store in the on-disk format. + * + * For non-root nodes this is equivalent to xfs_rtrmapbt_get_maxrecs, but + * for the root node this checks the available space in the dinode fork + * so that we can resize the in-memory buffer to match it. After a + * resize to the maximum size this function returns the same value + * as xfs_rtrmapbt_get_maxrecs for the root node, too. + */ +STATIC int +xfs_rtrmapbt_get_dmaxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level != cur->bc_nlevels - 1) + return cur->bc_mp->m_rtrmap_mxr[level != 0]; + return xfs_rtrmapbt_droot_maxrecs(cur->bc_ino.forksize, level == 0); +} + +/* + * Convert the ondisk record's offset field into the ondisk key's offset field. + * Fork and bmbt are significant parts of the rmap record key, but written + * status is merely a record attribute. + */ +static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec) +{ + return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); +} + +STATIC void +xfs_rtrmapbt_init_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + key->rmap.rm_startblock = rec->rmap.rm_startblock; + key->rmap.rm_owner = rec->rmap.rm_owner; + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); +} + +STATIC void +xfs_rtrmapbt_init_high_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + uint64_t off; + int adj; + + adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; + + key->rmap.rm_startblock = rec->rmap.rm_startblock; + be32_add_cpu(&key->rmap.rm_startblock, adj); + key->rmap.rm_owner = rec->rmap.rm_owner; + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); + if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || + XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) + return; + off = be64_to_cpu(key->rmap.rm_offset); + off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); + key->rmap.rm_offset = cpu_to_be64(off); +} + +STATIC void +xfs_rtrmapbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); + rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); + rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); + rec->rmap.rm_offset = cpu_to_be64( + xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); +} + +STATIC void +xfs_rtrmapbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ptr->l = 0; +} + +/* + * Mask the appropriate parts of the ondisk key field for a key comparison. + * Fork and bmbt are significant parts of the rmap record key, but written + * status is merely a record attribute. + */ +static inline uint64_t offset_keymask(uint64_t offset) +{ + return offset & ~XFS_RMAP_OFF_UNWRITTEN; +} + +STATIC int64_t +xfs_rtrmapbt_key_diff( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) +{ + struct xfs_rmap_irec *rec = &cur->bc_rec.r; + const struct xfs_rmap_key *kp = &key->rmap; + __u64 x, y; + int64_t d; + + d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; + if (d) + return d; + + x = be64_to_cpu(kp->rm_owner); + y = rec->rm_owner; + if (x > y) + return 1; + else if (y > x) + return -1; + + x = offset_keymask(be64_to_cpu(kp->rm_offset)); + y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); + if (x > y) + return 1; + else if (y > x) + return -1; + return 0; +} + +STATIC int64_t +xfs_rtrmapbt_diff_two_keys( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) +{ + const struct xfs_rmap_key *kp1 = &k1->rmap; + const struct xfs_rmap_key *kp2 = &k2->rmap; + int64_t d; + __u64 x, y; + + /* Doesn't make sense to mask off the physical space part */ + ASSERT(!mask || mask->rmap.rm_startblock); + + d = (int64_t)be32_to_cpu(kp1->rm_startblock) - + be32_to_cpu(kp2->rm_startblock); + if (d) + return d; + + if (!mask || mask->rmap.rm_owner) { + x = be64_to_cpu(kp1->rm_owner); + y = be64_to_cpu(kp2->rm_owner); + if (x > y) + return 1; + else if (y > x) + return -1; + } + + if (!mask || mask->rmap.rm_offset) { + /* Doesn't make sense to allow offset but not owner */ + ASSERT(!mask || mask->rmap.rm_owner); + + x = offset_keymask(be64_to_cpu(kp1->rm_offset)); + y = offset_keymask(be64_to_cpu(kp2->rm_offset)); + if (x > y) + return 1; + else if (y > x) + return -1; + } + + return 0; +} + +static xfs_failaddr_t +xfs_rtrmapbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + int level; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (!xfs_has_rmapbt(mp)) + return __this_address; + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + level = be16_to_cpu(block->bb_level); + if (level > mp->m_rtrmap_maxlevels) + return __this_address; + + return xfs_btree_fsblock_verify(bp, mp->m_rtrmap_mxr[level != 0]); +} + +static void +xfs_rtrmapbt_read_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (!xfs_btree_fsblock_verify_crc(bp)) + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_rtrmapbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } + + if (bp->b_error) + trace_xfs_btree_corrupt(bp, _RET_IP_); +} + +static void +xfs_rtrmapbt_write_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_rtrmapbt_verify(bp); + if (fa) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + xfs_btree_fsblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = { + .name = "xfs_rtrmapbt", + .magic = { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) }, + .verify_read = xfs_rtrmapbt_read_verify, + .verify_write = xfs_rtrmapbt_write_verify, + .verify_struct = xfs_rtrmapbt_verify, +}; + +STATIC int +xfs_rtrmapbt_keys_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) +{ + uint32_t x; + uint32_t y; + uint64_t a; + uint64_t b; + + x = be32_to_cpu(k1->rmap.rm_startblock); + y = be32_to_cpu(k2->rmap.rm_startblock); + if (x < y) + return 1; + else if (x > y) + return 0; + a = be64_to_cpu(k1->rmap.rm_owner); + b = be64_to_cpu(k2->rmap.rm_owner); + if (a < b) + return 1; + else if (a > b) + return 0; + a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset)); + b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset)); + if (a <= b) + return 1; + return 0; +} + +STATIC int +xfs_rtrmapbt_recs_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) +{ + uint32_t x; + uint32_t y; + uint64_t a; + uint64_t b; + + x = be32_to_cpu(r1->rmap.rm_startblock); + y = be32_to_cpu(r2->rmap.rm_startblock); + if (x < y) + return 1; + else if (x > y) + return 0; + a = be64_to_cpu(r1->rmap.rm_owner); + b = be64_to_cpu(r2->rmap.rm_owner); + if (a < b) + return 1; + else if (a > b) + return 0; + a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset)); + b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset)); + if (a <= b) + return 1; + return 0; +} + +STATIC enum xbtree_key_contig +xfs_rtrmapbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->rmap.rm_startblock); + + /* + * We only support checking contiguity of the physical space component. + * If any callers ever need more specificity than that, they'll have to + * implement it here. + */ + ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset)); + + return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock), + be32_to_cpu(key2->rmap.rm_startblock)); +} + +static inline void +xfs_rtrmapbt_move_ptrs( + struct xfs_mount *mp, + struct xfs_btree_block *broot, + short old_size, + size_t new_size, + unsigned int numrecs) +{ + void *dptr; + void *sptr; + + sptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, old_size); + dptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, new_size); + memmove(dptr, sptr, numrecs * sizeof(xfs_rtrmap_ptr_t)); +} + +static struct xfs_btree_block * +xfs_rtrmapbt_broot_realloc( + struct xfs_btree_cur *cur, + unsigned int new_numrecs) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + struct xfs_btree_block *broot; + unsigned int new_size; + unsigned int old_size = ifp->if_broot_bytes; + const unsigned int level = cur->bc_nlevels - 1; + + new_size = xfs_rtrmap_broot_space_calc(mp, level, new_numrecs); + + /* Handle the nop case quietly. */ + if (new_size == old_size) + return ifp->if_broot; + + if (new_size > old_size) { + unsigned int old_numrecs; + + /* + * If there wasn't any memory allocated before, just allocate + * it now and get out. + */ + if (old_size == 0) + return xfs_broot_realloc(ifp, new_size); + + /* + * If there is already an existing if_broot, then we need to + * realloc it and possibly move the node block pointers because + * those are not butted up against the btree block header. + */ + old_numrecs = xfs_rtrmapbt_maxrecs(mp, old_size, level == 0); + broot = xfs_broot_realloc(ifp, new_size); + if (level > 0) + xfs_rtrmapbt_move_ptrs(mp, broot, old_size, new_size, + old_numrecs); + goto out_broot; + } + + /* + * We're reducing numrecs. If we're going all the way to zero, just + * free the block. + */ + ASSERT(ifp->if_broot != NULL && old_size > 0); + if (new_size == 0) + return xfs_broot_realloc(ifp, 0); + + /* + * Shrink the btree root by possibly moving the rtrmapbt pointers, + * since they are not butted up against the btree block header. Then + * reallocate broot. + */ + if (level > 0) + xfs_rtrmapbt_move_ptrs(mp, ifp->if_broot, old_size, new_size, + new_numrecs); + broot = xfs_broot_realloc(ifp, new_size); + +out_broot: + ASSERT(xfs_rtrmap_droot_space(broot) <= + xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork)); + return broot; +} + +const struct xfs_btree_ops xfs_rtrmapbt_ops = { + .name = "rtrmap", + .type = XFS_BTREE_TYPE_INODE, + .geom_flags = XFS_BTGEO_OVERLAPPING | + XFS_BTGEO_IROOT_RECORDS, + + .rec_len = sizeof(struct xfs_rmap_rec), + /* Overlapping btree; 2 keys per pointer. */ + .key_len = 2 * sizeof(struct xfs_rmap_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_RMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rtrmap_2), + .sick_mask = XFS_SICK_RG_RMAPBT, + + .dup_cursor = xfs_rtrmapbt_dup_cursor, + .alloc_block = xfs_btree_alloc_metafile_block, + .free_block = xfs_btree_free_metafile_block, + .get_minrecs = xfs_rtrmapbt_get_minrecs, + .get_maxrecs = xfs_rtrmapbt_get_maxrecs, + .get_dmaxrecs = xfs_rtrmapbt_get_dmaxrecs, + .init_key_from_rec = xfs_rtrmapbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_rtrmapbt_init_ptr_from_cur, + .key_diff = xfs_rtrmapbt_key_diff, + .buf_ops = &xfs_rtrmapbt_buf_ops, + .diff_two_keys = xfs_rtrmapbt_diff_two_keys, + .keys_inorder = xfs_rtrmapbt_keys_inorder, + .recs_inorder = xfs_rtrmapbt_recs_inorder, + .keys_contiguous = xfs_rtrmapbt_keys_contiguous, + .broot_realloc = xfs_rtrmapbt_broot_realloc, +}; + +/* Allocate a new rt rmap btree cursor. */ +struct xfs_btree_cur * +xfs_rtrmapbt_init_cursor( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg) +{ + struct xfs_inode *ip = rtg_rmap(rtg); + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_btree_cur *cur; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_ops, + mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache); + + cur->bc_ino.ip = ip; + cur->bc_group = xfs_group_hold(rtg_group(rtg)); + cur->bc_ino.whichfork = XFS_DATA_FORK; + cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK); + + return cur; +} + +#ifdef CONFIG_XFS_BTREE_IN_MEM +/* + * Validate an in-memory realtime rmap btree block. Callers are allowed to + * generate an in-memory btree even if the ondisk feature is not enabled. + */ +static xfs_failaddr_t +xfs_rtrmapbt_mem_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + unsigned int level; + unsigned int maxrecs; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + + level = be16_to_cpu(block->bb_level); + if (xfs_has_rmapbt(mp)) { + if (level >= mp->m_rtrmap_maxlevels) + return __this_address; + } else { + if (level >= xfs_rtrmapbt_maxlevels_ondisk()) + return __this_address; + } + + maxrecs = xfs_rtrmapbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0); + return xfs_btree_memblock_verify(bp, maxrecs); +} + +static void +xfs_rtrmapbt_mem_rw_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa = xfs_rtrmapbt_mem_verify(bp); + + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +/* skip crc checks on in-memory btrees to save time */ +static const struct xfs_buf_ops xfs_rtrmapbt_mem_buf_ops = { + .name = "xfs_rtrmapbt_mem", + .magic = { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) }, + .verify_read = xfs_rtrmapbt_mem_rw_verify, + .verify_write = xfs_rtrmapbt_mem_rw_verify, + .verify_struct = xfs_rtrmapbt_mem_verify, +}; + +const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = { + .type = XFS_BTREE_TYPE_MEM, + .geom_flags = XFS_BTGEO_OVERLAPPING, + + .rec_len = sizeof(struct xfs_rmap_rec), + /* Overlapping btree; 2 keys per pointer. */ + .key_len = 2 * sizeof(struct xfs_rmap_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_RMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rtrmap_mem_2), + + .dup_cursor = xfbtree_dup_cursor, + .set_root = xfbtree_set_root, + .alloc_block = xfbtree_alloc_block, + .free_block = xfbtree_free_block, + .get_minrecs = xfbtree_get_minrecs, + .get_maxrecs = xfbtree_get_maxrecs, + .init_key_from_rec = xfs_rtrmapbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur, + .init_ptr_from_cur = xfbtree_init_ptr_from_cur, + .key_diff = xfs_rtrmapbt_key_diff, + .buf_ops = &xfs_rtrmapbt_mem_buf_ops, + .diff_two_keys = xfs_rtrmapbt_diff_two_keys, + .keys_inorder = xfs_rtrmapbt_keys_inorder, + .recs_inorder = xfs_rtrmapbt_recs_inorder, + .keys_contiguous = xfs_rtrmapbt_keys_contiguous, +}; + +/* Create a cursor for an in-memory btree. */ +struct xfs_btree_cur * +xfs_rtrmapbt_mem_cursor( + struct xfs_rtgroup *rtg, + struct xfs_trans *tp, + struct xfbtree *xfbt) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_btree_cur *cur; + + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_mem_ops, + mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache); + cur->bc_mem.xfbtree = xfbt; + cur->bc_nlevels = xfbt->nlevels; + cur->bc_group = xfs_group_hold(rtg_group(rtg)); + return cur; +} + +/* Create an in-memory realtime rmap btree. */ +int +xfs_rtrmapbt_mem_init( + struct xfs_mount *mp, + struct xfbtree *xfbt, + struct xfs_buftarg *btp, + xfs_rgnumber_t rgno) +{ + xfbt->owner = rgno; + return xfbtree_init(mp, xfbt, btp, &xfs_rtrmapbt_mem_ops); +} +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + +/* + * Install a new rt reverse mapping btree root. Caller is responsible for + * invalidating and freeing the old btree blocks. + */ +void +xfs_rtrmapbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp) +{ + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; + struct xfs_ifork *ifp; + int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE); + + /* + * Free any resources hanging off the real fork, then shallow-copy the + * staging fork's contents into the real fork to transfer everything + * we just built. + */ + ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); + + cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno; + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); + xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK); +} + +/* Calculate number of records in a rt reverse mapping btree block. */ +static inline unsigned int +xfs_rtrmapbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / + (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t)); +} + +/* + * Calculate number of records in an rt reverse mapping btree block. + */ +unsigned int +xfs_rtrmapbt_maxrecs( + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) +{ + blocklen -= XFS_RTRMAP_BLOCK_LEN; + return xfs_rtrmapbt_block_maxrecs(blocklen, leaf); +} + +/* Compute the max possible height for realtime reverse mapping btrees. */ +unsigned int +xfs_rtrmapbt_maxlevels_ondisk(void) +{ + unsigned long long max_dblocks; + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2; + + /* + * Compute the asymptotic maxlevels for an rtrmapbt on any rtreflink fs. + * + * On a reflink filesystem, each block in an rtgroup can have up to + * 2^32 (per the refcount record format) owners, which means that + * theoretically we could face up to 2^64 rmap records. However, we're + * likely to run out of blocks in the data device long before that + * happens, which means that we must compute the max height based on + * what the btree will look like if it consumes almost all the blocks + * in the data device due to maximal sharing factor. + */ + max_dblocks = -1U; /* max ag count */ + max_dblocks *= XFS_MAX_CRC_AG_BLOCKS; + return xfs_btree_space_to_height(minrecs, max_dblocks); +} + +int __init +xfs_rtrmapbt_init_cur_cache(void) +{ + xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur", + xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_rtrmapbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_rtrmapbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_rtrmapbt_cur_cache); + xfs_rtrmapbt_cur_cache = NULL; +} + +/* Compute the maximum height of an rt reverse mapping btree. */ +void +xfs_rtrmapbt_compute_maxlevels( + struct xfs_mount *mp) +{ + unsigned int d_maxlevels, r_maxlevels; + + if (!xfs_has_rtrmapbt(mp)) { + mp->m_rtrmap_maxlevels = 0; + return; + } + + /* + * The realtime rmapbt lives on the data device, which means that its + * maximum height is constrained by the size of the data device and + * the height required to store one rmap record for each block in an + * rt group. + * + * On a reflink filesystem, each rt block can have up to 2^32 (per the + * refcount record format) owners, which means that theoretically we + * could face up to 2^64 rmap records. This makes the computation of + * maxlevels based on record count meaningless, so we only consider the + * size of the data device. + */ + d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr, + mp->m_sb.sb_dblocks); + if (xfs_has_rtreflink(mp)) { + mp->m_rtrmap_maxlevels = d_maxlevels + 1; + return; + } + + r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr, + mp->m_groups[XG_TYPE_RTG].blocks); + + /* Add one level to handle the inode root level. */ + mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1; +} + +/* Calculate the rtrmap btree size for some records. */ +unsigned long long +xfs_rtrmapbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_rtrmap_mnr, len); +} + +/* + * Calculate the maximum rmap btree size. + */ +static unsigned long long +xfs_rtrmapbt_max_size( + struct xfs_mount *mp, + xfs_rtblock_t rtblocks) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_rtrmap_mxr[0] == 0) + return 0; + + return xfs_rtrmapbt_calc_size(mp, rtblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +xfs_filblks_t +xfs_rtrmapbt_calc_reserves( + struct xfs_mount *mp) +{ + uint32_t blocks = mp->m_groups[XG_TYPE_RTG].blocks; + + if (!xfs_has_rtrmapbt(mp)) + return 0; + + /* Reserve 1% of the rtgroup or enough for 1 block per record. */ + return max_t(xfs_filblks_t, blocks / 100, + xfs_rtrmapbt_max_size(mp, blocks)); +} + +/* Convert on-disk form of btree root to in-memory form. */ +STATIC void +xfs_rtrmapbt_from_disk( + struct xfs_inode *ip, + struct xfs_rtrmap_root *dblock, + unsigned int dblocklen, + struct xfs_btree_block *rblock) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_rmap_key *fkp; + __be64 *fpp; + struct xfs_rmap_key *tkp; + __be64 *tpp; + struct xfs_rmap_rec *frp; + struct xfs_rmap_rec *trp; + unsigned int rblocklen = xfs_rtrmap_broot_space(mp, dblock); + unsigned int numrecs; + unsigned int maxrecs; + + xfs_btree_init_block(mp, rblock, &xfs_rtrmapbt_ops, 0, 0, ip->i_ino); + + rblock->bb_level = dblock->bb_level; + rblock->bb_numrecs = dblock->bb_numrecs; + numrecs = be16_to_cpu(dblock->bb_numrecs); + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrmap_droot_key_addr(dblock, 1); + tkp = xfs_rtrmap_key_addr(rblock, 1); + fpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs); + tpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrmap_droot_rec_addr(dblock, 1); + trp = xfs_rtrmap_rec_addr(rblock, 1); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Load a realtime reverse mapping btree root in from disk. */ +int +xfs_iformat_rtrmap( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_rtrmap_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + struct xfs_btree_block *broot; + unsigned int numrecs; + unsigned int level; + int dsize; + + /* + * growfs must create the rtrmap inodes before adding a realtime volume + * to the filesystem, so we cannot use the rtrmapbt predicate here. + */ + if (!xfs_has_rmapbt(ip->i_mount)) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK); + numrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (level > mp->m_rtrmap_maxlevels || + xfs_rtrmap_droot_space_calc(level, numrecs) > dsize) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK), + xfs_rtrmap_broot_space_calc(mp, level, numrecs)); + if (broot) + xfs_rtrmapbt_from_disk(ip, dfp, dsize, broot); + return 0; +} + +/* Convert in-memory form of btree root to on-disk form. */ +void +xfs_rtrmapbt_to_disk( + struct xfs_mount *mp, + struct xfs_btree_block *rblock, + unsigned int rblocklen, + struct xfs_rtrmap_root *dblock, + unsigned int dblocklen) +{ + struct xfs_rmap_key *fkp; + __be64 *fpp; + struct xfs_rmap_key *tkp; + __be64 *tpp; + struct xfs_rmap_rec *frp; + struct xfs_rmap_rec *trp; + unsigned int numrecs; + unsigned int maxrecs; + + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTRMAP_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL)); + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)); + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)); + + dblock->bb_level = rblock->bb_level; + dblock->bb_numrecs = rblock->bb_numrecs; + numrecs = be16_to_cpu(rblock->bb_numrecs); + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrmap_key_addr(rblock, 1); + tkp = xfs_rtrmap_droot_key_addr(dblock, 1); + fpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen); + tpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrmap_rec_addr(rblock, 1); + trp = xfs_rtrmap_droot_rec_addr(dblock, 1); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Flush a realtime reverse mapping btree root out to disk. */ +void +xfs_iflush_rtrmap( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_rtrmap_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + + ASSERT(ifp->if_broot != NULL); + ASSERT(ifp->if_broot_bytes > 0); + ASSERT(xfs_rtrmap_droot_space(ifp->if_broot) <= + xfs_inode_fork_size(ip, XFS_DATA_FORK)); + xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes, + dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK)); +} + +/* + * Create a realtime rmap btree inode. + */ +int +xfs_rtrmapbt_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_block *broot; + + ifp->if_format = XFS_DINODE_FMT_META_BTREE; + ASSERT(ifp->if_broot_bytes == 0); + ASSERT(ifp->if_bytes == 0); + + /* Initialize the empty incore btree root. */ + broot = xfs_broot_realloc(ifp, xfs_rtrmap_broot_space_calc(mp, 0, 0)); + if (broot) + xfs_btree_init_block(mp, broot, &xfs_rtrmapbt_ops, 0, 0, + ip->i_ino); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT); + + return 0; +} + +/* + * Initialize an rmap for a realtime superblock using the potentially updated + * rt geometry in the provided @mp. + */ +int +xfs_rtrmapbt_init_rtsb( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg, + struct xfs_trans *tp) +{ + struct xfs_rmap_irec rmap = { + .rm_blockcount = mp->m_sb.sb_rextsize, + .rm_owner = XFS_RMAP_OWN_FS, + }; + struct xfs_btree_cur *cur; + int error; + + ASSERT(xfs_has_rtsb(mp)); + ASSERT(rtg_rgno(rtg) == 0); + + cur = xfs_rtrmapbt_init_cursor(tp, rtg); + error = xfs_rmap_map_raw(cur, &rmap); + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Return the highest rgbno currently tracked by the rmap for this rtg. + */ +xfs_rgblock_t +xfs_rtrmap_highest_rgbno( + struct xfs_rtgroup *rtg) +{ + struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot; + union xfs_btree_key key = {}; + struct xfs_btree_cur *cur; + + if (block->bb_numrecs == 0) + return NULLRGBLOCK; + cur = xfs_rtrmapbt_init_cursor(NULL, rtg); + xfs_btree_get_keys(cur, block, &key); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock); +} diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h new file mode 100644 index 000000000000..e328fd62a149 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h @@ -0,0 +1,212 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_RTRMAP_BTREE_H__ +#define __XFS_RTRMAP_BTREE_H__ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; +struct xbtree_ifakeroot; +struct xfs_rtgroup; +struct xfbtree; + +/* rmaps only exist on crc enabled filesystems */ +#define XFS_RTRMAP_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN + +struct xfs_btree_cur *xfs_rtrmapbt_init_cursor(struct xfs_trans *tp, + struct xfs_rtgroup *rtg); +struct xfs_btree_cur *xfs_rtrmapbt_stage_cursor(struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xbtree_ifakeroot *ifake); +void xfs_rtrmapbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp); +unsigned int xfs_rtrmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); +void xfs_rtrmapbt_compute_maxlevels(struct xfs_mount *mp); +unsigned int xfs_rtrmapbt_droot_maxrecs(unsigned int blocklen, bool leaf); + +/* + * Addresses of records, keys, and pointers within an incore rtrmapbt block. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +static inline struct xfs_rmap_rec * +xfs_rtrmap_rec_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_rmap_rec *) + ((char *)block + XFS_RTRMAP_BLOCK_LEN + + (index - 1) * sizeof(struct xfs_rmap_rec)); +} + +static inline struct xfs_rmap_key * +xfs_rtrmap_key_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_rmap_key *) + ((char *)block + XFS_RTRMAP_BLOCK_LEN + + (index - 1) * 2 * sizeof(struct xfs_rmap_key)); +} + +static inline struct xfs_rmap_key * +xfs_rtrmap_high_key_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_rmap_key *) + ((char *)block + XFS_RTRMAP_BLOCK_LEN + + sizeof(struct xfs_rmap_key) + + (index - 1) * 2 * sizeof(struct xfs_rmap_key)); +} + +static inline xfs_rtrmap_ptr_t * +xfs_rtrmap_ptr_addr( + struct xfs_btree_block *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrmap_ptr_t *) + ((char *)block + XFS_RTRMAP_BLOCK_LEN + + maxrecs * 2 * sizeof(struct xfs_rmap_key) + + (index - 1) * sizeof(xfs_rtrmap_ptr_t)); +} + +unsigned int xfs_rtrmapbt_maxlevels_ondisk(void); + +int __init xfs_rtrmapbt_init_cur_cache(void); +void xfs_rtrmapbt_destroy_cur_cache(void); + +xfs_filblks_t xfs_rtrmapbt_calc_reserves(struct xfs_mount *mp); + +/* Addresses of key, pointers, and records within an ondisk rtrmapbt block. */ + +static inline struct xfs_rmap_rec * +xfs_rtrmap_droot_rec_addr( + struct xfs_rtrmap_root *block, + unsigned int index) +{ + return (struct xfs_rmap_rec *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_rmap_rec)); +} + +static inline struct xfs_rmap_key * +xfs_rtrmap_droot_key_addr( + struct xfs_rtrmap_root *block, + unsigned int index) +{ + return (struct xfs_rmap_key *) + ((char *)(block + 1) + + (index - 1) * 2 * sizeof(struct xfs_rmap_key)); +} + +static inline xfs_rtrmap_ptr_t * +xfs_rtrmap_droot_ptr_addr( + struct xfs_rtrmap_root *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrmap_ptr_t *) + ((char *)(block + 1) + + maxrecs * 2 * sizeof(struct xfs_rmap_key) + + (index - 1) * sizeof(xfs_rtrmap_ptr_t)); +} + +/* + * Address of pointers within the incore btree root. + * + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +static inline xfs_rtrmap_ptr_t * +xfs_rtrmap_broot_ptr_addr( + struct xfs_mount *mp, + struct xfs_btree_block *bb, + unsigned int index, + unsigned int block_size) +{ + return xfs_rtrmap_ptr_addr(bb, index, + xfs_rtrmapbt_maxrecs(mp, block_size, false)); +} + +/* + * Compute the space required for the incore btree root containing the given + * number of records. + */ +static inline size_t +xfs_rtrmap_broot_space_calc( + struct xfs_mount *mp, + unsigned int level, + unsigned int nrecs) +{ + size_t sz = XFS_RTRMAP_BLOCK_LEN; + + if (level > 0) + return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) + + sizeof(xfs_rtrmap_ptr_t)); + return sz + nrecs * sizeof(struct xfs_rmap_rec); +} + +/* + * Compute the space required for the incore btree root given the ondisk + * btree root block. + */ +static inline size_t +xfs_rtrmap_broot_space(struct xfs_mount *mp, struct xfs_rtrmap_root *bb) +{ + return xfs_rtrmap_broot_space_calc(mp, be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +/* Compute the space required for the ondisk root block. */ +static inline size_t +xfs_rtrmap_droot_space_calc( + unsigned int level, + unsigned int nrecs) +{ + size_t sz = sizeof(struct xfs_rtrmap_root); + + if (level > 0) + return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) + + sizeof(xfs_rtrmap_ptr_t)); + return sz + nrecs * sizeof(struct xfs_rmap_rec); +} + +/* + * Compute the space required for the ondisk root block given an incore root + * block. + */ +static inline size_t +xfs_rtrmap_droot_space(struct xfs_btree_block *bb) +{ + return xfs_rtrmap_droot_space_calc(be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +int xfs_iformat_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip); +void xfs_rtrmapbt_to_disk(struct xfs_mount *mp, struct xfs_btree_block *rblock, + unsigned int rblocklen, struct xfs_rtrmap_root *dblock, + unsigned int dblocklen); +void xfs_iflush_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip); + +int xfs_rtrmapbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); +int xfs_rtrmapbt_init_rtsb(struct xfs_mount *mp, struct xfs_rtgroup *rtg, + struct xfs_trans *tp); + +unsigned long long xfs_rtrmapbt_calc_size(struct xfs_mount *mp, + unsigned long long len); + +struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg, + struct xfs_trans *tp, struct xfbtree *xfbtree); +int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, + struct xfs_buftarg *btp, xfs_rgnumber_t rgno); + +xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg); + +#endif /* __XFS_RTRMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 4a9e8588f4c9..711e180f9ebb 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -26,6 +26,10 @@ #include "xfs_health.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" +#include "xfs_exchrange.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -175,6 +179,14 @@ xfs_sb_version_to_features( features |= XFS_FEAT_NEEDSREPAIR; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64) features |= XFS_FEAT_NREXT64; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE) + features |= XFS_FEAT_EXCHANGE_RANGE; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT) + features |= XFS_FEAT_PARENT; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) + features |= XFS_FEAT_METADIR; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) + features |= XFS_FEAT_ZONED; return features; } @@ -227,6 +239,73 @@ xfs_validate_sb_read( return 0; } +/* Return the number of extents covered by a single rt bitmap file */ +static xfs_rtbxlen_t +xfs_extents_per_rbm( + struct xfs_sb *sbp) +{ + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) + return sbp->sb_rgextents; + return sbp->sb_rextents; +} + +/* + * Return the payload size of a single rt bitmap block (without the metadata + * header if any). + */ +static inline unsigned int +xfs_rtbmblock_size( + struct xfs_sb *sbp) +{ + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) + return sbp->sb_blocksize - sizeof(struct xfs_rtbuf_blkinfo); + return sbp->sb_blocksize; +} + +static uint64_t +xfs_expected_rbmblocks( + struct xfs_sb *sbp) +{ + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) + return 0; + return howmany_64(xfs_extents_per_rbm(sbp), + NBBY * xfs_rtbmblock_size(sbp)); +} + +/* Validate the realtime geometry */ +bool +xfs_validate_rt_geometry( + struct xfs_sb *sbp) +{ + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) { + if (sbp->sb_rextsize != 1) + return false; + } else { + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) + return false; + } + + if (sbp->sb_rblocks == 0) { + if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || + sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) + return false; + return true; + } + + if (sbp->sb_rextents == 0 || + sbp->sb_rextents != div_u64(sbp->sb_rblocks, sbp->sb_rextsize) || + sbp->sb_rextslog != xfs_compute_rextslog(sbp->sb_rextents) || + sbp->sb_rbmblocks != xfs_expected_rbmblocks(sbp)) + return false; + + return true; +} + /* Check all the superblock fields we care about when writing one out. */ STATIC int xfs_validate_sb_write( @@ -260,13 +339,6 @@ xfs_validate_sb_write( * the kernel cannot support since we checked for unsupported bits in * the read verifier, which means that memory is corrupt. */ - if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { - xfs_warn(mp, -"Corruption detected in superblock compatible features (0x%x)!", - (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN)); - return -EFSCORRUPTED; - } - if (!xfs_is_readonly(mp) && xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_alert(mp, @@ -302,6 +374,106 @@ xfs_validate_sb_write( return 0; } +int +xfs_compute_rgblklog( + xfs_rtxlen_t rgextents, + xfs_rgblock_t rextsize) +{ + uint64_t rgblocks = (uint64_t)rgextents * rextsize; + + return xfs_highbit64(rgblocks - 1) + 1; +} + +static int +xfs_validate_sb_rtgroups( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + uint64_t groups; + int rgblklog; + + if (sbp->sb_rextsize == 0) { + xfs_warn(mp, +"Realtime extent size must not be zero."); + return -EINVAL; + } + + if (sbp->sb_rgextents > XFS_MAX_RGBLOCKS / sbp->sb_rextsize) { + xfs_warn(mp, +"Realtime group size (%u) must be less than %u rt extents.", + sbp->sb_rgextents, + XFS_MAX_RGBLOCKS / sbp->sb_rextsize); + return -EINVAL; + } + + if (sbp->sb_rgextents < XFS_MIN_RGEXTENTS) { + xfs_warn(mp, +"Realtime group size (%u) must be at least %u rt extents.", + sbp->sb_rgextents, XFS_MIN_RGEXTENTS); + return -EINVAL; + } + + if (sbp->sb_rgcount > XFS_MAX_RGNUMBER) { + xfs_warn(mp, +"Realtime groups (%u) must be less than %u.", + sbp->sb_rgcount, XFS_MAX_RGNUMBER); + return -EINVAL; + } + + groups = howmany_64(sbp->sb_rextents, sbp->sb_rgextents); + if (groups != sbp->sb_rgcount) { + xfs_warn(mp, +"Realtime groups (%u) do not cover the entire rt section; need (%llu) groups.", + sbp->sb_rgcount, groups); + return -EINVAL; + } + + /* Exchange-range is required for fsr to work on realtime files */ + if (!(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE)) { + xfs_warn(mp, +"Realtime groups feature requires exchange-range support."); + return -EINVAL; + } + + rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents, sbp->sb_rextsize); + if (sbp->sb_rgblklog != rgblklog) { + xfs_warn(mp, +"Realtime group log (%d) does not match expected value (%d).", + sbp->sb_rgblklog, rgblklog); + return -EINVAL; + } + + return 0; +} + +static int +xfs_validate_sb_zoned( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + if (sbp->sb_frextents != 0) { + xfs_warn(mp, +"sb_frextents must be zero for zoned file systems."); + return -EINVAL; + } + + if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) { + xfs_warn(mp, +"sb_rtstart (%lld) overlaps sb_dblocks (%lld).", + sbp->sb_rtstart, sbp->sb_dblocks); + return -EINVAL; + } + + if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) { + xfs_warn(mp, +"sb_rtreserved (%lld) larger than sb_rblocks (%lld).", + sbp->sb_rtreserved, sbp->sb_rblocks); + return -EINVAL; + } + + return 0; +} + /* Check the validity of the SB. */ STATIC int xfs_validate_sb_common( @@ -313,6 +485,7 @@ xfs_validate_sb_common( uint32_t agcount = 0; uint32_t rem; bool has_dalign; + int error; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, @@ -361,6 +534,38 @@ xfs_validate_sb_common( sbp->sb_inoalignmt, align); return -EINVAL; } + + if (sbp->sb_spino_align && + (sbp->sb_spino_align > sbp->sb_inoalignmt || + (sbp->sb_inoalignmt % sbp->sb_spino_align) != 0)) { + xfs_warn(mp, +"Sparse inode alignment (%u) is invalid, must be integer factor of (%u).", + sbp->sb_spino_align, + sbp->sb_inoalignmt); + return -EINVAL; + } + } else if (sbp->sb_spino_align) { + xfs_warn(mp, + "Sparse inode alignment (%u) should be zero.", + sbp->sb_spino_align); + return -EINVAL; + } + + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { + if (memchr_inv(sbp->sb_pad, 0, sizeof(sbp->sb_pad))) { + xfs_warn(mp, +"Metadir superblock padding fields must be zero."); + return -EINVAL; + } + + error = xfs_validate_sb_rtgroups(mp, sbp); + if (error) + return error; + } + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + error = xfs_validate_sb_zoned(mp, sbp); + if (error) + return error; } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { @@ -486,39 +691,13 @@ xfs_validate_sb_common( } } - /* Validate the realtime geometry; stolen from xfs_repair */ - if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || - sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { + if (!xfs_validate_rt_geometry(sbp)) { xfs_notice(mp, - "realtime extent sanity check failed"); + "realtime %sgeometry check failed", + sbp->sb_rblocks ? "" : "zeroed "); return -EFSCORRUPTED; } - if (sbp->sb_rblocks == 0) { - if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || - sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) { - xfs_notice(mp, - "realtime zeroed geometry check failed"); - return -EFSCORRUPTED; - } - } else { - uint64_t rexts; - uint64_t rbmblocks; - - rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize); - rbmblocks = howmany_64(sbp->sb_rextents, - NBBY * sbp->sb_blocksize); - - if (!xfs_validate_rtextents(rexts) || - sbp->sb_rextents != rexts || - sbp->sb_rextslog != xfs_compute_rextslog(rexts) || - sbp->sb_rbmblocks != rbmblocks) { - xfs_notice(mp, - "realtime geometry sanity check failed"); - return -EFSCORRUPTED; - } - } - /* * Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign) * would imply the image is corrupted. @@ -530,7 +709,8 @@ xfs_validate_sb_common( } if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit), - XFS_FSB_TO_B(mp, sbp->sb_width), 0, false)) + XFS_FSB_TO_B(mp, sbp->sb_width), 0, + xfs_buf_daddr(bp) == XFS_SB_DADDR, false)) return -EFSCORRUPTED; /* @@ -554,6 +734,14 @@ xfs_validate_sb_common( void xfs_sb_quota_from_disk(struct xfs_sb *sbp) { + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { + sbp->sb_uquotino = NULLFSINO; + sbp->sb_gquotino = NULLFSINO; + sbp->sb_pquotino = NULLFSINO; + return; + } + /* * older mkfs doesn't initialize quota inodes to NULLFSINO. This * leads to in-core values having two different values for a quota @@ -677,6 +865,28 @@ __xfs_sb_from_disk( /* Convert on-disk flags to in-memory flags? */ if (convert_xquota) xfs_sb_quota_from_disk(to); + + if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { + to->sb_metadirino = be64_to_cpu(from->sb_metadirino); + to->sb_rgblklog = from->sb_rgblklog; + memcpy(to->sb_pad, from->sb_pad, sizeof(to->sb_pad)); + to->sb_rgcount = be32_to_cpu(from->sb_rgcount); + to->sb_rgextents = be32_to_cpu(from->sb_rgextents); + to->sb_rbmino = NULLFSINO; + to->sb_rsumino = NULLFSINO; + } else { + to->sb_metadirino = NULLFSINO; + to->sb_rgcount = 1; + to->sb_rgextents = 0; + } + + if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = be64_to_cpu(from->sb_rtstart); + to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved); + } else { + to->sb_rtstart = 0; + to->sb_rtreserved = 0; + } } void @@ -694,6 +904,15 @@ xfs_sb_quota_to_disk( { uint16_t qflags = from->sb_qflags; + if (xfs_sb_is_v5(from) && + (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { + to->sb_qflags = cpu_to_be16(from->sb_qflags); + to->sb_uquotino = cpu_to_be64(0); + to->sb_gquotino = cpu_to_be64(0); + to->sb_pquotino = cpu_to_be64(0); + return; + } + to->sb_uquotino = cpu_to_be64(from->sb_uquotino); /* @@ -824,6 +1043,21 @@ xfs_sb_to_disk( to->sb_lsn = cpu_to_be64(from->sb_lsn); if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); + + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { + to->sb_metadirino = cpu_to_be64(from->sb_metadirino); + to->sb_rgblklog = from->sb_rgblklog; + memset(to->sb_pad, 0, sizeof(to->sb_pad)); + to->sb_rgcount = cpu_to_be32(from->sb_rgcount); + to->sb_rgextents = cpu_to_be32(from->sb_rgextents); + to->sb_rbmino = cpu_to_be64(0); + to->sb_rsumino = cpu_to_be64(0); + } + + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = cpu_to_be64(from->sb_rtstart); + to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved); + } } /* @@ -953,6 +1187,49 @@ const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { .verify_write = xfs_sb_write_verify, }; +/* Compute cached rt geometry from the incore sb. */ +void +xfs_sb_mount_rextsize( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG]; + + mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize); + mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); + + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { + rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize; + rgs->blklog = mp->m_sb.sb_rgblklog; + rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog); + rgs->start_fsb = mp->m_sb.sb_rtstart; + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)) + rgs->has_daddr_gaps = true; + } else { + rgs->blocks = 0; + rgs->blklog = 0; + rgs->blkmask = (uint64_t)-1; + } +} + +/* Update incore sb rt extent size, then recompute the cached rt geometry. */ +void +xfs_mount_sb_set_rextsize( + struct xfs_mount *mp, + struct xfs_sb *sbp, + xfs_agblock_t rextsize) +{ + sbp->sb_rextsize = rextsize; + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) + sbp->sb_rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents, + rextsize); + + xfs_sb_mount_rextsize(mp, sbp); +} + /* * xfs_mount_common * @@ -967,6 +1244,8 @@ xfs_sb_mount_common( struct xfs_mount *mp, struct xfs_sb *sbp) { + struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG]; + mp->m_agfrotor = 0; atomic_set(&mp->m_agirotor, 0); mp->m_maxagi = mp->m_sb.sb_agcount; @@ -975,31 +1254,47 @@ xfs_sb_mount_common( mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; mp->m_blockmask = sbp->sb_blocksize - 1; - mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; - mp->m_blockwmask = mp->m_blockwsize - 1; - mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize); - mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); + mp->m_blockwsize = xfs_rtbmblock_size(sbp) >> XFS_WORDLOG; + mp->m_rtx_per_rbmblock = mp->m_blockwsize << XFS_NBWORDLOG; + + ags->blocks = mp->m_sb.sb_agblocks; + ags->blklog = mp->m_sb.sb_agblklog; + ags->blkmask = xfs_mask32lo(mp->m_sb.sb_agblklog); + + xfs_sb_mount_rextsize(mp, sbp); - mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; - mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; - mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 1); - mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 0); + mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; - mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, true); - mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, false); + mp->m_rtrmap_mxr[0] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_rtrmap_mxr[1] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, false); + mp->m_rtrmap_mnr[0] = mp->m_rtrmap_mxr[0] / 2; + mp->m_rtrmap_mnr[1] = mp->m_rtrmap_mxr[1] / 2; + + mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; + mp->m_rtrefc_mxr[0] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize, + true); + mp->m_rtrefc_mxr[1] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize, + false); + mp->m_rtrefc_mnr[0] = mp->m_rtrefc_mxr[0] / 2; + mp->m_rtrefc_mnr[1] = mp->m_rtrefc_mxr[1] / 2; + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp); @@ -1025,18 +1320,24 @@ xfs_log_sb( * reservations that have been taken out percpu counters. If we have an * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. - * - * Do not update sb_frextents here because it is not part of the lazy - * sb counters, despite having a percpu counter. It is always kept - * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas() - * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { - mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount); mp->m_sb.sb_ifree = min_t(uint64_t, - percpu_counter_sum(&mp->m_ifree), + percpu_counter_sum_positive(&mp->m_ifree), mp->m_sb.sb_icount); - mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS); + } + + /* + * sb_frextents was added to the lazy sb counters when the rt groups + * feature was introduced. This counter can go negative due to the way + * we handle nearly-lockless reservations, so we must use the _positive + * variant here to avoid writing out nonsense frextents. + */ + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) { + mp->m_sb.sb_frextents = + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS); } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); @@ -1088,18 +1389,17 @@ int xfs_update_secondary_sbs( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno = 1; + struct xfs_perag *pag = NULL; int saved_error = 0; int error = 0; LIST_HEAD (buffer_list); /* update secondary superblocks. */ - for_each_perag_from(mp, agno, pag) { + while ((pag = xfs_perag_next_from(mp, pag, 1))) { struct xfs_buf *bp; error = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR), + XFS_AG_DADDR(mp, pag_agno(pag), XFS_SB_DADDR), XFS_FSS_TO_BB(mp, 1), &bp); /* * If we get an error reading or writing alternate superblocks, @@ -1111,7 +1411,7 @@ xfs_update_secondary_sbs( if (error) { xfs_warn(mp, "error allocating secondary superblock for ag %d", - pag->pag_agno); + pag_agno(pag)); if (!saved_error) saved_error = error; continue; @@ -1125,26 +1425,22 @@ xfs_update_secondary_sbs( xfs_buf_relse(bp); /* don't hold too many buffers at once */ - if (agno % 16) + if (pag_agno(pag) % 16) continue; error = xfs_buf_delwri_submit(&buffer_list); if (error) { xfs_warn(mp, "write error %d updating a secondary superblock near ag %d", - error, pag->pag_agno); + error, pag_agno(pag)); if (!saved_error) saved_error = error; continue; } } error = xfs_buf_delwri_submit(&buffer_list); - if (error) { - xfs_warn(mp, - "write error %d updating a secondary superblock near ag %d", - error, agno); - } - + if (error) + xfs_warn(mp, "error %d writing secondary superblocks", error); return saved_error ? saved_error : error; } @@ -1154,10 +1450,12 @@ xfs_update_secondary_sbs( */ int xfs_sync_sb_buf( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool update_rtsb) { struct xfs_trans *tp; struct xfs_buf *bp; + struct xfs_buf *rtsb_bp = NULL; int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp); @@ -1167,6 +1465,11 @@ xfs_sync_sb_buf( bp = xfs_trans_getsb(tp); xfs_log_sb(tp); xfs_trans_bhold(tp, bp); + if (update_rtsb) { + rtsb_bp = xfs_log_rtsb(tp, bp); + if (rtsb_bp) + xfs_trans_bhold(tp, rtsb_bp); + } xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) @@ -1175,7 +1478,11 @@ xfs_sync_sb_buf( * write out the sb buffer to get the changes to disk */ error = xfs_bwrite(bp); + if (!error && rtsb_bp) + error = xfs_bwrite(rtsb_bp); out: + if (rtsb_bp) + xfs_buf_relse(rtsb_bp); xfs_buf_relse(bp); return error; } @@ -1250,6 +1557,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; if (xfs_has_inobtcounts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT; + if (xfs_has_parent(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_PARENT; if (xfs_has_sector(mp)) { geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; geo->logsectsize = sbp->sb_logsectsize; @@ -1258,6 +1567,12 @@ xfs_fs_geometry( } if (xfs_has_large_extent_counts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; + if (xfs_has_exchange_range(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE; + if (xfs_has_metadir(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR; + if (xfs_has_zoned(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); @@ -1273,6 +1588,15 @@ xfs_fs_geometry( return; geo->version = XFS_FSOP_GEOM_VERSION_V5; + + if (xfs_has_rtgroups(mp)) { + geo->rgcount = sbp->sb_rgcount; + geo->rgextents = sbp->sb_rgextents; + } + if (xfs_has_zoned(mp)) { + geo->rtstart = sbp->sb_rtstart; + geo->rtreserved = sbp->sb_rtreserved; + } } /* Read a secondary superblock. */ @@ -1290,6 +1614,8 @@ xfs_sb_read_secondary( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_agno_mark_sick(mp, agno, XFS_SICK_AG_SB); if (error) return error; xfs_buf_set_ref(bp, XFS_SSB_REF); @@ -1321,8 +1647,10 @@ xfs_sb_get_secondary( } /* - * sunit, swidth, sectorsize(optional with 0) should be all in bytes, - * so users won't be confused by values in error messages. + * sunit, swidth, sectorsize(optional with 0) should be all in bytes, so users + * won't be confused by values in error messages. This function returns false + * if the stripe geometry is invalid and the caller is unable to repair the + * stripe configuration later in the mount process. */ bool xfs_validate_stripe_geometry( @@ -1330,20 +1658,21 @@ xfs_validate_stripe_geometry( __s64 sunit, __s64 swidth, int sectorsize, + bool may_repair, bool silent) { if (swidth > INT_MAX) { if (!silent) xfs_notice(mp, "stripe width (%lld) is too large", swidth); - return false; + goto check_override; } if (sunit > swidth) { if (!silent) xfs_notice(mp, "stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth); - return false; + goto check_override; } if (sectorsize && (int)sunit % sectorsize) { @@ -1351,21 +1680,21 @@ xfs_validate_stripe_geometry( xfs_notice(mp, "stripe unit (%lld) must be a multiple of the sector size (%d)", sunit, sectorsize); - return false; + goto check_override; } if (sunit && !swidth) { if (!silent) xfs_notice(mp, "invalid stripe unit (%lld) and stripe width of 0", sunit); - return false; + goto check_override; } if (!sunit && swidth) { if (!silent) xfs_notice(mp, "invalid stripe width (%lld) and stripe unit of 0", swidth); - return false; + goto check_override; } if (sunit && (int)swidth % (int)sunit) { @@ -1373,7 +1702,39 @@ xfs_validate_stripe_geometry( xfs_notice(mp, "stripe width (%lld) must be a multiple of the stripe unit (%lld)", swidth, sunit); - return false; + goto check_override; } return true; + +check_override: + if (!may_repair) + return false; + /* + * During mount, mp->m_dalign will not be set unless the sunit mount + * option was set. If it was set, ignore the bad stripe alignment values + * and allow the validation and overwrite later in the mount process to + * attempt to overwrite the bad stripe alignment values with the values + * supplied by mount options. + */ + if (!mp->m_dalign) + return false; + if (!silent) + xfs_notice(mp, +"Will try to correct with specified mount options sunit (%d) and swidth (%d)", + BBTOB(mp->m_dalign), BBTOB(mp->m_swidth)); + return true; +} + +/* + * Compute the maximum level number of the realtime summary file, as defined by + * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct + * use of rt volumes with more than 2^32 extents. + */ +uint8_t +xfs_compute_rextslog( + xfs_rtbxlen_t rtextents) +{ + if (!rtextents) + return 0; + return xfs_highbit64(rtextents); } diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 19134b23c10b..34d0dd374e9b 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -15,8 +15,11 @@ struct xfs_perag; extern void xfs_log_sb(struct xfs_trans *tp); extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); -extern int xfs_sync_sb_buf(struct xfs_mount *mp); +extern int xfs_sync_sb_buf(struct xfs_mount *mp, bool update_rtsb); extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); +void xfs_sb_mount_rextsize(struct xfs_mount *mp, struct xfs_sb *sbp); +void xfs_mount_sb_set_rextsize(struct xfs_mount *mp, + struct xfs_sb *sbp, xfs_agblock_t rextsize); extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); @@ -35,7 +38,12 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); -extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp, - __s64 sunit, __s64 swidth, int sectorsize, bool silent); +bool xfs_validate_stripe_geometry(struct xfs_mount *mp, + __s64 sunit, __s64 swidth, int sectorsize, bool may_repair, + bool silent); +bool xfs_validate_rt_geometry(struct xfs_sb *sbp); + +uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); +int xfs_compute_rgblklog(xfs_rtxlen_t rgextents, xfs_rgblock_t rextsize); #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 4220d3584c1b..b1e0d9bc1f7d 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -38,11 +38,89 @@ extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; +extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops; +extern const struct xfs_buf_ops xfs_rtsummary_buf_ops; extern const struct xfs_buf_ops xfs_rtbuf_ops; +extern const struct xfs_buf_ops xfs_rtsb_buf_ops; +extern const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops; +extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; +/* btree ops */ +extern const struct xfs_btree_ops xfs_bnobt_ops; +extern const struct xfs_btree_ops xfs_cntbt_ops; +extern const struct xfs_btree_ops xfs_inobt_ops; +extern const struct xfs_btree_ops xfs_finobt_ops; +extern const struct xfs_btree_ops xfs_bmbt_ops; +extern const struct xfs_btree_ops xfs_refcountbt_ops; +extern const struct xfs_btree_ops xfs_rmapbt_ops; +extern const struct xfs_btree_ops xfs_rmapbt_mem_ops; +extern const struct xfs_btree_ops xfs_rtrmapbt_ops; +extern const struct xfs_btree_ops xfs_rtrmapbt_mem_ops; +extern const struct xfs_btree_ops xfs_rtrefcountbt_ops; + +static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_bnobt_ops; +} + +static inline bool xfs_btree_is_cnt(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_cntbt_ops; +} + +static inline bool xfs_btree_is_bmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_bmbt_ops; +} + +static inline bool xfs_btree_is_ino(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_inobt_ops; +} + +static inline bool xfs_btree_is_fino(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_finobt_ops; +} + +static inline bool xfs_btree_is_refcount(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_refcountbt_ops; +} + +static inline bool xfs_btree_is_rmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rmapbt_ops; +} + +#ifdef CONFIG_XFS_BTREE_IN_MEM +static inline bool xfs_btree_is_mem_rmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rmapbt_mem_ops; +} + +static inline bool xfs_btree_is_mem_rtrmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rtrmapbt_mem_ops; +} +#else +# define xfs_btree_is_mem_rmap(...) (false) +# define xfs_btree_is_mem_rtrmap(...) (false) +#endif + +static inline bool xfs_btree_is_rtrmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rtrmapbt_ops; +} + +static inline bool xfs_btree_is_rtrefcount(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rtrefcountbt_ops; +} + /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); int xfs_log_calc_minimum_size(struct xfs_mount *); @@ -70,7 +148,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_RES_FDBLKS (1u << 6) /* Transaction contains an intent done log item */ #define XFS_TRANS_HAS_INTENT_DONE (1u << 7) - /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an @@ -82,7 +159,10 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, * for free space from AG 0. If the correct transaction reservations have been * made then this algorithm will eventually find all the space it needs. */ -#define XFS_TRANS_LOWMODE 0x100 /* allocate in low space mode */ +#define XFS_TRANS_LOWMODE (1u << 8) + +/* Transaction has locked the rtbitmap and rtsum inodes */ +#define XFS_TRANS_RTBITMAP_LOCKED (1u << 9) /* * Field values for xfs_trans_mod_sb. @@ -101,6 +181,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_SB_RBLOCKS 0x00000800 #define XFS_TRANS_SB_REXTENTS 0x00001000 #define XFS_TRANS_SB_REXTSLOG 0x00002000 +#define XFS_TRANS_SB_RGCOUNT 0x00004000 /* * Here we centralize the specification of XFS meta-data buffer reference count @@ -121,26 +202,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_REFC_BTREE_REF 1 #define XFS_SSB_REF 0 -/* - * Flags for xfs_trans_ichgtime(). - */ -#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ -#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ -#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ - - -/* - * Symlink decoding/encoding functions - */ -int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); -int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, - uint32_t size, struct xfs_buf *bp); -bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, - uint32_t size, struct xfs_buf *bp); -void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_inode *ip, struct xfs_ifork *ifp); -xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); - /* Computed inode geometry for the filesystem. */ struct xfs_ino_geometry { /* Maximum inode count in this filesystem. */ @@ -188,6 +249,9 @@ struct xfs_ino_geometry { /* precomputed value for di_flags2 */ uint64_t new_diflags2; + /* minimum folio order of a page cache allocation */ + unsigned int min_folio_order; + }; #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 160aa20aa441..fb47a76ead18 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -16,7 +16,10 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" - +#include "xfs_symlink_remote.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_health.h" /* * Each contiguous block has a header, so it is not just a simple pathlen @@ -89,8 +92,10 @@ xfs_symlink_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_dsymlink_hdr *dsl = bp->b_addr; + /* no verification of non-crc buffers */ if (!xfs_has_crc(mp)) - return __this_address; + return NULL; + if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) @@ -166,7 +171,8 @@ xfs_symlink_local_to_remote( struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, - struct xfs_ifork *ifp) + struct xfs_ifork *ifp, + void *priv) { struct xfs_mount *mp = ip->i_mount; char *buf; @@ -227,3 +233,200 @@ xfs_symlink_shortform_verify( return __this_address; return NULL; } + +/* Read a remote symlink target into the buffer. */ +int +xfs_symlink_remote_read( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_buf *bp; + xfs_daddr_t d; + char *cur_chunk; + int pathlen = ip->i_disk_size; + int nmaps = XFS_SYMLINK_MAPS; + int byte_cnt; + int n; + int error = 0; + int fsblocks = 0; + int offset; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + fsblocks = xfs_symlink_blocks(mp, pathlen); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + goto out; + + offset = 0; + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + + error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, + &bp, &xfs_symlink_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); + if (error) + return error; + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + if (pathlen < byte_cnt) + byte_cnt = pathlen; + + cur_chunk = bp->b_addr; + if (xfs_has_crc(mp)) { + if (!xfs_symlink_hdr_ok(ip->i_ino, offset, + byte_cnt, bp)) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); + error = -EFSCORRUPTED; + xfs_alert(mp, +"symlink header does not match required off/len/owner (0x%x/0x%x,0x%llx)", + offset, byte_cnt, ip->i_ino); + xfs_buf_relse(bp); + goto out; + + } + + cur_chunk += sizeof(struct xfs_dsymlink_hdr); + } + + memcpy(link + offset, cur_chunk, byte_cnt); + + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_buf_relse(bp); + } + ASSERT(pathlen == 0); + + link[ip->i_disk_size] = '\0'; + error = 0; + + out: + return error; +} + +/* Write the symlink target into the inode. */ +int +xfs_symlink_write_target( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_ino_t owner, + const char *target_path, + int pathlen, + xfs_fsblock_t fs_blocks, + uint resblks) +{ + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_mount *mp = tp->t_mountp; + const char *cur_chunk; + struct xfs_buf *bp; + xfs_daddr_t d; + int byte_cnt; + int nmaps; + int offset = 0; + int n; + int error; + + /* + * If the symlink will fit into the inode, write it inline. + */ + if (pathlen <= xfs_inode_data_fork_size(ip)) { + xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); + + ip->i_disk_size = pathlen; + ip->i_df.if_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); + return 0; + } + + nmaps = XFS_SYMLINK_MAPS; + error = xfs_bmapi_write(tp, ip, 0, fs_blocks, XFS_BMAPI_METADATA, + resblks, mval, &nmaps); + if (error) + return error; + + ip->i_disk_size = pathlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + cur_chunk = target_path; + offset = 0; + for (n = 0; n < nmaps; n++) { + char *buf; + + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0, &bp); + if (error) + return error; + bp->b_ops = &xfs_symlink_buf_ops; + + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + byte_cnt = min(byte_cnt, pathlen); + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, owner, offset, byte_cnt, bp); + + memcpy(buf, cur_chunk, byte_cnt); + + cur_chunk += byte_cnt; + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); + xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - + (char *)bp->b_addr); + } + ASSERT(pathlen == 0); + return 0; +} + +/* Remove all the blocks from a symlink and invalidate buffers. */ +int +xfs_symlink_remote_truncate( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *bp; + int nmaps = XFS_SYMLINK_MAPS; + int done = 0; + int i; + int error; + + /* Read mappings and invalidate buffers. */ + error = xfs_bmapi_read(ip, 0, XFS_MAX_FILEOFF, mval, &nmaps, 0); + if (error) + return error; + + for (i = 0; i < nmaps; i++) { + if (!xfs_bmap_is_real_extent(&mval[i])) + break; + + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0, + &bp); + if (error) + return error; + + xfs_trans_binval(tp, bp); + } + + /* Unmap the remote blocks. */ + error = xfs_bunmapi(tp, ip, 0, XFS_MAX_FILEOFF, 0, nmaps, &done); + if (error) + return error; + if (!done) { + ASSERT(done); + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); + return -EFSCORRUPTED; + } + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h new file mode 100644 index 000000000000..c1672fe1f17b --- /dev/null +++ b/fs/xfs/libxfs/xfs_symlink_remote.h @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_SYMLINK_REMOTE_H +#define __XFS_SYMLINK_REMOTE_H + +/* + * Symlink decoding/encoding functions + */ +int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); +int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp, + void *priv); +xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); +int xfs_symlink_remote_read(struct xfs_inode *ip, char *link); +int xfs_symlink_write_target(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_ino_t owner, const char *target_path, int pathlen, + xfs_fsblock_t fs_blocks, uint resblks); +int xfs_symlink_remote_truncate(struct xfs_trans *tp, struct xfs_inode *ip); + +#endif /* __XFS_SYMLINK_REMOTE_H */ diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 70e97ea6eee7..c962ad64b0c1 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -31,7 +31,7 @@ xfs_trans_ijoin( { struct xfs_inode_log_item *iip; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; @@ -60,14 +60,16 @@ xfs_trans_ichgtime( struct timespec64 tv; ASSERT(tp); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - tv = current_time(inode); + /* If the mtime changes, then ctime must also change */ + ASSERT(flags & XFS_ICHGTIME_CHG); + tv = inode_set_ctime_current(inode); if (flags & XFS_ICHGTIME_MOD) inode_set_mtime_to_ts(inode, tv); - if (flags & XFS_ICHGTIME_CHG) - inode_set_ctime_to_ts(inode, tv); + if (flags & XFS_ICHGTIME_ACCESS) + inode_set_atime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) ip->i_crtime = tv; } @@ -90,7 +92,7 @@ xfs_trans_log_inode( struct inode *inode = VFS_I(ip); ASSERT(iip); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); tp->t_flags |= XFS_TRANS_DIRTY; diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 6cd45e8c118d..86a111d0f2fc 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -20,6 +20,14 @@ #include "xfs_qm.h" #include "xfs_trans_space.h" #include "xfs_rtbitmap.h" +#include "xfs_attr_item.h" +#include "xfs_log.h" +#include "xfs_defer.h" +#include "xfs_bmap_item.h" +#include "xfs_extfree_item.h" +#include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_trace.h" #define _ALLOC true #define _FREE false @@ -90,6 +98,14 @@ xfs_refcountbt_block_count( return num_ops * (2 * mp->m_refc_maxlevels - 1); } +static unsigned int +xfs_rtrefcountbt_block_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + return num_ops * (2 * mp->m_rtrefc_maxlevels - 1); +} + /* * Logging inodes is really tricksy. They are logged in memory format, * which means that what we write into the log doesn't directly translate into @@ -128,7 +144,7 @@ xfs_calc_inode_res( (4 * sizeof(struct xlog_op_header) + sizeof(struct xfs_inode_log_format) + mp->m_sb.sb_inodesize + - 2 * XFS_BMBT_BLOCK_LEN(mp)); + 2 * xfs_bmbt_block_len(mp)); } /* @@ -211,7 +227,9 @@ xfs_calc_inode_chunk_res( * Per-extent log reservation for the btree changes involved in freeing or * allocating a realtime extent. We have to be able to log as many rtbitmap * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime - * extents, as well as the realtime summary block. + * extents, as well as the realtime summary block (t1). Realtime rmap btree + * operations happen in a second transaction, so factor in a couple of rtrmapbt + * splits (t2). */ static unsigned int xfs_rtalloc_block_count( @@ -220,10 +238,16 @@ xfs_rtalloc_block_count( { unsigned int rtbmp_blocks; xfs_rtxlen_t rtxlen; + unsigned int t1, t2 = 0; rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN); - rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen); - return (rtbmp_blocks + 1) * num_ops; + rtbmp_blocks = xfs_rtbitmap_blockcount_len(mp, rtxlen); + t1 = (rtbmp_blocks + 1) * num_ops; + + if (xfs_has_rmapbt(mp)) + t2 = num_ops * (2 * mp->m_rtrmap_maxlevels - 1); + + return max(t1, t2); } /* @@ -246,26 +270,64 @@ xfs_rtalloc_block_count( */ /* + * Finishing a data device refcount updates (t1): + * the agfs of the ags containing the blocks: nr_ops * sector size + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_cui_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + if (!xfs_has_reflink(mp)) + return 0; + + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), + mp->m_sb.sb_blocksize); +} + +/* + * Realtime refcount updates (t2); + * the rt refcount inode + * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_rt_cui_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + if (!xfs_has_rtreflink(mp)) + return 0; + + return xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops), + mp->m_sb.sb_blocksize); +} + +/* * Compute the log reservation required to handle the refcount update * transaction. Refcount updates are always done via deferred log items. * - * This is calculated as: + * This is calculated as the max of: * Data device refcount updates (t1): * the agfs of the ags containing the blocks: nr_ops * sector size * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + * Realtime refcount updates (t2); + * the rt refcount inode + * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size */ static unsigned int xfs_calc_refcountbt_reservation( struct xfs_mount *mp, unsigned int nr_ops) { - unsigned int blksz = XFS_FSB_TO_B(mp, 1); + unsigned int t1, t2; - if (!xfs_has_reflink(mp)) - return 0; + t1 = xfs_calc_finish_cui_reservation(mp, nr_ops); + t2 = xfs_calc_finish_rt_cui_reservation(mp, nr_ops); - return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); + return max(t1, t2); } /* @@ -336,11 +398,11 @@ xfs_calc_write_reservation( blksz); t1 += adj; t3 += adj; - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + return XFS_DQUOT_LOGRES + max3(t1, t2, t3); } t4 = xfs_calc_refcountbt_reservation(mp, 1); - return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); + return XFS_DQUOT_LOGRES + max(t4, max3(t1, t2, t3)); } unsigned int @@ -351,6 +413,96 @@ xfs_calc_write_reservation_minlogsize( } /* + * Finishing an EFI can free the blocks and bmap blocks (t2): + * the agf for each of the ags: nr * sector size + * the agfl for each of the ags: nr * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming nr extents: + * nr exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: + * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_rt_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_realtime(mp)) + return 0; + + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, nr), + mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Finishing an RUI is the same as an EFI. We can split the rmap btree twice + * on each end of the record, and that can cause the AGFL to be refilled or + * emptied out. + */ +inline unsigned int +xfs_calc_finish_rui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_rmapbt(mp)) + return 0; + return xfs_calc_finish_efi_reservation(mp, nr); +} + +/* + * Finishing an RUI is the same as an EFI. We can split the rmap btree twice + * on each end of the record, and that can cause the AGFL to be refilled or + * emptied out. + */ +inline unsigned int +xfs_calc_finish_rt_rui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_rtrmapbt(mp)) + return 0; + return xfs_calc_finish_rt_efi_reservation(mp, nr); +} + +/* + * In finishing a BUI, we can modify: + * the inode being truncated: inode size + * dquots + * the inode's bmap btree: (max depth + 1) * block size + */ +inline unsigned int +xfs_calc_finish_bui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + mp->m_sb.sb_blocksize); +} + +/* * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size * the inode's bmap btree: (max depth + 1) * block size @@ -382,16 +534,8 @@ xfs_calc_itruncate_reservation( t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); - t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); - - if (xfs_has_realtime(mp)) { - t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); - } else { - t3 = 0; - } + t2 = xfs_calc_finish_efi_reservation(mp, 4); + t3 = xfs_calc_finish_rt_efi_reservation(mp, 2); /* * In the early days of reflink, we included enough reservation to log @@ -408,11 +552,11 @@ xfs_calc_itruncate_reservation( xfs_refcountbt_block_count(mp, 4), blksz); - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + return XFS_DQUOT_LOGRES + max3(t1, t2, t3); } t4 = xfs_calc_refcountbt_reservation(mp, 2); - return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); + return XFS_DQUOT_LOGRES + max(t4, max3(t1, t2, t3)); } unsigned int @@ -422,29 +566,108 @@ xfs_calc_itruncate_reservation_minlogsize( return xfs_calc_itruncate_reservation(mp, true); } +static inline unsigned int xfs_calc_pptr_link_overhead(void) +{ + return sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1); +} +static inline unsigned int xfs_calc_pptr_unlink_overhead(void) +{ + return sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1); +} +static inline unsigned int xfs_calc_pptr_replace_overhead(void) +{ + return sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1); +} + /* * In renaming a files we can modify: * the five inodes involved: 5 * inode size * the two directory btrees: 2 * (max depth + v2) * dir block size * the two directory bmap btrees: 2 * max depth * block size * And the bmap_finish transaction can free dir and bmap blocks (two sets - * of bmap blocks) giving: + * of bmap blocks) giving (t2): * the agf for the ags in which the blocks live: 3 * sector size * the agfl for the ags in which the blocks live: 3 * sector size * the superblock for the free block count: sector size * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size + * If parent pointers are enabled (t3), then each transaction in the chain + * must be capable of setting or removing the extended attribute + * containing the parent information. It must also be able to handle + * the three xattr intent items that track the progress of the parent + * pointer update. */ STATIC uint xfs_calc_rename_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 5) + - xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), - XFS_FSB_TO_B(mp, 1)))); + unsigned int overhead = XFS_DQUOT_LOGRES; + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int t1, t2, t3 = 0; + + t1 = xfs_calc_inode_res(mp, 5) + + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1)); + + t2 = xfs_calc_finish_efi_reservation(mp, 3); + + if (xfs_has_parent(mp)) { + unsigned int rename_overhead, exchange_overhead; + + t3 = max(resp->tr_attrsetm.tr_logres, + resp->tr_attrrm.tr_logres); + + /* + * For a standard rename, the three xattr intent log items + * are (1) replacing the pptr for the source file; (2) + * removing the pptr on the dest file; and (3) adding a + * pptr for the whiteout file in the src dir. + * + * For an RENAME_EXCHANGE, there are two xattr intent + * items to replace the pptr for both src and dest + * files. Link counts don't change and there is no + * whiteout. + * + * In the worst case we can end up relogging all log + * intent items to allow the log tail to move ahead, so + * they become overhead added to each transaction in a + * processing chain. + */ + rename_overhead = xfs_calc_pptr_replace_overhead() + + xfs_calc_pptr_unlink_overhead() + + xfs_calc_pptr_link_overhead(); + exchange_overhead = 2 * xfs_calc_pptr_replace_overhead(); + + overhead += max(rename_overhead, exchange_overhead); + } + + return overhead + max3(t1, t2, t3); +} + +static inline unsigned int +xfs_rename_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* One for the rename, one more for freeing blocks */ + unsigned int ret = XFS_RENAME_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to remove or add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += max(resp->tr_attrsetm.tr_logcount, + resp->tr_attrrm.tr_logcount); + + return ret; } /* @@ -461,6 +684,23 @@ xfs_calc_iunlink_remove_reservation( 2 * M_IGEO(mp)->inode_cluster_size; } +static inline unsigned int +xfs_link_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_LINK_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} + /* * For creating a link to an inode: * the parent directory inode: inode size @@ -477,14 +717,21 @@ STATIC uint xfs_calc_link_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - xfs_calc_iunlink_remove_reservation(mp) + - max((xfs_calc_inode_res(mp, 2) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), - XFS_FSB_TO_B(mp, 1)))); + unsigned int overhead = XFS_DQUOT_LOGRES; + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int t1, t2, t3 = 0; + + overhead += xfs_calc_iunlink_remove_reservation(mp); + t1 = xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 1); + + if (xfs_has_parent(mp)) { + t3 = resp->tr_attrsetm.tr_logres; + overhead += xfs_calc_pptr_link_overhead(); + } + + return overhead + max3(t1, t2, t3); } /* @@ -499,6 +746,23 @@ xfs_calc_iunlink_add_reservation(xfs_mount_t *mp) M_IGEO(mp)->inode_cluster_size; } +static inline unsigned int +xfs_remove_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_REMOVE_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrrm.tr_logcount; + + return ret; +} + /* * For removing a directory entry we can modify: * the parent directory inode: inode size @@ -515,14 +779,22 @@ STATIC uint xfs_calc_remove_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - xfs_calc_iunlink_add_reservation(mp) + - max((xfs_calc_inode_res(mp, 2) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), - XFS_FSB_TO_B(mp, 1)))); + unsigned int overhead = XFS_DQUOT_LOGRES; + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int t1, t2, t3 = 0; + + overhead += xfs_calc_iunlink_add_reservation(mp); + + t1 = xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 2); + + if (xfs_has_parent(mp)) { + t3 = resp->tr_attrrm.tr_logres; + overhead += xfs_calc_pptr_unlink_overhead(); + } + + return overhead + max3(t1, t2, t3); } /* @@ -571,24 +843,69 @@ xfs_calc_icreate_resv_alloc( xfs_calc_finobt_res(mp); } +static inline unsigned int +xfs_icreate_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_CREATE_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} + STATIC uint -xfs_calc_icreate_reservation(xfs_mount_t *mp) +xfs_calc_icreate_reservation( + struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max(xfs_calc_icreate_resv_alloc(mp), - xfs_calc_create_resv_modify(mp)); + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int overhead = XFS_DQUOT_LOGRES; + unsigned int t1, t2, t3 = 0; + + t1 = xfs_calc_icreate_resv_alloc(mp); + t2 = xfs_calc_create_resv_modify(mp); + + if (xfs_has_parent(mp)) { + t3 = resp->tr_attrsetm.tr_logres; + overhead += xfs_calc_pptr_link_overhead(); + } + + return overhead + max3(t1, t2, t3); } STATIC uint xfs_calc_create_tmpfile_reservation( struct xfs_mount *mp) { - uint res = XFS_DQUOT_LOGRES(mp); + uint res = XFS_DQUOT_LOGRES; res += xfs_calc_icreate_resv_alloc(mp); return res + xfs_calc_iunlink_add_reservation(mp); } +static inline unsigned int +xfs_mkdir_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_MKDIR_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} + /* * Making a new directory is the same as creating a new file. */ @@ -599,6 +916,22 @@ xfs_calc_mkdir_reservation( return xfs_calc_icreate_reservation(mp); } +static inline unsigned int +xfs_symlink_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_SYMLINK_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} /* * Making a new symplink is the same as creating a new file, but @@ -632,7 +965,7 @@ STATIC uint xfs_calc_ifree_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + xfs_calc_iunlink_remove_reservation(mp) + @@ -649,7 +982,7 @@ STATIC uint xfs_calc_ichange_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); @@ -721,7 +1054,7 @@ xfs_calc_growrtfree_reservation( return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + - xfs_calc_buf_res(1, mp->m_rsumsize); + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, mp->m_rsumblocks)); } /* @@ -758,7 +1091,7 @@ STATIC uint xfs_calc_addafork_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + @@ -806,7 +1139,7 @@ STATIC uint xfs_calc_attrsetm_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); @@ -846,7 +1179,7 @@ STATIC uint xfs_calc_attrrm_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + max((xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)) + @@ -911,54 +1244,85 @@ xfs_calc_sb_reservation( return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); } -void -xfs_trans_resv_calc( +/* + * Namespace reservations. + * + * These get tricky when parent pointers are enabled as we have attribute + * modifications occurring from within these transactions. Rather than confuse + * each of these reservation calculations with the conditional attribute + * reservations, add them here in a clear and concise manner. This requires that + * the attribute reservations have already been calculated. + * + * Note that we only include the static attribute reservation here; the runtime + * reservation will have to be modified by the size of the attributes being + * added/removed/modified. See the comments on the attribute reservation + * calculations for more details. + */ +STATIC void +xfs_calc_namespace_reservations( struct xfs_mount *mp, struct xfs_trans_resv *resp) { - int logcount_adj = 0; - - /* - * The following transactions are logged in physical format and - * require a permanent reservation on space. - */ - resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; - resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - - resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; - resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + ASSERT(resp->tr_attrsetm.tr_logres > 0); resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); - resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT; + resp->tr_rename.tr_logcount = xfs_rename_log_count(mp, resp); resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_link.tr_logres = xfs_calc_link_reservation(mp); - resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT; + resp->tr_link.tr_logcount = xfs_link_log_count(mp, resp); resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp); - resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT; + resp->tr_remove.tr_logcount = xfs_remove_log_count(mp, resp); resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp); - resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT; + resp->tr_symlink.tr_logcount = xfs_symlink_log_count(mp, resp); resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_create.tr_logres = xfs_calc_icreate_reservation(mp); - resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; + resp->tr_create.tr_logcount = xfs_icreate_log_count(mp, resp); resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); + resp->tr_mkdir.tr_logcount = xfs_mkdir_log_count(mp, resp); + resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; +} + +STATIC void +xfs_calc_default_atomic_ioend_reservation( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* Pick a default that will scale reasonably for the log size. */ + resp->tr_atomic_ioend = resp->tr_itruncate; +} + +void +xfs_trans_resv_calc( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + int logcount_adj = 0; + + /* + * The following transactions are logged in physical format and + * require a permanent reservation on space. + */ + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + resp->tr_create_tmpfile.tr_logres = xfs_calc_create_tmpfile_reservation(mp); resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT; resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); - resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; - resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp); resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT; resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES; @@ -988,6 +1352,8 @@ xfs_trans_resv_calc( resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + xfs_calc_namespace_reservations(mp, resp); + /* * The following transactions are logged in logical format with * a default log count. @@ -1027,4 +1393,167 @@ xfs_trans_resv_calc( resp->tr_itruncate.tr_logcount += logcount_adj; resp->tr_write.tr_logcount += logcount_adj; resp->tr_qm_dqalloc.tr_logcount += logcount_adj; + + /* + * Now that we've finished computing the static reservations, we can + * compute the dynamic reservation for atomic writes. + */ + xfs_calc_default_atomic_ioend_reservation(mp, resp); +} + +/* + * Return the per-extent and fixed transaction reservation sizes needed to + * complete an atomic write. + */ +STATIC unsigned int +xfs_calc_atomic_write_ioend_geometry( + struct xfs_mount *mp, + unsigned int *step_size) +{ + const unsigned int efi = xfs_efi_log_space(1); + const unsigned int efd = xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1); + const unsigned int rud = xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1); + const unsigned int cud = xfs_cud_log_space(); + const unsigned int bui = xfs_bui_log_space(1); + const unsigned int bud = xfs_bud_log_space(); + + /* + * Maximum overhead to complete an atomic write ioend in software: + * remove data fork extent + remove cow fork extent + map extent into + * data fork. + * + * tx0: Creates a BUI and a CUI and that's all it needs. + * + * tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and + * enough space to relog the CUI (== CUI + CUD). + * + * tx2: Roll again to finish the RUI. Need space for the RUD and space + * to relog the CUI. + * + * tx3: Roll again, need space for the CUD and possibly a new EFI. + * + * tx4: Roll again, need space for an EFD. + * + * If the extent referenced by the pair of BUI/CUI items is not the one + * being currently processed, then we need to reserve space to relog + * both items. + */ + const unsigned int tx0 = bui + cui; + const unsigned int tx1 = bud + rui + cui + cud; + const unsigned int tx2 = rud + cui + cud; + const unsigned int tx3 = cud + efi; + const unsigned int tx4 = efd; + const unsigned int relog = bui + bud + cui + cud; + + const unsigned int per_intent = max(max3(tx0, tx1, tx2), + max3(tx3, tx4, relog)); + + /* Overhead to finish one step of each intent item type */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); + const unsigned int f4 = xfs_calc_finish_bui_reservation(mp, 1); + + /* We only finish one item per transaction in a chain */ + *step_size = max(f4, max3(f1, f2, f3)); + + return per_intent; +} + +/* + * Compute the maximum size (in fsblocks) of atomic writes that we can complete + * given the existing log reservations. + */ +xfs_extlen_t +xfs_calc_max_atomic_write_fsblocks( + struct xfs_mount *mp) +{ + const struct xfs_trans_res *resv = &M_RES(mp)->tr_atomic_ioend; + unsigned int per_intent = 0; + unsigned int step_size = 0; + unsigned int ret = 0; + + if (resv->tr_logres > 0) { + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, + &step_size); + + if (resv->tr_logres >= step_size) + ret = (resv->tr_logres - step_size) / per_intent; + } + + trace_xfs_calc_max_atomic_write_fsblocks(mp, per_intent, step_size, + resv->tr_logres, ret); + + return ret; +} + +/* + * Compute the log blocks and transaction reservation needed to complete an + * atomic write of a given number of blocks. Worst case, each block requires + * separate handling. A return value of 0 means something went wrong. + */ +xfs_extlen_t +xfs_calc_atomic_write_log_geometry( + struct xfs_mount *mp, + xfs_extlen_t blockcount, + unsigned int *new_logres) +{ + struct xfs_trans_res *curr_res = &M_RES(mp)->tr_atomic_ioend; + uint old_logres = curr_res->tr_logres; + unsigned int per_intent, step_size; + unsigned int logres; + xfs_extlen_t min_logblocks; + + ASSERT(blockcount > 0); + + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size); + + /* Check for overflows */ + if (check_mul_overflow(blockcount, per_intent, &logres) || + check_add_overflow(logres, step_size, &logres)) + return 0; + + curr_res->tr_logres = logres; + min_logblocks = xfs_log_calc_minimum_size(mp); + curr_res->tr_logres = old_logres; + + trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size, + blockcount, min_logblocks, logres); + + *new_logres = logres; + return min_logblocks; +} + +/* + * Compute the transaction reservation needed to complete an out of place + * atomic write of a given number of blocks. + */ +int +xfs_calc_atomic_write_reservation( + struct xfs_mount *mp, + xfs_extlen_t blockcount) +{ + unsigned int new_logres; + xfs_extlen_t min_logblocks; + + /* + * If the caller doesn't ask for a specific atomic write size, then + * use the defaults. + */ + if (blockcount == 0) { + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + return 0; + } + + min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount, + &new_logres); + if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks) + return -EINVAL; + + M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres; + return 0; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 0554b9d775d2..336279e0fc61 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -48,6 +48,7 @@ struct xfs_trans_resv { struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ struct xfs_trans_res tr_sb; /* modify superblock */ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ + struct xfs_trans_res tr_atomic_ioend; /* untorn write completion */ }; /* shorthand way of accessing reservation structure */ @@ -98,8 +99,32 @@ struct xfs_trans_resv { void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); +unsigned int xfs_calc_finish_bui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_rui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_rui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_cui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_cui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp, + xfs_extlen_t blockcount, unsigned int *new_logres); +int xfs_calc_atomic_write_reservation(struct xfs_mount *mp, + xfs_extlen_t blockcount); + #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_space.c b/fs/xfs/libxfs/xfs_trans_space.c new file mode 100644 index 000000000000..b9dc3752f702 --- /dev/null +++ b/fs/xfs/libxfs/xfs_trans_space.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_da_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" + +/* Calculate the disk space required to add a parent pointer. */ +unsigned int +xfs_parent_calc_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + /* + * Parent pointers are always the first attr in an attr tree, and never + * larger than a block + */ + return XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + + XFS_NEXTENTADD_SPACE_RES(mp, namelen, XFS_ATTR_FORK); +} + +unsigned int +xfs_create_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int ret; + + ret = XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp, namelen); + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_mkdir_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + return xfs_create_space_res(mp, namelen); +} + +unsigned int +xfs_link_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int ret; + + ret = XFS_DIRENTER_SPACE_RES(mp, namelen); + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_symlink_space_res( + struct xfs_mount *mp, + unsigned int namelen, + unsigned int fsblocks) +{ + unsigned int ret; + + ret = XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp, namelen) + + fsblocks; + + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_remove_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int ret = XFS_DIRREMOVE_SPACE_RES(mp); + + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_rename_space_res( + struct xfs_mount *mp, + unsigned int src_namelen, + bool target_exists, + unsigned int target_namelen, + bool has_whiteout) +{ + unsigned int ret; + + ret = XFS_DIRREMOVE_SPACE_RES(mp) + + XFS_DIRENTER_SPACE_RES(mp, target_namelen); + + if (xfs_has_parent(mp)) { + if (has_whiteout) + ret += xfs_parent_calc_space_res(mp, src_namelen); + ret += 2 * xfs_parent_calc_space_res(mp, target_namelen); + } + + if (target_exists) + ret += xfs_parent_calc_space_res(mp, target_namelen); + + return ret; +} diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 87b31c69a773..d89b570aafcc 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -10,6 +10,23 @@ * Components of space reservations. */ +/* Worst case number of bmaps that can be held in a block. */ +#define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp) \ + (((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0])) + +/* Worst case number of realtime rmaps that can be held in a block. */ +#define XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) \ + (((mp)->m_rtrmap_mxr[0]) - ((mp)->m_rtrmap_mnr[0])) + +/* Adding one realtime rmap could split every level to the top of the tree. */ +#define XFS_RTRMAPADD_SPACE_RES(mp) ((mp)->m_rtrmap_maxlevels) + +/* Blocks we might need to add "b" realtime rmaps to a tree. */ +#define XFS_NRTRMAPADD_SPACE_RES(mp, b) \ + ((((b) + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * \ + XFS_RTRMAPADD_SPACE_RES(mp)) + /* Worst case number of rmaps that can be held in a block. */ #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) @@ -76,31 +93,32 @@ /* This macro is not used - see inline code in xfs_attr_set */ #define XFS_ATTRSET_SPACE_RES(mp, v) \ (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v)) -#define XFS_CREATE_SPACE_RES(mp,nl) \ - (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) #define XFS_DIOSTRAT_SPACE_RES(mp, v) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) #define XFS_GROWFS_SPACE_RES(mp) \ (2 * (mp)->m_alloc_maxlevels) #define XFS_GROWFSRT_SPACE_RES(mp,b) \ ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) -#define XFS_LINK_SPACE_RES(mp,nl) \ - XFS_DIRENTER_SPACE_RES(mp,nl) -#define XFS_MKDIR_SPACE_RES(mp,nl) \ - (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) #define XFS_QM_DQALLOC_SPACE_RES(mp) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \ XFS_DQUOT_CLUSTER_SIZE_FSB) #define XFS_QM_QINOCREATE_SPACE_RES(mp) \ XFS_IALLOC_SPACE_RES(mp) -#define XFS_REMOVE_SPACE_RES(mp) \ - XFS_DIRREMOVE_SPACE_RES(mp) -#define XFS_RENAME_SPACE_RES(mp,nl) \ - (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) -#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ - (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) #define XFS_IFREE_SPACE_RES(mp) \ (xfs_has_finobt(mp) ? M_IGEO(mp)->inobt_maxlevels : 0) +unsigned int xfs_parent_calc_space_res(struct xfs_mount *mp, + unsigned int namelen); + +unsigned int xfs_create_space_res(struct xfs_mount *mp, unsigned int namelen); +unsigned int xfs_mkdir_space_res(struct xfs_mount *mp, unsigned int namelen); +unsigned int xfs_link_space_res(struct xfs_mount *mp, unsigned int namelen); +unsigned int xfs_symlink_space_res(struct xfs_mount *mp, unsigned int namelen, + unsigned int fsblocks); +unsigned int xfs_remove_space_res(struct xfs_mount *mp, unsigned int namelen); + +unsigned int xfs_rename_space_res(struct xfs_mount *mp, + unsigned int src_namelen, bool target_exists, + unsigned int target_namelen, bool has_whiteout); #endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index c299b16c9365..1faf04204c5d 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -12,6 +12,8 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" /* @@ -111,7 +113,7 @@ xfs_verify_ino( /* Is this an internal inode number? */ inline bool -xfs_internal_inum( +xfs_is_sb_inum( struct xfs_mount *mp, xfs_ino_t ino) { @@ -129,24 +131,42 @@ xfs_verify_dir_ino( struct xfs_mount *mp, xfs_ino_t ino) { - if (xfs_internal_inum(mp, ino)) + if (xfs_is_sb_inum(mp, ino)) return false; return xfs_verify_ino(mp, ino); } /* - * Verify that an realtime block number pointer doesn't point off the - * end of the realtime device. + * Verify that a realtime block number pointer neither points outside the + * allocatable areas of the rtgroup nor off the end of the realtime + * device. */ inline bool xfs_verify_rtbno( struct xfs_mount *mp, xfs_rtblock_t rtbno) { + if (xfs_has_rtgroups(mp)) { + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); + xfs_rtxnum_t rtx = xfs_rtb_to_rtx(mp, rtbno); + + if (rgno >= mp->m_sb.sb_rgcount) + return false; + if (rtx >= xfs_rtgroup_extents(mp, rgno)) + return false; + if (xfs_has_rtsb(mp) && rgno == 0 && rtx == 0) + return false; + return true; + } + return rtbno < mp->m_sb.sb_rblocks; } -/* Verify that a realtime device extent is fully contained inside the volume. */ +/* + * Verify that an allocated realtime device extent neither points outside + * allocatable areas of the rtgroup, across an rtgroup boundary, nor off the + * end of the realtime device. + */ bool xfs_verify_rtbext( struct xfs_mount *mp, @@ -159,7 +179,14 @@ xfs_verify_rtbext( if (!xfs_verify_rtbno(mp, rtbno)) return false; - return xfs_verify_rtbno(mp, rtbno + len - 1); + if (!xfs_verify_rtbno(mp, rtbno + len - 1)) + return false; + + if (xfs_has_rtgroups(mp) && + xfs_rtb_to_rgno(mp, rtbno) != xfs_rtb_to_rgno(mp, rtbno + len - 1)) + return false; + + return true; } /* Calculate the range of valid icount values. */ @@ -170,13 +197,12 @@ xfs_icount_range( unsigned long long *max) { unsigned long long nr_inos = 0; - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; /* root, rtbitmap, rtsum all live in the first chunk */ *min = XFS_INODES_PER_CHUNK; - for_each_perag(mp, agno, pag) + while ((pag = xfs_perag_next(mp, pag))) nr_inos += pag->agino_max - pag->agino_min + 1; *max = nr_inos; } diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 20b5375f2d9c..f6f4f2d4b5db 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -9,10 +9,12 @@ typedef uint32_t prid_t; /* project ID */ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef uint32_t xfs_rgblock_t; /* blockno in realtime group */ typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef uint32_t xfs_extlen_t; /* extent length in blocks */ typedef uint32_t xfs_rtxlen_t; /* file extent length in rtextents */ typedef uint32_t xfs_agnumber_t; /* allocation group number */ +typedef uint32_t xfs_rgnumber_t; /* realtime group number */ typedef uint64_t xfs_extnum_t; /* # of extents in a file */ typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ typedef int64_t xfs_fsize_t; /* bytes in a file */ @@ -53,7 +55,9 @@ typedef void * xfs_failaddr_t; #define NULLFILEOFF ((xfs_fileoff_t)-1) #define NULLAGBLOCK ((xfs_agblock_t)-1) +#define NULLRGBLOCK ((xfs_rgblock_t)-1) #define NULLAGNUMBER ((xfs_agnumber_t)-1) +#define NULLRGNUMBER ((xfs_rgnumber_t)-1) #define NULLCOMMITLSN ((xfs_lsn_t)-1) @@ -80,11 +84,13 @@ typedef void * xfs_failaddr_t; /* * Inode fork identifiers. */ -#define XFS_DATA_FORK 0 -#define XFS_ATTR_FORK 1 -#define XFS_COW_FORK 2 +#define XFS_STAGING_FORK (-1) /* fake fork for staging a btree */ +#define XFS_DATA_FORK (0) +#define XFS_ATTR_FORK (1) +#define XFS_COW_FORK (2) #define XFS_WHICHFORK_STRINGS \ + { XFS_STAGING_FORK, "staging" }, \ { XFS_DATA_FORK, "data" }, \ { XFS_ATTR_FORK, "attr" }, \ { XFS_COW_FORK, "cow" } @@ -114,24 +120,6 @@ typedef enum { { XFS_LOOKUP_LEi, "le" }, \ { XFS_LOOKUP_GEi, "ge" } -/* - * This enum is used in string mapping in xfs_trace.h and scrub/trace.h; - * please keep the TRACE_DEFINE_ENUMs for it up to date. - */ -typedef enum { - XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi, - XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX -} xfs_btnum_t; - -#define XFS_BTNUM_STRINGS \ - { XFS_BTNUM_BNOi, "bnobt" }, \ - { XFS_BTNUM_CNTi, "cntbt" }, \ - { XFS_BTNUM_RMAPi, "rmapbt" }, \ - { XFS_BTNUM_BMAPi, "bmbt" }, \ - { XFS_BTNUM_INOi, "inobt" }, \ - { XFS_BTNUM_FINOi, "finobt" }, \ - { XFS_BTNUM_REFCi, "refcbt" } - struct xfs_name { const unsigned char *name; int len; @@ -214,6 +202,13 @@ enum xfs_ag_resv_type { * altering fdblocks. If you think you need this you're wrong. */ XFS_AG_RESV_IGNORE, + + /* + * This allocation activity is being done on behalf of a metadata file. + * These files maintain their own permanent space reservations and are + * required to adjust fdblocks using the xfs_metafile_resv_* helpers. + */ + XFS_AG_RESV_METAFILE, }; /* Results of scanning a btree keyspace to check occupancy. */ @@ -228,6 +223,44 @@ enum xbtree_recpacking { XBTREE_RECPACKING_FULL, }; +enum xfs_group_type { + XG_TYPE_AG, + XG_TYPE_RTG, + XG_TYPE_MAX, +} __packed; + +#define XG_TYPE_STRINGS \ + { XG_TYPE_AG, "ag" }, \ + { XG_TYPE_RTG, "rtg" } + +enum xfs_free_counter { + /* + * Number of free blocks on the data device. + */ + XC_FREE_BLOCKS, + + /* + * Number of free RT extents on the RT device. + */ + XC_FREE_RTEXTENTS, + + /* + * Number of available for use RT extents. + * + * This counter only exists for zoned RT device and indicates the number + * of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS + * also includes blocks that have been written previously and freed, but + * sit in a rtgroup that still needs a zone reset. + */ + XC_FREE_RTAVAILABLE, + XC_FREE_NR, +}; + +#define XFS_FREECOUNTER_STR \ + { XC_FREE_BLOCKS, "blocks" }, \ + { XC_FREE_RTEXTENTS, "rtextents" }, \ + { XC_FREE_RTAVAILABLE, "rtavailable" } + /* * Type verifier functions */ @@ -238,7 +271,7 @@ bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno, xfs_fsblock_t len); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); -bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); +bool xfs_is_sb_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); bool xfs_verify_rtbext(struct xfs_mount *mp, xfs_rtblock_t rtbno, diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c new file mode 100644 index 000000000000..b0791a71931c --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtgroup.h" +#include "xfs_zones.h" + +static bool +xfs_zone_validate_empty( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > 0) { + xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = 0; + return true; +} + +static bool +xfs_zone_validate_wp( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) { + xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.", + rtg_rgno(rtg), wp_fsb); + return false; + } + + *write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb); + if (*write_pointer >= rtg->rtg_extents) { + xfs_warn(mp, "zone %u has invalid write pointer (0x%x).", + rtg_rgno(rtg), *write_pointer); + return false; + } + + return true; +} + +static bool +xfs_zone_validate_full( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = rtg->rtg_extents; + return true; +} + +static bool +xfs_zone_validate_seq( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + return xfs_zone_validate_empty(zone, rtg, write_pointer); + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return xfs_zone_validate_wp(zone, rtg, write_pointer); + case BLK_ZONE_COND_FULL: + return xfs_zone_validate_full(zone, rtg, write_pointer); + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + default: + xfs_warn(mp, "zone %u has unknown zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +static bool +xfs_zone_validate_conv( + struct blk_zone *zone, + struct xfs_rtgroup *rtg) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_NOT_WP: + return true; + default: + xfs_warn(mp, +"conventional zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +bool +xfs_zone_validate( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + uint32_t expected_size; + + /* + * Check that the zone capacity matches the rtgroup size stored in the + * superblock. Note that all zones including the last one must have a + * uniform capacity. + */ + if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) { + xfs_warn(mp, +"zone %u capacity (0x%llx) does not match RT group size (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity), + g->blocks); + return false; + } + + if (g->has_daddr_gaps) { + expected_size = 1 << g->blklog; + } else { + if (zone->len != zone->capacity) { + xfs_warn(mp, +"zone %u has capacity != size ((0x%llx vs 0x%llx)", + rtg_rgno(rtg), + XFS_BB_TO_FSB(mp, zone->len), + XFS_BB_TO_FSB(mp, zone->capacity)); + return false; + } + expected_size = g->blocks; + } + + if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) { + xfs_warn(mp, +"zone %u length (0x%llx) does match geometry (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len), + expected_size); + } + + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + return xfs_zone_validate_conv(zone, rtg); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + return xfs_zone_validate_seq(zone, rtg, write_pointer); + default: + xfs_warn(mp, "zoned %u has unsupported type 0x%x.", + rtg_rgno(rtg), zone->type); + return false; + } +} diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h new file mode 100644 index 000000000000..c4f1367b2cca --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LIBXFS_ZONES_H +#define _LIBXFS_ZONES_H + +struct xfs_rtgroup; + +/* + * In order to guarantee forward progress for GC we need to reserve at least + * two zones: one that will be used for moving data into and one spare zone + * making sure that we have enough space to relocate a nearly-full zone. + * To allow for slightly sloppy accounting for when we need to reserve the + * second zone, we actually reserve three as that is easier than doing fully + * accurate bookkeeping. + */ +#define XFS_GC_ZONES 3U + +/* + * In addition we need two zones for user writes, one open zone for writing + * and one to still have available blocks without resetting the open zone + * when data in the open zone has been freed. + */ +#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1) +#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1) + +/* + * Always keep one zone out of the general open zone pool to allow for GC to + * happen while other writers are waiting for free space. + */ +#define XFS_OPEN_GC_ZONES 1U +#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) + +bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer); + +#endif /* _LIBXFS_ZONES_H */ |