diff options
Diffstat (limited to 'fs/xfs/libxfs')
95 files changed, 15322 insertions, 3527 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index dc1873f76bff..e6ba914f6d06 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -30,137 +30,7 @@ #include "xfs_trace.h" #include "xfs_inode.h" #include "xfs_icache.h" - - -/* - * Passive reference counting access wrappers to the perag structures. If the - * per-ag structure is to be freed, the freeing code is responsible for cleaning - * up objects with passive references before freeing the structure. This is - * things like cached buffers. - */ -struct xfs_perag * -xfs_perag_get( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - rcu_read_lock(); - pag = radix_tree_lookup(&mp->m_perag_tree, agno); - if (pag) { - trace_xfs_perag_get(pag, _RET_IP_); - ASSERT(atomic_read(&pag->pag_ref) >= 0); - atomic_inc(&pag->pag_ref); - } - rcu_read_unlock(); - return pag; -} - -/* - * search from @first to find the next perag with the given tag set. - */ -struct xfs_perag * -xfs_perag_get_tag( - struct xfs_mount *mp, - xfs_agnumber_t first, - unsigned int tag) -{ - struct xfs_perag *pag; - int found; - - rcu_read_lock(); - found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, - (void **)&pag, first, 1, tag); - if (found <= 0) { - rcu_read_unlock(); - return NULL; - } - trace_xfs_perag_get_tag(pag, _RET_IP_); - atomic_inc(&pag->pag_ref); - rcu_read_unlock(); - return pag; -} - -/* Get a passive reference to the given perag. */ -struct xfs_perag * -xfs_perag_hold( - struct xfs_perag *pag) -{ - ASSERT(atomic_read(&pag->pag_ref) > 0 || - atomic_read(&pag->pag_active_ref) > 0); - - trace_xfs_perag_hold(pag, _RET_IP_); - atomic_inc(&pag->pag_ref); - return pag; -} - -void -xfs_perag_put( - struct xfs_perag *pag) -{ - trace_xfs_perag_put(pag, _RET_IP_); - ASSERT(atomic_read(&pag->pag_ref) > 0); - atomic_dec(&pag->pag_ref); -} - -/* - * Active references for perag structures. This is for short term access to the - * per ag structures for walking trees or accessing state. If an AG is being - * shrunk or is offline, then this will fail to find that AG and return NULL - * instead. - */ -struct xfs_perag * -xfs_perag_grab( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - rcu_read_lock(); - pag = radix_tree_lookup(&mp->m_perag_tree, agno); - if (pag) { - trace_xfs_perag_grab(pag, _RET_IP_); - if (!atomic_inc_not_zero(&pag->pag_active_ref)) - pag = NULL; - } - rcu_read_unlock(); - return pag; -} - -/* - * search from @first to find the next perag with the given tag set. - */ -struct xfs_perag * -xfs_perag_grab_tag( - struct xfs_mount *mp, - xfs_agnumber_t first, - int tag) -{ - struct xfs_perag *pag; - int found; - - rcu_read_lock(); - found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, - (void **)&pag, first, 1, tag); - if (found <= 0) { - rcu_read_unlock(); - return NULL; - } - trace_xfs_perag_grab_tag(pag, _RET_IP_); - if (!atomic_inc_not_zero(&pag->pag_active_ref)) - pag = NULL; - rcu_read_unlock(); - return pag; -} - -void -xfs_perag_rele( - struct xfs_perag *pag) -{ - trace_xfs_perag_rele(pag, _RET_IP_); - if (atomic_dec_and_test(&pag->pag_active_ref)) - wake_up(&pag->pag_active_wq); -} +#include "xfs_group.h" /* * xfs_initialize_perag_data @@ -194,7 +64,7 @@ xfs_initialize_perag_data( pag = xfs_perag_get(mp, index); error = xfs_alloc_read_agf(pag, NULL, 0, NULL); if (!error) - error = xfs_ialloc_read_agi(pag, NULL, NULL); + error = xfs_ialloc_read_agi(pag, NULL, 0, NULL); if (error) { xfs_perag_put(pag); return error; @@ -235,43 +105,32 @@ out: return error; } -STATIC void -__xfs_free_perag( - struct rcu_head *head) +static void +xfs_perag_uninit( + struct xfs_group *xg) { - struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); +#ifdef __KERNEL__ + struct xfs_perag *pag = to_perag(xg); - ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); - kfree(pag); + cancel_delayed_work_sync(&pag->pag_blockgc_work); + xfs_buf_cache_destroy(&pag->pag_bcache); +#endif } /* - * Free up the per-ag resources associated with the mount structure. + * Free up the per-ag resources within the specified AG range. */ void -xfs_free_perag( - struct xfs_mount *mp) +xfs_free_perag_range( + struct xfs_mount *mp, + xfs_agnumber_t first_agno, + xfs_agnumber_t end_agno) + { - struct xfs_perag *pag; xfs_agnumber_t agno; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - spin_lock(&mp->m_perag_lock); - pag = radix_tree_delete(&mp->m_perag_tree, agno); - spin_unlock(&mp->m_perag_lock); - ASSERT(pag); - XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); - xfs_defer_drain_free(&pag->pag_intents_drain); - - cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_buf_cache_destroy(&pag->pag_bcache); - - /* drop the mount's active reference */ - xfs_perag_rele(pag); - XFS_IS_CORRUPT(pag->pag_mount, - atomic_read(&pag->pag_active_ref) != 0); - call_rcu(&pag->rcu_head, __xfs_free_perag); - } + for (agno = first_agno; agno < end_agno; agno++) + xfs_group_free(mp, agno, XG_TYPE_AG, xfs_perag_uninit); } /* Find the size of the AG, in blocks. */ @@ -334,131 +193,100 @@ xfs_agino_range( } /* - * Free perag within the specified AG range, it is only used to free unused - * perags under the error handling path. + * Update the perag of the previous tail AG if it has been changed during + * recovery (i.e. recovery of a growfs). */ -void -xfs_free_unused_perag_range( +int +xfs_update_last_ag_size( struct xfs_mount *mp, - xfs_agnumber_t agstart, - xfs_agnumber_t agend) + xfs_agnumber_t prev_agcount) { - struct xfs_perag *pag; - xfs_agnumber_t index; + struct xfs_perag *pag = xfs_perag_grab(mp, prev_agcount - 1); - for (index = agstart; index < agend; index++) { - spin_lock(&mp->m_perag_lock); - pag = radix_tree_delete(&mp->m_perag_tree, index); - spin_unlock(&mp->m_perag_lock); - if (!pag) - break; - xfs_buf_cache_destroy(&pag->pag_bcache); - xfs_defer_drain_free(&pag->pag_intents_drain); - kfree(pag); - } + if (!pag) + return -EFSCORRUPTED; + pag_group(pag)->xg_block_count = __xfs_ag_block_count(mp, + prev_agcount - 1, mp->m_sb.sb_agcount, + mp->m_sb.sb_dblocks); + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); + xfs_perag_rele(pag); + return 0; } -int -xfs_initialize_perag( +static int +xfs_perag_alloc( struct xfs_mount *mp, + xfs_agnumber_t index, xfs_agnumber_t agcount, - xfs_rfsblock_t dblocks, - xfs_agnumber_t *maxagi) + xfs_rfsblock_t dblocks) { struct xfs_perag *pag; - xfs_agnumber_t index; - xfs_agnumber_t first_initialised = NULLAGNUMBER; int error; - /* - * Walk the current per-ag tree so we don't try to initialise AGs - * that already exist (growfs case). Allocate and insert all the - * AGs we don't find ready for initialisation. - */ - for (index = 0; index < agcount; index++) { - pag = xfs_perag_get(mp, index); - if (pag) { - xfs_perag_put(pag); - continue; - } - - pag = kzalloc(sizeof(*pag), GFP_KERNEL | __GFP_RETRY_MAYFAIL); - if (!pag) { - error = -ENOMEM; - goto out_unwind_new_pags; - } - pag->pag_agno = index; - pag->pag_mount = mp; - - error = radix_tree_preload(GFP_KERNEL | __GFP_RETRY_MAYFAIL); - if (error) - goto out_free_pag; - - spin_lock(&mp->m_perag_lock); - if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { - WARN_ON_ONCE(1); - spin_unlock(&mp->m_perag_lock); - radix_tree_preload_end(); - error = -EEXIST; - goto out_free_pag; - } - spin_unlock(&mp->m_perag_lock); - radix_tree_preload_end(); + pag = kzalloc(sizeof(*pag), GFP_KERNEL); + if (!pag) + return -ENOMEM; #ifdef __KERNEL__ - /* Place kernel structure only init below this point. */ - spin_lock_init(&pag->pag_ici_lock); - spin_lock_init(&pag->pagb_lock); - spin_lock_init(&pag->pag_state_lock); - INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); - INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - xfs_defer_drain_init(&pag->pag_intents_drain); - init_waitqueue_head(&pag->pagb_wait); - init_waitqueue_head(&pag->pag_active_wq); - pag->pagb_count = 0; - pag->pagb_tree = RB_ROOT; - xfs_hooks_init(&pag->pag_rmap_update_hooks); + /* Place kernel structure only init below this point. */ + spin_lock_init(&pag->pag_ici_lock); + INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); #endif /* __KERNEL__ */ - error = xfs_buf_cache_init(&pag->pag_bcache); - if (error) - goto out_remove_pag; + error = xfs_buf_cache_init(&pag->pag_bcache); + if (error) + goto out_free_perag; - /* Active ref owned by mount indicates AG is online. */ - atomic_set(&pag->pag_active_ref, 1); + /* + * Pre-calculated geometry + */ + pag_group(pag)->xg_block_count = __xfs_ag_block_count(mp, index, agcount, + dblocks); + pag_group(pag)->xg_min_gbno = XFS_AGFL_BLOCK(mp) + 1; + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); - /* first new pag is fully initialized */ - if (first_initialised == NULLAGNUMBER) - first_initialised = index; + error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG); + if (error) + goto out_buf_cache_destroy; - /* - * Pre-calculated geometry - */ - pag->block_count = __xfs_ag_block_count(mp, index, agcount, - dblocks); - pag->min_block = XFS_AGFL_BLOCK(mp); - __xfs_agino_range(mp, pag->block_count, &pag->agino_min, - &pag->agino_max); - } + return 0; + +out_buf_cache_destroy: + xfs_buf_cache_destroy(&pag->pag_bcache); +out_free_perag: + kfree(pag); + return error; +} + +int +xfs_initialize_perag( + struct xfs_mount *mp, + xfs_agnumber_t orig_agcount, + xfs_agnumber_t new_agcount, + xfs_rfsblock_t dblocks, + xfs_agnumber_t *maxagi) +{ + xfs_agnumber_t index; + int error; - index = xfs_set_inode_alloc(mp, agcount); + if (orig_agcount >= new_agcount) + return 0; - if (maxagi) - *maxagi = index; + for (index = orig_agcount; index < new_agcount; index++) { + error = xfs_perag_alloc(mp, index, new_agcount, dblocks); + if (error) + goto out_unwind_new_pags; + } + *maxagi = xfs_set_inode_alloc(mp, new_agcount); mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; -out_remove_pag: - xfs_defer_drain_free(&pag->pag_intents_drain); - spin_lock(&mp->m_perag_lock); - radix_tree_delete(&mp->m_perag_tree, index); - spin_unlock(&mp->m_perag_lock); -out_free_pag: - kfree(pag); out_unwind_new_pags: - /* unwind any prior newly initialized pags */ - xfs_free_unused_perag_range(mp, first_initialised, agcount); + xfs_free_perag_range(mp, orig_agcount, index); return error; } @@ -473,7 +301,7 @@ xfs_get_aghdr_buf( struct xfs_buf *bp; int error; - error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp); + error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, &bp); if (error) return error; @@ -913,7 +741,7 @@ xfs_ag_shrink_space( struct xfs_trans **tpp, xfs_extlen_t delta) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_alloc_arg args = { .tp = *tpp, .mp = mp, @@ -930,8 +758,8 @@ xfs_ag_shrink_space( xfs_agblock_t aglen; int error, err2; - ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1); - error = xfs_ialloc_read_agi(pag, *tpp, &agibp); + ASSERT(pag_agno(pag) == mp->m_sb.sb_agcount - 1); + error = xfs_ialloc_read_agi(pag, *tpp, 0, &agibp); if (error) return error; @@ -963,13 +791,11 @@ xfs_ag_shrink_space( * Disable perag reservations so it doesn't cause the allocation request * to fail. We'll reestablish reservation before we return. */ - error = xfs_ag_resv_free(pag); - if (error) - return error; + xfs_ag_resv_free(pag); /* internal log shouldn't also show up in the free space btrees */ error = xfs_alloc_vextent_exact_bno(&args, - XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta)); + xfs_agbno_to_fsb(pag, aglen - delta)); if (!error && args.agbno == NULLAGBLOCK) error = -ENOSPC; @@ -1010,7 +836,7 @@ xfs_ag_shrink_space( goto resv_err; err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, - XFS_AG_RESV_NONE, true); + XFS_AG_RESV_NONE, XFS_FREE_EXTENT_SKIP_DISCARD); if (err2) goto resv_err; @@ -1028,9 +854,9 @@ xfs_ag_shrink_space( } /* Update perag geometry */ - pag->block_count -= delta; - __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, - &pag->agino_max); + pag_group(pag)->xg_block_count -= delta; + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); @@ -1055,14 +881,15 @@ xfs_ag_extend_space( struct xfs_trans *tp, xfs_extlen_t len) { + struct xfs_mount *mp = pag_mount(pag); struct xfs_buf *bp; struct xfs_agi *agi; struct xfs_agf *agf; int error; - ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1); + ASSERT(pag_agno(pag) == mp->m_sb.sb_agcount - 1); - error = xfs_ialloc_read_agi(pag, tp, &bp); + error = xfs_ialloc_read_agi(pag, tp, 0, &bp); if (error) return error; @@ -1099,9 +926,9 @@ xfs_ag_extend_space( return error; /* Update perag geometry */ - pag->block_count = be32_to_cpu(agf->agf_length); - __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, - &pag->agino_max); + pag_group(pag)->xg_block_count = be32_to_cpu(agf->agf_length); + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); return 0; } @@ -1119,7 +946,7 @@ xfs_ag_get_geometry( int error; /* Lock the AG headers. */ - error = xfs_ialloc_read_agi(pag, NULL, &agi_bp); + error = xfs_ialloc_read_agi(pag, NULL, 0, &agi_bp); if (error) return error; error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp); @@ -1128,7 +955,7 @@ xfs_ag_get_geometry( /* Fill out form. */ memset(ageo, 0, sizeof(*ageo)); - ageo->ag_number = pag->pag_agno; + ageo->ag_number = pag_agno(pag); agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 35de09a2516c..1f24cfa27321 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -7,6 +7,8 @@ #ifndef __LIBXFS_AG_H #define __LIBXFS_AG_H 1 +#include "xfs_group.h" + struct xfs_mount; struct xfs_trans; struct xfs_perag; @@ -30,11 +32,7 @@ struct xfs_ag_resv { * performance of allocation group selection. */ struct xfs_perag { - struct xfs_mount *pag_mount; /* owner filesystem */ - xfs_agnumber_t pag_agno; /* AG this structure belongs to */ - atomic_t pag_ref; /* passive reference count */ - atomic_t pag_active_ref; /* active reference count */ - wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */ + struct xfs_group pag_group; unsigned long pag_opstate; uint8_t pagf_bno_level; /* # of levels in bno btree */ uint8_t pagf_cnt_level; /* # of levels in cnt btree */ @@ -55,7 +53,6 @@ struct xfs_perag { xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; - int pagb_count; /* pagb slots in use */ uint8_t pagf_refcount_level; /* recount btree height */ /* Blocks reserved for all kinds of metadata. */ @@ -63,25 +60,13 @@ struct xfs_perag { /* Blocks reserved for the reverse mapping btree. */ struct xfs_ag_resv pag_rmapbt_resv; - /* for rcu-safe freeing */ - struct rcu_head rcu_head; - /* Precalculated geometry info */ - xfs_agblock_t block_count; - xfs_agblock_t min_block; xfs_agino_t agino_min; xfs_agino_t agino_max; #ifdef __KERNEL__ /* -- kernel only structures below this line -- */ - /* - * Bitsets of per-ag metadata that have been checked and/or are sick. - * Callers should hold pag_state_lock before accessing this field. - */ - uint16_t pag_checked; - uint16_t pag_sick; - #ifdef CONFIG_XFS_ONLINE_REPAIR /* * Alternate btree heights so that online repair won't trip the write @@ -93,13 +78,6 @@ struct xfs_perag { uint8_t pagf_repair_rmap_level; #endif - spinlock_t pag_state_lock; - - spinlock_t pagb_lock; /* lock for pagb_tree */ - struct rb_root pagb_tree; /* ordered tree of busy extents */ - unsigned int pagb_gen; /* generation count for pagb_tree */ - wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */ - atomic_t pagf_fstrms; /* # of filestreams active in this AG */ spinlock_t pag_ici_lock; /* incore inode cache lock */ @@ -111,21 +89,29 @@ struct xfs_perag { /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; - - /* - * We use xfs_drain to track the number of deferred log intent items - * that have been queued (but not yet processed) so that waiters (e.g. - * scrub) will not lock resources when other threads are in the middle - * of processing a chain of intent items only to find momentary - * inconsistencies. - */ - struct xfs_defer_drain pag_intents_drain; - - /* Hook to feed rmapbt updates to an active online repair. */ - struct xfs_hooks pag_rmap_update_hooks; #endif /* __KERNEL__ */ }; +static inline struct xfs_perag *to_perag(struct xfs_group *xg) +{ + return container_of(xg, struct xfs_perag, pag_group); +} + +static inline struct xfs_group *pag_group(struct xfs_perag *pag) +{ + return &pag->pag_group; +} + +static inline struct xfs_mount *pag_mount(const struct xfs_perag *pag) +{ + return pag->pag_group.xg_mount; +} + +static inline xfs_agnumber_t pag_agno(const struct xfs_perag *pag) +{ + return pag->pag_group.xg_gno; +} + /* * Per-AG operational state. These are atomic flag bits. */ @@ -147,25 +133,80 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA) __XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES) __XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET) -void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart, - xfs_agnumber_t agend); -int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, - xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); +int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t orig_agcount, + xfs_agnumber_t new_agcount, xfs_rfsblock_t dcount, + xfs_agnumber_t *maxagi); +void xfs_free_perag_range(struct xfs_mount *mp, xfs_agnumber_t first_agno, + xfs_agnumber_t end_agno); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); -void xfs_free_perag(struct xfs_mount *mp); +int xfs_update_last_ag_size(struct xfs_mount *mp, xfs_agnumber_t prev_agcount); /* Passive AG references */ -struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); -struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, - unsigned int tag); -struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag); -void xfs_perag_put(struct xfs_perag *pag); +static inline struct xfs_perag * +xfs_perag_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + return to_perag(xfs_group_get(mp, agno, XG_TYPE_AG)); +} + +static inline struct xfs_perag * +xfs_perag_hold( + struct xfs_perag *pag) +{ + return to_perag(xfs_group_hold(pag_group(pag))); +} + +static inline void +xfs_perag_put( + struct xfs_perag *pag) +{ + xfs_group_put(pag_group(pag)); +} /* Active AG references */ -struct xfs_perag *xfs_perag_grab(struct xfs_mount *, xfs_agnumber_t); -struct xfs_perag *xfs_perag_grab_tag(struct xfs_mount *, xfs_agnumber_t, - int tag); -void xfs_perag_rele(struct xfs_perag *pag); +static inline struct xfs_perag * +xfs_perag_grab( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + return to_perag(xfs_group_grab(mp, agno, XG_TYPE_AG)); +} + +static inline void +xfs_perag_rele( + struct xfs_perag *pag) +{ + xfs_group_rele(pag_group(pag)); +} + +static inline struct xfs_perag * +xfs_perag_next_range( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_agnumber_t start_agno, + xfs_agnumber_t end_agno) +{ + return to_perag(xfs_group_next_range(mp, pag ? pag_group(pag) : NULL, + start_agno, end_agno, XG_TYPE_AG)); +} + +static inline struct xfs_perag * +xfs_perag_next_from( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_agnumber_t start_agno) +{ + return xfs_perag_next_range(mp, pag, start_agno, mp->m_sb.sb_agcount - 1); +} + +static inline struct xfs_perag * +xfs_perag_next( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + return xfs_perag_next_from(mp, pag, 0); +} /* * Per-ag geometry infomation and validation @@ -177,11 +218,7 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, static inline bool xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) { - if (agbno >= pag->block_count) - return false; - if (agbno <= pag->min_block) - return false; - return true; + return xfs_verify_gbno(pag_group(pag), agbno); } static inline bool @@ -190,13 +227,7 @@ xfs_verify_agbext( xfs_agblock_t agbno, xfs_agblock_t len) { - if (agbno + len <= agbno) - return false; - - if (!xfs_verify_agbno(pag, agbno)) - return false; - - return xfs_verify_agbno(pag, agbno + len - 1); + return xfs_verify_gbext(pag_group(pag), agbno, len); } /* @@ -232,47 +263,6 @@ xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno) agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); } -/* - * Perag iteration APIs - */ -static inline struct xfs_perag * -xfs_perag_next( - struct xfs_perag *pag, - xfs_agnumber_t *agno, - xfs_agnumber_t end_agno) -{ - struct xfs_mount *mp = pag->pag_mount; - - *agno = pag->pag_agno + 1; - xfs_perag_rele(pag); - while (*agno <= end_agno) { - pag = xfs_perag_grab(mp, *agno); - if (pag) - return pag; - (*agno)++; - } - return NULL; -} - -#define for_each_perag_range(mp, agno, end_agno, pag) \ - for ((pag) = xfs_perag_grab((mp), (agno)); \ - (pag) != NULL; \ - (pag) = xfs_perag_next((pag), &(agno), (end_agno))) - -#define for_each_perag_from(mp, agno, pag) \ - for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag)) - -#define for_each_perag(mp, agno, pag) \ - (agno) = 0; \ - for_each_perag_from((mp), (agno), (pag)) - -#define for_each_perag_tag(mp, agno, pag, tag) \ - for ((agno) = 0, (pag) = xfs_perag_grab_tag((mp), 0, (tag)); \ - (pag) != NULL; \ - (agno) = (pag)->pag_agno + 1, \ - xfs_perag_rele(pag), \ - (pag) = xfs_perag_grab_tag((mp), (agno), (tag))) - static inline struct xfs_perag * xfs_perag_next_wrap( struct xfs_perag *pag, @@ -281,9 +271,9 @@ xfs_perag_next_wrap( xfs_agnumber_t restart_agno, xfs_agnumber_t wrap_agno) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); - *agno = pag->pag_agno + 1; + *agno = pag_agno(pag) + 1; xfs_perag_rele(pag); while (*agno != stop_agno) { if (*agno >= wrap_agno) { @@ -345,4 +335,28 @@ int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t len); int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); +static inline xfs_fsblock_t +xfs_agbno_to_fsb( + struct xfs_perag *pag, + xfs_agblock_t agbno) +{ + return XFS_AGB_TO_FSB(pag_mount(pag), pag_agno(pag), agbno); +} + +static inline xfs_daddr_t +xfs_agbno_to_daddr( + struct xfs_perag *pag, + xfs_agblock_t agbno) +{ + return XFS_AGB_TO_DADDR(pag_mount(pag), pag_agno(pag), agbno); +} + +static inline xfs_ino_t +xfs_agino_to_ino( + struct xfs_perag *pag, + xfs_agino_t agino) +{ + return XFS_AGINO_TO_INO(pag_mount(pag), pag_agno(pag), agino); +} + #endif /* __LIBXFS_AG_H */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index da1057bd0e60..fb79215a509d 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -70,6 +70,7 @@ xfs_ag_resv_critical( struct xfs_perag *pag, enum xfs_ag_resv_type type) { + struct xfs_mount *mp = pag_mount(pag); xfs_extlen_t avail; xfs_extlen_t orig; @@ -92,8 +93,8 @@ xfs_ag_resv_critical( /* Critically low if less than 10% or max btree height remains. */ return XFS_TEST_ERROR(avail < orig / 10 || - avail < pag->pag_mount->m_agbtree_maxlevels, - pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL); + avail < mp->m_agbtree_maxlevels, + mp, XFS_ERRTAG_AG_RESV_CRITICAL); } /* @@ -113,6 +114,7 @@ xfs_ag_resv_needed( case XFS_AG_RESV_RMAPBT: len -= xfs_perag_resv(pag, type)->ar_reserved; break; + case XFS_AG_RESV_METAFILE: case XFS_AG_RESV_NONE: /* empty */ break; @@ -126,20 +128,19 @@ xfs_ag_resv_needed( } /* Clean out a reservation */ -static int +static void __xfs_ag_resv_free( struct xfs_perag *pag, enum xfs_ag_resv_type type) { struct xfs_ag_resv *resv; xfs_extlen_t oldresv; - int error; trace_xfs_ag_resv_free(pag, type, 0); resv = xfs_perag_resv(pag, type); - if (pag->pag_agno == 0) - pag->pag_mount->m_ag_max_usable += resv->ar_asked; + if (pag_agno(pag) == 0) + pag_mount(pag)->m_ag_max_usable += resv->ar_asked; /* * RMAPBT blocks come from the AGFL and AGFL blocks are always * considered "free", so whatever was reserved at mount time must be @@ -149,30 +150,19 @@ __xfs_ag_resv_free( oldresv = resv->ar_orig_reserved; else oldresv = resv->ar_reserved; - error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); + xfs_add_fdblocks(pag_mount(pag), oldresv); resv->ar_reserved = 0; resv->ar_asked = 0; resv->ar_orig_reserved = 0; - - if (error) - trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, - error, _RET_IP_); - return error; } /* Free a per-AG reservation. */ -int +void xfs_ag_resv_free( struct xfs_perag *pag) { - int error; - int err2; - - error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT); - err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); - if (err2 && !error) - error = err2; - return error; + __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT); + __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); } static int @@ -182,7 +172,7 @@ __xfs_ag_resv_init( xfs_extlen_t ask, xfs_extlen_t used) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_ag_resv *resv; int error; xfs_extlen_t hidden_space; @@ -216,13 +206,12 @@ __xfs_ag_resv_init( if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL)) error = -ENOSPC; else - error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); + error = xfs_dec_fdblocks(mp, hidden_space, true); if (error) { - trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, - error, _RET_IP_); + trace_xfs_ag_resv_init_error(pag, error, _RET_IP_); xfs_warn(mp, "Per-AG reservation for AG %u failed. Filesystem may run out of space.", - pag->pag_agno); + pag_agno(pag)); return error; } @@ -232,7 +221,7 @@ __xfs_ag_resv_init( * counter, we only make the adjustment for AG 0. This assumes that * there aren't any AGs hungrier for per-AG reservation than AG 0. */ - if (pag->pag_agno == 0) + if (pag_agno(pag) == 0) mp->m_ag_max_usable -= ask; resv = xfs_perag_resv(pag, type); @@ -250,7 +239,7 @@ xfs_ag_resv_init( struct xfs_perag *pag, struct xfs_trans *tp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); xfs_extlen_t ask; xfs_extlen_t used; int error = 0, error2; @@ -359,6 +348,7 @@ xfs_ag_resv_alloc_extent( switch (type) { case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_METAFILE: return; case XFS_AG_RESV_METADATA: case XFS_AG_RESV_RMAPBT: @@ -401,6 +391,7 @@ xfs_ag_resv_free_extent( switch (type) { case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_METAFILE: return; case XFS_AG_RESV_METADATA: case XFS_AG_RESV_RMAPBT: diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index b74b210008ea..f247eeff7358 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -6,7 +6,7 @@ #ifndef __XFS_AG_RESV_H__ #define __XFS_AG_RESV_H__ -int xfs_ag_resv_free(struct xfs_perag *pag); +void xfs_ag_resv_free(struct xfs_perag *pag); int xfs_ag_resv_init(struct xfs_perag *pag, struct xfs_trans *tp); bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type); @@ -33,23 +33,4 @@ xfs_perag_resv( } } -/* - * RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from - * the AGFL, they are allocated one at a time and the reservation updates don't - * require a transaction. - */ -static inline void -xfs_ag_resv_rmapbt_alloc( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_alloc_arg args = { NULL }; - struct xfs_perag *pag; - - args.len = 1; - pag = xfs_perag_get(mp, agno); - xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args); - xfs_perag_put(pag); -} - #endif /* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 9da52e92172a..000cc7f4a3ce 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -27,13 +27,12 @@ #include "xfs_ag_resv.h" #include "xfs_bmap.h" #include "xfs_health.h" +#include "xfs_extfree_item.h" struct kmem_cache *xfs_extfree_item_cache; struct workqueue_struct *xfs_alloc_wq; -#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) - #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 @@ -79,7 +78,7 @@ xfs_prealloc_blocks( } /* - * The number of blocks per AG that we withhold from xfs_mod_fdblocks to + * The number of blocks per AG that we withhold from xfs_dec_fdblocks to * guarantee that we can refill the AGFL prior to allocating space in a nearly * full AG. Although the space described by the free space btrees, the * blocks used by the freesp btrees themselves, and the blocks owned by the @@ -89,7 +88,7 @@ xfs_prealloc_blocks( * until the fs goes down, we subtract this many AG blocks from the incore * fdblocks to ensure user allocation does not overcommit the space the * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to - * withhold space from xfs_mod_fdblocks, so we do not account for that here. + * withhold space from xfs_dec_fdblocks, so we do not account for that here. */ #define XFS_ALLOCBT_AGFL_RESERVE 4 @@ -274,7 +273,7 @@ xfs_alloc_complain_bad_rec( xfs_warn(mp, "%sbt record corruption in AG %d detected at %pS!", - cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa); + cur->bc_ops->name, cur->bc_group->xg_gno, fa); xfs_warn(mp, "start block 0x%x block count 0x%x", irec->ar_startblock, irec->ar_blockcount); @@ -302,7 +301,7 @@ xfs_alloc_get_rec( return error; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); @@ -330,7 +329,8 @@ xfs_alloc_compute_aligned( bool busy; /* Trim busy sections out of found extent */ - busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); + busy = xfs_extent_busy_trim(pag_group(args->pag), args->minlen, + args->maxlen, &bno, &len, busy_gen); /* * If we have a largish extent that happens to start before min_agbno, @@ -408,8 +408,8 @@ xfs_alloc_compute_diff( if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { if (newlen1 < newlen2 || (newlen1 == newlen2 && - XFS_ABSDIFF(newbno1, wantbno) > - XFS_ABSDIFF(newbno2, wantbno))) + abs_diff(newbno1, wantbno) > + abs_diff(newbno2, wantbno))) newbno1 = newbno2; } else if (newbno2 != NULLAGBLOCK) newbno1 = newbno2; @@ -425,7 +425,7 @@ xfs_alloc_compute_diff( } else newbno1 = freeend - wantlen; *newbnop = newbno1; - return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); + return newbno1 == NULLAGBLOCK ? 0 : abs_diff(newbno1, wantbno); } /* @@ -466,6 +466,97 @@ xfs_alloc_fix_len( } /* + * Determine if the cursor points to the block that contains the right-most + * block of records in the by-count btree. This block contains the largest + * contiguous free extent in the AG, so if we modify a record in this block we + * need to call xfs_alloc_fixup_longest() once the modifications are done to + * ensure the agf->agf_longest field is kept up to date with the longest free + * extent tracked by the by-count btree. + */ +static bool +xfs_alloc_cursor_at_lastrec( + struct xfs_btree_cur *cnt_cur) +{ + struct xfs_btree_block *block; + union xfs_btree_ptr ptr; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cnt_cur, 0, &bp); + + xfs_btree_get_sibling(cnt_cur, block, &ptr, XFS_BB_RIGHTSIB); + return xfs_btree_ptr_is_null(cnt_cur, &ptr); +} + +/* + * Find the rightmost record of the cntbt, and return the longest free space + * recorded in it. Simply set both the block number and the length to their + * maximum values before searching. + */ +static int +xfs_cntbt_longest( + struct xfs_btree_cur *cnt_cur, + xfs_extlen_t *longest) +{ + struct xfs_alloc_rec_incore irec; + union xfs_btree_rec *rec; + int stat = 0; + int error; + + memset(&cnt_cur->bc_rec, 0xFF, sizeof(cnt_cur->bc_rec)); + error = xfs_btree_lookup(cnt_cur, XFS_LOOKUP_LE, &stat); + if (error) + return error; + if (!stat) { + /* totally empty tree */ + *longest = 0; + return 0; + } + + error = xfs_btree_get_rec(cnt_cur, &rec, &stat); + if (error) + return error; + if (XFS_IS_CORRUPT(cnt_cur->bc_mp, !stat)) { + xfs_btree_mark_sick(cnt_cur); + return -EFSCORRUPTED; + } + + xfs_alloc_btrec_to_irec(rec, &irec); + *longest = irec.ar_blockcount; + return 0; +} + +/* + * Update the longest contiguous free extent in the AG from the by-count cursor + * that is passed to us. This should be done at the end of any allocation or + * freeing operation that touches the longest extent in the btree. + * + * Needing to update the longest extent can be determined by calling + * xfs_alloc_cursor_at_lastrec() after the cursor is positioned for record + * modification but before the modification begins. + */ +static int +xfs_alloc_fixup_longest( + struct xfs_btree_cur *cnt_cur) +{ + struct xfs_perag *pag = to_perag(cnt_cur->bc_group); + struct xfs_buf *bp = cnt_cur->bc_ag.agbp; + struct xfs_agf *agf = bp->b_addr; + xfs_extlen_t longest = 0; + int error; + + /* Lookup last rec in order to update AGF. */ + error = xfs_cntbt_longest(cnt_cur, &longest); + if (error) + return error; + + pag->pagf_longest = longest; + agf->agf_longest = cpu_to_be32(pag->pagf_longest); + xfs_alloc_log_agf(cnt_cur->bc_tp, bp, XFS_AGF_LONGEST); + + return 0; +} + +/* * Update the two btrees, logically removing from freespace the extent * starting at rbno, rlen blocks. The extent is contained within the * actual (current) free extent fbno for flen blocks. @@ -489,6 +580,7 @@ xfs_alloc_fixup_trees( xfs_extlen_t nflen1=0; /* first new free length */ xfs_extlen_t nflen2=0; /* second new free length */ struct xfs_mount *mp; + bool fixup_longest = false; mp = cnt_cur->bc_mp; @@ -577,6 +669,10 @@ xfs_alloc_fixup_trees( nfbno2 = rbno + rlen; nflen2 = (fbno + flen) - nfbno2; } + + if (xfs_alloc_cursor_at_lastrec(cnt_cur)) + fixup_longest = true; + /* * Delete the entry from the by-size btree. */ @@ -654,6 +750,10 @@ xfs_alloc_fixup_trees( return -EFSCORRUPTED; } } + + if (fixup_longest) + return xfs_alloc_fixup_longest(cnt_cur); + return 0; } @@ -698,7 +798,7 @@ xfs_agfl_verify( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) + if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != pag_agno((bp->b_pag))) return __this_address; for (i = 0; i < xfs_agfl_size(mp); i++) { @@ -778,13 +878,12 @@ xfs_alloc_read_agfl( struct xfs_trans *tp, struct xfs_buf **bpp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_buf *bp; int error; - error = xfs_trans_read_buf( - mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (xfs_metadata_is_sick(error)) xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); @@ -1008,13 +1107,12 @@ xfs_alloc_cur_finish( struct xfs_alloc_arg *args, struct xfs_alloc_cur *acur) { - struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; int error; ASSERT(acur->cnt && acur->bnolt); ASSERT(acur->bno >= acur->rec_bno); ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); - ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); + ASSERT(xfs_verify_agbext(args->pag, acur->rec_bno, acur->rec_len)); error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, acur->rec_len, acur->bno, acur->len, 0); @@ -1152,14 +1250,14 @@ xfs_alloc_ag_vextent_small( if (fbno == NULLAGBLOCK) goto out; - xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1, + xfs_extent_busy_reuse(pag_group(args->pag), fbno, 1, (args->datatype & XFS_ALLOC_NOBUSY)); if (args->datatype & XFS_ALLOC_USERDATA) { struct xfs_buf *bp; error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, - XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), + xfs_agbno_to_daddr(args->pag, fbno), args->mp->m_bsize, 0, &bp); if (error) goto error; @@ -1217,7 +1315,6 @@ STATIC int /* error */ xfs_alloc_ag_vextent_exact( xfs_alloc_arg_t *args) /* allocation argument structure */ { - struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ int error; @@ -1266,7 +1363,8 @@ xfs_alloc_ag_vextent_exact( */ tbno = fbno; tlen = flen; - xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); + xfs_extent_busy_trim(pag_group(args->pag), args->minlen, args->maxlen, + &tbno, &tlen, &busy_gen); /* * Give up if the start of the extent is busy, or the freespace isn't @@ -1297,7 +1395,7 @@ xfs_alloc_ag_vextent_exact( */ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, args->pag); - ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); + ASSERT(xfs_verify_agbext(args->pag, args->agbno, args->len)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); if (error) { @@ -1659,8 +1757,9 @@ restart: * the allocation can be retried. */ trace_xfs_alloc_near_busy(args); - error = xfs_extent_busy_flush(args->tp, args->pag, - acur.busy_gen, alloc_flags); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), acur.busy_gen, + alloc_flags); if (error) goto out; @@ -1775,8 +1874,9 @@ restart: * the allocation can be retried. */ trace_xfs_alloc_size_busy(args); - error = xfs_extent_busy_flush(args->tp, args->pag, - busy_gen, alloc_flags); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), busy_gen, + alloc_flags); if (error) goto error0; @@ -1824,7 +1924,7 @@ restart: error = -EFSCORRUPTED; goto error0; } - if (flen < bestrlen) + if (flen <= bestrlen) break; busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen, &busy_gen); @@ -1874,8 +1974,9 @@ restart: * the allocation can be retried. */ trace_xfs_alloc_size_busy(args); - error = xfs_extent_busy_flush(args->tp, args->pag, - busy_gen, alloc_flags); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), busy_gen, + alloc_flags); if (error) goto error0; @@ -1934,11 +2035,10 @@ out_nominleft: /* * Free the extent starting at agno/bno for length. */ -STATIC int +int xfs_free_ag_extent( struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, @@ -1958,6 +2058,7 @@ xfs_free_ag_extent( int i; int error; struct xfs_perag *pag = agbp->b_pag; + bool fixup_longest = false; bno_cur = cnt_cur = NULL; mp = tp->t_mountp; @@ -2221,8 +2322,13 @@ xfs_free_ag_extent( } xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); bno_cur = NULL; + /* * In all cases we need to insert the new freespace in the by-size tree. + * + * If this new freespace is being inserted in the block that contains + * the largest free space in the btree, make sure we also fix up the + * agf->agf-longest tracker field. */ if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; @@ -2231,6 +2337,8 @@ xfs_free_ag_extent( error = -EFSCORRUPTED; goto error0; } + if (xfs_alloc_cursor_at_lastrec(cnt_cur)) + fixup_longest = true; if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -2238,6 +2346,12 @@ xfs_free_ag_extent( error = -EFSCORRUPTED; goto error0; } + if (fixup_longest) { + error = xfs_alloc_fixup_longest(cnt_cur); + if (error) + goto error0; + } + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); cnt_cur = NULL; @@ -2245,19 +2359,19 @@ xfs_free_ag_extent( * Update the freespace totals in the ag and superblock. */ error = xfs_alloc_update_counters(tp, agbp, len); - xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); + xfs_ag_resv_free_extent(pag, type, tp, len); if (error) goto error0; XFS_STATS_INC(mp, xs_freex); XFS_STATS_ADD(mp, xs_freeb, len); - trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); + trace_xfs_free_extent(pag, bno, len, type, haveleft, haveright); return 0; error0: - trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); + trace_xfs_free_extent(pag, bno, len, type, -1, -1); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); if (cnt_cur) @@ -2316,7 +2430,7 @@ xfs_alloc_longest_free_extent( * reservations and AGFL rules in place, we can return this extent. */ if (pag->pagf_longest > delta) - return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable, + return min_t(xfs_extlen_t, pag_mount(pag)->m_ag_max_usable, pag->pagf_longest - delta); /* Otherwise, let the caller try for 1 block if there's space. */ @@ -2424,32 +2538,6 @@ xfs_alloc_space_available( return true; } -int -xfs_free_agfl_block( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t agbno, - struct xfs_buf *agbp, - struct xfs_owner_info *oinfo) -{ - int error; - struct xfs_buf *bp; - - error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, - XFS_AG_RESV_AGFL); - if (error) - return error; - - error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp, - XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno), - tp->t_mountp->m_bsize, 0, &bp); - if (error) - return error; - xfs_trans_binval(tp, bp); - - return 0; -} - /* * Check the agfl fields of the agf for inconsistency or corruption. * @@ -2525,7 +2613,7 @@ xfs_agfl_reset( xfs_warn(mp, "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " "Please unmount and run xfs_repair.", - pag->pag_agno, pag->pagf_flcount); + pag_agno(pag), pag->pagf_flcount); agf->agf_flfirst = 0; agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); @@ -2538,48 +2626,6 @@ xfs_agfl_reset( } /* - * Defer an AGFL block free. This is effectively equivalent to - * xfs_free_extent_later() with some special handling particular to AGFL blocks. - * - * Deferring AGFL frees helps prevent log reservation overruns due to too many - * allocation operations in a transaction. AGFL frees are prone to this problem - * because for one they are always freed one at a time. Further, an immediate - * AGFL block free can cause a btree join and require another block free before - * the real allocation can proceed. Deferring the free disconnects freeing up - * the AGFL slot from freeing the block. - */ -static int -xfs_defer_agfl_block( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t agbno, - struct xfs_owner_info *oinfo) -{ - struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent_free_item *xefi; - xfs_fsblock_t fsbno = XFS_AGB_TO_FSB(mp, agno, agbno); - - ASSERT(xfs_extfree_item_cache != NULL); - ASSERT(oinfo != NULL); - - if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno))) - return -EFSCORRUPTED; - - xefi = kmem_cache_zalloc(xfs_extfree_item_cache, - GFP_KERNEL | __GFP_NOFAIL); - xefi->xefi_startblock = fsbno; - xefi->xefi_blockcount = 1; - xefi->xefi_owner = oinfo->oi_owner; - xefi->xefi_agresv = XFS_AG_RESV_AGFL; - - trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); - - xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type); - return 0; -} - -/* * Add the extent to the list of extents to be free at transaction end. * The list is maintained sorted (by block number). */ @@ -2590,39 +2636,37 @@ xfs_defer_extent_free( xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, - bool skip_discard, + unsigned int free_flags, struct xfs_defer_pending **dfpp) { struct xfs_extent_free_item *xefi; struct xfs_mount *mp = tp->t_mountp; -#ifdef DEBUG - xfs_agnumber_t agno; - xfs_agblock_t agbno; - ASSERT(bno != NULLFSBLOCK); - ASSERT(len > 0); ASSERT(len <= XFS_MAX_BMBT_EXTLEN); ASSERT(!isnullstartblock(bno)); - agno = XFS_FSB_TO_AGNO(mp, bno); - agbno = XFS_FSB_TO_AGBNO(mp, bno); - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(agbno < mp->m_sb.sb_agblocks); - ASSERT(len < mp->m_sb.sb_agblocks); - ASSERT(agbno + len <= mp->m_sb.sb_agblocks); -#endif - ASSERT(xfs_extfree_item_cache != NULL); - ASSERT(type != XFS_AG_RESV_AGFL); + ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS)); - if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) - return -EFSCORRUPTED; + if (free_flags & XFS_FREE_EXTENT_REALTIME) { + if (type != XFS_AG_RESV_NONE) { + ASSERT(type == XFS_AG_RESV_NONE); + return -EFSCORRUPTED; + } + if (XFS_IS_CORRUPT(mp, !xfs_verify_rtbext(mp, bno, len))) + return -EFSCORRUPTED; + } else { + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) + return -EFSCORRUPTED; + } xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); xefi->xefi_startblock = bno; xefi->xefi_blockcount = (xfs_extlen_t)len; xefi->xefi_agresv = type; - if (skip_discard) + if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD) xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; + if (free_flags & XFS_FREE_EXTENT_REALTIME) + xefi->xefi_flags |= XFS_EFI_REALTIME; if (oinfo) { ASSERT(oinfo->oi_offset == 0); @@ -2634,12 +2678,8 @@ xfs_defer_extent_free( } else { xefi->xefi_owner = XFS_RMAP_OWN_NULL; } - trace_xfs_bmap_free_defer(mp, - XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, - XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); - xfs_extent_free_get_group(mp, xefi); - *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type); + xfs_extent_free_defer_add(tp, xefi, dfpp); return 0; } @@ -2650,11 +2690,11 @@ xfs_free_extent_later( xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, - bool skip_discard) + unsigned int free_flags) { struct xfs_defer_pending *dontcare = NULL; - return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard, + return xfs_defer_extent_free(tp, bno, len, oinfo, type, free_flags, &dontcare); } @@ -2679,13 +2719,13 @@ xfs_free_extent_later( int xfs_alloc_schedule_autoreap( const struct xfs_alloc_arg *args, - bool skip_discard, + unsigned int free_flags, struct xfs_alloc_autoreap *aarp) { int error; error = xfs_defer_extent_free(args->tp, args->fsbno, args->len, - &args->oinfo, args->resv, skip_discard, &aarp->dfp); + &args->oinfo, args->resv, free_flags, &aarp->dfp); if (error) return error; @@ -2738,7 +2778,6 @@ xfs_alloc_commit_autoreap( xfs_defer_item_unpause(tp, aarp->dfp); } -#ifdef DEBUG /* * Check if an AGF has a free extent record whose length is equal to * args->minlen. @@ -2778,7 +2817,6 @@ out: return error; } -#endif /* * Decide whether to use this allocation group for this allocation. @@ -2852,15 +2890,14 @@ xfs_alloc_fix_freelist( if (!xfs_alloc_space_available(args, need, alloc_flags)) goto out_agbp_relse; -#ifdef DEBUG - if (args->alloc_minlen_only) { + if (IS_ENABLED(CONFIG_XFS_DEBUG) && args->alloc_minlen_only) { int stat; error = xfs_exact_minlen_extent_available(args, agbp, &stat); if (error || !stat) goto out_agbp_relse; } -#endif + /* * Make the freelist shorter if it's too long. * @@ -2897,8 +2934,20 @@ xfs_alloc_fix_freelist( if (error) goto out_agbp_relse; - /* defer agfl frees */ - error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); + /* + * Defer the AGFL block free. + * + * This helps to prevent log reservation overruns due to too + * many allocation operations in a transaction. AGFL frees are + * prone to this problem because for one they are always freed + * one at a time. Further, an immediate AGFL block free can + * cause a btree join and require another block free before the + * real allocation can proceed. + * Deferring the free disconnects freeing up the AGFL slot from + * freeing the block. + */ + error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, bno), + 1, &targs.oinfo, XFS_AG_RESV_AGFL, 0); if (error) goto out_agbp_relse; } @@ -3118,8 +3167,6 @@ xfs_alloc_put_freelist( logflags |= XFS_AGF_BTREEBLKS; } - xfs_alloc_log_agf(tp, agbp, logflags); - ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); agfl_bno = xfs_buf_to_agfl_bno(agflbp); @@ -3152,7 +3199,7 @@ xfs_validate_ag_length( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag && seqno != bp->b_pag->pag_agno) + if (bp->b_pag && seqno != pag_agno(bp->b_pag)) return __this_address; /* @@ -3321,13 +3368,13 @@ xfs_read_agf( int flags, struct xfs_buf **agfbpp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); int error; - trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); + trace_xfs_read_agf(pag); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); if (xfs_metadata_is_sick(error)) xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); @@ -3350,12 +3397,13 @@ xfs_alloc_read_agf( int flags, struct xfs_buf **agfbpp) { + struct xfs_mount *mp = pag_mount(pag); struct xfs_buf *agfbp; struct xfs_agf *agf; int error; int allocbt_blks; - trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); + trace_xfs_alloc_read_agf(pag); /* We don't support trylock when freeing. */ ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != @@ -3376,7 +3424,7 @@ xfs_alloc_read_agf( pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); - if (xfs_agfl_needs_reset(pag->pag_mount, agf)) + if (xfs_agfl_needs_reset(mp, agf)) set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); else clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); @@ -3389,24 +3437,48 @@ xfs_alloc_read_agf( * counter only tracks non-root blocks. */ allocbt_blks = pag->pagf_btreeblks; - if (xfs_has_rmapbt(pag->pag_mount)) + if (xfs_has_rmapbt(mp)) allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; if (allocbt_blks > 0) - atomic64_add(allocbt_blks, - &pag->pag_mount->m_allocbt_blks); + atomic64_add(allocbt_blks, &mp->m_allocbt_blks); set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); } + #ifdef DEBUG - else if (!xfs_is_shutdown(pag->pag_mount)) { - ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); - ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); - ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); - ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); - ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level)); - ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level)); + /* + * It's possible for the AGF to be out of sync if the block device is + * silently dropping writes. This can happen in fstests with dmflakey + * enabled, which allows the buffer to be cleaned and reclaimed by + * memory pressure and then re-read from disk here. We will get a + * stale version of the AGF from disk, and nothing good can happen from + * here. Hence if we detect this situation, immediately shut down the + * filesystem. + * + * This can also happen if we are already in the middle of a forced + * shutdown, so don't bother checking if we are already shut down. + */ + if (!xfs_is_shutdown(pag_mount(pag))) { + bool ok = true; + + ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks); + ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks); + ok &= pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks); + ok &= pag->pagf_flcount == be32_to_cpu(agf->agf_flcount); + ok &= pag->pagf_longest == be32_to_cpu(agf->agf_longest); + ok &= pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level); + ok &= pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level); + + if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); + xfs_trans_brelse(tp, agfbp); + xfs_force_shutdown(pag_mount(pag), + SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } } -#endif +#endif /* DEBUG */ + if (agfbpp) *agfbpp = agfbp; else @@ -3559,7 +3631,7 @@ xfs_alloc_vextent_finish( goto out_drop_perag; } - args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); + args->fsbno = xfs_agbno_to_fsb(args->pag, args->agbno); ASSERT(args->len >= args->minlen); ASSERT(args->len <= args->maxlen); @@ -3580,8 +3652,8 @@ xfs_alloc_vextent_finish( if (error) goto out_drop_perag; - ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno, - args->len)); + ASSERT(!xfs_extent_busy_search(pag_group(args->pag), + args->agbno, args->len)); } xfs_ag_resv_alloc_extent(args->pag, args->resv, args); @@ -3611,21 +3683,20 @@ xfs_alloc_vextent_this_ag( struct xfs_alloc_arg *args, xfs_agnumber_t agno) { - struct xfs_mount *mp = args->mp; xfs_agnumber_t minimum_agno; uint32_t alloc_flags = 0; int error; ASSERT(args->pag != NULL); - ASSERT(args->pag->pag_agno == agno); + ASSERT(pag_agno(args->pag) == agno); args->agno = agno; args->agbno = 0; trace_xfs_alloc_vextent_this_ag(args); - error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0), - &minimum_agno); + error = xfs_alloc_vextent_check_args(args, + xfs_agbno_to_fsb(args->pag, 0), &minimum_agno); if (error) { if (error == -ENOSPC) return 0; @@ -3830,7 +3901,7 @@ xfs_alloc_vextent_exact_bno( int error; ASSERT(args->pag != NULL); - ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); + ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target)); args->agno = XFS_FSB_TO_AGNO(mp, target); args->agbno = XFS_FSB_TO_AGBNO(mp, target); @@ -3869,7 +3940,7 @@ xfs_alloc_vextent_near_bno( int error; if (!needs_perag) - ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); + ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target)); args->agno = XFS_FSB_TO_AGNO(mp, target); args->agbno = XFS_FSB_TO_AGBNO(mp, target); @@ -3906,7 +3977,7 @@ xfs_free_extent_fix_freelist( memset(&args, 0, sizeof(struct xfs_alloc_arg)); args.tp = tp; args.mp = tp->t_mountp; - args.agno = pag->pag_agno; + args.agno = pag_agno(pag); args.pag = pag; /* @@ -3974,14 +4045,13 @@ __xfs_free_extent( goto err_release; } - error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo, - type); + error = xfs_free_ag_extent(tp, agbp, agbno, len, oinfo, type); if (error) goto err_release; if (skip_discard) busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; - xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); + xfs_extent_busy_insert(tp, pag_group(pag), agbno, len, busy_flags); return 0; err_release: @@ -4006,7 +4076,7 @@ xfs_alloc_query_range_helper( xfs_failaddr_t fa; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 0b956f8b9d5a..50ef79a1ed41 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -53,11 +53,9 @@ typedef struct xfs_alloc_arg { int datatype; /* mask defining data type treatment */ char wasdel; /* set if allocation was prev delayed */ char wasfromfl; /* set if allocation is from freelist */ + bool alloc_minlen_only; /* allocate exact minlen extent */ struct xfs_owner_info oinfo; /* owner of blocks being allocated */ enum xfs_ag_resv_type resv; /* block reservation to use */ -#ifdef DEBUG - bool alloc_minlen_only; /* allocate exact minlen extent */ -#endif } xfs_alloc_arg_t; /* @@ -80,6 +78,9 @@ int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp, int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agfbp, struct xfs_buf *agflbp, xfs_agblock_t bno, int btreeblk); +int xfs_free_ag_extent(struct xfs_trans *tp, struct xfs_buf *agbp, + xfs_agblock_t bno, xfs_extlen_t len, + const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); /* * Compute and fill in value of m_alloc_maxlevels. @@ -194,8 +195,6 @@ int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, struct xfs_buf **agfbpp); int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf **bpp); -int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, - struct xfs_buf *, struct xfs_owner_info *); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, uint32_t alloc_flags); int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_buf **agbp); @@ -233,7 +232,16 @@ xfs_buf_to_agfl_bno( int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type, bool skip_discard); + enum xfs_ag_resv_type type, unsigned int free_flags); + +/* Don't issue a discard for the blocks freed. */ +#define XFS_FREE_EXTENT_SKIP_DISCARD (1U << 0) + +/* Free blocks on the realtime device. */ +#define XFS_FREE_EXTENT_REALTIME (1U << 1) + +#define XFS_FREE_EXTENT_ALL_FLAGS (XFS_FREE_EXTENT_SKIP_DISCARD | \ + XFS_FREE_EXTENT_REALTIME) /* * List of extents to be free "later". @@ -244,25 +252,28 @@ struct xfs_extent_free_item { uint64_t xefi_owner; xfs_fsblock_t xefi_startblock;/* starting fs block number */ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ - struct xfs_perag *xefi_pag; + struct xfs_group *xefi_group; unsigned int xefi_flags; enum xfs_ag_resv_type xefi_agresv; }; -void xfs_extent_free_get_group(struct xfs_mount *mp, - struct xfs_extent_free_item *xefi); - #define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ #define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */ +#define XFS_EFI_REALTIME (1U << 4) /* freeing realtime extent */ + +static inline bool xfs_efi_is_realtime(const struct xfs_extent_free_item *xefi) +{ + return xefi->xefi_flags & XFS_EFI_REALTIME; +} struct xfs_alloc_autoreap { struct xfs_defer_pending *dfp; }; int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args, - bool skip_discard, struct xfs_alloc_autoreap *aarp); + unsigned int free_flags, struct xfs_alloc_autoreap *aarp); void xfs_alloc_cancel_autoreap(struct xfs_trans *tp, struct xfs_alloc_autoreap *aarp); void xfs_alloc_commit_autoreap(struct xfs_trans *tp, diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 6ef5ddd89600..a4ac37ba5d51 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -28,7 +28,7 @@ xfs_bnobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_bnobt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, - cur->bc_ag.pag); + to_perag(cur->bc_group)); } STATIC struct xfs_btree_cur * @@ -36,29 +36,29 @@ xfs_cntbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_cntbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, - cur->bc_ag.pag); + to_perag(cur->bc_group)); } - STATIC void xfs_allocbt_set_root( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_ag.agbp; - struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag = to_perag(cur->bc_group); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; ASSERT(ptr->s != 0); if (xfs_btree_is_bno(cur->bc_ops)) { agf->agf_bno_root = ptr->s; be32_add_cpu(&agf->agf_bno_level, inc); - cur->bc_ag.pag->pagf_bno_level += inc; + pag->pagf_bno_level += inc; } else { agf->agf_cnt_root = ptr->s; be32_add_cpu(&agf->agf_cnt_level, inc); - cur->bc_ag.pag->pagf_cnt_level += inc; + pag->pagf_cnt_level += inc; } xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); @@ -75,7 +75,7 @@ xfs_allocbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp, + error = xfs_alloc_get_freelist(to_perag(cur->bc_group), cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -86,7 +86,7 @@ xfs_allocbt_alloc_block( } atomic64_inc(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_group, bno, 1, false); new->s = cpu_to_be32(bno); @@ -104,78 +104,17 @@ xfs_allocbt_free_block( int error; bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); - error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL, - bno, 1); + error = xfs_alloc_put_freelist(to_perag(cur->bc_group), cur->bc_tp, + agbp, NULL, bno, 1); if (error) return error; atomic64_dec(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_insert(cur->bc_tp, agbp->b_pag, bno, 1, + xfs_extent_busy_insert(cur->bc_tp, pag_group(agbp->b_pag), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; } -/* - * Update the longest extent in the AGF - */ -STATIC void -xfs_allocbt_update_lastrec( - struct xfs_btree_cur *cur, - const struct xfs_btree_block *block, - const union xfs_btree_rec *rec, - int ptr, - int reason) -{ - struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - struct xfs_perag *pag; - __be32 len; - int numrecs; - - ASSERT(!xfs_btree_is_bno(cur->bc_ops)); - - switch (reason) { - case LASTREC_UPDATE: - /* - * If this is the last leaf block and it's the last record, - * then update the size of the longest extent in the AG. - */ - if (ptr != xfs_btree_get_numrecs(block)) - return; - len = rec->alloc.ar_blockcount; - break; - case LASTREC_INSREC: - if (be32_to_cpu(rec->alloc.ar_blockcount) <= - be32_to_cpu(agf->agf_longest)) - return; - len = rec->alloc.ar_blockcount; - break; - case LASTREC_DELREC: - numrecs = xfs_btree_get_numrecs(block); - if (ptr <= numrecs) - return; - ASSERT(ptr == numrecs + 1); - - if (numrecs) { - xfs_alloc_rec_t *rrp; - - rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs); - len = rrp->ar_blockcount; - } else { - len = 0; - } - - break; - default: - ASSERT(0); - return; - } - - agf->agf_longest = len; - pag = cur->bc_ag.agbp->b_pag; - pag->pagf_longest = be32_to_cpu(len); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST); -} - STATIC int xfs_allocbt_get_minrecs( struct xfs_btree_cur *cur, @@ -239,7 +178,7 @@ xfs_allocbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); if (xfs_btree_is_bno(cur->bc_ops)) ptr->s = agf->agf_bno_root; @@ -493,7 +432,6 @@ const struct xfs_btree_ops xfs_bnobt_ops = { .set_root = xfs_allocbt_set_root, .alloc_block = xfs_allocbt_alloc_block, .free_block = xfs_allocbt_free_block, - .update_lastrec = xfs_allocbt_update_lastrec, .get_minrecs = xfs_allocbt_get_minrecs, .get_maxrecs = xfs_allocbt_get_maxrecs, .init_key_from_rec = xfs_allocbt_init_key_from_rec, @@ -511,7 +449,6 @@ const struct xfs_btree_ops xfs_bnobt_ops = { const struct xfs_btree_ops xfs_cntbt_ops = { .name = "cnt", .type = XFS_BTREE_TYPE_AG, - .geom_flags = XFS_BTGEO_LASTREC_UPDATE, .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), @@ -525,7 +462,6 @@ const struct xfs_btree_ops xfs_cntbt_ops = { .set_root = xfs_allocbt_set_root, .alloc_block = xfs_allocbt_alloc_block, .free_block = xfs_allocbt_free_block, - .update_lastrec = xfs_allocbt_update_lastrec, .get_minrecs = xfs_allocbt_get_minrecs, .get_maxrecs = xfs_allocbt_get_maxrecs, .init_key_from_rec = xfs_allocbt_init_key_from_rec, @@ -556,7 +492,7 @@ xfs_bnobt_init_cursor( cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bnobt_ops, mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agf *agf = agbp->b_addr; @@ -582,7 +518,7 @@ xfs_cntbt_init_cursor( cur = xfs_btree_alloc_cursor(mp, tp, &xfs_cntbt_ops, mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agf *agf = agbp->b_addr; @@ -633,11 +569,11 @@ xfs_allocbt_block_maxrecs( /* * Calculate number of records in an alloc btree block. */ -int +unsigned int xfs_allocbt_maxrecs( struct xfs_mount *mp, - int blocklen, - int leaf) + unsigned int blocklen, + bool leaf) { blocklen -= XFS_ALLOC_BLOCK_LEN(mp); return xfs_allocbt_block_maxrecs(blocklen, leaf); diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 155b47f231ab..12647f9aaa6d 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -53,7 +53,8 @@ struct xfs_btree_cur *xfs_bnobt_init_cursor(struct xfs_mount *mp, struct xfs_btree_cur *xfs_cntbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_perag *pag); -extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); +unsigned int xfs_allocbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, unsigned long long len); diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 673a4b6d2e8d..8c04acd30d48 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -26,6 +26,7 @@ #include "xfs_trace.h" #include "xfs_attr_item.h" #include "xfs_xattr.h" +#include "xfs_parent.h" struct kmem_cache *xfs_attr_intent_cache; @@ -50,7 +51,6 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); -STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args); /* * Internal routines when attribute list is more than one block. @@ -87,6 +87,8 @@ xfs_attr_is_leaf( struct xfs_iext_cursor icur; struct xfs_bmbt_irec imap; + ASSERT(!xfs_need_iread_extents(ifp)); + if (ifp->if_nextents != 1 || ifp->if_format != XFS_DINODE_FMT_EXTENTS) return false; @@ -224,11 +226,21 @@ int xfs_attr_get_ilocked( struct xfs_da_args *args) { + int error; + xfs_assert_ilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; + /* + * The incore attr fork iext tree must be loaded for xfs_attr_is_leaf + * to work correctly. + */ + error = xfs_iread_extents(args->trans, args->dp, XFS_ATTR_FORK); + if (error) + return error; + if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); if (xfs_attr_is_leaf(args->dp)) @@ -264,9 +276,11 @@ xfs_attr_get( if (xfs_is_shutdown(args->dp->i_mount)) return -EIO; + if (!args->owner) + args->owner = args->dp->i_ino; args->geo = args->dp->i_mount->m_attr_geo; args->whichfork = XFS_ATTR_FORK; - args->hashval = xfs_da_hashname(args->name, args->namelen); + xfs_attr_sethash(args); /* Entirely possible to look up a name which doesn't exist */ args->op_flags = XFS_DA_OP_OKNOENT; @@ -314,26 +328,20 @@ xfs_attr_calc_size( return nblks; } -/* Initialize transaction reservation for attr operations */ -void -xfs_init_attr_trans( - struct xfs_da_args *args, - struct xfs_trans_res *tres, - unsigned int *total) +/* Initialize transaction reservation for an xattr set/replace/upsert */ +inline struct xfs_trans_res +xfs_attr_set_resv( + const struct xfs_da_args *args) { - struct xfs_mount *mp = args->dp->i_mount; - - if (args->value) { - tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * - args->total; - tres->tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres->tr_logflags = XFS_TRANS_PERM_LOG_RES; - *total = args->total; - } else { - *tres = M_RES(mp)->tr_attrrm; - *total = XFS_ATTRRM_SPACE_RES(mp); - } + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_trans_res ret = { + .tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args->total, + .tr_logcount = XFS_ATTRSET_LOG_COUNT, + .tr_logflags = XFS_TRANS_PERM_LOG_RES, + }; + + return ret; } /* @@ -363,7 +371,7 @@ xfs_attr_try_sf_addname( * Commit the shortform mods, and we're done. * NOTE: this is also the error path (EEXIST, etc). */ - if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) + if (!error) xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); if (xfs_has_wsync(dp->i_mount)) @@ -401,6 +409,77 @@ out: return error; } +/* Compute the hash value for a user/root/secure extended attribute */ +xfs_dahash_t +xfs_attr_hashname( + const uint8_t *name, + int namelen) +{ + return xfs_da_hashname(name, namelen); +} + +/* Compute the hash value for any extended attribute from any namespace. */ +xfs_dahash_t +xfs_attr_hashval( + struct xfs_mount *mp, + unsigned int attr_flags, + const uint8_t *name, + int namelen, + const void *value, + int valuelen) +{ + ASSERT(xfs_attr_check_namespace(attr_flags)); + + if (attr_flags & XFS_ATTR_PARENT) + return xfs_parent_hashattr(mp, name, namelen, value, valuelen); + + return xfs_attr_hashname(name, namelen); +} + +/* Save the current remote block info and clear the current pointers. */ +static void +xfs_attr_save_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno2 = args->blkno; + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; +} + +/* Set stored info about a remote block */ +static void +xfs_attr_restore_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno = args->blkno2; + args->index = args->index2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; +} + +/* + * PPTR_REPLACE operations require the caller to set the old and new names and + * values explicitly. Update the canonical fields to the new name and value + * here now that the removal phase has finished. + */ +static void +xfs_attr_update_pptr_replace_args( + struct xfs_da_args *args) +{ + ASSERT(args->new_namelen > 0); + args->name = args->new_name; + args->namelen = args->new_namelen; + args->value = args->new_value; + args->valuelen = args->new_valuelen; + xfs_attr_sethash(args); +} + /* * Handle the state change on completion of a multi-state attr operation. * @@ -418,58 +497,84 @@ xfs_attr_complete_op( enum xfs_delattr_state replace_state) { struct xfs_da_args *args = attr->xattri_da_args; - bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; + + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + replace_state = XFS_DAS_DONE; + else if (xfs_attr_intent_op(attr) == XFS_ATTRI_OP_FLAGS_PPTR_REPLACE) + xfs_attr_update_pptr_replace_args(args); args->op_flags &= ~XFS_DA_OP_REPLACE; args->attr_filter &= ~XFS_ATTR_INCOMPLETE; - if (do_replace) - return replace_state; - - return XFS_DAS_DONE; + return replace_state; } +/* + * Try to add an attribute to an inode in leaf form. + */ static int xfs_attr_leaf_addname( struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_buf *bp; int error; ASSERT(xfs_attr_is_leaf(args->dp)); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp); + if (error) + return error; + /* - * Use the leaf buffer we may already hold locked as a result of - * a sf-to-leaf conversion. + * Look up the xattr name to set the insertion point for the new xattr. */ - error = xfs_attr_leaf_try_add(args); - - if (error == -ENOSPC) { - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; + error = xfs_attr3_leaf_lookup_int(bp, args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto out_brelse; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + goto out_brelse; + trace_xfs_attr_leaf_replace(args); /* - * We're not in leaf format anymore, so roll the transaction and - * retry the add to the newly allocated node block. + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to + * add, not the attribute we just found and will remove later. */ - attr->xattri_dela_state = XFS_DAS_NODE_ADD; - goto out; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto out_brelse; } - if (error) - return error; /* * We need to commit and roll if we need to allocate remote xattr blocks * or perform more xattr manipulations. Otherwise there is nothing more * to do and we can return success. */ - if (args->rmtblkno) + if (!xfs_attr3_leaf_add(bp, args)) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; + + attr->xattri_dela_state = XFS_DAS_NODE_ADD; + } else if (args->rmtblkno) { attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; - else - attr->xattri_dela_state = xfs_attr_complete_op(attr, - XFS_DAS_LEAF_REPLACE); -out: + } else { + attr->xattri_dela_state = + xfs_attr_complete_op(attr, XFS_DAS_LEAF_REPLACE); + } + trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); + return 0; + +out_brelse: + xfs_trans_brelse(args->trans, bp); return error; } @@ -492,7 +597,7 @@ xfs_attr_node_addname( return error; error = xfs_attr_node_try_addname(attr); - if (error == -ENOSPC) { + if (error == 1) { error = xfs_attr3_leaf_to_node(args); if (error) return error; @@ -647,8 +752,8 @@ xfs_attr_leaf_remove_attr( int forkoff; int error; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp); if (error) return error; @@ -679,7 +784,7 @@ xfs_attr_leaf_shrink( if (!xfs_attr_is_leaf(dp)) return 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp); if (error) return error; @@ -868,6 +973,11 @@ xfs_attr_lookup( return -ENOATTR; } + /* Prerequisite for xfs_attr_is_leaf */ + error = xfs_iread_extents(args->trans, args->dp, XFS_ATTR_FORK); + if (error) + return error; + if (xfs_attr_is_leaf(dp)) { error = xfs_attr_leaf_hasname(args, &bp); @@ -883,74 +993,73 @@ xfs_attr_lookup( return error; } -static void -xfs_attr_defer_add( - struct xfs_da_args *args, - unsigned int op_flags) +int +xfs_attr_add_fork( + struct xfs_inode *ip, /* incore inode pointer */ + int size, /* space new attribute needs */ + int rsvd) /* xact may use reserved blks */ { + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; /* transaction pointer */ + unsigned int blks; /* space reservation */ + int error; /* error return value */ - struct xfs_attr_intent *new; + if (!xfs_is_metadir_inode(ip)) + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - new = kmem_cache_zalloc(xfs_attr_intent_cache, - GFP_KERNEL | __GFP_NOFAIL); - new->xattri_op_flags = op_flags; - new->xattri_da_args = args; + blks = XFS_ADDAFORK_SPACE_RES(mp); - switch (op_flags) { - case XFS_ATTRI_OP_FLAGS_SET: - new->xattri_dela_state = xfs_attr_init_add_state(args); - break; - case XFS_ATTRI_OP_FLAGS_REPLACE: - new->xattri_dela_state = xfs_attr_init_replace_state(args); - break; - case XFS_ATTRI_OP_FLAGS_REMOVE: - new->xattri_dela_state = xfs_attr_init_remove_state(args); - break; - default: - ASSERT(0); - } + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0, + rsvd, &tp); + if (error) + return error; - xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type); - trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); + if (xfs_inode_has_attr_fork(ip)) + goto trans_cancel; + + error = xfs_bmap_add_attrfork(tp, ip, size, rsvd); + if (error) + goto trans_cancel; + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +trans_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; } /* - * Note: If args->value is NULL the attribute will be removed, just like the - * Linux ->setattr API. + * Make a change to the xattr structure. + * + * The caller must have initialized @args, attached dquots, and must not hold + * any ILOCKs. Reserved data blocks may be used if @rsvd is set. + * + * Returns -EEXIST for XFS_ATTRUPDATE_CREATE if the name already exists. + * Returns -ENOATTR for XFS_ATTRUPDATE_REMOVE if the name does not exist. + * Returns 0 on success, or a negative errno if something else went wrong. */ int xfs_attr_set( - struct xfs_da_args *args) + struct xfs_da_args *args, + enum xfs_attr_update op, + bool rsvd) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_trans_res tres; - bool rsvd = (args->attr_filter & XFS_ATTR_ROOT); int error, local; int rmt_blks = 0; - unsigned int total; - - if (xfs_is_shutdown(dp->i_mount)) - return -EIO; - - error = xfs_qm_dqattach(dp); - if (error) - return error; + unsigned int total = 0; - args->geo = mp->m_attr_geo; - args->whichfork = XFS_ATTR_FORK; - args->hashval = xfs_da_hashname(args->name, args->namelen); + ASSERT(!args->trans); - /* - * We have no control over the attribute names that userspace passes us - * to remove, so we have to allow the name lookup prior to attribute - * removal to fail as well. Preserve the logged flag, since we need - * to pass that through to the logging code. - */ - args->op_flags = XFS_DA_OP_OKNOENT | - (args->op_flags & XFS_DA_OP_LOGGED); - - if (args->value) { + switch (op) { + case XFS_ATTRUPDATE_UPSERT: + case XFS_ATTRUPDATE_CREATE: + case XFS_ATTRUPDATE_REPLACE: XFS_STATS_INC(mp, xs_attr_set); args->total = xfs_attr_calc_size(args, &local); @@ -963,33 +1072,36 @@ xfs_attr_set( xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); - error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + error = xfs_attr_add_fork(dp, sf_size, rsvd); if (error) return error; } if (!local) rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); - } else { + + tres = xfs_attr_set_resv(args); + total = args->total; + break; + case XFS_ATTRUPDATE_REMOVE: XFS_STATS_INC(mp, xs_attr_remove); - rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); + rmt_blks = xfs_attr3_max_rmt_blocks(mp); + tres = M_RES(mp)->tr_attrrm; + total = XFS_ATTRRM_SPACE_RES(mp); + break; } /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) return error; - if (args->value || xfs_inode_hasattr(dp)) { - error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, + if (op != XFS_ATTRUPDATE_REMOVE || xfs_inode_hasattr(dp)) { + error = xfs_iext_count_extend(args->trans, dp, XFS_ATTR_FORK, XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(args->trans, dp, - XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); if (error) goto out_trans_cancel; } @@ -997,26 +1109,26 @@ xfs_attr_set( error = xfs_attr_lookup(args); switch (error) { case -EEXIST: - if (!args->value) { + if (op == XFS_ATTRUPDATE_REMOVE) { /* if no value, we are performing a remove operation */ - xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE); + xfs_attr_defer_add(args, XFS_ATTR_DEFER_REMOVE); break; } /* Pure create fails if the attr already exists */ - if (args->attr_flags & XATTR_CREATE) + if (op == XFS_ATTRUPDATE_CREATE) goto out_trans_cancel; - xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE); + xfs_attr_defer_add(args, XFS_ATTR_DEFER_REPLACE); break; case -ENOATTR: /* Can't remove what isn't there. */ - if (!args->value) + if (op == XFS_ATTRUPDATE_REMOVE) goto out_trans_cancel; /* Pure replace fails if no existing attr to replace. */ - if (args->attr_flags & XATTR_REPLACE) + if (op == XFS_ATTRUPDATE_REPLACE) goto out_trans_cancel; - xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET); + xfs_attr_defer_add(args, XFS_ATTR_DEFER_SET); break; default: goto out_trans_cancel; @@ -1029,8 +1141,7 @@ xfs_attr_set( if (xfs_has_wsync(mp)) xfs_trans_set_sync(args->trans); - if (!(args->op_flags & XFS_DA_OP_NOTIME)) - xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); /* * Commit the last in the sequence of transactions. @@ -1039,6 +1150,7 @@ xfs_attr_set( error = xfs_trans_commit(args->trans); out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); + args->trans = NULL; return error; out_trans_cancel: @@ -1051,7 +1163,7 @@ out_trans_cancel: * External routines when attribute list is inside the inode *========================================================================*/ -static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) +int xfs_attr_sf_totsize(struct xfs_inode *dp) { struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; @@ -1110,88 +1222,6 @@ xfs_attr_shortform_addname( * External routines when attribute list is one block *========================================================================*/ -/* Save the current remote block info and clear the current pointers. */ -static void -xfs_attr_save_rmt_blk( - struct xfs_da_args *args) -{ - args->blkno2 = args->blkno; - args->index2 = args->index; - args->rmtblkno2 = args->rmtblkno; - args->rmtblkcnt2 = args->rmtblkcnt; - args->rmtvaluelen2 = args->rmtvaluelen; - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; -} - -/* Set stored info about a remote block */ -static void -xfs_attr_restore_rmt_blk( - struct xfs_da_args *args) -{ - args->blkno = args->blkno2; - args->index = args->index2; - args->rmtblkno = args->rmtblkno2; - args->rmtblkcnt = args->rmtblkcnt2; - args->rmtvaluelen = args->rmtvaluelen2; -} - -/* - * Tries to add an attribute to an inode in leaf form - * - * This function is meant to execute as part of a delayed operation and leaves - * the transaction handling to the caller. On success the attribute is added - * and the inode and transaction are left dirty. If there is not enough space, - * the attr data is converted to node format and -ENOSPC is returned. Caller is - * responsible for handling the dirty inode and transaction or adding the attr - * in node format. - */ -STATIC int -xfs_attr_leaf_try_add( - struct xfs_da_args *args) -{ - struct xfs_buf *bp; - int error; - - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - return error; - - /* - * Look up the xattr name to set the insertion point for the new xattr. - */ - error = xfs_attr3_leaf_lookup_int(bp, args); - switch (error) { - case -ENOATTR: - if (args->op_flags & XFS_DA_OP_REPLACE) - goto out_brelse; - break; - case -EEXIST: - if (!(args->op_flags & XFS_DA_OP_REPLACE)) - goto out_brelse; - - trace_xfs_attr_leaf_replace(args); - /* - * Save the existing remote attr state so that the current - * values reflect the state of the new attribute we are about to - * add, not the attribute we just found and will remove later. - */ - xfs_attr_save_rmt_blk(args); - break; - case 0: - break; - default: - goto out_brelse; - } - - return xfs_attr3_leaf_add(bp, args); - -out_brelse: - xfs_trans_brelse(args->trans, bp); - return error; -} - /* * Return EEXIST if attr is found, or ENOATTR if not */ @@ -1202,7 +1232,7 @@ xfs_attr_leaf_hasname( { int error = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, bp); if (error) return error; @@ -1357,9 +1387,12 @@ error: /* * Add a name to a Btree-format attribute list. * - * This will involve walking down the Btree, and may involve splitting - * leaf nodes and even splitting intermediate nodes up to and including - * the root node (a special case of an intermediate node). + * This will involve walking down the Btree, and may involve splitting leaf + * nodes and even splitting intermediate nodes up to and including the root + * node (a special case of an intermediate node). + * + * If the tree was still in single leaf format and needs to converted to + * real node format return 1 and let the caller handle that. */ static int xfs_attr_node_try_addname( @@ -1367,21 +1400,21 @@ xfs_attr_node_try_addname( { struct xfs_da_state *state = attr->xattri_da_state; struct xfs_da_state_blk *blk; - int error; + int error = 0; trace_xfs_attr_node_addname(state->args); blk = &state->path.blk[state->path.active-1]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr3_leaf_add(blk->bp, state->args); - if (error == -ENOSPC) { + if (!xfs_attr3_leaf_add(blk->bp, state->args)) { if (state->path.active == 1) { /* * Its really a single leaf node, but it had * out-of-line values so it looked like it *might* * have been a b-tree. Let the caller deal with this. */ + error = 1; goto out; } @@ -1511,12 +1544,23 @@ out_release: return error; } +/* Enforce that there is at most one namespace bit per attr. */ +inline bool xfs_attr_check_namespace(unsigned int attr_flags) +{ + return hweight32(attr_flags & XFS_ATTR_NSP_ONDISK_MASK) < 2; +} + /* Returns true if the attribute entry name is valid. */ bool xfs_attr_namecheck( + unsigned int attr_flags, const void *name, size_t length) { + /* Only one namespace bit allowed. */ + if (!xfs_attr_check_namespace(attr_flags)) + return false; + /* * MAXNAMELEN includes the trailing null, but (name/length) leave it * out, so use >= for the length check. @@ -1524,6 +1568,10 @@ xfs_attr_namecheck( if (length >= MAXNAMELEN) return false; + /* Parent pointers have their own validation. */ + if (attr_flags & XFS_ATTR_PARENT) + return xfs_parent_namecheck(attr_flags, name, length); + /* There shouldn't be any nulls here */ return !memchr(name, 0, length); } diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 81be9b3e4004..0e51d0723f9a 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -47,8 +47,9 @@ struct xfs_attrlist_cursor_kern { /* void; state communicated via *context */ -typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, - unsigned char *, int, int); +typedef void (*put_listent_func_t)(struct xfs_attr_list_context *context, + int flags, unsigned char *name, int namelen, void *value, + int valuelen); struct xfs_attr_list_context { struct xfs_trans *tp; @@ -510,8 +511,8 @@ struct xfs_attr_intent { struct xfs_da_args *xattri_da_args; /* - * Shared buffer containing the attr name and value so that the logging - * code can share large memory buffers between log items. + * Shared buffer containing the attr name, new name, and value so that + * the logging code can share large memory buffers between log items. */ struct xfs_attri_log_nameval *xattri_nameval; @@ -529,6 +530,11 @@ struct xfs_attr_intent { struct xfs_bmbt_irec xattri_map; }; +static inline unsigned int +xfs_attr_intent_op(const struct xfs_attr_intent *attr) +{ + return attr->xattri_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK; +} /*======================================================================== * Function prototypes for the kernel. @@ -544,13 +550,22 @@ int xfs_inode_hasattr(struct xfs_inode *ip); bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); -int xfs_attr_set(struct xfs_da_args *args); + +enum xfs_attr_update { + XFS_ATTRUPDATE_REMOVE, /* remove attr */ + XFS_ATTRUPDATE_UPSERT, /* set value, replace any existing attr */ + XFS_ATTRUPDATE_CREATE, /* set value, fail if attr already exists */ + XFS_ATTRUPDATE_REPLACE, /* set value, fail if attr does not exist */ +}; + +int xfs_attr_set(struct xfs_da_args *args, enum xfs_attr_update op, bool rsvd); int xfs_attr_set_iter(struct xfs_attr_intent *attr); int xfs_attr_remove_iter(struct xfs_attr_intent *attr); -bool xfs_attr_namecheck(const void *name, size_t length); +bool xfs_attr_check_namespace(unsigned int attr_flags); +bool xfs_attr_namecheck(unsigned int attr_flags, const void *name, + size_t length); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); -void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, - unsigned int *total); +struct xfs_trans_res xfs_attr_set_resv(const struct xfs_da_args *args); /* * Check to see if the attr should be upgraded from non-existent or shortform to @@ -590,7 +605,6 @@ xfs_attr_init_add_state(struct xfs_da_args *args) static inline enum xfs_delattr_state xfs_attr_init_remove_state(struct xfs_da_args *args) { - args->op_flags |= XFS_DA_OP_REMOVE; if (xfs_attr_is_shortform(args->dp)) return XFS_DAS_SF_REMOVE; if (xfs_attr_is_leaf(args->dp)) @@ -614,8 +628,25 @@ xfs_attr_init_replace_state(struct xfs_da_args *args) return xfs_attr_init_add_state(args); } +xfs_dahash_t xfs_attr_hashname(const uint8_t *name, int namelen); + +xfs_dahash_t xfs_attr_hashval(struct xfs_mount *mp, unsigned int attr_flags, + const uint8_t *name, int namelen, const void *value, + int valuelen); + +/* Set the hash value for any extended attribute from any namespace. */ +static inline void xfs_attr_sethash(struct xfs_da_args *args) +{ + args->hashval = xfs_attr_hashval(args->dp->i_mount, args->attr_filter, + args->name, args->namelen, + args->value, args->valuelen); +} + extern struct kmem_cache *xfs_attr_intent_cache; int __init xfs_attr_intent_init_cache(void); void xfs_attr_intent_destroy_cache(void); +int xfs_attr_sf_totsize(struct xfs_inode *dp); +int xfs_attr_add_fork(struct xfs_inode *ip, int size, int rsvd); + #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index ac904cc1a97b..fddb55605e0c 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -47,7 +47,7 @@ */ STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block, struct xfs_buf **bpp); -STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, +STATIC void xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, struct xfs_attr3_icleaf_hdr *ichdr, struct xfs_da_args *args, int freemap_index); STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args, @@ -388,6 +388,27 @@ xfs_attr3_leaf_verify( return NULL; } +xfs_failaddr_t +xfs_attr3_leaf_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + + if (xfs_has_crc(mp)) { + struct xfs_attr3_leafblock *hdr3 = bp->b_addr; + + if (hdr3->hdr.info.hdr.magic != + cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->hdr.info.owner) != owner) + return __this_address; + } + + return NULL; +} + static void xfs_attr3_leaf_write_verify( struct xfs_buf *bp) @@ -448,16 +469,30 @@ int xfs_attr3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t bno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); - if (!err && tp && *bpp) + if (err || !(*bpp)) + return err; + + fa = xfs_attr3_leaf_header_check(*bpp, owner); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); + return -EFSCORRUPTED; + } + + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); - return err; + return 0; } /*======================================================================== @@ -472,28 +507,57 @@ xfs_attr3_leaf_read( * INCOMPLETE flag will not be set in attr->attr_filter, but rather * XFS_DA_OP_RECOVERY will be set in args->op_flags. */ +static inline unsigned int xfs_attr_match_mask(const struct xfs_da_args *args) +{ + if (args->op_flags & XFS_DA_OP_RECOVERY) + return XFS_ATTR_NSP_ONDISK_MASK; + return XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE; +} + +static inline bool +xfs_attr_parent_match( + const struct xfs_da_args *args, + const void *value, + unsigned int valuelen) +{ + ASSERT(args->value != NULL); + + /* Parent pointers do not use remote values */ + if (!value) + return false; + + /* + * The only value we support is a parent rec. However, we'll accept + * any valuelen so that offline repair can delete ATTR_PARENT values + * that are not parent pointers. + */ + if (valuelen != args->valuelen) + return false; + + return memcmp(args->value, value, valuelen) == 0; +} + static bool xfs_attr_match( struct xfs_da_args *args, - uint8_t namelen, - unsigned char *name, - int flags) + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen) { + unsigned int mask = xfs_attr_match_mask(args); if (args->namelen != namelen) return false; + if ((args->attr_filter & mask) != (attr_flags & mask)) + return false; if (memcmp(args->name, name, namelen) != 0) return false; - /* Recovery ignores the INCOMPLETE flag. */ - if ((args->op_flags & XFS_DA_OP_RECOVERY) && - args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK)) - return true; + if (attr_flags & XFS_ATTR_PARENT) + return xfs_attr_parent_match(args, value, valuelen); - /* All remaining matches need to be filtered by INCOMPLETE state. */ - if (args->attr_filter != - (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) - return false; return true; } @@ -504,6 +568,13 @@ xfs_attr_copy_value( int valuelen) { /* + * Parent pointer lookups require the caller to specify the name and + * value, so don't copy anything. + */ + if (args->attr_filter & XFS_ATTR_PARENT) + return 0; + + /* * No copy if all we have to do is get the length */ if (!args->valuelen) { @@ -615,7 +686,7 @@ xfs_attr_shortform_bytesfit( */ if (!dp->i_forkoff && dp->i_df.if_bytes > xfs_default_attroffset(dp)) - dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + dsize = xfs_bmdr_space_calc(MINDBTPTRS); break; case XFS_DINODE_FMT_BTREE: /* @@ -629,7 +700,7 @@ xfs_attr_shortform_bytesfit( return 0; return dp->i_forkoff; } - dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); + dsize = xfs_bmap_bmdr_space(dp->i_df.if_broot); break; } @@ -637,11 +708,11 @@ xfs_attr_shortform_bytesfit( * A data fork btree root must have space for at least * MINDBTPTRS key/ptr pairs if the data fork is small or empty. */ - minforkoff = max_t(int64_t, dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); + minforkoff = max_t(int64_t, dsize, xfs_bmdr_space_calc(MINDBTPTRS)); minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) @@ -711,8 +782,9 @@ xfs_attr_sf_findname( for (sfe = xfs_attr_sf_firstentry(sf); sfe < xfs_attr_sf_endptr(sf); sfe = xfs_attr_sf_nextentry(sfe)) { - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) + if (xfs_attr_match(args, sfe->flags, sfe->nameval, + sfe->namelen, &sfe->nameval[sfe->namelen], + sfe->valuelen)) return sfe; } @@ -819,7 +891,8 @@ xfs_attr_sf_removename( */ if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && - !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { + !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) && + !xfs_has_parent(mp)) { xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); @@ -828,7 +901,8 @@ xfs_attr_sf_removename( ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || !xfs_has_attr2(mp) || - dp->i_df.if_format == XFS_DINODE_FMT_BTREE); + dp->i_df.if_format == XFS_DINODE_FMT_BTREE || + xfs_has_parent(mp)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } @@ -904,6 +978,7 @@ xfs_attr_shortform_to_leaf( nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; + nargs.owner = args->owner; sfe = xfs_attr_sf_firstentry(sf); for (i = 0; i < sf->count; i++) { @@ -911,15 +986,17 @@ xfs_attr_shortform_to_leaf( nargs.namelen = sfe->namelen; nargs.value = &sfe->nameval[nargs.namelen]; nargs.valuelen = sfe->valuelen; - nargs.hashval = xfs_da_hashname(sfe->nameval, - sfe->namelen); nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; + if (!xfs_attr_check_namespace(sfe->flags)) { + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto out; + } + xfs_attr_sethash(&nargs); error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == -ENOATTR); - error = xfs_attr3_leaf_add(bp, &nargs); - ASSERT(error != -ENOSPC); - if (error) - goto out; + if (!xfs_attr3_leaf_add(bp, &nargs)) + ASSERT(0); sfe = xfs_attr_sf_nextentry(sfe); } error = 0; @@ -1027,7 +1104,7 @@ xfs_attr_shortform_verify( * one namespace flag per xattr, so we can just count the * bits (i.e. hweight) here. */ - if (hweight8(sfep->flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) + if (!xfs_attr_check_namespace(sfep->flags)) return __this_address; sfep = next_sfep; @@ -1059,10 +1136,7 @@ xfs_attr3_leaf_to_shortform( trace_xfs_attr_leaf_to_sf(args); - tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); - if (!tmpbuffer) - return -ENOMEM; - + tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); leaf = (xfs_attr_leafblock_t *)tmpbuffer; @@ -1106,6 +1180,7 @@ xfs_attr3_leaf_to_shortform( nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; + nargs.owner = args->owner; for (i = 0; i < ichdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) @@ -1125,7 +1200,7 @@ xfs_attr3_leaf_to_shortform( error = 0; out: - kfree(tmpbuffer); + kvfree(tmpbuffer); return error; } @@ -1158,7 +1233,7 @@ xfs_attr3_leaf_to_node( error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_attr3_leaf_read(args->trans, dp, 0, &bp1); + error = xfs_attr3_leaf_read(args->trans, dp, args->owner, 0, &bp1); if (error) goto out; @@ -1237,7 +1312,7 @@ xfs_attr3_leaf_create( ichdr.magic = XFS_ATTR3_LEAF_MAGIC; hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->owner = cpu_to_be64(dp->i_ino); + hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); @@ -1256,6 +1331,9 @@ xfs_attr3_leaf_create( /* * Split the leaf node, rebalance, then add the new entry. + * + * Returns 0 if the entry was added, 1 if a further split is needed or a + * negative error number otherwise. */ int xfs_attr3_leaf_split( @@ -1263,8 +1341,9 @@ xfs_attr3_leaf_split( struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk) { - xfs_dablk_t blkno; - int error; + bool added; + xfs_dablk_t blkno; + int error; trace_xfs_attr_leaf_split(state->args); @@ -1299,10 +1378,10 @@ xfs_attr3_leaf_split( */ if (state->inleaf) { trace_xfs_attr_leaf_add_old(state->args); - error = xfs_attr3_leaf_add(oldblk->bp, state->args); + added = xfs_attr3_leaf_add(oldblk->bp, state->args); } else { trace_xfs_attr_leaf_add_new(state->args); - error = xfs_attr3_leaf_add(newblk->bp, state->args); + added = xfs_attr3_leaf_add(newblk->bp, state->args); } /* @@ -1310,13 +1389,15 @@ xfs_attr3_leaf_split( */ oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); - return error; + if (!added) + return 1; + return 0; } /* * Add a name to the leaf attribute list structure. */ -int +bool xfs_attr3_leaf_add( struct xfs_buf *bp, struct xfs_da_args *args) @@ -1325,6 +1406,7 @@ xfs_attr3_leaf_add( struct xfs_attr3_icleaf_hdr ichdr; int tablesize; int entsize; + bool added = true; int sum; int tmp; int i; @@ -1353,7 +1435,7 @@ xfs_attr3_leaf_add( if (ichdr.freemap[i].base < ichdr.firstused) tmp += sizeof(xfs_attr_leaf_entry_t); if (ichdr.freemap[i].size >= tmp) { - tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i); + xfs_attr3_leaf_add_work(bp, &ichdr, args, i); goto out_log_hdr; } sum += ichdr.freemap[i].size; @@ -1365,7 +1447,7 @@ xfs_attr3_leaf_add( * no good and we should just give up. */ if (!ichdr.holes && sum < entsize) - return -ENOSPC; + return false; /* * Compact the entries to coalesce free space. @@ -1378,24 +1460,24 @@ xfs_attr3_leaf_add( * free region, in freemap[0]. If it is not big enough, give up. */ if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { - tmp = -ENOSPC; + added = false; goto out_log_hdr; } - tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); + xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); out_log_hdr: xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); - return tmp; + return added; } /* * Add a name to a leaf attribute list structure. */ -STATIC int +STATIC void xfs_attr3_leaf_add_work( struct xfs_buf *bp, struct xfs_attr3_icleaf_hdr *ichdr, @@ -1513,7 +1595,6 @@ xfs_attr3_leaf_add_work( } } ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); - return 0; } /* @@ -1533,7 +1614,7 @@ xfs_attr3_leaf_compact( trace_xfs_attr_leaf_compact(args); - tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); + tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); memset(bp->b_addr, 0, args->geo->blksize); leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; @@ -1571,7 +1652,7 @@ xfs_attr3_leaf_compact( */ xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); - kfree(tmpbuffer); + kvfree(tmpbuffer); } /* @@ -1993,7 +2074,7 @@ xfs_attr3_leaf_toosmall( if (blkno == 0) continue; error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, - blkno, &bp); + state->args->owner, blkno, &bp); if (error) return error; @@ -2250,7 +2331,7 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kzalloc(state->args->geo->blksize, + tmp_leaf = kvzalloc(state->args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); /* @@ -2291,7 +2372,7 @@ xfs_attr3_leaf_unbalance( } memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); savehdr = tmphdr; /* struct copy */ - kfree(tmp_leaf); + kvfree(tmp_leaf); } xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr); @@ -2401,18 +2482,23 @@ xfs_attr3_leaf_lookup_int( */ if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, probe); - if (!xfs_attr_match(args, name_loc->namelen, - name_loc->nameval, entry->flags)) + if (!xfs_attr_match(args, entry->flags, + name_loc->nameval, name_loc->namelen, + &name_loc->nameval[name_loc->namelen], + be16_to_cpu(name_loc->valuelen))) continue; args->index = probe; return -EEXIST; } else { + unsigned int valuelen; + name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); - if (!xfs_attr_match(args, name_rmt->namelen, - name_rmt->name, entry->flags)) + valuelen = be32_to_cpu(name_rmt->valuelen); + if (!xfs_attr_match(args, entry->flags, name_rmt->name, + name_rmt->namelen, NULL, valuelen)) continue; args->index = probe; - args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = valuelen; args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, @@ -2715,7 +2801,8 @@ xfs_attr3_leaf_clearflag( /* * Set up the operation. */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp); if (error) return error; @@ -2779,7 +2866,8 @@ xfs_attr3_leaf_setflag( /* * Set up the operation. */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp); if (error) return error; @@ -2838,7 +2926,8 @@ xfs_attr3_leaf_flipflags( /* * Read the block containing the "old" attr */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp1); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp1); if (error) return error; @@ -2846,8 +2935,8 @@ xfs_attr3_leaf_flipflags( * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2, - &bp2); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno2, &bp2); if (error) return error; } else { diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 9b9948639c0f..589f810eedc0 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -76,7 +76,7 @@ int xfs_attr3_leaf_split(struct xfs_da_state *state, int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf, struct xfs_da_args *args); int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); -int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, +bool xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); @@ -98,12 +98,14 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t bno, struct xfs_buf **bpp); void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from); void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo, struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from); +xfs_failaddr_t xfs_attr3_leaf_header_check(struct xfs_buf *bp, + xfs_ino_t owner); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index ff0412828772..4c44ce1c8a64 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -43,19 +43,32 @@ * the logging system and therefore never have a log item. */ -/* - * Each contiguous block has a header, so it is not just a simple attribute - * length to FSB conversion. - */ -int +/* How many bytes can be stored in a remote value buffer? */ +inline unsigned int +xfs_attr3_rmt_buf_space( + struct xfs_mount *mp) +{ + unsigned int blocksize = mp->m_attr_geo->blksize; + + if (xfs_has_crc(mp)) + return blocksize - sizeof(struct xfs_attr3_rmt_hdr); + + return blocksize; +} + +/* Compute number of fsblocks needed to store a remote attr value */ +unsigned int xfs_attr3_rmt_blocks( - struct xfs_mount *mp, - int attrlen) + struct xfs_mount *mp, + unsigned int attrlen) { - if (xfs_has_crc(mp)) { - int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - return (attrlen + buflen - 1) / buflen; - } + /* + * Each contiguous block has a header, so it is not just a simple + * attribute length to FSB conversion. + */ + if (xfs_has_crc(mp)) + return howmany(attrlen, xfs_attr3_rmt_buf_space(mp)); + return XFS_B_TO_FSB(mp, attrlen); } @@ -92,7 +105,6 @@ xfs_attr3_rmt_verify( struct xfs_mount *mp, struct xfs_buf *bp, void *ptr, - int fsbsize, xfs_daddr_t bno) { struct xfs_attr3_rmt_hdr *rmt = ptr; @@ -103,7 +115,7 @@ xfs_attr3_rmt_verify( return __this_address; if (be64_to_cpu(rmt->rm_blkno) != bno) return __this_address; - if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) + if (be32_to_cpu(rmt->rm_bytes) > mp->m_attr_geo->blksize - sizeof(*rmt)) return __this_address; if (be32_to_cpu(rmt->rm_offset) + be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX) @@ -122,9 +134,9 @@ __xfs_attr3_rmt_read_verify( { struct xfs_mount *mp = bp->b_mount; char *ptr; - int len; + unsigned int len; xfs_daddr_t bno; - int blksize = mp->m_attr_geo->blksize; + unsigned int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ if (!xfs_has_crc(mp)) @@ -141,7 +153,7 @@ __xfs_attr3_rmt_read_verify( *failaddr = __this_address; return -EFSBADCRC; } - *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); + *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, bno); if (*failaddr) return -EFSCORRUPTED; len -= blksize; @@ -186,7 +198,7 @@ xfs_attr3_rmt_write_verify( { struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - int blksize = mp->m_attr_geo->blksize; + unsigned int blksize = mp->m_attr_geo->blksize; char *ptr; int len; xfs_daddr_t bno; @@ -203,7 +215,7 @@ xfs_attr3_rmt_write_verify( while (len > 0) { struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; - fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); + fa = xfs_attr3_rmt_verify(mp, bp, ptr, bno); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -280,30 +292,30 @@ xfs_attr_rmtval_copyout( struct xfs_mount *mp, struct xfs_buf *bp, struct xfs_inode *dp, - int *offset, - int *valuelen, + xfs_ino_t owner, + unsigned int *offset, + unsigned int *valuelen, uint8_t **dst) { char *src = bp->b_addr; - xfs_ino_t ino = dp->i_ino; xfs_daddr_t bno = xfs_buf_daddr(bp); - int len = BBTOB(bp->b_length); - int blksize = mp->m_attr_geo->blksize; + unsigned int len = BBTOB(bp->b_length); + unsigned int blksize = mp->m_attr_geo->blksize; ASSERT(len >= blksize); while (len > 0 && *valuelen > 0) { - int hdr_size = 0; - int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + unsigned int hdr_size = 0; + unsigned int byte_cnt = xfs_attr3_rmt_buf_space(mp); byte_cnt = min(*valuelen, byte_cnt); if (xfs_has_crc(mp)) { - if (xfs_attr3_rmt_hdr_ok(src, ino, *offset, + if (xfs_attr3_rmt_hdr_ok(src, owner, *offset, byte_cnt, bno)) { xfs_alert(mp, "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", - bno, *offset, byte_cnt, ino); + bno, *offset, byte_cnt, owner); xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } @@ -330,20 +342,20 @@ xfs_attr_rmtval_copyin( struct xfs_mount *mp, struct xfs_buf *bp, xfs_ino_t ino, - int *offset, - int *valuelen, + unsigned int *offset, + unsigned int *valuelen, uint8_t **src) { char *dst = bp->b_addr; xfs_daddr_t bno = xfs_buf_daddr(bp); - int len = BBTOB(bp->b_length); - int blksize = mp->m_attr_geo->blksize; + unsigned int len = BBTOB(bp->b_length); + unsigned int blksize = mp->m_attr_geo->blksize; ASSERT(len >= blksize); while (len > 0 && *valuelen > 0) { - int hdr_size; - int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + unsigned int hdr_size; + unsigned int byte_cnt = xfs_attr3_rmt_buf_space(mp); byte_cnt = min(*valuelen, byte_cnt); hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, @@ -389,12 +401,12 @@ xfs_attr_rmtval_get( struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; uint8_t *dst = args->value; - int valuelen; + unsigned int valuelen; int nmap; int error; - int blkcnt = args->rmtblkcnt; + unsigned int blkcnt = args->rmtblkcnt; int i; - int offset = 0; + unsigned int offset = 0; trace_xfs_attr_rmtval_get(args); @@ -427,8 +439,7 @@ xfs_attr_rmtval_get( return error; error = xfs_attr_rmtval_copyout(mp, bp, args->dp, - &offset, &valuelen, - &dst); + args->owner, &offset, &valuelen, &dst); xfs_buf_relse(bp); if (error) return error; @@ -453,7 +464,7 @@ xfs_attr_rmt_find_hole( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; int error; - int blkcnt; + unsigned int blkcnt; xfs_fileoff_t lfileoff = 0; /* @@ -482,11 +493,11 @@ xfs_attr_rmtval_set_value( struct xfs_bmbt_irec map; xfs_dablk_t lblkno; uint8_t *src = args->value; - int blkcnt; - int valuelen; + unsigned int blkcnt; + unsigned int valuelen; int nmap; int error; - int offset = 0; + unsigned int offset = 0; /* * Roll through the "value", copying the attribute value to the @@ -522,8 +533,8 @@ xfs_attr_rmtval_set_value( return error; bp->b_ops = &xfs_attr3_rmt_buf_ops; - xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, - &valuelen, &src); + xfs_attr_rmtval_copyin(mp, bp, args->owner, &offset, &valuelen, + &src); error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); @@ -626,7 +637,6 @@ xfs_attr_rmtval_set_blk( if (error) return error; - ASSERT(nmap == 1); ASSERT((map->br_startblock != DELAYSTARTBLOCK) && (map->br_startblock != HOLESTARTBLOCK)); @@ -646,7 +656,7 @@ xfs_attr_rmtval_invalidate( struct xfs_da_args *args) { xfs_dablk_t lblkno; - int blkcnt; + unsigned int blkcnt; int error; /* diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index d097ec6c4dc3..e3c6c7d774bf 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -6,7 +6,13 @@ #ifndef __XFS_ATTR_REMOTE_H__ #define __XFS_ATTR_REMOTE_H__ -int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); +unsigned int xfs_attr3_rmt_blocks(struct xfs_mount *mp, unsigned int attrlen); + +/* Number of rmt blocks needed to store the maximally sized attr value */ +static inline unsigned int xfs_attr3_max_rmt_blocks(struct xfs_mount *mp) +{ + return xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); +} int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index bc4422223024..73bdc0e55682 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -16,6 +16,7 @@ typedef struct xfs_attr_sf_sort { uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ xfs_dahash_t hash; /* this entry's hash value */ unsigned char *name; /* name value, pointer into buffer */ + void *value; } xfs_attr_sf_sort_t; #define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 656c95a22f2e..d954f9b8071f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -34,11 +34,13 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" -#include "xfs_icache.h" #include "xfs_iomap.h" #include "xfs_health.h" #include "xfs_bmap_item.h" #include "xfs_symlink_remote.h" +#include "xfs_inode_util.h" +#include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_bmap_intent_cache; @@ -78,9 +80,9 @@ xfs_bmap_compute_maxlevels( maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), whichfork); if (whichfork == XFS_DATA_FORK) - sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + sz = xfs_bmdr_space_calc(MINDBTPTRS); else - sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); + sz = xfs_bmdr_space_calc(MINABTPTRS); maxrootrecs = xfs_bmdr_maxrecs(sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; @@ -101,8 +103,8 @@ xfs_bmap_compute_attr_offset( struct xfs_mount *mp) { if (mp->m_sb.sb_inodesize == 256) - return XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); - return XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); + return XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS); + return xfs_bmdr_space_calc(6 * MINABTPTRS); } STATIC int /* error */ @@ -169,18 +171,16 @@ xfs_bmbt_update( * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". */ -STATIC xfs_filblks_t +xfs_filblks_t xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len) /* delayed extent length */ + struct xfs_inode *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ { - int level; /* btree level number */ - int maxrecs; /* maximum record count at this level */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t rval; /* return value */ + struct xfs_mount *mp = ip->i_mount; + int maxrecs = mp->m_bmap_dmxr[0]; + int level; + xfs_filblks_t rval; - mp = ip->i_mount; - maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); level++) { @@ -297,7 +297,7 @@ xfs_check_block( prevp = NULL; for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { dmxr = mp->m_bmap_dmxr[0]; - keyp = XFS_BMBT_KEY_ADDR(mp, block, i); + keyp = xfs_bmbt_key_addr(mp, block, i); if (prevp) { ASSERT(be64_to_cpu(prevp->br_startoff) < @@ -309,15 +309,15 @@ xfs_check_block( * Compare the block numbers to see if there are dups. */ if (root) - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); + pp = xfs_bmap_broot_ptr_addr(mp, block, i, sz); else - pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); + pp = xfs_bmbt_ptr_addr(mp, block, i, dmxr); for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { if (root) - thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); + thispa = xfs_bmap_broot_ptr_addr(mp, block, j, sz); else - thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); + thispa = xfs_bmbt_ptr_addr(mp, block, j, dmxr); if (*thispa == *pp) { xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld", __func__, j, i, @@ -372,7 +372,7 @@ xfs_bmap_check_leaf_extents( level = be16_to_cpu(block->bb_level); ASSERT(level > 0); xfs_check_block(block, mp, 1, ifp->if_broot_bytes); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + pp = xfs_bmap_broot_ptr_addr(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); ASSERT(bno != NULLFSBLOCK); @@ -405,7 +405,7 @@ xfs_bmap_check_leaf_extents( */ xfs_check_block(block, mp, 0, 0); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + pp = xfs_bmbt_ptr_addr(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) { xfs_btree_mark_sick(cur); @@ -445,14 +445,14 @@ xfs_bmap_check_leaf_extents( * conform with the first entry in this one. */ - ep = XFS_BMBT_REC_ADDR(mp, block, 1); + ep = xfs_bmbt_rec_addr(mp, block, 1); if (i) { ASSERT(xfs_bmbt_disk_get_startoff(&last) + xfs_bmbt_disk_get_blockcount(&last) <= xfs_bmbt_disk_get_startoff(ep)); } for (j = 1; j < num_recs; j++) { - nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); + nextp = xfs_bmbt_rec_addr(mp, block, j + 1); ASSERT(xfs_bmbt_disk_get_startoff(ep) + xfs_bmbt_disk_get_blockcount(ep) <= xfs_bmbt_disk_get_startoff(nextp)); @@ -583,9 +583,9 @@ xfs_bmap_btree_to_extents( ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); - ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false) == 1); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); + pp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); #ifdef DEBUG if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_verify_fsbno(mp, cbno))) { @@ -604,7 +604,7 @@ xfs_bmap_btree_to_extents( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, 0); if (error) return error; @@ -613,7 +613,7 @@ xfs_bmap_btree_to_extents( xfs_trans_binval(tp, cbp); if (cur->bc_levels[0].bp == cbp) cur->bc_levels[0].bp = NULL; - xfs_iroot_realloc(ip, -1, whichfork); + xfs_bmap_broot_realloc(ip, whichfork, 0); ASSERT(ifp->if_broot == NULL); ifp->if_format = XFS_DINODE_FMT_EXTENTS; *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); @@ -657,12 +657,11 @@ xfs_bmap_extents_to_btree( * Make space in the inode incore. This needs to be undone if we fail * to expand the root. */ - xfs_iroot_realloc(ip, 1, whichfork); + block = xfs_bmap_broot_realloc(ip, whichfork, 1); /* * Fill in the root. */ - block = ifp->if_broot; xfs_bmbt_init_block(ip, block, NULL, 1, 1); /* * Need a cursor. Can't allocate until bb_level is filled in. @@ -713,7 +712,7 @@ xfs_bmap_extents_to_btree( for_each_xfs_iext(ifp, &icur, &rec) { if (isnullstartblock(rec.br_startblock)) continue; - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1 + cnt); + arp = xfs_bmbt_rec_addr(mp, ablock, 1 + cnt); xfs_bmbt_disk_set_all(arp, &rec); cnt++; } @@ -723,10 +722,10 @@ xfs_bmap_extents_to_btree( /* * Fill in the root key and pointer. */ - kp = XFS_BMBT_KEY_ADDR(mp, block, 1); - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + kp = xfs_bmbt_key_addr(mp, block, 1); + arp = xfs_bmbt_rec_addr(mp, ablock, 1); kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, + pp = xfs_bmbt_ptr_addr(mp, block, 1, xfs_bmbt_get_maxrecs(cur, be16_to_cpu(block->bb_level))); *pp = cpu_to_be64(args.fsbno); @@ -744,7 +743,7 @@ xfs_bmap_extents_to_btree( out_unreserve_dquot: xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); out_root_realloc: - xfs_iroot_realloc(ip, -1, whichfork); + xfs_bmap_broot_realloc(ip, whichfork, 0); ifp->if_format = XFS_DINODE_FMT_EXTENTS; ASSERT(ifp->if_broot == NULL); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -779,7 +778,7 @@ xfs_bmap_local_to_extents_empty( } -STATIC int /* error */ +int /* error */ xfs_bmap_local_to_extents( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ @@ -789,7 +788,8 @@ xfs_bmap_local_to_extents( void (*init_fn)(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, - struct xfs_ifork *ifp)) + struct xfs_ifork *ifp, void *priv), + void *priv) { int error = 0; int flags; /* logging flags returned */ @@ -850,7 +850,7 @@ xfs_bmap_local_to_extents( * log here. Note that init_fn must also set the buffer log item type * correctly. */ - init_fn(tp, bp, ip, ifp); + init_fn(tp, bp, ip, ifp, priv); /* account for the change in fork size */ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); @@ -894,7 +894,7 @@ xfs_bmap_add_attrfork_btree( mp = ip->i_mount; - if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip)) + if (xfs_bmap_bmdr_space(block) <= xfs_inode_data_fork_size(ip)) *flags |= XFS_ILOG_DBROOT; else { cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); @@ -976,13 +976,14 @@ xfs_bmap_add_attrfork_local( dargs.total = dargs.geo->fsbcount; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; + dargs.owner = ip->i_ino; return xfs_dir2_sf_to_block(&dargs); } if (S_ISLNK(VFS_I(ip)->i_mode)) return xfs_bmap_local_to_extents(tp, ip, 1, flags, - XFS_DATA_FORK, - xfs_symlink_local_to_remote); + XFS_DATA_FORK, xfs_symlink_local_to_remote, + NULL); /* should only be called for types that support local format data */ ASSERT(0); @@ -1023,40 +1024,30 @@ xfs_bmap_set_attrforkoff( } /* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. + * Convert inode from non-attributed to attributed. Caller must hold the + * ILOCK_EXCL and the file cannot have an attr fork. */ int /* error code */ xfs_bmap_add_attrfork( - xfs_inode_t *ip, /* incore inode pointer */ + struct xfs_trans *tp, + struct xfs_inode *ip, /* incore inode pointer */ int size, /* space new attribute needs */ int rsvd) /* xact may use reserved blks */ { - xfs_mount_t *mp; /* mount structure */ - xfs_trans_t *tp; /* transaction pointer */ - int blks; /* space reservation */ + struct xfs_mount *mp = tp->t_mountp; int version = 1; /* superblock attr version */ int logflags; /* logging flags */ int error; /* error return value */ - ASSERT(xfs_inode_has_attr_fork(ip) == 0); - - mp = ip->i_mount; - ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - - blks = XFS_ADDAFORK_SPACE_RES(mp); - - error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0, - rsvd, &tp); - if (error) - return error; - if (xfs_inode_has_attr_fork(ip)) - goto trans_cancel; + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + if (!xfs_is_metadir_inode(ip)) + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + ASSERT(!xfs_inode_has_attr_fork(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_bmap_set_attrforkoff(ip, size, &version); if (error) - goto trans_cancel; + return error; xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); logflags = 0; @@ -1077,7 +1068,7 @@ xfs_bmap_add_attrfork( if (logflags) xfs_trans_log_inode(tp, ip, logflags); if (error) - goto trans_cancel; + return error; if (!xfs_has_attr(mp) || (!xfs_has_attr2(mp) && version == 2)) { bool log_sb = false; @@ -1096,14 +1087,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; - -trans_cancel: - xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; + return 0; } /* @@ -1175,7 +1159,7 @@ xfs_iread_bmbt_block( } /* Copy records into the incore cache. */ - frp = XFS_BMBT_REC_ADDR(mp, block, 1); + frp = xfs_bmbt_rec_addr(mp, block, 1); for (j = 0; j < num_recs; j++, frp++, ir->loaded++) { struct xfs_bmbt_irec new; xfs_failaddr_t fa; @@ -1438,6 +1422,24 @@ xfs_bmap_last_offset( * Extent tree manipulation functions used during allocation. */ +static inline bool +xfs_bmap_same_rtgroup( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *left, + struct xfs_bmbt_irec *right) +{ + struct xfs_mount *mp = ip->i_mount; + + if (xfs_ifork_is_realtime(ip, whichfork) && xfs_has_rtgroups(mp)) { + if (xfs_rtb_to_rgno(mp, left->br_startblock) != + xfs_rtb_to_rgno(mp, right->br_startblock)) + return false; + } + + return true; +} + /* * Convert a delayed allocation to a real allocation. */ @@ -1507,7 +1509,8 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && + xfs_bmap_same_rtgroup(bma->ip, whichfork, &LEFT, new)) state |= BMAP_LEFT_CONTIG; /* @@ -1531,7 +1534,8 @@ xfs_bmap_add_extent_delay_real( (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= XFS_MAX_BMBT_EXTLEN)) + <= XFS_MAX_BMBT_EXTLEN) && + xfs_bmap_same_rtgroup(bma->ip, whichfork, new, &RIGHT)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -1586,6 +1590,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: @@ -1616,6 +1621,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -1650,6 +1656,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: @@ -1684,6 +1691,7 @@ xfs_bmap_add_extent_delay_real( goto done; } } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: @@ -1722,6 +1730,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING: @@ -1812,6 +1821,7 @@ xfs_bmap_add_extent_delay_real( xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); xfs_iext_next(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT); + ASSERT(da_new <= da_old); break; case BMAP_RIGHT_FILLING: @@ -1861,6 +1871,7 @@ xfs_bmap_add_extent_delay_real( PREV.br_blockcount = temp; xfs_iext_insert(bma->ip, &bma->icur, &PREV, state); xfs_iext_next(ifp, &bma->icur); + ASSERT(da_new <= da_old); break; case 0: @@ -1975,7 +1986,7 @@ xfs_bmap_add_extent_delay_real( } if (da_new != da_old) - xfs_mod_delalloc(mp, (int64_t)da_new - da_old); + xfs_mod_delalloc(bma->ip, 0, (int64_t)da_new - da_old); if (bma->cur) { da_new += bma->cur->bc_bmap.allocated; @@ -1983,11 +1994,10 @@ xfs_bmap_add_extent_delay_real( } /* adjust for changes in reserved delayed indirect blocks */ - if (da_new != da_old) { - ASSERT(state == 0 || da_new < da_old); - error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), - false); - } + if (da_new < da_old) + xfs_add_fdblocks(mp, da_old - da_new); + else if (da_new > da_old) + error = xfs_dec_fdblocks(mp, da_new - da_old, true); xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: @@ -2070,7 +2080,8 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && + xfs_bmap_same_rtgroup(ip, whichfork, &LEFT, new)) state |= BMAP_LEFT_CONTIG; /* @@ -2094,7 +2105,8 @@ xfs_bmap_add_extent_unwritten_real( (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= XFS_MAX_BMBT_EXTLEN)) + <= XFS_MAX_BMBT_EXTLEN) && + xfs_bmap_same_rtgroup(ip, whichfork, new, &RIGHT)) state |= BMAP_RIGHT_CONTIG; /* @@ -2558,146 +2570,6 @@ done: } /* - * Convert a hole to a delayed allocation. - */ -STATIC void -xfs_bmap_add_extent_hole_delay( - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork, - struct xfs_iext_cursor *icur, - xfs_bmbt_irec_t *new) /* new data to add to file extents */ -{ - struct xfs_ifork *ifp; /* inode fork pointer */ - xfs_bmbt_irec_t left; /* left neighbor extent entry */ - xfs_filblks_t newlen=0; /* new indirect size */ - xfs_filblks_t oldlen=0; /* old indirect size */ - xfs_bmbt_irec_t right; /* right neighbor extent entry */ - uint32_t state = xfs_bmap_fork_to_state(whichfork); - xfs_filblks_t temp; /* temp for indirect calculations */ - - ifp = xfs_ifork_ptr(ip, whichfork); - ASSERT(isnullstartblock(new->br_startblock)); - - /* - * Check and set flags if this segment has a left neighbor - */ - if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { - state |= BMAP_LEFT_VALID; - if (isnullstartblock(left.br_startblock)) - state |= BMAP_LEFT_DELAY; - } - - /* - * Check and set flags if the current (right) segment exists. - * If it doesn't exist, we're converting the hole at end-of-file. - */ - if (xfs_iext_get_extent(ifp, icur, &right)) { - state |= BMAP_RIGHT_VALID; - if (isnullstartblock(right.br_startblock)) - state |= BMAP_RIGHT_DELAY; - } - - /* - * Set contiguity flags on the left and right neighbors. - * Don't let extents get too large, even if the pieces are contiguous. - */ - if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && - left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) - state |= BMAP_LEFT_CONTIG; - - if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && - new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && - (!(state & BMAP_LEFT_CONTIG) || - (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) - state |= BMAP_RIGHT_CONTIG; - - /* - * Switch out based on the contiguity flags. - */ - switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { - case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with delayed allocations - * on the left and on the right. - * Merge all three into a single extent record. - */ - temp = left.br_blockcount + new->br_blockcount + - right.br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_startblock = nullstartblock(newlen); - left.br_blockcount = temp; - - xfs_iext_remove(ip, icur, state); - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_LEFT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the left. - * Merge the new allocation with the left neighbor. - */ - temp = left.br_blockcount + new->br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_blockcount = temp; - left.br_startblock = nullstartblock(newlen); - - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the right. - * Merge the new allocation with the right neighbor. - */ - temp = new->br_blockcount + right.br_blockcount; - oldlen = startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - right.br_startoff = new->br_startoff; - right.br_startblock = nullstartblock(newlen); - right.br_blockcount = temp; - xfs_iext_update_extent(ip, state, icur, &right); - break; - - case 0: - /* - * New allocation is not contiguous with another - * delayed allocation. - * Insert a new entry. - */ - oldlen = newlen = 0; - xfs_iext_insert(ip, icur, new, state); - break; - } - if (oldlen != newlen) { - ASSERT(oldlen > newlen); - xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen), - false); - /* - * Nothing to do for disk quota accounting here. - */ - xfs_mod_delalloc(ip->i_mount, (int64_t)newlen - oldlen); - } -} - -/* * Convert a hole to a real allocation. */ STATIC int /* error */ @@ -2754,7 +2626,8 @@ xfs_bmap_add_extent_hole_real( left.br_startoff + left.br_blockcount == new->br_startoff && left.br_startblock + left.br_blockcount == new->br_startblock && left.br_state == new->br_state && - left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && + xfs_bmap_same_rtgroup(ip, whichfork, &left, new)) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && @@ -2764,7 +2637,8 @@ xfs_bmap_add_extent_hole_real( new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || left.br_blockcount + new->br_blockcount + - right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN) && + xfs_bmap_same_rtgroup(ip, whichfork, new, &right)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -3121,6 +2995,30 @@ xfs_bmap_extsize_align( return 0; } +static inline bool +xfs_bmap_adjacent_valid( + struct xfs_bmalloca *ap, + xfs_fsblock_t x, + xfs_fsblock_t y) +{ + struct xfs_mount *mp = ap->ip->i_mount; + + if (XFS_IS_REALTIME_INODE(ap->ip) && + (ap->datatype & XFS_ALLOC_USERDATA)) { + if (!xfs_has_rtgroups(mp)) + return x < mp->m_sb.sb_rblocks; + + return xfs_rtb_to_rgno(mp, x) == xfs_rtb_to_rgno(mp, y) && + xfs_rtb_to_rgno(mp, x) < mp->m_sb.sb_rgcount && + xfs_rtb_to_rtx(mp, x) < mp->m_sb.sb_rgextents; + + } + + return XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && + XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && + XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks; +} + #define XFS_ALLOC_GAP_UNITS 4 /* returns true if ap->blkno was modified */ @@ -3128,36 +3026,25 @@ bool xfs_bmap_adjacent( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { - xfs_fsblock_t adjust; /* adjustment to block numbers */ - xfs_mount_t *mp; /* mount point structure */ - int rt; /* true if inode is realtime */ - -#define ISVALID(x,y) \ - (rt ? \ - (x) < mp->m_sb.sb_rblocks : \ - XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \ - XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \ - XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) - - mp = ap->ip->i_mount; - rt = XFS_IS_REALTIME_INODE(ap->ip) && - (ap->datatype & XFS_ALLOC_USERDATA); + xfs_fsblock_t adjust; /* adjustment to block numbers */ + /* * If allocating at eof, and there's a previous real block, * try to use its last block as our starting point. */ if (ap->eof && ap->prev.br_startoff != NULLFILEOFF && !isnullstartblock(ap->prev.br_startblock) && - ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount, - ap->prev.br_startblock)) { + xfs_bmap_adjacent_valid(ap, + ap->prev.br_startblock + ap->prev.br_blockcount, + ap->prev.br_startblock)) { ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount; /* * Adjust for the gap between prevp and us. */ adjust = ap->offset - (ap->prev.br_startoff + ap->prev.br_blockcount); - if (adjust && - ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) + if (adjust && xfs_bmap_adjacent_valid(ap, ap->blkno + adjust, + ap->prev.br_startblock)) ap->blkno += adjust; return true; } @@ -3180,7 +3067,8 @@ xfs_bmap_adjacent( !isnullstartblock(ap->prev.br_startblock) && (prevbno = ap->prev.br_startblock + ap->prev.br_blockcount) && - ISVALID(prevbno, ap->prev.br_startblock)) { + xfs_bmap_adjacent_valid(ap, prevbno, + ap->prev.br_startblock)) { /* * Calculate gap to end of previous block. */ @@ -3196,8 +3084,8 @@ xfs_bmap_adjacent( * number, then just use the end of the previous block. */ if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length && - ISVALID(prevbno + prevdiff, - ap->prev.br_startblock)) + xfs_bmap_adjacent_valid(ap, prevbno + prevdiff, + ap->prev.br_startblock)) prevbno += adjust; else prevdiff += adjust; @@ -3229,9 +3117,11 @@ xfs_bmap_adjacent( * offset by our length. */ if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length && - ISVALID(gotbno - gotdiff, gotbno)) + xfs_bmap_adjacent_valid(ap, gotbno - gotdiff, + gotbno)) gotbno -= adjust; - else if (ISVALID(gotbno - ap->length, gotbno)) { + else if (xfs_bmap_adjacent_valid(ap, gotbno - ap->length, + gotbno)) { gotbno -= ap->length; gotdiff += adjust - ap->length; } else @@ -3259,7 +3149,7 @@ xfs_bmap_adjacent( return true; } } -#undef ISVALID + return false; } @@ -3280,7 +3170,7 @@ xfs_bmap_longest_free_extent( } longest = xfs_alloc_longest_free_extent(pag, - xfs_alloc_min_freelist(pag->pag_mount, pag), + xfs_alloc_min_freelist(pag_mount(pag), pag), xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) *blen = longest; @@ -3370,7 +3260,7 @@ xfs_bmap_alloc_account( * yet. */ if (ap->wasdel) { - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); + xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0); return; } @@ -3394,7 +3284,7 @@ xfs_bmap_alloc_account( xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) { ap->ip->i_delayed_blks -= ap->length; - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); + xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0); fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT; } else { fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; @@ -3422,6 +3312,11 @@ xfs_bmap_compute_alignments( align = xfs_get_cowextsz_hint(ap->ip); else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); + + /* Try to align start block to any minimum allocation alignment */ + if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN)) + args->alignment = align; + if (align) { if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, &ap->offset, @@ -3477,31 +3372,19 @@ xfs_bmap_process_allocated_extent( xfs_bmap_alloc_account(ap); } -#ifdef DEBUG static int xfs_bmap_exact_minlen_extent_alloc( - struct xfs_bmalloca *ap) + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args) { - struct xfs_mount *mp = ap->ip->i_mount; - struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp }; - xfs_fileoff_t orig_offset; - xfs_extlen_t orig_length; - int error; - - ASSERT(ap->length); - if (ap->minlen != 1) { - ap->blkno = NULLFSBLOCK; - ap->length = 0; + args->fsbno = NULLFSBLOCK; return 0; } - orig_offset = ap->offset; - orig_length = ap->length; - - args.alloc_minlen_only = 1; - - xfs_bmap_compute_alignments(ap, &args); + args->alloc_minlen_only = 1; + args->minlen = args->maxlen = ap->minlen; + args->total = ap->total; /* * Unlike the longest extent available in an AG, we don't track @@ -3511,39 +3394,16 @@ xfs_bmap_exact_minlen_extent_alloc( * we need not be concerned about a drop in performance in * "debug only" code paths. */ - ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0); - - args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; - args.minlen = args.maxlen = ap->minlen; - args.total = ap->total; - - args.alignment = 1; - args.minalignslop = 0; - - args.minleft = ap->minleft; - args.wasdel = ap->wasdel; - args.resv = XFS_AG_RESV_NONE; - args.datatype = ap->datatype; - - error = xfs_alloc_vextent_first_ag(&args, ap->blkno); - if (error) - return error; - - if (args.fsbno != NULLFSBLOCK) { - xfs_bmap_process_allocated_extent(ap, &args, orig_offset, - orig_length); - } else { - ap->blkno = NULLFSBLOCK; - ap->length = 0; - } + ap->blkno = XFS_AGB_TO_FSB(ap->ip->i_mount, 0, 0); - return 0; + /* + * Call xfs_bmap_btalloc_low_space here as it first does a "normal" AG + * iteration and then drops args->total to args->minlen, which might be + * required to find an allocation for the transaction reservation when + * the file system is very full. + */ + return xfs_bmap_btalloc_low_space(ap, args); } -#else - -#define xfs_bmap_exact_minlen_extent_alloc(bma) (-EFSCORRUPTED) - -#endif /* * If we are not low on available data blocks and we are allocating at @@ -3566,12 +3426,12 @@ xfs_bmap_btalloc_at_eof( int error; /* - * If there are already extents in the file, try an exact EOF block - * allocation to extend the file as a contiguous extent. If that fails, - * or it's the first allocation in a file, just try for a stripe aligned - * allocation. + * If there are already extents in the file, and xfs_bmap_adjacent() has + * given a better blkno, try an exact EOF block allocation to extend the + * file as a contiguous extent. If that fails, or it's the first + * allocation in a file, just try for a stripe aligned allocation. */ - if (ap->offset) { + if (ap->eof) { xfs_extlen_t nextminlen = 0; /* @@ -3739,7 +3599,8 @@ xfs_bmap_btalloc_best_length( int error; ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino); - xfs_bmap_adjacent(ap); + if (!xfs_bmap_adjacent(ap)) + ap->eof = false; /* * Search for an allocation group with a single extent large enough for @@ -3801,8 +3662,11 @@ xfs_bmap_btalloc( /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); - if ((ap->datatype & XFS_ALLOC_USERDATA) && - xfs_inode_is_filestream(ap->ip)) + if (unlikely(XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + error = xfs_bmap_exact_minlen_extent_alloc(ap, &args); + else if ((ap->datatype & XFS_ALLOC_USERDATA) && + xfs_inode_is_filestream(ap->ip)) error = xfs_bmap_btalloc_filestreams(ap, &args, stripe_align); else error = xfs_bmap_btalloc_best_length(ap, &args, stripe_align); @@ -4038,152 +3902,6 @@ xfs_bmapi_read( return 0; } -/* - * Add a delayed allocation extent to an inode. Blocks are reserved from the - * global pool and the extent inserted into the inode in-core extent tree. - * - * On entry, got refers to the first extent beyond the offset of the extent to - * allocate or eof is specified if no such extent exists. On return, got refers - * to the extent record that was inserted to the inode fork. - * - * Note that the allocated extent may have been merged with contiguous extents - * during insertion into the inode fork. Thus, got does not reflect the current - * state of the inode fork on return. If necessary, the caller can use lastx to - * look up the updated record in the inode fork. - */ -int -xfs_bmapi_reserve_delalloc( - struct xfs_inode *ip, - int whichfork, - xfs_fileoff_t off, - xfs_filblks_t len, - xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, - struct xfs_iext_cursor *icur, - int eof) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); - xfs_extlen_t alen; - xfs_extlen_t indlen; - int error; - xfs_fileoff_t aoff = off; - - /* - * Cap the alloc length. Keep track of prealloc so we know whether to - * tag the inode before we return. - */ - alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); - if (!eof) - alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); - if (prealloc && alen >= len) - prealloc = alen - len; - - /* Figure out the extent size, adjust alen */ - if (whichfork == XFS_COW_FORK) { - struct xfs_bmbt_irec prev; - xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); - - if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) - prev.br_startoff = NULLFILEOFF; - - error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, - 1, 0, &aoff, &alen); - ASSERT(!error); - } - - /* - * Make a transaction-less quota reservation for delayed allocation - * blocks. This number gets adjusted later. We return if we haven't - * allocated blocks already inside this loop. - */ - error = xfs_quota_reserve_blkres(ip, alen); - if (error) - return error; - - /* - * Split changing sb for alen and indlen since they could be coming - * from different places. - */ - indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); - ASSERT(indlen > 0); - - error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); - if (error) - goto out_unreserve_quota; - - error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false); - if (error) - goto out_unreserve_blocks; - - - ip->i_delayed_blks += alen; - xfs_mod_delalloc(ip->i_mount, alen + indlen); - - got->br_startoff = aoff; - got->br_startblock = nullstartblock(indlen); - got->br_blockcount = alen; - got->br_state = XFS_EXT_NORM; - - xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); - - /* - * Tag the inode if blocks were preallocated. Note that COW fork - * preallocation can occur at the start or end of the extent, even when - * prealloc == 0, so we must also check the aligned offset and length. - */ - if (whichfork == XFS_DATA_FORK && prealloc) - xfs_inode_set_eofblocks_tag(ip); - if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) - xfs_inode_set_cowblocks_tag(ip); - - return 0; - -out_unreserve_blocks: - xfs_mod_fdblocks(mp, alen, false); -out_unreserve_quota: - if (XFS_IS_QUOTA_ON(mp)) - xfs_quota_unreserve_blkres(ip, alen); - return error; -} - -static int -xfs_bmap_alloc_userdata( - struct xfs_bmalloca *bma) -{ - struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = xfs_bmapi_whichfork(bma->flags); - int error; - - /* - * Set the data type being allocated. For the data fork, the first data - * in the file is treated differently to all other allocations. For the - * attribute fork, we only need to ensure the allocated range is not on - * the busy list. - */ - bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { - bma->datatype |= XFS_ALLOC_USERDATA; - if (bma->offset == 0) - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; - - if (mp->m_dalign && bma->length >= mp->m_dalign) { - error = xfs_bmap_isaeof(bma, whichfork); - if (error) - return error; - } - - if (XFS_IS_REALTIME_INODE(bma->ip)) - return xfs_bmap_rtalloc(bma); - } - - if (unlikely(XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) - return xfs_bmap_exact_minlen_extent_alloc(bma); - - return xfs_bmap_btalloc(bma); -} - static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) @@ -4191,43 +3909,51 @@ xfs_bmapi_allocate( struct xfs_mount *mp = bma->ip->i_mount; int whichfork = xfs_bmapi_whichfork(bma->flags); struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); - int tmp_logflags = 0; int error; ASSERT(bma->length > 0); - - /* - * For the wasdelay case, we could also just allocate the stuff asked - * for in this bmap call but that wouldn't be as good. - */ - if (bma->wasdel) { - bma->length = (xfs_extlen_t)bma->got.br_blockcount; - bma->offset = bma->got.br_startoff; - if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev)) - bma->prev.br_startoff = NULLFILEOFF; - } else { - bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN); - if (!bma->eof) - bma->length = XFS_FILBLKS_MIN(bma->length, - bma->got.br_startoff - bma->offset); - } + ASSERT(bma->length <= XFS_MAX_BMBT_EXTLEN); if (bma->flags & XFS_BMAPI_CONTIG) bma->minlen = bma->length; else bma->minlen = 1; - if (bma->flags & XFS_BMAPI_METADATA) { - if (unlikely(XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) - error = xfs_bmap_exact_minlen_extent_alloc(bma); - else - error = xfs_bmap_btalloc(bma); - } else { - error = xfs_bmap_alloc_userdata(bma); + if (!(bma->flags & XFS_BMAPI_METADATA)) { + /* + * For the data and COW fork, the first data in the file is + * treated differently to all other allocations. For the + * attribute fork, we only need to ensure the allocated range + * is not on the busy list. + */ + bma->datatype = XFS_ALLOC_NOBUSY; + if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { + bma->datatype |= XFS_ALLOC_USERDATA; + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + + if (mp->m_dalign && bma->length >= mp->m_dalign) { + error = xfs_bmap_isaeof(bma, whichfork); + if (error) + return error; + } + } } - if (error || bma->blkno == NULLFSBLOCK) + + if ((bma->datatype & XFS_ALLOC_USERDATA) && + XFS_IS_REALTIME_INODE(bma->ip)) + error = xfs_bmap_rtalloc(bma); + else + error = xfs_bmap_btalloc(bma); + if (error) return error; + if (bma->blkno == NULLFSBLOCK) + return -ENOSPC; + + if (WARN_ON_ONCE(!xfs_valid_startblock(bma->ip, bma->blkno))) { + xfs_bmap_mark_sick(bma->ip, whichfork); + return -EFSCORRUPTED; + } if (bma->flags & XFS_BMAPI_ZERO) { error = xfs_zero_extent(bma->ip, bma->blkno, bma->length); @@ -4260,8 +3986,6 @@ xfs_bmapi_allocate( error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, whichfork, &bma->icur, &bma->cur, &bma->got, &bma->logflags, bma->flags); - - bma->logflags |= tmp_logflags; if (error) return error; @@ -4406,6 +4130,15 @@ xfs_bmapi_finish( * extent state if necessary. Details behaviour is controlled by the flags * parameter. Only allocates blocks from a single allocation group, to avoid * locking problems. + * + * Returns 0 on success and places the extent mappings in mval. nmaps is used + * as an input/output parameter where the caller specifies the maximum number + * of mappings that may be returned and xfs_bmapi_write passes back the number + * of mappings (including existing mappings) it found. + * + * Returns a negative error code on failure, including -ENOSPC when it could not + * allocate any blocks and -ENOSR when it did allocate blocks to convert a + * delalloc range, but those blocks were before the passed in range. */ int xfs_bmapi_write( @@ -4524,28 +4257,42 @@ xfs_bmapi_write( * allocation length request (which can be 64 bits in * length) and the bma length request, which is * xfs_extlen_t and therefore 32 bits. Hence we have to - * check for 32-bit overflows and handle them here. + * be careful and do the min() using the larger type to + * avoid overflows. */ - if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN) - bma.length = XFS_MAX_BMBT_EXTLEN; - else - bma.length = len; + bma.length = XFS_FILBLKS_MIN(len, XFS_MAX_BMBT_EXTLEN); + + if (wasdelay) { + bma.length = XFS_FILBLKS_MIN(bma.length, + bma.got.br_blockcount - + (bno - bma.got.br_startoff)); + } else { + if (!eof) + bma.length = XFS_FILBLKS_MIN(bma.length, + bma.got.br_startoff - bno); + } - ASSERT(len > 0); ASSERT(bma.length > 0); error = xfs_bmapi_allocate(&bma); - if (error) + if (error) { + /* + * If we already allocated space in a previous + * iteration return what we go so far when + * running out of space. + */ + if (error == -ENOSPC && bma.nallocs) + break; goto error0; - if (bma.blkno == NULLFSBLOCK) - break; + } /* * If this is a CoW allocation, record the data in * the refcount btree for orphan recovery. */ if (whichfork == XFS_COW_FORK) - xfs_refcount_alloc_cow_extent(tp, bma.blkno, - bma.length); + xfs_refcount_alloc_cow_extent(tp, + XFS_IS_REALTIME_INODE(ip), + bma.blkno, bma.length); } /* Deal with the allocated space we found. */ @@ -4575,7 +4322,6 @@ xfs_bmapi_write( if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got)) eof = true; } - *nmap = n; error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); @@ -4586,7 +4332,22 @@ xfs_bmapi_write( ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork)); xfs_bmapi_finish(&bma, whichfork, 0); xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, - orig_nmap, *nmap); + orig_nmap, n); + + /* + * When converting delayed allocations, xfs_bmapi_allocate ignores + * the passed in bno and always converts from the start of the found + * delalloc extent. + * + * To avoid a successful return with *nmap set to 0, return the magic + * -ENOSR error code for this particular case so that the caller can + * handle it. + */ + if (!n) { + ASSERT(bma.nallocs >= *nmap); + return -ENOSR; + } + *nmap = n; return 0; error0: xfs_bmapi_finish(&bma, whichfork, error); @@ -4599,8 +4360,8 @@ error0: * invocations to allocate the target offset if a large enough physical extent * is not available. */ -int -xfs_bmapi_convert_delalloc( +static int +xfs_bmapi_convert_one_delalloc( struct xfs_inode *ip, int whichfork, xfs_off_t offset, @@ -4630,11 +4391,8 @@ xfs_bmapi_convert_delalloc( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_iext_count_may_overflow(ip, whichfork, + error = xfs_iext_count_extend(tp, ip, whichfork, XFS_IEXT_ADD_NOSPLIT_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; @@ -4657,19 +4415,25 @@ xfs_bmapi_convert_delalloc( if (!isnullstartblock(bma.got.br_startblock)) { xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } bma.tp = tp; bma.ip = ip; bma.wasdel = true; - bma.offset = bma.got.br_startoff; - bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, - XFS_MAX_BMBT_EXTLEN); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); /* + * Always allocate convert from the start of the delalloc extent even if + * that is outside the passed in range to create large contiguous + * extents on disk. + */ + bma.offset = bma.got.br_startoff; + bma.length = bma.got.br_blockcount; + + /* * When we're converting the delalloc reservations backing dirty pages * in the page cache, we must be careful about how we create the new * extents: @@ -4693,25 +4457,18 @@ xfs_bmapi_convert_delalloc( if (error) goto out_finish; - error = -ENOSPC; - if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) - goto out_finish; - if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) { - xfs_bmap_mark_sick(ip, whichfork); - error = -EFSCORRUPTED; - goto out_finish; - } - XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) - xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); + xfs_refcount_alloc_cow_extent(tp, XFS_IS_REALTIME_INODE(ip), + bma.blkno, bma.length); error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); @@ -4731,6 +4488,36 @@ out_trans_cancel: return error; } +/* + * Pass in a dellalloc extent and convert it to real extents, return the real + * extent that maps offset_fsb in iomap. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_inode *ip, + int whichfork, + loff_t offset, + struct iomap *iomap, + unsigned int *seq) +{ + int error; + + /* + * Attempt to allocate whatever delalloc extent currently backs offset + * and put the result into iomap. Allocate in a loop because it may + * take several attempts to allocate real blocks for a contiguous + * delalloc extent if free space is sufficiently fragmented. + */ + do { + error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset, + iomap, seq); + if (error) + return error; + } while (iomap->offset + iomap->length <= offset); + + return 0; +} + int xfs_bmapi_remap( struct xfs_trans *tp, @@ -4777,6 +4564,7 @@ xfs_bmapi_remap( } ip->i_nblocks += len; + ip->i_delayed_blks -= len; /* see xfs_bmap_defer_add */ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (ifp->if_format == XFS_DINODE_FMT_BTREE) @@ -4822,32 +4610,18 @@ error0: * ores == 1). The number of stolen blocks is returned. The availability and * subsequent accounting of stolen blocks is the responsibility of the caller. */ -static xfs_filblks_t +static void xfs_bmap_split_indlen( xfs_filblks_t ores, /* original res. */ xfs_filblks_t *indlen1, /* ext1 worst indlen */ - xfs_filblks_t *indlen2, /* ext2 worst indlen */ - xfs_filblks_t avail) /* stealable blocks */ + xfs_filblks_t *indlen2) /* ext2 worst indlen */ { xfs_filblks_t len1 = *indlen1; xfs_filblks_t len2 = *indlen2; xfs_filblks_t nres = len1 + len2; /* new total res. */ - xfs_filblks_t stolen = 0; xfs_filblks_t resfactor; /* - * Steal as many blocks as we can to try and satisfy the worst case - * indlen for both new extents. - */ - if (ores < nres && avail) - stolen = XFS_FILBLKS_MIN(nres - ores, avail); - ores += stolen; - - /* nothing else to do if we've satisfied the new reservation */ - if (ores >= nres) - return stolen; - - /* * We can't meet the total required reservation for the two extents. * Calculate the percent of the overall shortage between both extents * and apply this percentage to each of the requested indlen values. @@ -4891,26 +4665,25 @@ xfs_bmap_split_indlen( *indlen1 = len1; *indlen2 = len2; - - return stolen; } -int +void xfs_bmap_del_extent_delay( struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del) + struct xfs_bmbt_irec *del, + uint32_t bflags) /* bmapi flags */ { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec new; int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; - xfs_filblks_t got_indlen, new_indlen, stolen; + xfs_filblks_t got_indlen, new_indlen, stolen = 0; uint32_t state = xfs_bmap_fork_to_state(whichfork); - int error = 0; + uint64_t fdblocks; bool isrt; XFS_STATS_INC(mp, xs_del_exlist); @@ -4925,18 +4698,12 @@ xfs_bmap_del_extent_delay( ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); - if (isrt) - xfs_mod_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount)); - /* * Update the inode delalloc counter now and wait to update the * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - ASSERT(!isrt); - error = xfs_quota_unreserve_blkres(ip, del->br_blockcount); - if (error) - return error; + xfs_quota_unreserve_blkres(ip, del->br_blockcount); ip->i_delayed_blks -= del->br_blockcount; if (got->br_startoff == del->br_startoff) @@ -4990,8 +4757,24 @@ xfs_bmap_del_extent_delay( new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount); WARN_ON_ONCE(!got_indlen || !new_indlen); - stolen = xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen, - del->br_blockcount); + /* + * Steal as many blocks as we can to try and satisfy the worst + * case indlen for both new extents. + * + * However, we can't just steal reservations from the data + * blocks if this is an RT inodes as the data and metadata + * blocks come from different pools. We'll have to live with + * under-filled indirect reservation in this case. + */ + da_new = got_indlen + new_indlen; + if (da_new > da_old && !isrt) { + stolen = XFS_FILBLKS_MIN(da_new - da_old, + del->br_blockcount); + da_old += stolen; + } + if (da_new > da_old) + xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen); + da_new = got_indlen + new_indlen; got->br_startblock = nullstartblock((int)got_indlen); @@ -5003,20 +4786,29 @@ xfs_bmap_del_extent_delay( xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &new, state); - da_new = got_indlen + new_indlen - stolen; del->br_blockcount -= stolen; break; } ASSERT(da_old >= da_new); da_diff = da_old - da_new; - if (!isrt) - da_diff += del->br_blockcount; - if (da_diff) { - xfs_mod_fdblocks(mp, da_diff, false); - xfs_mod_delalloc(mp, -da_diff); + fdblocks = da_diff; + + if (bflags & XFS_BMAPI_REMAP) { + ; + } else if (isrt) { + xfs_rtbxlen_t rtxlen; + + rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount); + if (xfs_is_zoned_inode(ip)) + xfs_zoned_add_available(mp, rtxlen); + xfs_add_frextents(mp, rtxlen); + } else { + fdblocks += del->br_blockcount; } - return error; + + xfs_add_fdblocks(mp, fdblocks); + xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff); } void @@ -5090,6 +4882,34 @@ xfs_bmap_del_extent_cow( ip->i_delayed_blks -= del->br_blockcount; } +static int +xfs_bmap_free_rtblocks( + struct xfs_trans *tp, + struct xfs_bmbt_irec *del) +{ + struct xfs_rtgroup *rtg; + int error; + + rtg = xfs_rtgroup_grab(tp->t_mountp, 0); + if (!rtg) + return -EIO; + + /* + * Ensure the bitmap and summary inodes are locked and joined to the + * transaction before modifying them. + */ + if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) { + tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED; + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_BITMAP); + } + + error = xfs_rtfree_blocks(tp, rtg, del->br_startblock, + del->br_blockcount); + xfs_rtgroup_rele(rtg); + return error; +} + /* * Called by xfs_bmapi to update file extent records and the btree * after removing space. @@ -5107,8 +4927,7 @@ xfs_bmap_del_extent_real( { xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ - int do_fx; /* free extent at end of routine */ - int error; /* error return value */ + int error = 0; /* error return value */ struct xfs_bmbt_irec got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ @@ -5151,20 +4970,10 @@ xfs_bmap_del_extent_real( return -ENOSPC; *logflagsp = XFS_ILOG_CORE; - if (xfs_ifork_is_realtime(ip, whichfork)) { - if (!(bflags & XFS_BMAPI_REMAP)) { - error = xfs_rtfree_blocks(tp, del->br_startblock, - del->br_blockcount); - if (error) - return error; - } - - do_fx = 0; + if (xfs_ifork_is_realtime(ip, whichfork)) qfield = XFS_TRANS_DQ_RTBCOUNT; - } else { - do_fx = 1; + else qfield = XFS_TRANS_DQ_BCOUNT; - } nblks = del->br_blockcount; del_endblock = del->br_startblock + del->br_blockcount; @@ -5312,18 +5121,39 @@ xfs_bmap_del_extent_real( /* * If we need to, add to list of extents to delete. */ - if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { + if (!(bflags & XFS_BMAPI_REMAP)) { + bool isrt = xfs_ifork_is_realtime(ip, whichfork); + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { - xfs_refcount_decrease_extent(tp, del); + xfs_refcount_decrease_extent(tp, isrt, del); + } else if (isrt && !xfs_has_rtgroups(mp)) { + error = xfs_bmap_free_rtblocks(tp, del); } else { + unsigned int efi_flags = 0; + + if ((bflags & XFS_BMAPI_NODISCARD) || + del->br_state == XFS_EXT_UNWRITTEN) + efi_flags |= XFS_FREE_EXTENT_SKIP_DISCARD; + + /* + * Historically, we did not use EFIs to free realtime + * extents. However, when reverse mapping is enabled, + * we must maintain the same order of operations as the + * data device, which is: Remove the file mapping, + * remove the reverse mapping, and then free the + * blocks. Reflink for realtime volumes requires the + * same sort of ordering. Both features rely on + * rtgroups, so let's gate rt EFI usage on rtgroups. + */ + if (isrt) + efi_flags |= XFS_FREE_EXTENT_REALTIME; + error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, - XFS_AG_RESV_NONE, - ((bflags & XFS_BMAPI_NODISCARD) || - del->br_state == XFS_EXT_UNWRITTEN)); - if (error) - return error; + XFS_AG_RESV_NONE, efi_flags); } + if (error) + return error; } /* @@ -5414,16 +5244,6 @@ __xfs_bunmapi( } else cur = NULL; - if (isrt) { - /* - * Synchronize by locking the bitmap inode. - */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); - xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); - } - extno = 0; while (end != (xfs_fileoff_t)-1 && end >= start && (nexts == 0 || extno < nexts)) { @@ -5584,18 +5404,17 @@ __xfs_bunmapi( delete: if (wasdel) { - error = xfs_bmap_del_extent_delay(ip, whichfork, &icur, - &got, &del); + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, flags); } else { error = xfs_bmap_del_extent_real(ip, tp, &icur, cur, &del, &tmp_logflags, whichfork, flags); logflags |= tmp_logflags; + if (error) + goto error0; } - if (error) - goto error0; - end = del.br_startoff - 1; nodelete: /* @@ -5678,6 +5497,8 @@ xfs_bunmapi( */ STATIC bool xfs_bmse_can_merge( + struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *left, /* preceding extent */ struct xfs_bmbt_irec *got, /* current extent to shift */ xfs_fileoff_t shift) /* shift fsb */ @@ -5693,7 +5514,8 @@ xfs_bmse_can_merge( if ((left->br_startoff + left->br_blockcount != startoff) || (left->br_startblock + left->br_blockcount != got->br_startblock) || (left->br_state != got->br_state) || - (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN)) + (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN) || + !xfs_bmap_same_rtgroup(ip, whichfork, left, got)) return false; return true; @@ -5729,7 +5551,7 @@ xfs_bmse_merge( blockcount = left->br_blockcount + got->br_blockcount; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - ASSERT(xfs_bmse_can_merge(left, got, shift)); + ASSERT(xfs_bmse_can_merge(ip, whichfork, left, got, shift)); new = *left; new.br_blockcount = blockcount; @@ -5891,7 +5713,8 @@ xfs_bmap_collapse_extents( goto del_cursor; } - if (xfs_bmse_can_merge(&prev, &got, offset_shift_fsb)) { + if (xfs_bmse_can_merge(ip, whichfork, &prev, &got, + offset_shift_fsb)) { error = xfs_bmse_merge(tp, ip, whichfork, offset_shift_fsb, &icur, &got, &prev, cur, &logflags); @@ -6027,7 +5850,8 @@ xfs_bmap_insert_extents( * never find mergeable extents in this scenario. Check anyways * and warn if we encounter two extents that could be one. */ - if (xfs_bmse_can_merge(&got, &next, offset_shift_fsb)) + if (xfs_bmse_can_merge(ip, whichfork, &got, &next, + offset_shift_fsb)) WARN_ON_ONCE(1); } @@ -6354,6 +6178,7 @@ xfs_bunmapi_range( error = xfs_defer_finish(tpp); if (error) goto out; + cond_resched(); } out: return error; @@ -6401,3 +6226,50 @@ xfs_bmap_query_all( return xfs_btree_query_all(cur, xfs_bmap_query_range_helper, &query); } + +/* Helper function to extract extent size hint from inode */ +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + /* + * No point in aligning allocations if we need to COW to actually + * write to them. + */ + if (!xfs_is_always_cow_inode(ip) && + (ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) + return ip->i_extsize; + if (XFS_IS_REALTIME_INODE(ip) && + ip->i_mount->m_sb.sb_rextsize > 1) + return ip->i_mount->m_sb.sb_rextsize; + return 0; +} + +/* + * Helper function to extract CoW extent size hint from inode. + * Between the extent size hint and the CoW extent size hint, we + * return the greater of the two. If the value is zero (automatic), + * use the default size. + */ +xfs_extlen_t +xfs_get_cowextsz_hint( + struct xfs_inode *ip) +{ + xfs_extlen_t a, b; + + a = 0; + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) + a = ip->i_cowextsize; + if (XFS_IS_REALTIME_INODE(ip)) { + b = 0; + if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) + b = ip->i_extsize; + } else { + b = xfs_get_extsz_hint(ip); + } + + a = max(a, b); + if (a == 0) + return XFS_DEFAULT_COWEXTSZ_HINT; + return a; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f7662595309d..d5f2729305fa 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -87,6 +87,9 @@ struct xfs_bmalloca { /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP (1u << 10) +/* Try to align allocations to the extent size hint */ +#define XFS_BMAPI_EXTSZALIGN (1u << 11) + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -98,7 +101,8 @@ struct xfs_bmalloca { { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ - { XFS_BMAPI_NORMAP, "NORMAP" } + { XFS_BMAPI_NORMAP, "NORMAP" },\ + { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" } static inline int xfs_bmapi_aflag(int w) @@ -158,7 +162,7 @@ static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec) * Return true if the extent is a real, allocated extent, or false if it is a * delayed allocation, and unwritten extent or a hole. */ -static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) +static inline bool xfs_bmap_is_written_extent(const struct xfs_bmbt_irec *irec) { return xfs_bmap_is_real_extent(irec) && irec->br_state != XFS_EXT_UNWRITTEN; @@ -176,9 +180,16 @@ int xfs_bmap_longest_free_extent(struct xfs_perag *pag, void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); -int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +int xfs_bmap_add_attrfork(struct xfs_trans *tp, struct xfs_inode *ip, + int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); +int xfs_bmap_local_to_extents(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_extlen_t total, int *logflagsp, int whichfork, + void (*init_fn)(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp, + void *priv), + void *priv); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); @@ -195,9 +206,9 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); -int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, +void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del); + struct xfs_bmbt_irec *del, uint32_t bflags); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); @@ -212,10 +223,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, bool *done, xfs_fileoff_t stop_fsb); int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_offset); -int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, - int eof); int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, xfs_off_t offset, struct iomap *iomap, unsigned int *seq); int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, @@ -226,6 +233,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip, int fork); int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap, struct xfs_alloc_arg *args); +xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, @@ -241,7 +249,7 @@ struct xfs_bmap_intent { enum xfs_bmap_intent_type bi_type; int bi_whichfork; struct xfs_inode *bi_owner; - struct xfs_perag *bi_pag; + struct xfs_group *bi_group; struct xfs_bmbt_irec bi_bmap; }; @@ -289,4 +297,7 @@ typedef int (*xfs_bmap_query_range_fn)( int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn, void *priv); +xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); +xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); + #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index f5d84dcb58da..908d7b050e9c 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -65,10 +65,10 @@ xfs_bmdr_to_bmbt( ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; dmxr = xfs_bmdr_maxrecs(dblocklen, 0); - fkp = XFS_BMDR_KEY_ADDR(dblock, 1); - tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); - fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); - tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); + fkp = xfs_bmdr_key_addr(dblock, 1); + tkp = xfs_bmbt_key_addr(mp, rblock, 1); + fpp = xfs_bmdr_ptr_addr(dblock, 1, dmxr); + tpp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, rblocklen); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); memcpy(tpp, fpp, sizeof(*fpp) * dmxr); @@ -168,10 +168,10 @@ xfs_bmbt_to_bmdr( dblock->bb_level = rblock->bb_level; dblock->bb_numrecs = rblock->bb_numrecs; dmxr = xfs_bmdr_maxrecs(dblocklen, 0); - fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); - tkp = XFS_BMDR_KEY_ADDR(dblock, 1); - fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); - tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); + fkp = xfs_bmbt_key_addr(mp, rblock, 1); + tkp = xfs_bmdr_key_addr(dblock, 1); + fpp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, rblocklen); + tpp = xfs_bmdr_ptr_addr(dblock, 1, dmxr); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); memcpy(tpp, fpp, sizeof(*fpp) * dmxr); @@ -282,7 +282,7 @@ xfs_bmbt_free_block( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, 0); if (error) return error; @@ -516,6 +516,116 @@ xfs_bmbt_keys_contiguous( be64_to_cpu(key2->bmbt.br_startoff)); } +static inline void +xfs_bmbt_move_ptrs( + struct xfs_mount *mp, + struct xfs_btree_block *broot, + short old_size, + size_t new_size, + unsigned int numrecs) +{ + void *dptr; + void *sptr; + + sptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, old_size); + dptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, new_size); + memmove(dptr, sptr, numrecs * sizeof(xfs_bmbt_ptr_t)); +} + +/* + * Reallocate the space for if_broot based on the number of records. Move the + * records and pointers in if_broot to fit the new size. When shrinking this + * will eliminate holes between the records and pointers created by the caller. + * When growing this will create holes to be filled in by the caller. + * + * The caller must not request to add more records than would fit in the + * on-disk inode root. If the if_broot is currently NULL, then if we are + * adding records, one will be allocated. The caller must also not request + * that the number of records go below zero, although it can go to zero. + * + * ip -- the inode whose if_broot area is changing + * whichfork -- which inode fork to change + * new_numrecs -- the new number of records requested for the if_broot array + * + * Returns the incore btree root block. + */ +struct xfs_btree_block * +xfs_bmap_broot_realloc( + struct xfs_inode *ip, + int whichfork, + unsigned int new_numrecs) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + struct xfs_btree_block *broot; + unsigned int new_size; + unsigned int old_size = ifp->if_broot_bytes; + + /* + * Block mapping btrees do not support storing zero records; if this + * happens, the fork is being changed to FMT_EXTENTS. Free the broot + * and get out. + */ + if (new_numrecs == 0) + return xfs_broot_realloc(ifp, 0); + + new_size = xfs_bmap_broot_space_calc(mp, new_numrecs); + + /* Handle the nop case quietly. */ + if (new_size == old_size) + return ifp->if_broot; + + if (new_size > old_size) { + unsigned int old_numrecs; + + /* + * If there wasn't any memory allocated before, just + * allocate it now and get out. + */ + if (old_size == 0) + return xfs_broot_realloc(ifp, new_size); + + /* + * If there is already an existing if_broot, then we need + * to realloc() it and shift the pointers to their new + * location. The records don't change location because + * they are kept butted up against the btree block header. + */ + old_numrecs = xfs_bmbt_maxrecs(mp, old_size, false); + broot = xfs_broot_realloc(ifp, new_size); + ASSERT(xfs_bmap_bmdr_space(broot) <= + xfs_inode_fork_size(ip, whichfork)); + xfs_bmbt_move_ptrs(mp, broot, old_size, new_size, old_numrecs); + return broot; + } + + /* + * We're reducing, but not totally eliminating, numrecs. In this case, + * we are shrinking the if_broot buffer, so it must already exist. + */ + ASSERT(ifp->if_broot != NULL && old_size > 0 && new_size > 0); + + /* + * Shrink the btree root by moving the bmbt pointers, since they are + * not butted up against the btree block header, then reallocating + * broot. + */ + xfs_bmbt_move_ptrs(mp, ifp->if_broot, old_size, new_size, new_numrecs); + broot = xfs_broot_realloc(ifp, new_size); + ASSERT(xfs_bmap_bmdr_space(broot) <= + xfs_inode_fork_size(ip, whichfork)); + return broot; +} + +static struct xfs_btree_block * +xfs_bmbt_broot_realloc( + struct xfs_btree_cur *cur, + unsigned int new_numrecs) +{ + return xfs_bmap_broot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork, + new_numrecs); +} + const struct xfs_btree_ops xfs_bmbt_ops = { .name = "bmap", .type = XFS_BTREE_TYPE_INODE, @@ -543,6 +653,7 @@ const struct xfs_btree_ops xfs_bmbt_ops = { .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, .keys_contiguous = xfs_bmbt_keys_contiguous, + .broot_realloc = xfs_bmbt_broot_realloc, }; /* @@ -645,13 +756,13 @@ xfs_bmbt_commit_staged_btree( /* * Calculate number of records in a bmap btree block. */ -int +unsigned int xfs_bmbt_maxrecs( struct xfs_mount *mp, - int blocklen, - int leaf) + unsigned int blocklen, + bool leaf) { - blocklen -= XFS_BMBT_BLOCK_LEN(mp); + blocklen -= xfs_bmbt_block_len(mp); return xfs_bmbt_block_maxrecs(blocklen, leaf); } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index de1b73f1225c..b238d559ab03 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -14,70 +14,6 @@ struct xfs_trans; struct xbtree_ifakeroot; /* - * Btree block header size depends on a superblock flag. - */ -#define XFS_BMBT_BLOCK_LEN(mp) \ - (xfs_has_crc(((mp))) ? \ - XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) - -#define XFS_BMBT_REC_ADDR(mp, block, index) \ - ((xfs_bmbt_rec_t *) \ - ((char *)(block) + \ - XFS_BMBT_BLOCK_LEN(mp) + \ - ((index) - 1) * sizeof(xfs_bmbt_rec_t))) - -#define XFS_BMBT_KEY_ADDR(mp, block, index) \ - ((xfs_bmbt_key_t *) \ - ((char *)(block) + \ - XFS_BMBT_BLOCK_LEN(mp) + \ - ((index) - 1) * sizeof(xfs_bmbt_key_t))) - -#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \ - ((xfs_bmbt_ptr_t *) \ - ((char *)(block) + \ - XFS_BMBT_BLOCK_LEN(mp) + \ - (maxrecs) * sizeof(xfs_bmbt_key_t) + \ - ((index) - 1) * sizeof(xfs_bmbt_ptr_t))) - -#define XFS_BMDR_REC_ADDR(block, index) \ - ((xfs_bmdr_rec_t *) \ - ((char *)(block) + \ - sizeof(struct xfs_bmdr_block) + \ - ((index) - 1) * sizeof(xfs_bmdr_rec_t))) - -#define XFS_BMDR_KEY_ADDR(block, index) \ - ((xfs_bmdr_key_t *) \ - ((char *)(block) + \ - sizeof(struct xfs_bmdr_block) + \ - ((index) - 1) * sizeof(xfs_bmdr_key_t))) - -#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \ - ((xfs_bmdr_ptr_t *) \ - ((char *)(block) + \ - sizeof(struct xfs_bmdr_block) + \ - (maxrecs) * sizeof(xfs_bmdr_key_t) + \ - ((index) - 1) * sizeof(xfs_bmdr_ptr_t))) - -/* - * These are to be used when we know the size of the block and - * we don't have a cursor. - */ -#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \ - XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0)) - -#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \ - (int)(XFS_BMBT_BLOCK_LEN(mp) + \ - ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) - -#define XFS_BMAP_BROOT_SPACE(mp, bb) \ - (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs))) -#define XFS_BMDR_SPACE_CALC(nrecs) \ - (int)(sizeof(xfs_bmdr_block_t) + \ - ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) -#define XFS_BMAP_BMDR_SPACE(bb) \ - (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) - -/* * Maximum number of bmap btree levels. */ #define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)]) @@ -99,7 +35,8 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(int blocklen, int leaf); -extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); +unsigned int xfs_bmbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_ino_t new_owner, @@ -121,4 +58,147 @@ void xfs_bmbt_destroy_cur_cache(void); void xfs_bmbt_init_block(struct xfs_inode *ip, struct xfs_btree_block *buf, struct xfs_buf *bp, __u16 level, __u16 numrecs); +/* + * Btree block header size depends on a superblock flag. + */ +static inline size_t +xfs_bmbt_block_len(struct xfs_mount *mp) +{ + return xfs_has_crc(mp) ? + XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN; +} + +/* Addresses of key, pointers, and records within an incore bmbt block. */ + +static inline struct xfs_bmbt_rec * +xfs_bmbt_rec_addr( + struct xfs_mount *mp, + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_bmbt_rec *) + ((char *)block + xfs_bmbt_block_len(mp) + + (index - 1) * sizeof(struct xfs_bmbt_rec)); +} + +static inline struct xfs_bmbt_key * +xfs_bmbt_key_addr( + struct xfs_mount *mp, + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_bmbt_key *) + ((char *)block + xfs_bmbt_block_len(mp) + + (index - 1) * sizeof(struct xfs_bmbt_key *)); +} + +static inline xfs_bmbt_ptr_t * +xfs_bmbt_ptr_addr( + struct xfs_mount *mp, + struct xfs_btree_block *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_bmbt_ptr_t *) + ((char *)block + xfs_bmbt_block_len(mp) + + maxrecs * sizeof(struct xfs_bmbt_key) + + (index - 1) * sizeof(xfs_bmbt_ptr_t)); +} + +/* Addresses of key, pointers, and records within an ondisk bmbt block. */ + +static inline struct xfs_bmbt_rec * +xfs_bmdr_rec_addr( + struct xfs_bmdr_block *block, + unsigned int index) +{ + return (struct xfs_bmbt_rec *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_bmbt_rec)); +} + +static inline struct xfs_bmbt_key * +xfs_bmdr_key_addr( + struct xfs_bmdr_block *block, + unsigned int index) +{ + return (struct xfs_bmbt_key *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_bmbt_key)); +} + +static inline xfs_bmbt_ptr_t * +xfs_bmdr_ptr_addr( + struct xfs_bmdr_block *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_bmbt_ptr_t *) + ((char *)(block + 1) + + maxrecs * sizeof(struct xfs_bmbt_key) + + (index - 1) * sizeof(xfs_bmbt_ptr_t)); +} + +/* + * Address of pointers within the incore btree root. + * + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +static inline xfs_bmbt_ptr_t * +xfs_bmap_broot_ptr_addr( + struct xfs_mount *mp, + struct xfs_btree_block *bb, + unsigned int i, + unsigned int sz) +{ + return xfs_bmbt_ptr_addr(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, false)); +} + +/* + * Compute the space required for the incore btree root containing the given + * number of records. + */ +static inline size_t +xfs_bmap_broot_space_calc( + struct xfs_mount *mp, + unsigned int nrecs) +{ + return xfs_bmbt_block_len(mp) + + (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t))); +} + +/* + * Compute the space required for the incore btree root given the ondisk + * btree root block. + */ +static inline size_t +xfs_bmap_broot_space( + struct xfs_mount *mp, + struct xfs_bmdr_block *bb) +{ + return xfs_bmap_broot_space_calc(mp, be16_to_cpu(bb->bb_numrecs)); +} + +/* Compute the space required for the ondisk root block. */ +static inline size_t +xfs_bmdr_space_calc(unsigned int nrecs) +{ + return sizeof(struct xfs_bmdr_block) + + (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t))); +} + +/* + * Compute the space required for the ondisk root block given an incore root + * block. + */ +static inline size_t +xfs_bmap_bmdr_space(struct xfs_btree_block *bb) +{ + return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs)); +} + +struct xfs_btree_block *xfs_bmap_broot_realloc(struct xfs_inode *ip, + int whichfork, unsigned int new_numrecs); + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index d29547572a68..299ce7fd11b0 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -30,6 +30,12 @@ #include "xfs_health.h" #include "xfs_buf_mem.h" #include "xfs_btree_mem.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_rmap.h" +#include "xfs_quota.h" +#include "xfs_metafile.h" +#include "xfs_rtrefcount_btree.h" /* * Btree magic numbers. @@ -225,7 +231,7 @@ __xfs_btree_check_agblock( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); xfs_failaddr_t fa; xfs_agblock_t agbno; @@ -331,7 +337,7 @@ __xfs_btree_check_ptr( return -EFSCORRUPTED; break; case XFS_BTREE_TYPE_AG: - if (!xfs_verify_agbno(cur->bc_ag.pag, + if (!xfs_verify_agbno(to_perag(cur->bc_group), be32_to_cpu((&ptr->s)[index]))) return -EFSCORRUPTED; break; @@ -372,7 +378,7 @@ xfs_btree_check_ptr( case XFS_BTREE_TYPE_AG: xfs_err(cur->bc_mp, "AG %u: Corrupt %sbt pointer at level %d index %d.", - cur->bc_ag.pag->pag_agno, cur->bc_ops->name, + cur->bc_group->xg_gno, cur->bc_ops->name, level, index); break; } @@ -523,20 +529,8 @@ xfs_btree_del_cursor( ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || xfs_is_shutdown(cur->bc_mp) || error != 0); - switch (cur->bc_ops->type) { - case XFS_BTREE_TYPE_AG: - if (cur->bc_ag.pag) - xfs_perag_put(cur->bc_ag.pag); - break; - case XFS_BTREE_TYPE_INODE: - /* nothing to do */ - break; - case XFS_BTREE_TYPE_MEM: - if (cur->bc_mem.pag) - xfs_perag_put(cur->bc_mem.pag); - break; - } - + if (cur->bc_group) + xfs_group_put(cur->bc_group); kmem_cache_free(cur->bc_cache, cur); } @@ -1017,22 +1011,22 @@ xfs_btree_readahead_agblock( struct xfs_btree_block *block) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = to_perag(cur->bc_group); xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); int rval = 0; if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, left), - mp->m_bsize, cur->bc_ops->buf_ops); + xfs_agbno_to_daddr(pag, left), mp->m_bsize, + cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, right), - mp->m_bsize, cur->bc_ops->buf_ops); + xfs_agbno_to_daddr(pag, right), mp->m_bsize, + cur->bc_ops->buf_ops); rval++; } @@ -1091,7 +1085,7 @@ xfs_btree_ptr_to_daddr( switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_AG: - *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, + *daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group), be32_to_cpu(ptr->s)); break; case XFS_BTREE_TYPE_INODE: @@ -1313,7 +1307,7 @@ xfs_btree_owner( case XFS_BTREE_TYPE_INODE: return cur->bc_ino.ip->i_ino; case XFS_BTREE_TYPE_AG: - return cur->bc_ag.pag->pag_agno; + return cur->bc_group->xg_gno; default: ASSERT(0); return 0; @@ -1331,30 +1325,6 @@ xfs_btree_init_block_cur( xfs_btree_owner(cur)); } -/* - * Return true if ptr is the last record in the btree and - * we need to track updates to this record. The decision - * will be further refined in the update_lastrec method. - */ -STATIC int -xfs_btree_is_lastrec( - struct xfs_btree_cur *cur, - struct xfs_btree_block *block, - int level) -{ - union xfs_btree_ptr ptr; - - if (level > 0) - return 0; - if (!(cur->bc_ops->geom_flags & XFS_BTGEO_LASTREC_UPDATE)) - return 0; - - xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); - if (!xfs_btree_ptr_is_null(cur, &ptr)) - return 0; - return 1; -} - STATIC void xfs_btree_buf_to_ptr( struct xfs_btree_cur *cur, @@ -1573,12 +1543,16 @@ xfs_btree_log_recs( int first, int last) { + if (!bp) { + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); + return; + } xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_rec_offset(cur, first), xfs_btree_rec_offset(cur, last + 1) - 1); - } /* @@ -2420,15 +2394,6 @@ xfs_btree_update( xfs_btree_copy_recs(cur, rp, rec, 1); xfs_btree_log_recs(cur, bp, ptr, ptr); - /* - * If we are tracking the last record in the tree and - * we are at the far right edge of the tree, update it. - */ - if (xfs_btree_is_lastrec(cur, block, 0)) { - cur->bc_ops->update_lastrec(cur, block, rec, - ptr, LASTREC_UPDATE); - } - /* Pass new key value up to our parent. */ if (xfs_btree_needs_key_update(cur, ptr)) { error = xfs_btree_update_keys(cur, 0); @@ -3123,6 +3088,131 @@ xfs_btree_split( #define xfs_btree_split __xfs_btree_split #endif /* __KERNEL__ */ +/* Move the records from a root leaf block to a separate block. */ +STATIC void +xfs_btree_promote_leaf_iroot( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + struct xfs_buf *cbp, + union xfs_btree_ptr *cptr, + struct xfs_btree_block *cblock) +{ + union xfs_btree_rec *rp; + union xfs_btree_rec *crp; + union xfs_btree_key *kp; + union xfs_btree_ptr *pp; + struct xfs_btree_block *broot; + int numrecs = xfs_btree_get_numrecs(block); + + /* Copy the records from the leaf broot into the new child block. */ + rp = xfs_btree_rec_addr(cur, 1, block); + crp = xfs_btree_rec_addr(cur, 1, cblock); + xfs_btree_copy_recs(cur, crp, rp, numrecs); + + /* + * Increment the tree height. + * + * Trickery here: The amount of memory that we need per record for the + * ifork's btree root block may change when we convert the broot from a + * leaf to a node block. Free the existing leaf broot so that nobody + * thinks we need to migrate node pointers when we realloc the broot + * buffer after bumping nlevels. + */ + cur->bc_ops->broot_realloc(cur, 0); + cur->bc_nlevels++; + cur->bc_levels[1].ptr = 1; + + /* + * Allocate a new node broot and initialize it to point to the new + * child block. + */ + broot = cur->bc_ops->broot_realloc(cur, 1); + xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, + cur->bc_nlevels - 1, 1, cur->bc_ino.ip->i_ino); + + pp = xfs_btree_ptr_addr(cur, 1, broot); + kp = xfs_btree_key_addr(cur, 1, broot); + xfs_btree_copy_ptrs(cur, pp, cptr, 1); + xfs_btree_get_keys(cur, cblock, kp); + + /* Attach the new block to the cursor and log it. */ + xfs_btree_setbuf(cur, 0, cbp); + xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); + xfs_btree_log_recs(cur, cbp, 1, numrecs); +} + +/* + * Move the keys and pointers from a root block to a separate block. + * + * Since the keyptr size does not change, all we have to do is increase the + * tree height, copy the keyptrs to the new internal node (cblock), shrink + * the root, and copy the pointers there. + */ +STATIC int +xfs_btree_promote_node_iroot( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *cbp, + union xfs_btree_ptr *cptr, + struct xfs_btree_block *cblock) +{ + union xfs_btree_key *ckp; + union xfs_btree_key *kp; + union xfs_btree_ptr *cpp; + union xfs_btree_ptr *pp; + int i; + int error; + int numrecs = xfs_btree_get_numrecs(block); + + /* + * Increase tree height, adjusting the root block level to match. + * We cannot change the root btree node size until we've copied the + * block contents to the new child block. + */ + be16_add_cpu(&block->bb_level, 1); + cur->bc_nlevels++; + cur->bc_levels[level + 1].ptr = 1; + + /* + * Adjust the root btree record count, then copy the keys from the old + * root to the new child block. + */ + xfs_btree_set_numrecs(block, 1); + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, ckp, kp, numrecs); + + /* Check the pointers and copy them to the new child block. */ + pp = xfs_btree_ptr_addr(cur, 1, block); + cpp = xfs_btree_ptr_addr(cur, 1, cblock); + for (i = 0; i < numrecs; i++) { + error = xfs_btree_debug_check_ptr(cur, pp, i, level); + if (error) + return error; + } + xfs_btree_copy_ptrs(cur, cpp, pp, numrecs); + + /* + * Set the first keyptr to point to the new child block, then shrink + * the memory buffer for the root block. + */ + error = xfs_btree_debug_check_ptr(cur, cptr, 0, level); + if (error) + return error; + xfs_btree_copy_ptrs(cur, pp, cptr, 1); + xfs_btree_get_keys(cur, cblock, kp); + + cur->bc_ops->broot_realloc(cur, 1); + + /* Attach the new block to the cursor and log it. */ + xfs_btree_setbuf(cur, level, cbp); + xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); + xfs_btree_log_keys(cur, cbp, 1, numrecs); + xfs_btree_log_ptrs(cur, cbp, 1, numrecs); + return 0; +} + /* * Copy the old inode root contents into a real block and make the * broot point to it. @@ -3136,14 +3226,10 @@ xfs_btree_new_iroot( struct xfs_buf *cbp; /* buffer for cblock */ struct xfs_btree_block *block; /* btree block */ struct xfs_btree_block *cblock; /* child btree block */ - union xfs_btree_key *ckp; /* child key pointer */ - union xfs_btree_ptr *cpp; /* child ptr pointer */ - union xfs_btree_key *kp; /* pointer to btree key */ - union xfs_btree_ptr *pp; /* pointer to block addr */ + union xfs_btree_ptr aptr; union xfs_btree_ptr nptr; /* new block addr */ int level; /* btree level */ int error; /* error return code */ - int i; /* loop counter */ XFS_BTREE_STATS_INC(cur, newroot); @@ -3152,10 +3238,15 @@ xfs_btree_new_iroot( level = cur->bc_nlevels - 1; block = xfs_btree_get_iroot(cur); - pp = xfs_btree_ptr_addr(cur, 1, block); + ASSERT(level > 0 || (cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)); + if (level > 0) + aptr = *xfs_btree_ptr_addr(cur, 1, block); + else + aptr.l = cpu_to_be64(XFS_INO_TO_FSB(cur->bc_mp, + cur->bc_ino.ip->i_ino)); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = xfs_btree_alloc_block(cur, pp, &nptr, stat); + error = xfs_btree_alloc_block(cur, &aptr, &nptr, stat); if (error) goto error0; if (*stat == 0) @@ -3181,47 +3272,16 @@ xfs_btree_new_iroot( cblock->bb_u.s.bb_blkno = bno; } - be16_add_cpu(&block->bb_level, 1); - xfs_btree_set_numrecs(block, 1); - cur->bc_nlevels++; - ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); - cur->bc_levels[level + 1].ptr = 1; - - kp = xfs_btree_key_addr(cur, 1, block); - ckp = xfs_btree_key_addr(cur, 1, cblock); - xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); - - cpp = xfs_btree_ptr_addr(cur, 1, cblock); - for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { - error = xfs_btree_debug_check_ptr(cur, pp, i, level); + if (level > 0) { + error = xfs_btree_promote_node_iroot(cur, block, level, cbp, + &nptr, cblock); if (error) goto error0; + } else { + xfs_btree_promote_leaf_iroot(cur, block, cbp, &nptr, cblock); } - xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); - - error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); - if (error) - goto error0; - - xfs_btree_copy_ptrs(cur, pp, &nptr, 1); - - xfs_iroot_realloc(cur->bc_ino.ip, - 1 - xfs_btree_get_numrecs(cblock), - cur->bc_ino.whichfork); - - xfs_btree_setbuf(cur, level, cbp); - - /* - * Do all this logging at the end so that - * the root is at the right level. - */ - xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); - xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); - xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); - - *logflags |= - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); + *logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); *stat = 1; return 0; error0: @@ -3392,7 +3452,7 @@ xfs_btree_make_block_unfull( if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ - xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); + cur->bc_ops->broot_realloc(cur, numrecs + 1); *stat = 1; } else { /* A root block that needs replacing */ @@ -3602,14 +3662,31 @@ xfs_btree_insrec( xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); /* - * If we just inserted into a new tree block, we have to - * recalculate nkey here because nkey is out of date. + * Update btree keys to reflect the newly added record or keyptr. + * There are three cases here to be aware of. Normally, all we have to + * do is walk towards the root, updating keys as necessary. * - * Otherwise we're just updating an existing block (having shoved - * some records into the new tree block), so use the regular key - * update mechanism. + * If the caller had us target a full block for the insertion, we dealt + * with that by calling the _make_block_unfull function. If the + * "make unfull" function splits the block, it'll hand us back the key + * and pointer of the new block. We haven't yet added the new block to + * the next level up, so if we decide to add the new record to the new + * block (bp->b_bn != old_bn), we have to update the caller's pointer + * so that the caller adds the new block with the correct key. + * + * However, there is a third possibility-- if the selected block is the + * root block of an inode-rooted btree and cannot be expanded further, + * the "make unfull" function moves the root block contents to a new + * block and updates the root block to point to the new block. In this + * case, no block pointer is passed back because the block has already + * been added to the btree. In this case, we need to use the regular + * key update function, just like the first case. This is critical for + * overlapping btrees, because the high key must be updated to reflect + * the entire tree, not just the subtree accessible through the first + * child of the root (which is now two levels down from the root). */ - if (bp && xfs_buf_daddr(bp) != old_bn) { + if (!xfs_btree_ptr_is_null(cur, &nptr) && + bp && xfs_buf_daddr(bp) != old_bn) { xfs_btree_get_keys(cur, block, lkey); } else if (xfs_btree_needs_key_update(cur, optr)) { error = xfs_btree_update_keys(cur, level); @@ -3618,15 +3695,6 @@ xfs_btree_insrec( } /* - * If we are tracking the last record in the tree and - * we are at the far right edge of the tree, update it. - */ - if (xfs_btree_is_lastrec(cur, block, level)) { - cur->bc_ops->update_lastrec(cur, block, rec, - ptr, LASTREC_INSREC); - } - - /* * Return the new block number, if any. * If there is one, give back a record value and a cursor too. */ @@ -3730,6 +3798,97 @@ error0: return error; } +/* Move the records from a child leaf block to the root block. */ +STATIC void +xfs_btree_demote_leaf_child( + struct xfs_btree_cur *cur, + struct xfs_btree_block *cblock, + int numrecs) +{ + union xfs_btree_rec *rp; + union xfs_btree_rec *crp; + struct xfs_btree_block *broot; + + /* + * Decrease the tree height. + * + * Trickery here: The amount of memory that we need per record for the + * ifork's btree root block may change when we convert the broot from a + * node to a leaf. Free the old node broot so that we can get a fresh + * leaf broot. + */ + cur->bc_ops->broot_realloc(cur, 0); + cur->bc_nlevels--; + + /* + * Allocate a new leaf broot and copy the records from the old child. + * Detach the old child from the cursor. + */ + broot = cur->bc_ops->broot_realloc(cur, numrecs); + xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, 0, numrecs, + cur->bc_ino.ip->i_ino); + + rp = xfs_btree_rec_addr(cur, 1, broot); + crp = xfs_btree_rec_addr(cur, 1, cblock); + xfs_btree_copy_recs(cur, rp, crp, numrecs); + + cur->bc_levels[0].bp = NULL; +} + +/* + * Move the keyptrs from a child node block to the root block. + * + * Since the keyptr size does not change, all we have to do is increase the + * tree height, copy the keyptrs to the new internal node (cblock), shrink + * the root, and copy the pointers there. + */ +STATIC int +xfs_btree_demote_node_child( + struct xfs_btree_cur *cur, + struct xfs_btree_block *cblock, + int level, + int numrecs) +{ + struct xfs_btree_block *block; + union xfs_btree_key *ckp; + union xfs_btree_key *kp; + union xfs_btree_ptr *cpp; + union xfs_btree_ptr *pp; + int i; + int error; + + /* + * Adjust the root btree node size and the record count to match the + * doomed child so that we can copy the keyptrs ahead of changing the + * tree shape. + */ + block = cur->bc_ops->broot_realloc(cur, numrecs); + + xfs_btree_set_numrecs(block, numrecs); + ASSERT(block->bb_numrecs == cblock->bb_numrecs); + + /* Copy keys from the doomed block. */ + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, kp, ckp, numrecs); + + /* Copy pointers from the doomed block. */ + pp = xfs_btree_ptr_addr(cur, 1, block); + cpp = xfs_btree_ptr_addr(cur, 1, cblock); + for (i = 0; i < numrecs; i++) { + error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); + if (error) + return error; + } + xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); + + /* Decrease tree height, adjusting the root block level to match. */ + cur->bc_levels[level - 1].bp = NULL; + be16_add_cpu(&block->bb_level, -1); + cur->bc_nlevels--; + return 0; +} + /* * Try to merge a non-leaf block back into the inode root. * @@ -3742,34 +3901,31 @@ STATIC int xfs_btree_kill_iroot( struct xfs_btree_cur *cur) { - int whichfork = cur->bc_ino.whichfork; struct xfs_inode *ip = cur->bc_ino.ip; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; struct xfs_btree_block *cblock; - union xfs_btree_key *kp; - union xfs_btree_key *ckp; - union xfs_btree_ptr *pp; - union xfs_btree_ptr *cpp; struct xfs_buf *cbp; int level; - int index; int numrecs; int error; #ifdef DEBUG union xfs_btree_ptr ptr; #endif - int i; ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); - ASSERT(cur->bc_nlevels > 1); + ASSERT((cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS) || + cur->bc_nlevels > 1); /* * Don't deal with the root block needs to be a leaf case. * We're just going to turn the thing back into extents anyway. */ level = cur->bc_nlevels - 1; - if (level == 1) + if (level == 1 && !(cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) + goto out0; + + /* If we're already a leaf, jump out. */ + if (level == 0) goto out0; /* @@ -3799,40 +3955,20 @@ xfs_btree_kill_iroot( ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); #endif - index = numrecs - cur->bc_ops->get_maxrecs(cur, level); - if (index) { - xfs_iroot_realloc(cur->bc_ino.ip, index, - cur->bc_ino.whichfork); - block = ifp->if_broot; - } - - be16_add_cpu(&block->bb_numrecs, index); - ASSERT(block->bb_numrecs == cblock->bb_numrecs); - - kp = xfs_btree_key_addr(cur, 1, block); - ckp = xfs_btree_key_addr(cur, 1, cblock); - xfs_btree_copy_keys(cur, kp, ckp, numrecs); - - pp = xfs_btree_ptr_addr(cur, 1, block); - cpp = xfs_btree_ptr_addr(cur, 1, cblock); - - for (i = 0; i < numrecs; i++) { - error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); + if (level > 1) { + error = xfs_btree_demote_node_child(cur, cblock, level, + numrecs); if (error) return error; - } - - xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); + } else + xfs_btree_demote_leaf_child(cur, cblock, numrecs); error = xfs_btree_free_block(cur, cbp); if (error) return error; - cur->bc_levels[level - 1].bp = NULL; - be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); - cur->bc_nlevels--; out0: return 0; } @@ -3984,21 +4120,12 @@ xfs_btree_delrec( xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); /* - * If we are tracking the last record in the tree and - * we are at the far right edge of the tree, update it. - */ - if (xfs_btree_is_lastrec(cur, block, level)) { - cur->bc_ops->update_lastrec(cur, block, NULL, - ptr, LASTREC_DELREC); - } - - /* * We're at the root level. First, shrink the root block in-memory. * Try to get rid of the next level down. If we can't then there's - * nothing left to do. + * nothing left to do. numrecs was decremented above. */ if (xfs_btree_at_iroot(cur, level)) { - xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork); + cur->bc_ops->broot_realloc(cur, numrecs); error = xfs_btree_kill_iroot(cur); if (error) @@ -4796,7 +4923,7 @@ xfs_btree_agblock_v5hdr_verify( return __this_address; if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; - if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag)) return __this_address; return NULL; } @@ -5207,7 +5334,7 @@ xfs_btree_count_blocks_helper( int level, void *data) { - xfs_extlen_t *blocks = data; + xfs_filblks_t *blocks = data; (*blocks)++; return 0; @@ -5217,7 +5344,7 @@ xfs_btree_count_blocks_helper( int xfs_btree_count_blocks( struct xfs_btree_cur *cur, - xfs_extlen_t *blocks) + xfs_filblks_t *blocks) { *blocks = 0; return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, @@ -5406,6 +5533,12 @@ xfs_btree_init_cur_caches(void) error = xfs_refcountbt_init_cur_cache(); if (error) goto err; + error = xfs_rtrmapbt_init_cur_cache(); + if (error) + goto err; + error = xfs_rtrefcountbt_init_cur_cache(); + if (error) + goto err; return 0; err: @@ -5422,6 +5555,8 @@ xfs_btree_destroy_cur_caches(void) xfs_bmbt_destroy_cur_cache(); xfs_rmapbt_destroy_cur_cache(); xfs_refcountbt_destroy_cur_cache(); + xfs_rtrmapbt_destroy_cur_cache(); + xfs_rtrefcountbt_destroy_cur_cache(); } /* Move the btree cursor before the first record. */ @@ -5450,3 +5585,67 @@ xfs_btree_goto_left_edge( return 0; } + +/* Allocate a block for an inode-rooted metadata btree. */ +int +xfs_btree_alloc_metafile_block( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + struct xfs_alloc_arg args = { + .mp = cur->bc_mp, + .tp = cur->bc_tp, + .resv = XFS_AG_RESV_METAFILE, + .minlen = 1, + .maxlen = 1, + .prod = 1, + }; + struct xfs_inode *ip = cur->bc_ino.ip; + int error; + + ASSERT(xfs_is_metadir_inode(ip)); + + xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, cur->bc_ino.whichfork); + error = xfs_alloc_vextent_start_ag(&args, + XFS_INO_TO_FSB(cur->bc_mp, ip->i_ino)); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) { + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + + xfs_metafile_resv_alloc_space(ip, &args); + + new->l = cpu_to_be64(args.fsbno); + *stat = 1; + return 0; +} + +/* Free a block from an inode-rooted metadata btree. */ +int +xfs_btree_free_metafile_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip = cur->bc_ino.ip; + struct xfs_trans *tp = cur->bc_tp; + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + int error; + + ASSERT(xfs_is_metadir_inode(ip)); + + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); + error = xfs_free_extent_later(tp, fsbno, 1, &oinfo, XFS_AG_RESV_METAFILE, + 0); + if (error) + return error; + + xfs_metafile_resv_free_space(ip, tp, 1); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index f93374278aa1..355b304696e6 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -135,7 +135,7 @@ struct xfs_btree_ops { /* offset of btree stats array */ unsigned int statoff; - /* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */ + /* sick mask for health reporting (not for bmap btrees) */ unsigned int sick_mask; /* cursor operations */ @@ -154,12 +154,6 @@ struct xfs_btree_ops { int *stat); int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); - /* update last record information */ - void (*update_lastrec)(struct xfs_btree_cur *cur, - const struct xfs_btree_block *block, - const union xfs_btree_rec *rec, - int ptr, int reason); - /* records in block/level */ int (*get_minrecs)(struct xfs_btree_cur *cur, int level); int (*get_maxrecs)(struct xfs_btree_cur *cur, int level); @@ -219,19 +213,27 @@ struct xfs_btree_ops { const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask); + + /* + * Reallocate the space for if_broot to fit the number of records. + * Move the records and pointers in if_broot to fit the new size. When + * shrinking this will eliminate holes between the records and pointers + * created by the caller. When growing this will create holes to be + * filled in by the caller. + * + * The caller must not request to add more records than would fit in + * the on-disk inode root. If the if_broot is currently NULL, then if + * we are adding records, one will be allocated. The caller must also + * not request that the number of records go below zero, although it + * can go to zero. + */ + struct xfs_btree_block *(*broot_realloc)(struct xfs_btree_cur *cur, + unsigned int new_numrecs); }; /* btree geometry flags */ -#define XFS_BTGEO_LASTREC_UPDATE (1U << 0) /* track last rec externally */ -#define XFS_BTGEO_OVERLAPPING (1U << 1) /* overlapping intervals */ - -/* - * Reasons for the update_lastrec method to be called. - */ -#define LASTREC_UPDATE 0 -#define LASTREC_INSREC 1 -#define LASTREC_DELREC 2 - +#define XFS_BTGEO_OVERLAPPING (1U << 0) /* overlapping intervals */ +#define XFS_BTGEO_IROOT_RECORDS (1U << 1) /* iroot can store records */ union xfs_btree_irec { struct xfs_alloc_rec_incore a; @@ -268,6 +270,7 @@ struct xfs_btree_cur union xfs_btree_irec bc_rec; /* current insert/search record value */ uint8_t bc_nlevels; /* number of levels in the tree */ uint8_t bc_maxlevels; /* maximum levels for this btree type */ + struct xfs_group *bc_group; /* per-type information */ union { @@ -278,13 +281,11 @@ struct xfs_btree_cur struct xbtree_ifakeroot *ifake; /* for staging cursor */ } bc_ino; struct { - struct xfs_perag *pag; struct xfs_buf *agbp; struct xbtree_afakeroot *afake; /* for staging cursor */ } bc_ag; struct { struct xfbtree *xfbtree; - struct xfs_perag *pag; } bc_mem; }; @@ -296,7 +297,7 @@ struct xfs_btree_cur struct { unsigned int nr_ops; /* # record updates */ unsigned int shape_changes; /* # of extent splits */ - } bc_refc; /* refcountbt */ + } bc_refc; /* refcountbt/rtrefcountbt */ }; /* Must be at the end of the struct! */ @@ -499,7 +500,7 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data); -int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); +int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_filblks_t *blocks); union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); @@ -702,4 +703,10 @@ xfs_btree_at_iroot( level == cur->bc_nlevels - 1; } +int xfs_btree_alloc_metafile_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, union xfs_btree_ptr *newp, + int *stat); +int xfs_btree_free_metafile_block(struct xfs_btree_cur *cur, + struct xfs_buf *bp); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c index 036061fe32cc..f2f7b4305413 100644 --- a/fs/xfs/libxfs/xfs_btree_mem.c +++ b/fs/xfs/libxfs/xfs_btree_mem.c @@ -18,6 +18,7 @@ #include "xfs_ag.h" #include "xfs_buf_item.h" #include "xfs_trace.h" +#include "xfs_rtgroup.h" /* Set the root of an in-memory btree. */ void @@ -57,10 +58,8 @@ xfbtree_dup_cursor( ncur->bc_flags = cur->bc_flags; ncur->bc_nlevels = cur->bc_nlevels; ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree; - - if (cur->bc_mem.pag) - ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag); - + if (cur->bc_group) + ncur->bc_group = xfs_group_hold(cur->bc_group); return ncur; } diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 694929703152..5ed84f9cc877 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -134,6 +134,7 @@ xfs_btree_stage_ifakeroot( cur->bc_ino.ifake = ifake; cur->bc_nlevels = ifake->if_levels; cur->bc_ino.forksize = ifake->if_fork_size; + cur->bc_ino.whichfork = XFS_STAGING_FORK; cur->bc_flags |= XFS_BTREE_STAGING; } @@ -573,6 +574,7 @@ xfs_btree_bload_compute_geometry( struct xfs_btree_bload *bbl, uint64_t nr_records) { + const struct xfs_btree_ops *ops = cur->bc_ops; uint64_t nr_blocks = 0; uint64_t nr_this_level; @@ -599,7 +601,7 @@ xfs_btree_bload_compute_geometry( xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, &avg_per_block, &level_blocks, &dontcare64); - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { + if (ops->type == XFS_BTREE_TYPE_INODE) { /* * If all the items we want to store at this level * would fit in the inode root block, then we have our @@ -607,7 +609,9 @@ xfs_btree_bload_compute_geometry( * * Note that bmap btrees forbid records in the root. */ - if (level != 0 && nr_this_level <= avg_per_block) { + if ((level != 0 || + (ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) && + nr_this_level <= avg_per_block) { nr_blocks++; break; } @@ -658,7 +662,7 @@ xfs_btree_bload_compute_geometry( return -EOVERFLOW; bbl->btree_height = cur->bc_nlevels; - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) + if (ops->type == XFS_BTREE_TYPE_INODE) bbl->nr_blocks = nr_blocks - 1; else bbl->nr_blocks = nr_blocks; diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 718d071bb21a..17d9e6154f19 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -252,6 +252,51 @@ xfs_da3_node_verify( return NULL; } +xfs_failaddr_t +xfs_da3_node_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + + if (xfs_has_crc(mp)) { + struct xfs_da3_blkinfo *hdr3 = bp->b_addr; + + if (hdr3->hdr.magic != cpu_to_be16(XFS_DA3_NODE_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->owner) != owner) + return __this_address; + } + + return NULL; +} + +xfs_failaddr_t +xfs_da3_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_da_blkinfo *hdr = bp->b_addr; + + if (!xfs_has_crc(mp)) + return NULL; + + switch (hdr->magic) { + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + return xfs_attr3_leaf_header_check(bp, owner); + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + return xfs_da3_node_header_check(bp, owner); + case cpu_to_be16(XFS_DIR3_LEAF1_MAGIC): + case cpu_to_be16(XFS_DIR3_LEAFN_MAGIC): + return xfs_dir3_leaf_header_check(bp, owner); + } + + ASSERT(0); + return NULL; +} + static void xfs_da3_node_write_verify( struct xfs_buf *bp) @@ -486,7 +531,7 @@ xfs_da3_node_create( memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr)); ichdr.magic = XFS_DA3_NODE_MAGIC; hdr3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->info.owner = cpu_to_be64(args->dp->i_ino); + hdr3->info.owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { ichdr.magic = XFS_DA_NODE_MAGIC; @@ -548,9 +593,8 @@ xfs_da3_split( switch (oldblk->magic) { case XFS_ATTR_LEAF_MAGIC: error = xfs_attr3_leaf_split(state, oldblk, newblk); - if ((error != 0) && (error != -ENOSPC)) { + if (error < 0) return error; /* GROT: attr is inconsistent */ - } if (!error) { addblk = newblk; break; @@ -572,6 +616,8 @@ xfs_da3_split( error = xfs_attr3_leaf_split(state, newblk, &state->extrablk); } + if (error == 1) + return -ENOSPC; if (error) return error; /* GROT: attr inconsistent */ addblk = newblk; @@ -1199,6 +1245,7 @@ xfs_da3_root_join( struct xfs_da3_icnode_hdr oldroothdr; int error; struct xfs_inode *dp = state->args->dp; + xfs_failaddr_t fa; trace_xfs_da_root_join(state->args); @@ -1225,6 +1272,13 @@ xfs_da3_root_join( error = xfs_da3_node_read(args->trans, dp, child, &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); /* @@ -1259,6 +1313,7 @@ xfs_da3_node_toosmall( struct xfs_da_blkinfo *info; xfs_dablk_t blkno; struct xfs_buf *bp; + xfs_failaddr_t fa; struct xfs_da3_icnode_hdr nodehdr; int count; int forward; @@ -1333,6 +1388,13 @@ xfs_da3_node_toosmall( state->args->whichfork); if (error) return error; + fa = xfs_da3_node_header_check(bp, state->args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(state->args->trans, bp); + xfs_da_mark_sick(state->args); + return -EFSCORRUPTED; + } node = bp->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &thdr, node); @@ -1591,6 +1653,7 @@ xfs_da3_node_lookup_int( struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; struct xfs_da_args *args; + xfs_failaddr_t fa; xfs_dablk_t blkno; xfs_dahash_t hashval; xfs_dahash_t btreehashval; @@ -1629,6 +1692,12 @@ xfs_da3_node_lookup_int( if (magic == XFS_ATTR_LEAF_MAGIC || magic == XFS_ATTR3_LEAF_MAGIC) { + fa = xfs_attr3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_ATTR_LEAF_MAGIC; blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); break; @@ -1636,6 +1705,12 @@ xfs_da3_node_lookup_int( if (magic == XFS_DIR2_LEAFN_MAGIC || magic == XFS_DIR3_LEAFN_MAGIC) { + fa = xfs_dir3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_DIR2_LEAFN_MAGIC; blk->hashval = xfs_dir2_leaf_lasthash(args->dp, blk->bp, NULL); @@ -1648,6 +1723,13 @@ xfs_da3_node_lookup_int( return -EFSCORRUPTED; } + fa = xfs_da3_node_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } + blk->magic = XFS_DA_NODE_MAGIC; /* @@ -1820,6 +1902,7 @@ xfs_da3_blk_link( struct xfs_da_blkinfo *tmp_info; struct xfs_da_args *args; struct xfs_buf *bp; + xfs_failaddr_t fa; int before = 0; int error; struct xfs_inode *dp = state->args->dp; @@ -1863,6 +1946,13 @@ xfs_da3_blk_link( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); @@ -1884,6 +1974,13 @@ xfs_da3_blk_link( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); @@ -1913,6 +2010,7 @@ xfs_da3_blk_unlink( struct xfs_da_blkinfo *tmp_info; struct xfs_da_args *args; struct xfs_buf *bp; + xfs_failaddr_t fa; int error; /* @@ -1943,6 +2041,13 @@ xfs_da3_blk_unlink( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); @@ -1960,6 +2065,13 @@ xfs_da3_blk_unlink( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); @@ -1996,6 +2108,7 @@ xfs_da3_path_shift( struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; struct xfs_buf *bp; + xfs_failaddr_t fa; xfs_dablk_t blkno = 0; int level; int error; @@ -2074,6 +2187,12 @@ xfs_da3_path_shift( switch (be16_to_cpu(info->magic)) { case XFS_DA_NODE_MAGIC: case XFS_DA3_NODE_MAGIC: + fa = xfs_da3_node_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_DA_NODE_MAGIC; xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, bp->b_addr); @@ -2087,6 +2206,12 @@ xfs_da3_path_shift( break; case XFS_ATTR_LEAF_MAGIC: case XFS_ATTR3_LEAF_MAGIC: + fa = xfs_attr3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_ATTR_LEAF_MAGIC; ASSERT(level == path->active-1); blk->index = 0; @@ -2094,6 +2219,12 @@ xfs_da3_path_shift( break; case XFS_DIR2_LEAFN_MAGIC: case XFS_DIR3_LEAFN_MAGIC: + fa = xfs_dir3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_DIR2_LEAFN_MAGIC; ASSERT(level == path->active-1); blk->index = 0; @@ -2167,8 +2298,8 @@ xfs_da_grow_inode_int( struct xfs_inode *dp = args->dp; int w = args->whichfork; xfs_rfsblock_t nblks = dp->i_nblocks; - struct xfs_bmbt_irec map, *mapp; - int nmap, error, got, i, mapi; + struct xfs_bmbt_irec map, *mapp = ↦ + int nmap, error, got, i, mapi = 1; /* * Find a spot in the file space to put the new block. @@ -2184,14 +2315,7 @@ xfs_da_grow_inode_int( error = xfs_bmapi_write(tp, dp, *bno, count, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->total, &map, &nmap); - if (error) - return error; - - ASSERT(nmap <= 1); - if (nmap == 1) { - mapp = ↦ - mapi = 1; - } else if (nmap == 0 && count > 1) { + if (error == -ENOSPC && count > 1) { xfs_fileoff_t b; int c; @@ -2209,16 +2333,13 @@ xfs_da_grow_inode_int( args->total, &mapp[mapi], &nmap); if (error) goto out_free_map; - if (nmap < 1) - break; mapi += nmap; b = mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount; } - } else { - mapi = 0; - mapp = NULL; } + if (error) + goto out_free_map; /* * Count the blocks we got, make sure it matches the total. @@ -2290,6 +2411,7 @@ xfs_da3_swap_lastblock( struct xfs_buf *last_buf; struct xfs_buf *sib_buf; struct xfs_buf *par_buf; + xfs_failaddr_t fa; xfs_dahash_t dead_hash; xfs_fileoff_t lastoff; xfs_dablk_t dead_blkno; @@ -2326,6 +2448,14 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, last_blkno, &last_buf, w); if (error) return error; + fa = xfs_da3_header_check(last_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(last_buf, fa); + xfs_trans_brelse(tp, last_buf); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } + /* * Copy the last block into the dead buffer and log it. */ @@ -2364,6 +2494,13 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w); if (error) goto done; + fa = xfs_da3_header_check(sib_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(sib_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } sib_info = sib_buf->b_addr; if (XFS_IS_CORRUPT(mp, be32_to_cpu(sib_info->forw) != last_blkno || @@ -2385,6 +2522,13 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w); if (error) goto done; + fa = xfs_da3_header_check(sib_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(sib_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } sib_info = sib_buf->b_addr; if (XFS_IS_CORRUPT(mp, be32_to_cpu(sib_info->back) != last_blkno || @@ -2408,6 +2552,13 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w); if (error) goto done; + fa = xfs_da3_node_header_check(par_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(par_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } par_node = par_buf->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); if (XFS_IS_CORRUPT(mp, @@ -2457,6 +2608,13 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w); if (error) goto done; + fa = xfs_da3_node_header_check(par_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(par_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } par_node = par_buf->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) { diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 706baf36e175..354d5d65043e 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -54,17 +54,24 @@ enum xfs_dacmp { */ typedef struct xfs_da_args { struct xfs_da_geometry *geo; /* da block geometry */ - const uint8_t *name; /* string (maybe not NULL terminated) */ - int namelen; /* length of string (maybe no NULL) */ - uint8_t filetype; /* filetype of inode for directories */ + const uint8_t *name; /* string (maybe not NULL terminated) */ + const uint8_t *new_name; /* new attr name */ void *value; /* set of bytes (maybe contain NULLs) */ - int valuelen; /* length of value */ - unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */ - unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */ - xfs_dahash_t hashval; /* hash value of name */ - xfs_ino_t inumber; /* input/output inode number */ + void *new_value; /* new xattr value (may contain NULLs) */ struct xfs_inode *dp; /* directory inode to manipulate */ struct xfs_trans *trans; /* current trans (changes over time) */ + + xfs_ino_t inumber; /* input/output inode number */ + xfs_ino_t owner; /* inode that owns the dir/attr data */ + + int valuelen; /* length of value */ + int new_valuelen; /* length of new_value */ + uint8_t filetype; /* filetype of inode for directories */ + uint8_t op_flags; /* operation flags */ + uint8_t attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */ + short namelen; /* length of string (maybe no NULL) */ + short new_namelen; /* length of new attr name */ + xfs_dahash_t hashval; /* hash value of name */ xfs_extlen_t total; /* total blocks needed, for 1st bmap */ int whichfork; /* data or attribute fork */ xfs_dablk_t blkno; /* blkno of attr leaf of interest */ @@ -77,7 +84,6 @@ typedef struct xfs_da_args { xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ int rmtvaluelen2; /* remote attr value length in bytes */ - uint32_t op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; @@ -89,10 +95,8 @@ typedef struct xfs_da_args { #define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */ #define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */ -#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ -#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */ -#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */ -#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */ +#define XFS_DA_OP_RECOVERY (1u << 5) /* Log recovery operation */ +#define XFS_DA_OP_LOGGED (1u << 6) /* Use intent items to track op */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -100,8 +104,6 @@ typedef struct xfs_da_args { { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_NOTIME, "NOTIME" }, \ - { XFS_DA_OP_REMOVE, "REMOVE" }, \ { XFS_DA_OP_RECOVERY, "RECOVERY" }, \ { XFS_DA_OP_LOGGED, "LOGGED" } @@ -235,6 +237,8 @@ void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from); void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp, struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from); +xfs_failaddr_t xfs_da3_header_check(struct xfs_buf *bp, xfs_ino_t owner); +xfs_failaddr_t xfs_da3_node_header_check(struct xfs_buf *bp, xfs_ino_t owner); extern struct kmem_cache *xfs_da_state_cache; diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 060e5c96b70f..86de99e2f757 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -714,12 +714,30 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ #define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ +#define XFS_ATTR_PARENT_BIT 3 /* parent pointer attrs */ #define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ #define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT) #define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT) #define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_PARENT (1u << XFS_ATTR_PARENT_BIT) #define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT) -#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) + +#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | \ + XFS_ATTR_SECURE | \ + XFS_ATTR_PARENT) + +/* Private attr namespaces not exposed to userspace */ +#define XFS_ATTR_PRIVATE_NSP_MASK (XFS_ATTR_PARENT) + +#define XFS_ATTR_ONDISK_MASK (XFS_ATTR_NSP_ONDISK_MASK | \ + XFS_ATTR_LOCAL | \ + XFS_ATTR_INCOMPLETE) + +#define XFS_ATTR_NAMESPACE_STR \ + { XFS_ATTR_LOCAL, "local" }, \ + { XFS_ATTR_ROOT, "root" }, \ + { XFS_ATTR_SECURE, "secure" }, \ + { XFS_ATTR_PARENT, "parent" } /* * Alignment for namelist and valuelist entries (since they are mixed @@ -862,9 +880,7 @@ struct xfs_attr3_rmt_hdr { #define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) -#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ - ((bufsize) - (xfs_has_crc((mp)) ? \ - sizeof(struct xfs_attr3_rmt_hdr) : 0)) +unsigned int xfs_attr3_rmt_buf_space(struct xfs_mount *mp); /* Number of bytes in a directory block. */ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) @@ -875,4 +891,17 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp, struct xfs_da3_blkinfo *hdr3); +/* + * Parent pointer attribute format definition + * + * The xattr name contains the dirent name. + * The xattr value encodes the parent inode number and generation to ease + * opening parents by handle. + * The xattr hashval is xfs_dir2_namehash() ^ p_ino + */ +struct xfs_parent_rec { + __be64 p_ino; + __be32 p_gen; +} __packed; + #endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index c13276095cc0..5b377cbbb1f7 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -12,12 +12,14 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_rmap.h" #include "xfs_refcount.h" #include "xfs_bmap.h" @@ -26,7 +28,7 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" -#include "xfs_trans_priv.h" +#include "xfs_exchmaps.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -555,7 +557,7 @@ xfs_defer_relog( * the log threshold once per call. */ if (threshold_lsn == NULLCOMMITLSN) { - threshold_lsn = xlog_grant_push_threshold(log, 0); + threshold_lsn = xfs_ail_get_push_target(log->l_ailp); if (threshold_lsn == NULLCOMMITLSN) break; } @@ -844,6 +846,12 @@ xfs_defer_add( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + if (!ops->finish_item) { + ASSERT(ops->finish_item != NULL); + xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); + return NULL; + } + dfp = xfs_defer_find_last(tp, ops); if (!dfp || !xfs_defer_can_append(dfp, ops)) dfp = xfs_defer_alloc(&tp->t_dfops, ops); @@ -1091,7 +1099,11 @@ xfs_defer_ops_continue( ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); /* Lock the captured resources to the new transaction. */ - if (dfc->dfc_held.dr_inos == 2) + if (dfc->dfc_held.dr_inos > 2) { + xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos); + xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos, + XFS_ILOCK_EXCL); + } else if (dfc->dfc_held.dr_inos == 2) xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); else if (dfc->dfc_held.dr_inos == 1) @@ -1176,6 +1188,10 @@ xfs_defer_init_item_caches(void) error = xfs_attr_intent_init_cache(); if (error) goto err; + error = xfs_exchmaps_intent_init_cache(); + if (error) + goto err; + return 0; err: xfs_defer_destroy_item_caches(); @@ -1186,6 +1202,7 @@ err: void xfs_defer_destroy_item_caches(void) { + xfs_exchmaps_intent_destroy_cache(); xfs_attr_intent_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 18a9fb92dde8..9effd95ddcd4 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -68,16 +68,25 @@ struct xfs_defer_op_type { extern const struct xfs_defer_op_type xfs_bmap_update_defer_type; extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; +extern const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type; extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; +extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +extern const struct xfs_defer_op_type xfs_rtextent_free_defer_type; extern const struct xfs_defer_op_type xfs_attr_defer_type; - +extern const struct xfs_defer_op_type xfs_exchmaps_defer_type; /* * Deferred operation item relogging limits. */ -#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ + +/* + * Rename w/ parent pointers can require up to 5 inodes with deferred ops to + * be joined to the transaction: src_dp, target_dp, src_ip, target_ip, and wip. + * These inodes are locked in sorted order by their inode numbers + */ +#define XFS_DEFER_OPS_NR_INODES 5 #define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */ /* Resources that must be held across a transaction roll. */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 4821519efad4..1775abcfa04d 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -19,6 +19,11 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_health.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_parent.h" +#include "xfs_ag.h" +#include "xfs_ialloc.h" const struct xfs_name xfs_name_dotdot = { .name = (const unsigned char *)"..", @@ -192,7 +197,7 @@ xfs_da_unmount( /* * Return 1 if directory contains only "." and "..". */ -int +static bool xfs_dir_isempty( xfs_inode_t *dp) { @@ -200,9 +205,9 @@ xfs_dir_isempty( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (dp->i_disk_size == 0) /* might happen during shutdown. */ - return 1; + return true; if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) - return 0; + return false; sfp = dp->i_df.if_data; return !sfp->count; } @@ -250,11 +255,68 @@ xfs_dir_init( args->geo = dp->i_mount->m_dir_geo; args->dp = dp; args->trans = tp; + args->owner = dp->i_ino; error = xfs_dir2_sf_create(args, pdp->i_ino); kfree(args); return error; } +enum xfs_dir2_fmt +xfs_dir2_format( + struct xfs_da_args *args, + int *error) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + xfs_fileoff_t eof; + + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + *error = 0; + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) + return XFS_DIR2_FMT_SF; + + *error = xfs_bmap_last_offset(dp, &eof, XFS_DATA_FORK); + if (*error) + return XFS_DIR2_FMT_ERROR; + + if (eof == XFS_B_TO_FSB(mp, geo->blksize)) { + if (XFS_IS_CORRUPT(mp, dp->i_disk_size != geo->blksize)) { + xfs_da_mark_sick(args); + *error = -EFSCORRUPTED; + return XFS_DIR2_FMT_ERROR; + } + return XFS_DIR2_FMT_BLOCK; + } + if (eof == geo->leafblk + geo->fsbcount) + return XFS_DIR2_FMT_LEAF; + return XFS_DIR2_FMT_NODE; +} + +int +xfs_dir_createname_args( + struct xfs_da_args *args) +{ + int error; + + if (!args->inumber) + args->op_flags |= XFS_DA_OP_JUSTCHECK; + + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + return xfs_dir2_sf_addname(args); + case XFS_DIR2_FMT_BLOCK: + return xfs_dir2_block_addname(args); + case XFS_DIR2_FMT_LEAF: + return xfs_dir2_leaf_addname(args); + case XFS_DIR2_FMT_NODE: + return xfs_dir2_node_addname(args); + default: + return error; + } +} + /* * Enter a name in a directory, or check for available space. * If inum is 0, only the available space test is performed. @@ -269,7 +331,6 @@ xfs_dir_createname( { struct xfs_da_args *args; int rval; - bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -295,31 +356,9 @@ xfs_dir_createname( args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - if (!inum) - args->op_flags |= XFS_DA_OP_JUSTCHECK; - - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_addname(args); - goto out_free; - } - - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_addname(args); - goto out_free; - } + args->owner = dp->i_ino; - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_addname(args); - else - rval = xfs_dir2_node_addname(args); - -out_free: + rval = xfs_dir_createname_args(args); kfree(args); return rval; } @@ -340,16 +379,43 @@ xfs_dir_cilookup_result( !(args->op_flags & XFS_DA_OP_CILOOKUP)) return -EEXIST; - args->value = kmalloc(len, + args->value = kmemdup(name, len, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL); if (!args->value) return -ENOMEM; - memcpy(args->value, name, len); args->valuelen = len; return -EEXIST; } +int +xfs_dir_lookup_args( + struct xfs_da_args *args) +{ + int error; + + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + error = xfs_dir2_sf_lookup(args); + break; + case XFS_DIR2_FMT_BLOCK: + error = xfs_dir2_block_lookup(args); + break; + case XFS_DIR2_FMT_LEAF: + error = xfs_dir2_leaf_lookup(args); + break; + case XFS_DIR2_FMT_NODE: + error = xfs_dir2_node_lookup(args); + break; + default: + break; + } + + if (error != -EEXIST) + return error; + return 0; +} + /* * Lookup a name in a directory, give back the inode number. * If ci_name is not NULL, returns the actual name in ci_name if it differs @@ -366,7 +432,6 @@ xfs_dir_lookup( { struct xfs_da_args *args; int rval; - bool v; int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -383,34 +448,12 @@ xfs_dir_lookup( args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_OKNOENT; + args->owner = dp->i_ino; if (ci_name) args->op_flags |= XFS_DA_OP_CILOOKUP; lock_mode = xfs_ilock_data_map_shared(dp); - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_lookup(args); - goto out_check_rval; - } - - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_lookup(args); - goto out_check_rval; - } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_lookup(args); - else - rval = xfs_dir2_node_lookup(args); - -out_check_rval: - if (rval == -EEXIST) - rval = 0; + rval = xfs_dir_lookup_args(args); if (!rval) { *inum = args->inumber; if (ci_name) { @@ -418,12 +461,31 @@ out_check_rval: ci_name->len = args->valuelen; } } -out_free: xfs_iunlock(dp, lock_mode); kfree(args); return rval; } +int +xfs_dir_removename_args( + struct xfs_da_args *args) +{ + int error; + + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + return xfs_dir2_sf_removename(args); + case XFS_DIR2_FMT_BLOCK: + return xfs_dir2_block_removename(args); + case XFS_DIR2_FMT_LEAF: + return xfs_dir2_leaf_removename(args); + case XFS_DIR2_FMT_NODE: + return xfs_dir2_node_removename(args); + default: + return error; + } +} + /* * Remove an entry from a directory. */ @@ -431,13 +493,12 @@ int xfs_dir_removename( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, + const struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; - bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); @@ -456,30 +517,30 @@ xfs_dir_removename( args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; + args->owner = dp->i_ino; + rval = xfs_dir_removename_args(args); + kfree(args); + return rval; +} - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_removename(args); - goto out_free; - } +int +xfs_dir_replace_args( + struct xfs_da_args *args) +{ + int error; - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_removename(args); - goto out_free; + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + return xfs_dir2_sf_replace(args); + case XFS_DIR2_FMT_BLOCK: + return xfs_dir2_block_replace(args); + case XFS_DIR2_FMT_LEAF: + return xfs_dir2_leaf_replace(args); + case XFS_DIR2_FMT_NODE: + return xfs_dir2_node_replace(args); + default: + return error; } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_removename(args); - else - rval = xfs_dir2_node_removename(args); -out_free: - kfree(args); - return rval; } /* @@ -495,7 +556,6 @@ xfs_dir_replace( { struct xfs_da_args *args; int rval; - bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -517,28 +577,8 @@ xfs_dir_replace( args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; - - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_replace(args); - goto out_free; - } - - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_replace(args); - goto out_free; - } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_replace(args); - else - rval = xfs_dir2_node_replace(args); -out_free: + args->owner = dp->i_ino; + rval = xfs_dir_replace_args(args); kfree(args); return rval; } @@ -548,9 +588,9 @@ out_free: */ int xfs_dir_canenter( - xfs_trans_t *tp, - xfs_inode_t *dp, - struct xfs_name *name) /* name of entry to add */ + struct xfs_trans *tp, + struct xfs_inode *dp, + const struct xfs_name *name) /* name of entry to add */ { return xfs_dir_createname(tp, dp, name, 0, 0); } @@ -607,57 +647,6 @@ xfs_dir2_grow_inode( } /* - * See if the directory is a single-block form directory. - */ -int -xfs_dir2_isblock( - struct xfs_da_args *args, - bool *isblock) -{ - struct xfs_mount *mp = args->dp->i_mount; - xfs_fileoff_t eof; - int error; - - error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); - if (error) - return error; - - *isblock = false; - if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize) - return 0; - - *isblock = true; - if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) { - xfs_da_mark_sick(args); - return -EFSCORRUPTED; - } - return 0; -} - -/* - * See if the directory is a single-leaf form directory. - */ -int -xfs_dir2_isleaf( - struct xfs_da_args *args, - bool *isleaf) -{ - xfs_fileoff_t eof; - int error; - - error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); - if (error) - return error; - - *isleaf = false; - if (eof != args->geo->leafblk + args->geo->fsbcount) - return 0; - - *isleaf = true; - return 0; -} - -/* * Remove the given block from the directory. * This routine is used for data and free blocks, leaf/node are done * by xfs_da_shrink_inode. @@ -771,3 +760,653 @@ xfs_dir2_compname( return xfs_ascii_ci_compname(args, name, len); return xfs_da_compname(args, name, len); } + +#ifdef CONFIG_XFS_LIVE_HOOKS +/* + * Use a static key here to reduce the overhead of directory live update hooks. + * If the compiler supports jump labels, the static branch will be replaced by + * a nop sled when there are no hook users. Online fsck is currently the only + * caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); + +void +xfs_dir_hook_disable(void) +{ + xfs_hooks_switch_off(&xfs_dir_hooks_switch); +} + +void +xfs_dir_hook_enable(void) +{ + xfs_hooks_switch_on(&xfs_dir_hooks_switch); +} + +/* Call hooks for a directory update relating to a child dirent update. */ +inline void +xfs_dir_update_hook( + struct xfs_inode *dp, + struct xfs_inode *ip, + int delta, + const struct xfs_name *name) +{ + if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { + struct xfs_dir_update_params p = { + .dp = dp, + .ip = ip, + .delta = delta, + .name = name, + }; + struct xfs_mount *mp = ip->i_mount; + + xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); + } +} + +/* Call the specified function during a directory update. */ +int +xfs_dir_hook_add( + struct xfs_mount *mp, + struct xfs_dir_hook *hook) +{ + return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); +} + +/* Stop calling the specified function during a directory update. */ +void +xfs_dir_hook_del( + struct xfs_mount *mp, + struct xfs_dir_hook *hook) +{ + xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); +} + +/* Configure directory update hook functions. */ +void +xfs_dir_hook_setup( + struct xfs_dir_hook *hook, + notifier_fn_t mod_fn) +{ + xfs_hook_setup(&hook->dirent_hook, mod_fn); +} +#endif /* CONFIG_XFS_LIVE_HOOKS */ + +/* + * Given a directory @dp, a newly allocated inode @ip, and a @name, link @ip + * into @dp under the given @name. If @ip is a directory, it will be + * initialized. Both inodes must have the ILOCK held and the transaction must + * have sufficient blocks reserved. + */ +int +xfs_dir_create_child( + struct xfs_trans *tp, + unsigned int resblks, + struct xfs_dir_update *du) +{ + struct xfs_inode *dp = du->dp; + const struct xfs_name *name = du->name; + struct xfs_inode *ip = du->ip; + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); + + error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks); + if (error) { + ASSERT(error != -ENOSPC); + return error; + } + + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + if (S_ISDIR(VFS_I(ip)->i_mode)) { + error = xfs_dir_init(tp, ip, dp); + if (error) + return error; + + xfs_bumplink(tp, dp); + } + + /* + * If we have parent pointers, we need to add the attribute containing + * the parent information now. + */ + if (du->ppargs) { + error = xfs_parent_addname(tp, du->ppargs, dp, name, ip); + if (error) + return error; + } + + xfs_dir_update_hook(dp, ip, 1, name); + return 0; +} + +/* + * Given a directory @dp, an existing non-directory inode @ip, and a @name, + * link @ip into @dp under the given @name. Both inodes must have the ILOCK + * held. + */ +int +xfs_dir_add_child( + struct xfs_trans *tp, + unsigned int resblks, + struct xfs_dir_update *du) +{ + struct xfs_inode *dp = du->dp; + const struct xfs_name *name = du->name; + struct xfs_inode *ip = du->ip; + struct xfs_mount *mp = tp->t_mountp; + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); + ASSERT(!S_ISDIR(VFS_I(ip)->i_mode)); + + if (!resblks) { + error = xfs_dir_canenter(tp, dp, name); + if (error) + return error; + } + + /* + * Handle initial link state of O_TMPFILE inode + */ + if (VFS_I(ip)->i_nlink == 0) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + error = xfs_iunlink_remove(tp, pag, ip); + xfs_perag_put(pag); + if (error) + return error; + } + + error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks); + if (error) + return error; + + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + xfs_bumplink(tp, ip); + + /* + * If we have parent pointers, we now need to add the parent record to + * the attribute fork of the inode. If this is the initial parent + * attribute, we need to create it correctly, otherwise we can just add + * the parent to the inode. + */ + if (du->ppargs) { + error = xfs_parent_addname(tp, du->ppargs, dp, name, ip); + if (error) + return error; + } + + xfs_dir_update_hook(dp, ip, 1, name); + return 0; +} + +/* + * Given a directory @dp, a child @ip, and a @name, remove the (@name, @ip) + * entry from the directory. Both inodes must have the ILOCK held. + */ +int +xfs_dir_remove_child( + struct xfs_trans *tp, + unsigned int resblks, + struct xfs_dir_update *du) +{ + struct xfs_inode *dp = du->dp; + const struct xfs_name *name = du->name; + struct xfs_inode *ip = du->ip; + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); + + /* + * If we're removing a directory perform some additional validation. + */ + if (S_ISDIR(VFS_I(ip)->i_mode)) { + ASSERT(VFS_I(ip)->i_nlink >= 2); + if (VFS_I(ip)->i_nlink != 2) + return -ENOTEMPTY; + if (!xfs_dir_isempty(ip)) + return -ENOTEMPTY; + + /* Drop the link from ip's "..". */ + error = xfs_droplink(tp, dp); + if (error) + return error; + + /* Drop the "." link from ip to self. */ + error = xfs_droplink(tp, ip); + if (error) + return error; + + /* + * Point the unlinked child directory's ".." entry to the root + * directory to eliminate back-references to inodes that may + * get freed before the child directory is closed. If the fs + * gets shrunk, this can lead to dirent inode validation errors. + */ + if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { + error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, + tp->t_mountp->m_sb.sb_rootino, 0); + if (error) + return error; + } + } else { + /* + * When removing a non-directory we need to log the parent + * inode here. For a directory this is done implicitly + * by the xfs_droplink call for the ".." entry. + */ + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* Drop the link from dp to ip. */ + error = xfs_droplink(tp, ip); + if (error) + return error; + + error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); + if (error) { + ASSERT(error != -ENOENT); + return error; + } + + /* Remove parent pointer. */ + if (du->ppargs) { + error = xfs_parent_removename(tp, du->ppargs, dp, name, ip); + if (error) + return error; + } + + xfs_dir_update_hook(dp, ip, -1, name); + return 0; +} + +/* + * Exchange the entry (@name1, @ip1) in directory @dp1 with the entry (@name2, + * @ip2) in directory @dp2, and update '..' @ip1 and @ip2's entries as needed. + * @ip1 and @ip2 need not be of the same type. + * + * All inodes must have the ILOCK held, and both entries must already exist. + */ +int +xfs_dir_exchange_children( + struct xfs_trans *tp, + struct xfs_dir_update *du1, + struct xfs_dir_update *du2, + unsigned int spaceres) +{ + struct xfs_inode *dp1 = du1->dp; + const struct xfs_name *name1 = du1->name; + struct xfs_inode *ip1 = du1->ip; + struct xfs_inode *dp2 = du2->dp; + const struct xfs_name *name2 = du2->name; + struct xfs_inode *ip2 = du2->ip; + int ip1_flags = 0; + int ip2_flags = 0; + int dp2_flags = 0; + int error; + + /* Swap inode number for dirent in first parent */ + error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); + if (error) + return error; + + /* Swap inode number for dirent in second parent */ + error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); + if (error) + return error; + + /* + * If we're renaming one or more directories across different parents, + * update the respective ".." entries (and link counts) to match the new + * parents. + */ + if (dp1 != dp2) { + dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + + if (S_ISDIR(VFS_I(ip2)->i_mode)) { + error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, + dp1->i_ino, spaceres); + if (error) + return error; + + /* transfer ip2 ".." reference to dp1 */ + if (!S_ISDIR(VFS_I(ip1)->i_mode)) { + error = xfs_droplink(tp, dp2); + if (error) + return error; + xfs_bumplink(tp, dp1); + } + + /* + * Although ip1 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + } + + if (S_ISDIR(VFS_I(ip1)->i_mode)) { + error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, + dp2->i_ino, spaceres); + if (error) + return error; + + /* transfer ip1 ".." reference to dp2 */ + if (!S_ISDIR(VFS_I(ip2)->i_mode)) { + error = xfs_droplink(tp, dp1); + if (error) + return error; + xfs_bumplink(tp, dp2); + } + + /* + * Although ip2 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_CHG; + } + } + + if (ip1_flags) { + xfs_trans_ichgtime(tp, ip1, ip1_flags); + xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); + } + if (ip2_flags) { + xfs_trans_ichgtime(tp, ip2, ip2_flags); + xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); + } + if (dp2_flags) { + xfs_trans_ichgtime(tp, dp2, dp2_flags); + xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); + } + xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); + + /* Schedule parent pointer replacements */ + if (du1->ppargs) { + error = xfs_parent_replacename(tp, du1->ppargs, dp1, name1, + dp2, name2, ip1); + if (error) + return error; + } + + if (du2->ppargs) { + error = xfs_parent_replacename(tp, du2->ppargs, dp2, name2, + dp1, name1, ip2); + if (error) + return error; + } + + /* + * Inform our hook clients that we've finished an exchange operation as + * follows: removed the source and target files from their directories; + * added the target to the source directory; and added the source to + * the target directory. All inodes are locked, so it's ok to model a + * rename this way so long as we say we deleted entries before we add + * new ones. + */ + xfs_dir_update_hook(dp1, ip1, -1, name1); + xfs_dir_update_hook(dp2, ip2, -1, name2); + xfs_dir_update_hook(dp1, ip2, 1, name1); + xfs_dir_update_hook(dp2, ip1, 1, name2); + return 0; +} + +/* + * Given an entry (@src_name, @src_ip) in directory @src_dp, make the entry + * @target_name in directory @target_dp point to @src_ip and remove the + * original entry, cleaning up everything left behind. + * + * Cleanup involves dropping a link count on @target_ip, and either removing + * the (@src_name, @src_ip) entry from @src_dp or simply replacing the entry + * with (@src_name, @wip) if a whiteout inode @wip is supplied. + * + * All inodes must have the ILOCK held. We assume that if @src_ip is a + * directory then its '..' doesn't already point to @target_dp, and that @wip + * is a freshly allocated whiteout. + */ +int +xfs_dir_rename_children( + struct xfs_trans *tp, + struct xfs_dir_update *du_src, + struct xfs_dir_update *du_tgt, + unsigned int spaceres, + struct xfs_dir_update *du_wip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *src_dp = du_src->dp; + const struct xfs_name *src_name = du_src->name; + struct xfs_inode *src_ip = du_src->ip; + struct xfs_inode *target_dp = du_tgt->dp; + const struct xfs_name *target_name = du_tgt->name; + struct xfs_inode *target_ip = du_tgt->ip; + bool new_parent = (src_dp != target_dp); + bool src_is_directory; + int error; + + src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); + + /* + * Check for expected errors before we dirty the transaction + * so we can return an error without a transaction abort. + */ + if (target_ip == NULL) { + /* + * If there's no space reservation, check the entry will + * fit before actually inserting it. + */ + if (!spaceres) { + error = xfs_dir_canenter(tp, target_dp, target_name); + if (error) + return error; + } + } else { + /* + * If target exists and it's a directory, check that whether + * it can be destroyed. + */ + if (S_ISDIR(VFS_I(target_ip)->i_mode) && + (!xfs_dir_isempty(target_ip) || + (VFS_I(target_ip)->i_nlink > 2))) + return -EEXIST; + } + + /* + * Directory entry creation below may acquire the AGF. Remove + * the whiteout from the unlinked list first to preserve correct + * AGI/AGF locking order. This dirties the transaction so failures + * after this point will abort and log recovery will clean up the + * mess. + * + * For whiteouts, we need to bump the link count on the whiteout + * inode. After this point, we have a real link, clear the tmpfile + * state flag from the inode so it doesn't accidentally get misused + * in future. + */ + if (du_wip->ip) { + struct xfs_perag *pag; + + ASSERT(VFS_I(du_wip->ip)->i_nlink == 0); + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, du_wip->ip->i_ino)); + error = xfs_iunlink_remove(tp, pag, du_wip->ip); + xfs_perag_put(pag); + if (error) + return error; + + xfs_bumplink(tp, du_wip->ip); + } + + /* + * Set up the target. + */ + if (target_ip == NULL) { + /* + * If target does not exist and the rename crosses + * directories, adjust the target directory link count + * to account for the ".." reference from the new entry. + */ + error = xfs_dir_createname(tp, target_dp, target_name, + src_ip->i_ino, spaceres); + if (error) + return error; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + if (new_parent && src_is_directory) { + xfs_bumplink(tp, target_dp); + } + } else { /* target_ip != NULL */ + /* + * Link the source inode under the target name. + * If the source inode is a directory and we are moving + * it across directories, its ".." entry will be + * inconsistent until we replace that down below. + * + * In case there is already an entry with the same + * name at the destination directory, remove it first. + */ + error = xfs_dir_replace(tp, target_dp, target_name, + src_ip->i_ino, spaceres); + if (error) + return error; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* + * Decrement the link count on the target since the target + * dir no longer points to it. + */ + error = xfs_droplink(tp, target_ip); + if (error) + return error; + + if (src_is_directory) { + /* + * Drop the link from the old "." entry. + */ + error = xfs_droplink(tp, target_ip); + if (error) + return error; + } + } /* target_ip != NULL */ + + /* + * Remove the source. + */ + if (new_parent && src_is_directory) { + /* + * Rewrite the ".." entry to point to the new + * directory. + */ + error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, + target_dp->i_ino, spaceres); + ASSERT(error != -EEXIST); + if (error) + return error; + } + + /* + * We always want to hit the ctime on the source inode. + * + * This isn't strictly required by the standards since the source + * inode isn't really being changed, but old unix file systems did + * it and some incremental backup programs won't work without it. + */ + xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); + + /* + * Adjust the link count on src_dp. This is necessary when + * renaming a directory, either within one parent when + * the target existed, or across two parent directories. + */ + if (src_is_directory && (new_parent || target_ip != NULL)) { + + /* + * Decrement link count on src_directory since the + * entry that's moved no longer points to it. + */ + error = xfs_droplink(tp, src_dp); + if (error) + return error; + } + + /* + * For whiteouts, we only need to update the source dirent with the + * inode number of the whiteout inode rather than removing it + * altogether. + */ + if (du_wip->ip) + error = xfs_dir_replace(tp, src_dp, src_name, du_wip->ip->i_ino, + spaceres); + else + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + spaceres); + if (error) + return error; + + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + if (new_parent) + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + + /* Schedule parent pointer updates. */ + if (du_wip->ppargs) { + error = xfs_parent_addname(tp, du_wip->ppargs, src_dp, + src_name, du_wip->ip); + if (error) + return error; + } + + if (du_src->ppargs) { + error = xfs_parent_replacename(tp, du_src->ppargs, src_dp, + src_name, target_dp, target_name, src_ip); + if (error) + return error; + } + + if (du_tgt->ppargs) { + error = xfs_parent_removename(tp, du_tgt->ppargs, target_dp, + target_name, target_ip); + if (error) + return error; + } + + /* + * Inform our hook clients that we've finished a rename operation as + * follows: removed the source and target files from their directories; + * that we've added the source to the target directory; and finally + * that we've added the whiteout, if there was one. All inodes are + * locked, so it's ok to model a rename this way so long as we say we + * deleted entries before we add new ones. + */ + if (target_ip) + xfs_dir_update_hook(target_dp, target_ip, -1, target_name); + xfs_dir_update_hook(src_dp, src_ip, -1, src_name); + xfs_dir_update_hook(target_dp, src_ip, 1, target_name); + if (du_wip->ip) + xfs_dir_update_hook(src_dp, du_wip->ip, 1, src_name); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 8497d041f316..a6594a5a941d 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -36,6 +36,16 @@ xfs_dir2_samename( return !memcmp(n1->name, n2->name, n1->len); } +enum xfs_dir2_fmt { + XFS_DIR2_FMT_SF, + XFS_DIR2_FMT_BLOCK, + XFS_DIR2_FMT_LEAF, + XFS_DIR2_FMT_NODE, + XFS_DIR2_FMT_ERROR, +}; + +enum xfs_dir2_fmt xfs_dir2_format(struct xfs_da_args *args, int *error); + /* * Convert inode mode to directory entry filetype */ @@ -48,7 +58,6 @@ extern void xfs_dir_startup(void); extern int xfs_da_mount(struct xfs_mount *mp); extern void xfs_da_unmount(struct xfs_mount *mp); -extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, @@ -58,13 +67,18 @@ extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t *inum, struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t ino, + const struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t tot); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name); + const struct xfs_name *name); + +int xfs_dir_lookup_args(struct xfs_da_args *args); +int xfs_dir_createname_args(struct xfs_da_args *args); +int xfs_dir_removename_args(struct xfs_da_args *args); +int xfs_dir_replace_args(struct xfs_da_args *args); /* * Direct call from the bmap code, bypassing the generic directory layer. @@ -74,8 +88,6 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); /* * Interface routines used by userspace utilities */ -extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock); -extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); @@ -101,6 +113,10 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); +xfs_failaddr_t xfs_dir3_leaf_header_check(struct xfs_buf *bp, xfs_ino_t owner); +xfs_failaddr_t xfs_dir3_data_header_check(struct xfs_buf *bp, xfs_ino_t owner); +xfs_failaddr_t xfs_dir3_block_header_check(struct xfs_buf *bp, xfs_ino_t owner); + extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; @@ -292,4 +308,51 @@ static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c) return c; } +struct xfs_dir_update_params { + const struct xfs_inode *dp; + const struct xfs_inode *ip; + const struct xfs_name *name; + int delta; +}; + +#ifdef CONFIG_XFS_LIVE_HOOKS +void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip, + int delta, const struct xfs_name *name); + +struct xfs_dir_hook { + struct xfs_hook dirent_hook; +}; + +void xfs_dir_hook_disable(void); +void xfs_dir_hook_enable(void); + +int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook); +void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook); +void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn); +#else +# define xfs_dir_update_hook(dp, ip, delta, name) ((void)0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + +struct xfs_parent_args; + +struct xfs_dir_update { + struct xfs_inode *dp; + const struct xfs_name *name; + struct xfs_inode *ip; + struct xfs_parent_args *ppargs; +}; + +int xfs_dir_create_child(struct xfs_trans *tp, unsigned int resblks, + struct xfs_dir_update *du); +int xfs_dir_add_child(struct xfs_trans *tp, unsigned int resblks, + struct xfs_dir_update *du); +int xfs_dir_remove_child(struct xfs_trans *tp, unsigned int resblks, + struct xfs_dir_update *du); + +int xfs_dir_exchange_children(struct xfs_trans *tp, struct xfs_dir_update *du1, + struct xfs_dir_update *du2, unsigned int spaceres); +int xfs_dir_rename_children(struct xfs_trans *tp, struct xfs_dir_update *du_src, + struct xfs_dir_update *du_tgt, unsigned int spaceres, + struct xfs_dir_update *du_wip); + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index a2da007adb46..0f93ed1a4a74 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -115,17 +115,20 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .verify_struct = xfs_dir3_block_verify, }; -static xfs_failaddr_t +xfs_failaddr_t xfs_dir3_block_header_check( - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_ino_t owner) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - if (be64_to_cpu(hdr3->owner) != dp->i_ino) + if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->owner) != owner) return __this_address; } @@ -136,6 +139,7 @@ int xfs_dir3_block_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; @@ -148,7 +152,7 @@ xfs_dir3_block_read( return err; /* Check things that we can't do in the verifier. */ - fa = xfs_dir3_block_header_check(dp, *bpp); + fa = xfs_dir3_block_header_check(*bpp, owner); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); @@ -163,12 +167,13 @@ xfs_dir3_block_read( static void xfs_dir3_block_init( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_buf *bp, - struct xfs_inode *dp) + struct xfs_da_args *args, + struct xfs_buf *bp) { - struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; bp->b_ops = &xfs_dir3_block_buf_ops; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); @@ -177,7 +182,7 @@ xfs_dir3_block_init( memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->owner = cpu_to_be64(dp->i_ino); + hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); return; @@ -382,7 +387,7 @@ xfs_dir2_block_addname( tp = args->trans; /* Read the (one and only) directory block into bp. */ - error = xfs_dir3_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, args->owner, &bp); if (error) return error; @@ -697,7 +702,7 @@ xfs_dir2_block_lookup_int( dp = args->dp; tp = args->trans; - error = xfs_dir3_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, args->owner, &bp); if (error) return error; @@ -981,7 +986,8 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_dir3_data_read(tp, dp, args->geo->datablk, 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + args->geo->datablk, 0, &dbp); if (error) return error; } @@ -1009,7 +1015,7 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ - xfs_dir3_block_init(mp, tp, dbp, dp); + xfs_dir3_block_init(args, dbp); needlog = 1; needscan = 0; @@ -1129,7 +1135,7 @@ xfs_dir2_sf_to_block( error = xfs_dir3_data_init(args, blkno, &bp); if (error) goto out_free; - xfs_dir3_block_init(mp, tp, bp, dp); + xfs_dir3_block_init(args, bp); hdr = bp->b_addr; /* @@ -1169,7 +1175,7 @@ xfs_dir2_sf_to_block( * Create entry for . */ dep = bp->b_addr + offset; - dep->inumber = cpu_to_be64(dp->i_ino); + dep->inumber = cpu_to_be64(args->owner); dep->namelen = 1; dep->name[0] = '.'; xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR); diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 7a6d965bea71..a16b05c43e2e 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -178,6 +178,14 @@ __xfs_dir3_data_check( while (offset < end) { struct xfs_dir2_data_unused *dup = bp->b_addr + offset; struct xfs_dir2_data_entry *dep = bp->b_addr + offset; + unsigned int reclen; + + /* + * Are the remaining bytes large enough to hold an + * unused entry? + */ + if (offset > end - xfs_dir2_data_unusedsize(1)) + return __this_address; /* * If it's unused, look for the space in the bestfree table. @@ -187,9 +195,13 @@ __xfs_dir3_data_check( if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { xfs_failaddr_t fa; + reclen = xfs_dir2_data_unusedsize( + be16_to_cpu(dup->length)); if (lastfree != 0) return __this_address; - if (offset + be16_to_cpu(dup->length) > end) + if (be16_to_cpu(dup->length) != reclen) + return __this_address; + if (offset + reclen > end) return __this_address; if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) != offset) @@ -207,10 +219,18 @@ __xfs_dir3_data_check( be16_to_cpu(bf[2].length)) return __this_address; } - offset += be16_to_cpu(dup->length); + offset += reclen; lastfree = 1; continue; } + + /* + * This is not an unused entry. Are the remaining bytes + * large enough for a dirent with a single-byte name? + */ + if (offset > end - xfs_dir2_data_entsize(mp, 1)) + return __this_address; + /* * It's a real entry. Validate the fields. * If this is a block directory then make sure it's @@ -219,9 +239,10 @@ __xfs_dir3_data_check( */ if (dep->namelen == 0) return __this_address; - if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber))) + reclen = xfs_dir2_data_entsize(mp, dep->namelen); + if (offset + reclen > end) return __this_address; - if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end) + if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber))) return __this_address; if (be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)) != offset) return __this_address; @@ -245,7 +266,7 @@ __xfs_dir3_data_check( if (i >= be32_to_cpu(btp->count)) return __this_address; } - offset += xfs_dir2_data_entsize(mp, dep->namelen); + offset += reclen; } /* * Need to have seen all the entries and all the bestfree slots. @@ -395,17 +416,20 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .verify_write = xfs_dir3_data_write_verify, }; -static xfs_failaddr_t +xfs_failaddr_t xfs_dir3_data_header_check( - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_ino_t owner) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; - if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + if (hdr3->hdr.magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->hdr.owner) != owner) return __this_address; } @@ -416,6 +440,7 @@ int xfs_dir3_data_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp) @@ -429,7 +454,7 @@ xfs_dir3_data_read( return err; /* Check things that we can't do in the verifier. */ - fa = xfs_dir3_data_header_check(dp, *bpp); + fa = xfs_dir3_data_header_check(*bpp, owner); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); @@ -725,7 +750,7 @@ xfs_dir3_data_init( memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->owner = cpu_to_be64(dp->i_ino); + hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); } else diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 08dda5ce9d91..71c2f22a3f6e 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -208,6 +208,29 @@ xfs_dir3_leaf_verify( return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr, true); } +xfs_failaddr_t +xfs_dir3_leaf_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + + if (xfs_has_crc(mp)) { + struct xfs_dir3_leaf *hdr3 = bp->b_addr; + + if (hdr3->hdr.info.hdr.magic != + cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) && + hdr3->hdr.info.hdr.magic != + cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->hdr.info.owner) != owner) + return __this_address; + } + + return NULL; +} + static void xfs_dir3_leaf_read_verify( struct xfs_buf *bp) @@ -271,32 +294,60 @@ int xfs_dir3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); - if (!err && tp && *bpp) + if (err || !(*bpp)) + return err; + + fa = xfs_dir3_leaf_header_check(*bpp, owner); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); + return -EFSCORRUPTED; + } + + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); - return err; + return 0; } int xfs_dir3_leafn_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); - if (!err && tp && *bpp) + if (err || !(*bpp)) + return err; + + fa = xfs_dir3_leaf_header_check(*bpp, owner); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); + return -EFSCORRUPTED; + } + + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); - return err; + return 0; } /* @@ -304,12 +355,12 @@ xfs_dir3_leafn_read( */ static void xfs_dir3_leaf_init( - struct xfs_mount *mp, - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, - xfs_ino_t owner, uint16_t type) { + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_trans *tp = args->trans; struct xfs_dir2_leaf *leaf = bp->b_addr; ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); @@ -323,7 +374,7 @@ xfs_dir3_leaf_init( ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); leaf3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - leaf3->info.owner = cpu_to_be64(owner); + leaf3->info.owner = cpu_to_be64(args->owner); uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { memset(leaf, 0, sizeof(*leaf)); @@ -356,7 +407,6 @@ xfs_dir3_leaf_get_buf( { struct xfs_inode *dp = args->dp; struct xfs_trans *tp = args->trans; - struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp; int error; @@ -369,7 +419,7 @@ xfs_dir3_leaf_get_buf( if (error) return error; - xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic); + xfs_dir3_leaf_init(args, bp, magic); xfs_dir3_leaf_log_header(args, bp); if (magic == XFS_DIR2_LEAF1_MAGIC) xfs_dir3_leaf_log_tail(args, bp); @@ -647,7 +697,8 @@ xfs_dir2_leaf_addname( trace_xfs_dir2_leaf_addname(args); - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->owner, args->geo->leafblk, + &lbp); if (error) return error; @@ -834,9 +885,9 @@ xfs_dir2_leaf_addname( * Already had space in some data block. * Just read that one in. */ - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, use_block), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, use_block), 0, + &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1238,7 +1289,8 @@ xfs_dir2_leaf_lookup_int( tp = args->trans; mp = dp->i_mount; - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->owner, args->geo->leafblk, + &lbp); if (error) return error; @@ -1276,9 +1328,9 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_trans_brelse(tp, dbp); - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, newdb), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, newdb), 0, + &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1318,9 +1370,9 @@ xfs_dir2_leaf_lookup_int( ASSERT(cidb != -1); if (cidb != curdb) { xfs_trans_brelse(tp, dbp); - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, cidb), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, cidb), 0, + &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1614,7 +1666,8 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(geo, db), 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(geo, db), 0, &dbp); if (error) return error; @@ -1753,7 +1806,8 @@ xfs_dir2_node_to_leaf( /* * Read the freespace block. */ - error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp); + error = xfs_dir2_free_read(tp, dp, args->owner, args->geo->freeblk, + &fbp); if (error) return error; xfs_dir2_free_hdr_from_disk(mp, &freehdr, fbp->b_addr); diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index be0b8834028c..fe8d4fa13128 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -175,11 +175,11 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = { /* Everything ok in the free block header? */ static xfs_failaddr_t xfs_dir3_free_header_check( - struct xfs_inode *dp, - xfs_dablk_t fbno, - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_ino_t owner, + xfs_dablk_t fbno) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_mount *mp = bp->b_mount; int maxbests = mp->m_dir_geo->free_max_bests; unsigned int firstdb; @@ -195,7 +195,7 @@ xfs_dir3_free_header_check( return __this_address; if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) return __this_address; - if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + if (be64_to_cpu(hdr3->hdr.owner) != owner) return __this_address; } else { struct xfs_dir2_free_hdr *hdr = bp->b_addr; @@ -214,6 +214,7 @@ static int __xfs_dir3_free_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, unsigned int flags, struct xfs_buf **bpp) @@ -227,7 +228,7 @@ __xfs_dir3_free_read( return err; /* Check things that we can't do in the verifier. */ - fa = xfs_dir3_free_header_check(dp, fbno, *bpp); + fa = xfs_dir3_free_header_check(*bpp, owner, fbno); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); @@ -299,20 +300,23 @@ int xfs_dir2_free_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir3_free_read(tp, dp, fbno, 0, bpp); + return __xfs_dir3_free_read(tp, dp, owner, fbno, 0, bpp); } static int xfs_dir2_free_try_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir3_free_read(tp, dp, fbno, XFS_DABUF_MAP_HOLE_OK, bpp); + return __xfs_dir3_free_read(tp, dp, owner, fbno, XFS_DABUF_MAP_HOLE_OK, + bpp); } static int @@ -349,7 +353,7 @@ xfs_dir3_free_get_buf( hdr.magic = XFS_DIR3_FREE_MAGIC; hdr3->hdr.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->hdr.owner = cpu_to_be64(dp->i_ino); + hdr3->hdr.owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid); } else hdr.magic = XFS_DIR2_FREE_MAGIC; @@ -717,7 +721,7 @@ xfs_dir2_leafn_lookup_for_addname( if (curbp) xfs_trans_brelse(tp, curbp); - error = xfs_dir2_free_read(tp, dp, + error = xfs_dir2_free_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, newfdb), &curbp); @@ -863,7 +867,7 @@ xfs_dir2_leafn_lookup_for_entry( ASSERT(state->extravalid); curbp = state->extrablk.bp; } else { - error = xfs_dir3_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, newdb), 0, &curbp); @@ -1356,8 +1360,8 @@ xfs_dir2_leafn_remove( * read in the free block. */ fdb = xfs_dir2_db_to_fdb(geo, db); - error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(geo, fdb), - &fbp); + error = xfs_dir2_free_read(tp, dp, args->owner, + xfs_dir2_db_to_da(geo, fdb), &fbp); if (error) return error; free = fbp->b_addr; @@ -1562,7 +1566,8 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, &bp); + error = xfs_dir3_leafn_read(state->args->trans, dp, + state->args->owner, blkno, &bp); if (error) return error; @@ -1715,7 +1720,7 @@ xfs_dir2_node_add_datablk( * that was just allocated. */ fbno = xfs_dir2_db_to_fdb(args->geo, *dbno); - error = xfs_dir2_free_try_read(tp, dp, + error = xfs_dir2_free_try_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, fbno), &fbp); if (error) return error; @@ -1862,7 +1867,7 @@ xfs_dir2_node_find_freeblk( * so this might not succeed. This should be really rare, so * there's no reason to avoid it. */ - error = xfs_dir2_free_try_read(tp, dp, + error = xfs_dir2_free_try_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, fbno), &fbp); if (error) @@ -1948,9 +1953,8 @@ xfs_dir2_node_addname_int( &freehdr, &findex); } else { /* Read the data block in. */ - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, dbno), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, dbno), 0, &dbp); } if (error) return error; @@ -2302,7 +2306,7 @@ xfs_dir2_node_trim_free( /* * Read the freespace block. */ - error = xfs_dir2_free_try_read(tp, dp, fo, &bp); + error = xfs_dir2_free_try_read(tp, dp, args->owner, fo, &bp); if (error) return error; /* diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 1db2e60ba827..10041350274a 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -50,8 +50,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args, /* xfs_dir2_block.c */ -extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_buf **bpp); +int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, struct xfs_buf **bpp); extern int xfs_dir2_block_addname(struct xfs_da_args *args); extern int xfs_dir2_block_lookup(struct xfs_da_args *args); extern int xfs_dir2_block_removename(struct xfs_da_args *args); @@ -78,7 +78,8 @@ extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t bno, unsigned int flags, + struct xfs_buf **bpp); int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, unsigned int flags); @@ -95,9 +96,9 @@ void xfs_dir2_leaf_hdr_from_disk(struct xfs_mount *mp, void xfs_dir2_leaf_hdr_to_disk(struct xfs_mount *mp, struct xfs_dir2_leaf *to, struct xfs_dir3_icleaf_hdr *from); int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp); int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); @@ -154,8 +155,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args); extern int xfs_dir2_node_replace(struct xfs_da_args *args); extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, int *rvalp); -extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, struct xfs_buf **bpp); +int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp); /* xfs_dir2_sf.c */ xfs_ino_t xfs_dir2_sf_get_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, @@ -189,6 +190,13 @@ extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize); static inline unsigned int +xfs_dir2_data_unusedsize( + unsigned int len) +{ + return round_up(len, XFS_DIR2_DATA_ALIGN); +} + +static inline unsigned int xfs_dir2_data_entsize( struct xfs_mount *mp, unsigned int namelen) diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 15a362e2f5ea..dceef2abd4e2 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -16,6 +16,9 @@ #include "xfs_trans.h" #include "xfs_qm.h" #include "xfs_error.h" +#include "xfs_health.h" +#include "xfs_metadir.h" +#include "xfs_metafile.h" int xfs_calc_dquots_per_chunk( @@ -323,3 +326,190 @@ xfs_dquot_to_disk_ts( return cpu_to_be32(t); } + +inline unsigned int +xfs_dqinode_sick_mask(xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: + return XFS_SICK_FS_UQUOTA; + case XFS_DQTYPE_GROUP: + return XFS_SICK_FS_GQUOTA; + case XFS_DQTYPE_PROJ: + return XFS_SICK_FS_PQUOTA; + } + + ASSERT(0); + return 0; +} + +/* + * Load the inode for a given type of quota, assuming that the sb fields have + * been sorted out. This is not true when switching quota types on a V4 + * filesystem, so do not use this function for that. If metadir is enabled, + * @dp must be the /quota metadir. + * + * Returns -ENOENT if the quota inode field is NULLFSINO; 0 and an inode on + * success; or a negative errno. + */ +int +xfs_dqinode_load( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dqtype_t type, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip; + enum xfs_metafile_type metafile_type = xfs_dqinode_metafile_type(type); + int error; + + if (!xfs_has_metadir(mp)) { + xfs_ino_t ino; + + switch (type) { + case XFS_DQTYPE_USER: + ino = mp->m_sb.sb_uquotino; + break; + case XFS_DQTYPE_GROUP: + ino = mp->m_sb.sb_gquotino; + break; + case XFS_DQTYPE_PROJ: + ino = mp->m_sb.sb_pquotino; + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + + /* Should have set 0 to NULLFSINO when loading superblock */ + if (ino == NULLFSINO) + return -ENOENT; + + error = xfs_trans_metafile_iget(tp, ino, metafile_type, &ip); + } else { + error = xfs_metadir_load(tp, dp, xfs_dqinode_path(type), + metafile_type, &ip); + if (error == -ENOENT) + return error; + } + if (error) { + if (xfs_metadata_is_sick(error)) + xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type)); + return error; + } + + if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE)) { + xfs_irele(ip); + xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type)); + return -EFSCORRUPTED; + } + + if (XFS_IS_CORRUPT(mp, ip->i_projid != 0)) { + xfs_irele(ip); + xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type)); + return -EFSCORRUPTED; + } + + *ipp = ip; + return 0; +} + +/* Create a metadata directory quota inode. */ +int +xfs_dqinode_metadir_create( + struct xfs_inode *dp, + xfs_dqtype_t type, + struct xfs_inode **ipp) +{ + struct xfs_metadir_update upd = { + .dp = dp, + .metafile_type = xfs_dqinode_metafile_type(type), + .path = xfs_dqinode_path(type), + }; + int error; + + error = xfs_metadir_start_create(&upd); + if (error) + return error; + + error = xfs_metadir_create(&upd, S_IFREG); + if (error) + return error; + + xfs_trans_log_inode(upd.tp, upd.ip, XFS_ILOG_CORE); + + error = xfs_metadir_commit(&upd); + if (error) + return error; + + xfs_finish_inode_setup(upd.ip); + *ipp = upd.ip; + return 0; +} + +#ifndef __KERNEL__ +/* Link a metadata directory quota inode. */ +int +xfs_dqinode_metadir_link( + struct xfs_inode *dp, + xfs_dqtype_t type, + struct xfs_inode *ip) +{ + struct xfs_metadir_update upd = { + .dp = dp, + .metafile_type = xfs_dqinode_metafile_type(type), + .path = xfs_dqinode_path(type), + .ip = ip, + }; + int error; + + error = xfs_metadir_start_link(&upd); + if (error) + return error; + + error = xfs_metadir_link(&upd); + if (error) + return error; + + xfs_trans_log_inode(upd.tp, upd.ip, XFS_ILOG_CORE); + + return xfs_metadir_commit(&upd); +} +#endif /* __KERNEL__ */ + +/* Create the parent directory for all quota inodes and load it. */ +int +xfs_dqinode_mkdir_parent( + struct xfs_mount *mp, + struct xfs_inode **dpp) +{ + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_mkdir(mp->m_metadirip, "quota", dpp); +} + +/* + * Load the parent directory of all quota inodes. Pass the inode to the caller + * because quota functions (e.g. QUOTARM) can be called on the quota files even + * if quotas are not enabled. + */ +int +xfs_dqinode_load_parent( + struct xfs_trans *tp, + struct xfs_inode **dpp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_load(tp, mp->m_metadirip, "quota", XFS_METAFILE_DIR, + dpp); +} diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 01a9e86b3037..a53c5d40e084 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -63,7 +63,9 @@ #define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 #define XFS_ERRTAG_WB_DELAY_MS 42 #define XFS_ERRTAG_WRITE_DELAY_MS 43 -#define XFS_ERRTAG_MAX 44 +#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 +#define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45 +#define XFS_ERRTAG_MAX 46 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -111,5 +113,7 @@ #define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 #define XFS_RANDOM_WB_DELAY_MS 3000 #define XFS_RANDOM_WRITE_DELAY_MS 3000 +#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1 +#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c new file mode 100644 index 000000000000..3f1d6a98c118 --- /dev/null +++ b/fs/xfs/libxfs/xfs_exchmaps.c @@ -0,0 +1,1237 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_bmap.h" +#include "xfs_icache.h" +#include "xfs_quota.h" +#include "xfs_exchmaps.h" +#include "xfs_trace.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_error.h" +#include "xfs_errortag.h" +#include "xfs_health.h" +#include "xfs_exchmaps_item.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr.h" +#include "xfs_dir2_priv.h" +#include "xfs_dir2.h" +#include "xfs_symlink_remote.h" + +struct kmem_cache *xfs_exchmaps_intent_cache; + +/* bmbt mappings adjacent to a pair of records. */ +struct xfs_exchmaps_adjacent { + struct xfs_bmbt_irec left1; + struct xfs_bmbt_irec right1; + struct xfs_bmbt_irec left2; + struct xfs_bmbt_irec right2; +}; + +#define ADJACENT_INIT { \ + .left1 = { .br_startblock = HOLESTARTBLOCK }, \ + .right1 = { .br_startblock = HOLESTARTBLOCK }, \ + .left2 = { .br_startblock = HOLESTARTBLOCK }, \ + .right2 = { .br_startblock = HOLESTARTBLOCK }, \ +} + +/* Information to reset reflink flag / CoW fork state after an exchange. */ + +/* + * If the reflink flag is set on either inode, make sure it has an incore CoW + * fork, since all reflink inodes must have them. If there's a CoW fork and it + * has mappings in it, make sure the inodes are tagged appropriately so that + * speculative preallocations can be GC'd if we run low of space. + */ +static inline void +xfs_exchmaps_ensure_cowfork( + struct xfs_inode *ip) +{ + struct xfs_ifork *cfork; + + if (xfs_is_reflink_inode(ip)) + xfs_ifork_init_cow(ip); + + cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); + if (!cfork) + return; + if (cfork->if_bytes > 0) + xfs_inode_set_cowblocks_tag(ip); + else + xfs_inode_clear_cowblocks_tag(ip); +} + +/* + * Adjust the on-disk inode size upwards if needed so that we never add + * mappings into the file past EOF. This is crucial so that log recovery won't + * get confused by the sudden appearance of post-eof mappings. + */ +STATIC void +xfs_exchmaps_update_size( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fsize_t new_isize) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_fsize_t len; + + if (new_isize < 0) + return; + + len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount), + new_isize); + + if (len <= ip->i_disk_size) + return; + + trace_xfs_exchmaps_update_inode_size(ip, len); + + ip->i_disk_size = len; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Advance the incore state tracking after exchanging a mapping. */ +static inline void +xmi_advance( + struct xfs_exchmaps_intent *xmi, + const struct xfs_bmbt_irec *irec) +{ + xmi->xmi_startoff1 += irec->br_blockcount; + xmi->xmi_startoff2 += irec->br_blockcount; + xmi->xmi_blockcount -= irec->br_blockcount; +} + +/* Do we still have more mappings to exchange? */ +static inline bool +xmi_has_more_exchange_work(const struct xfs_exchmaps_intent *xmi) +{ + return xmi->xmi_blockcount > 0; +} + +/* Do we have post-operation cleanups to perform? */ +static inline bool +xmi_has_postop_work(const struct xfs_exchmaps_intent *xmi) +{ + return xmi->xmi_flags & (XFS_EXCHMAPS_CLEAR_INO1_REFLINK | + XFS_EXCHMAPS_CLEAR_INO2_REFLINK | + __XFS_EXCHMAPS_INO2_SHORTFORM); +} + +/* Check all mappings to make sure we can actually exchange them. */ +int +xfs_exchmaps_check_forks( + struct xfs_mount *mp, + const struct xfs_exchmaps_req *req) +{ + struct xfs_ifork *ifp1, *ifp2; + int whichfork = xfs_exchmaps_reqfork(req); + + /* No fork? */ + ifp1 = xfs_ifork_ptr(req->ip1, whichfork); + ifp2 = xfs_ifork_ptr(req->ip2, whichfork); + if (!ifp1 || !ifp2) + return -EINVAL; + + /* We don't know how to exchange local format forks. */ + if (ifp1->if_format == XFS_DINODE_FMT_LOCAL || + ifp2->if_format == XFS_DINODE_FMT_LOCAL) + return -EINVAL; + + return 0; +} + +#ifdef CONFIG_XFS_QUOTA +/* Log the actual updates to the quota accounting. */ +static inline void +xfs_exchmaps_update_quota( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2) +{ + int64_t ip1_delta = 0, ip2_delta = 0; + unsigned int qflag; + + qflag = XFS_IS_REALTIME_INODE(xmi->xmi_ip1) ? XFS_TRANS_DQ_RTBCOUNT : + XFS_TRANS_DQ_BCOUNT; + + if (xfs_bmap_is_real_extent(irec1)) { + ip1_delta -= irec1->br_blockcount; + ip2_delta += irec1->br_blockcount; + } + + if (xfs_bmap_is_real_extent(irec2)) { + ip1_delta += irec2->br_blockcount; + ip2_delta -= irec2->br_blockcount; + } + + xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip1, qflag, ip1_delta); + xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip2, qflag, ip2_delta); +} +#else +# define xfs_exchmaps_update_quota(tp, xmi, irec1, irec2) ((void)0) +#endif + +/* Decide if we want to skip this mapping from file1. */ +static inline bool +xfs_exchmaps_can_skip_mapping( + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = xmi->xmi_ip1->i_mount; + + /* Do not skip this mapping if the caller did not tell us to. */ + if (!(xmi->xmi_flags & XFS_EXCHMAPS_INO1_WRITTEN)) + return false; + + /* Do not skip mapped, written mappings. */ + if (xfs_bmap_is_written_extent(irec)) + return false; + + /* + * The mapping is unwritten or a hole. It cannot be a delalloc + * reservation because we already excluded those. It cannot be an + * unwritten extent with dirty page cache because we flushed the page + * cache. For files where the allocation unit is 1FSB (files on the + * data dev, rt files if the extent size is 1FSB), we can safely + * skip this mapping. + */ + if (!xfs_inode_has_bigrtalloc(xmi->xmi_ip1)) + return true; + + /* + * For a realtime file with a multi-fsb allocation unit, the decision + * is trickier because we can only swap full allocation units. + * Unwritten mappings can appear in the middle of an rtx if the rtx is + * partially written, but they can also appear for preallocations. + * + * If the mapping is a hole, skip it entirely. Holes should align with + * rtx boundaries. + */ + if (!xfs_bmap_is_real_extent(irec)) + return true; + + /* + * All mappings below this point are unwritten. + * + * - If the beginning is not aligned to an rtx, trim the end of the + * mapping so that it does not cross an rtx boundary, and swap it. + * + * - If both ends are aligned to an rtx, skip the entire mapping. + */ + if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) { + xfs_fileoff_t new_end; + + new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize); + irec->br_blockcount = min(irec->br_blockcount, + new_end - irec->br_startoff); + return false; + } + if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize)) + return true; + + /* + * All mappings below this point are unwritten, start on an rtx + * boundary, and do not end on an rtx boundary. + * + * - If the mapping is longer than one rtx, trim the end of the mapping + * down to an rtx boundary and skip it. + * + * - The mapping is shorter than one rtx. Swap it. + */ + if (irec->br_blockcount > mp->m_sb.sb_rextsize) { + xfs_fileoff_t new_end; + + new_end = rounddown_64(irec->br_startoff + irec->br_blockcount, + mp->m_sb.sb_rextsize); + irec->br_blockcount = new_end - irec->br_startoff; + return true; + } + + return false; +} + +/* + * Walk forward through the file ranges in @xmi until we find two different + * mappings to exchange. If there is work to do, return the mappings; + * otherwise we've reached the end of the range and xmi_blockcount will be + * zero. + * + * If the walk skips over a pair of mappings to the same storage, save them as + * the left records in @adj (if provided) so that the simulation phase can + * avoid an extra lookup. + */ +static int +xfs_exchmaps_find_mappings( + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2, + struct xfs_exchmaps_adjacent *adj) +{ + int nimaps; + int bmap_flags; + int error; + + bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_whichfork(xmi)); + + for (; xmi_has_more_exchange_work(xmi); xmi_advance(xmi, irec1)) { + /* Read mapping from the first file */ + nimaps = 1; + error = xfs_bmapi_read(xmi->xmi_ip1, xmi->xmi_startoff1, + xmi->xmi_blockcount, irec1, &nimaps, + bmap_flags); + if (error) + return error; + if (nimaps != 1 || + irec1->br_startblock == DELAYSTARTBLOCK || + irec1->br_startoff != xmi->xmi_startoff1) { + /* + * We should never get no mapping or a delalloc mapping + * or something that doesn't match what we asked for, + * since the caller flushed both inodes and we hold the + * ILOCKs for both inodes. + */ + ASSERT(0); + return -EINVAL; + } + + if (xfs_exchmaps_can_skip_mapping(xmi, irec1)) { + trace_xfs_exchmaps_mapping1_skip(xmi->xmi_ip1, irec1); + continue; + } + + /* Read mapping from the second file */ + nimaps = 1; + error = xfs_bmapi_read(xmi->xmi_ip2, xmi->xmi_startoff2, + irec1->br_blockcount, irec2, &nimaps, + bmap_flags); + if (error) + return error; + if (nimaps != 1 || + irec2->br_startblock == DELAYSTARTBLOCK || + irec2->br_startoff != xmi->xmi_startoff2) { + /* + * We should never get no mapping or a delalloc mapping + * or something that doesn't match what we asked for, + * since the caller flushed both inodes and we hold the + * ILOCKs for both inodes. + */ + ASSERT(0); + return -EINVAL; + } + + /* + * We can only exchange as many blocks as the smaller of the + * two mapping maps. + */ + irec1->br_blockcount = min(irec1->br_blockcount, + irec2->br_blockcount); + + trace_xfs_exchmaps_mapping1(xmi->xmi_ip1, irec1); + trace_xfs_exchmaps_mapping2(xmi->xmi_ip2, irec2); + + /* We found something to exchange, so return it. */ + if (irec1->br_startblock != irec2->br_startblock) + return 0; + + /* + * Two mappings pointing to the same physical block must not + * have different states; that's filesystem corruption. Move + * on to the next mapping if they're both holes or both point + * to the same physical space extent. + */ + if (irec1->br_state != irec2->br_state) { + xfs_bmap_mark_sick(xmi->xmi_ip1, + xfs_exchmaps_whichfork(xmi)); + xfs_bmap_mark_sick(xmi->xmi_ip2, + xfs_exchmaps_whichfork(xmi)); + return -EFSCORRUPTED; + } + + /* + * Save the mappings if we're estimating work and skipping + * these identical mappings. + */ + if (adj) { + memcpy(&adj->left1, irec1, sizeof(*irec1)); + memcpy(&adj->left2, irec2, sizeof(*irec2)); + } + } + + return 0; +} + +/* Exchange these two mappings. */ +static void +xfs_exchmaps_one_step( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2) +{ + int whichfork = xfs_exchmaps_whichfork(xmi); + + xfs_exchmaps_update_quota(tp, xmi, irec1, irec2); + + /* Remove both mappings. */ + xfs_bmap_unmap_extent(tp, xmi->xmi_ip1, whichfork, irec1); + xfs_bmap_unmap_extent(tp, xmi->xmi_ip2, whichfork, irec2); + + /* + * Re-add both mappings. We exchange the file offsets between the two + * maps and add the opposite map, which has the effect of filling the + * logical offsets we just unmapped, but with with the physical mapping + * information exchanged. + */ + swap(irec1->br_startoff, irec2->br_startoff); + xfs_bmap_map_extent(tp, xmi->xmi_ip1, whichfork, irec2); + xfs_bmap_map_extent(tp, xmi->xmi_ip2, whichfork, irec1); + + /* Make sure we're not adding mappings past EOF. */ + if (whichfork == XFS_DATA_FORK) { + xfs_exchmaps_update_size(tp, xmi->xmi_ip1, irec2, + xmi->xmi_isize1); + xfs_exchmaps_update_size(tp, xmi->xmi_ip2, irec1, + xmi->xmi_isize2); + } + + /* + * Advance our cursor and exit. The caller (either defer ops or log + * recovery) will log the XMD item, and if *blockcount is nonzero, it + * will log a new XMI item for the remainder and call us back. + */ + xmi_advance(xmi, irec1); +} + +/* Convert inode2's leaf attr fork back to shortform, if possible.. */ +STATIC int +xfs_exchmaps_attr_to_sf( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_da_args args = { + .dp = xmi->xmi_ip2, + .geo = tp->t_mountp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .trans = tp, + .owner = xmi->xmi_ip2->i_ino, + }; + struct xfs_buf *bp; + int forkoff; + int error; + + if (!xfs_attr_is_leaf(xmi->xmi_ip2)) + return 0; + + error = xfs_attr3_leaf_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, 0, + &bp); + if (error) + return error; + + forkoff = xfs_attr_shortform_allfit(bp, xmi->xmi_ip2); + if (forkoff == 0) + return 0; + + return xfs_attr3_leaf_to_shortform(bp, &args, forkoff); +} + +/* Convert inode2's block dir fork back to shortform, if possible.. */ +STATIC int +xfs_exchmaps_dir_to_sf( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_da_args args = { + .dp = xmi->xmi_ip2, + .geo = tp->t_mountp->m_dir_geo, + .whichfork = XFS_DATA_FORK, + .trans = tp, + .owner = xmi->xmi_ip2->i_ino, + }; + struct xfs_dir2_sf_hdr sfh; + struct xfs_buf *bp; + int size; + int error = 0; + + if (xfs_dir2_format(&args, &error) != XFS_DIR2_FMT_BLOCK) + return error; + + error = xfs_dir3_block_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, &bp); + if (error) + return error; + + size = xfs_dir2_block_sfsize(xmi->xmi_ip2, bp->b_addr, &sfh); + if (size > xfs_inode_data_fork_size(xmi->xmi_ip2)) + return 0; + + return xfs_dir2_block_to_sf(&args, bp, size, &sfh); +} + +/* Convert inode2's remote symlink target back to shortform, if possible. */ +STATIC int +xfs_exchmaps_link_to_sf( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_inode *ip = xmi->xmi_ip2; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + char *buf; + int error; + + if (ifp->if_format == XFS_DINODE_FMT_LOCAL || + ip->i_disk_size > xfs_inode_data_fork_size(ip)) + return 0; + + /* Read the current symlink target into a buffer. */ + buf = kmalloc(ip->i_disk_size + 1, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); + if (!buf) { + ASSERT(0); + return -ENOMEM; + } + + error = xfs_symlink_remote_read(ip, buf); + if (error) + goto free; + + /* Remove the blocks. */ + error = xfs_symlink_remote_truncate(tp, ip); + if (error) + goto free; + + /* Convert fork to local format and log our changes. */ + xfs_idestroy_fork(ifp); + ifp->if_bytes = 0; + ifp->if_format = XFS_DINODE_FMT_LOCAL; + xfs_init_local_fork(ip, XFS_DATA_FORK, buf, ip->i_disk_size); + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); +free: + kfree(buf); + return error; +} + +/* Clear the reflink flag after an exchange. */ +static inline void +xfs_exchmaps_clear_reflink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + trace_xfs_reflink_unset_inode_flag(ip); + + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Finish whatever work might come after an exchange operation. */ +static int +xfs_exchmaps_do_postop_work( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + if (xmi->xmi_flags & __XFS_EXCHMAPS_INO2_SHORTFORM) { + int error = 0; + + if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK) + error = xfs_exchmaps_attr_to_sf(tp, xmi); + else if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode)) + error = xfs_exchmaps_dir_to_sf(tp, xmi); + else if (S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) + error = xfs_exchmaps_link_to_sf(tp, xmi); + xmi->xmi_flags &= ~__XFS_EXCHMAPS_INO2_SHORTFORM; + if (error) + return error; + } + + if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO1_REFLINK) { + xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip1); + xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO1_REFLINK; + } + + if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO2_REFLINK) { + xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip2); + xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO2_REFLINK; + } + + return 0; +} + +/* Finish one step in a mapping exchange operation, possibly relogging. */ +int +xfs_exchmaps_finish_one( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_bmbt_irec irec1, irec2; + int error; + + if (xmi_has_more_exchange_work(xmi)) { + /* + * If the operation state says that some range of the files + * have not yet been exchanged, look for mappings in that range + * to exchange. If we find some mappings, exchange them. + */ + error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, NULL); + if (error) + return error; + + if (xmi_has_more_exchange_work(xmi)) + xfs_exchmaps_one_step(tp, xmi, &irec1, &irec2); + + /* + * If the caller asked us to exchange the file sizes after the + * exchange and either we just exchanged the last mappings in + * the range or we didn't find anything to exchange, update the + * ondisk file sizes. + */ + if ((xmi->xmi_flags & XFS_EXCHMAPS_SET_SIZES) && + !xmi_has_more_exchange_work(xmi)) { + xmi->xmi_ip1->i_disk_size = xmi->xmi_isize1; + xmi->xmi_ip2->i_disk_size = xmi->xmi_isize2; + + xfs_trans_log_inode(tp, xmi->xmi_ip1, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, xmi->xmi_ip2, XFS_ILOG_CORE); + } + } else if (xmi_has_postop_work(xmi)) { + /* + * Now that we're finished with the exchange operation, + * complete the post-op cleanup work. + */ + error = xfs_exchmaps_do_postop_work(tp, xmi); + if (error) + return error; + } + + if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) + return -EIO; + + /* If we still have work to do, ask for a new transaction. */ + if (xmi_has_more_exchange_work(xmi) || xmi_has_postop_work(xmi)) { + trace_xfs_exchmaps_defer(tp->t_mountp, xmi); + return -EAGAIN; + } + + /* + * If we reach here, we've finished all the exchange work and the post + * operation work. The last thing we need to do before returning to + * the caller is to make sure that COW forks are set up correctly. + */ + if (!(xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)) { + xfs_exchmaps_ensure_cowfork(xmi->xmi_ip1); + xfs_exchmaps_ensure_cowfork(xmi->xmi_ip2); + } + + return 0; +} + +/* + * Compute the amount of bmbt blocks we should reserve for each file. In the + * worst case, each exchange will fill a hole with a new mapping, which could + * result in a btree split every time we add a new leaf block. + */ +static inline uint64_t +xfs_exchmaps_bmbt_blocks( + struct xfs_mount *mp, + const struct xfs_exchmaps_req *req) +{ + return howmany_64(req->nr_exchanges, + XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)) * + XFS_EXTENTADD_SPACE_RES(mp, xfs_exchmaps_reqfork(req)); +} + +/* Compute the space we should reserve for the rmap btree expansions. */ +static inline uint64_t +xfs_exchmaps_rmapbt_blocks( + struct xfs_mount *mp, + const struct xfs_exchmaps_req *req) +{ + if (!xfs_has_rmapbt(mp)) + return 0; + if (XFS_IS_REALTIME_INODE(req->ip1)) + return howmany_64(req->nr_exchanges, + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * + XFS_RTRMAPADD_SPACE_RES(mp); + + return howmany_64(req->nr_exchanges, + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * + XFS_RMAPADD_SPACE_RES(mp); +} + +/* Estimate the bmbt and rmapbt overhead required to exchange mappings. */ +int +xfs_exchmaps_estimate_overhead( + struct xfs_exchmaps_req *req) +{ + struct xfs_mount *mp = req->ip1->i_mount; + xfs_filblks_t bmbt_blocks; + xfs_filblks_t rmapbt_blocks; + xfs_filblks_t resblks = req->resblks; + + /* + * Compute the number of bmbt and rmapbt blocks we might need to handle + * the estimated number of exchanges. + */ + bmbt_blocks = xfs_exchmaps_bmbt_blocks(mp, req); + rmapbt_blocks = xfs_exchmaps_rmapbt_blocks(mp, req); + + trace_xfs_exchmaps_overhead(mp, bmbt_blocks, rmapbt_blocks); + + /* Make sure the change in file block count doesn't overflow. */ + if (check_add_overflow(req->ip1_bcount, bmbt_blocks, &req->ip1_bcount)) + return -EFBIG; + if (check_add_overflow(req->ip2_bcount, bmbt_blocks, &req->ip2_bcount)) + return -EFBIG; + + /* + * Add together the number of blocks we need to handle btree growth, + * then add it to the number of blocks we need to reserve to this + * transaction. + */ + if (check_add_overflow(resblks, bmbt_blocks, &resblks)) + return -ENOSPC; + if (check_add_overflow(resblks, bmbt_blocks, &resblks)) + return -ENOSPC; + if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) + return -ENOSPC; + if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) + return -ENOSPC; + + /* Can't actually reserve more than UINT_MAX blocks. */ + if (req->resblks > UINT_MAX) + return -ENOSPC; + + req->resblks = resblks; + trace_xfs_exchmaps_final_estimate(req); + return 0; +} + +/* Decide if we can merge two real mappings. */ +static inline bool +xmi_can_merge( + const struct xfs_bmbt_irec *b1, + const struct xfs_bmbt_irec *b2) +{ + /* Don't merge holes. */ + if (b1->br_startblock == HOLESTARTBLOCK || + b2->br_startblock == HOLESTARTBLOCK) + return false; + + /* We don't merge holes. */ + if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2)) + return false; + + if (b1->br_startoff + b1->br_blockcount == b2->br_startoff && + b1->br_startblock + b1->br_blockcount == b2->br_startblock && + b1->br_state == b2->br_state && + b1->br_blockcount + b2->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + return true; + + return false; +} + +/* + * Decide if we can merge three mappings. Caller must ensure all three + * mappings must not be holes or delalloc reservations. + */ +static inline bool +xmi_can_merge_all( + const struct xfs_bmbt_irec *l, + const struct xfs_bmbt_irec *m, + const struct xfs_bmbt_irec *r) +{ + xfs_filblks_t new_len; + + new_len = l->br_blockcount + m->br_blockcount + r->br_blockcount; + return new_len <= XFS_MAX_BMBT_EXTLEN; +} + +#define CLEFT_CONTIG 0x01 +#define CRIGHT_CONTIG 0x02 +#define CHOLE 0x04 +#define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG) + +#define NLEFT_CONTIG 0x10 +#define NRIGHT_CONTIG 0x20 +#define NHOLE 0x40 +#define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG) + +/* Estimate the effect of a single exchange on mapping count. */ +static inline int +xmi_delta_nextents_step( + struct xfs_mount *mp, + const struct xfs_bmbt_irec *left, + const struct xfs_bmbt_irec *curr, + const struct xfs_bmbt_irec *new, + const struct xfs_bmbt_irec *right) +{ + bool lhole, rhole, chole, nhole; + unsigned int state = 0; + int ret = 0; + + lhole = left->br_startblock == HOLESTARTBLOCK; + rhole = right->br_startblock == HOLESTARTBLOCK; + chole = curr->br_startblock == HOLESTARTBLOCK; + nhole = new->br_startblock == HOLESTARTBLOCK; + + if (chole) + state |= CHOLE; + if (!lhole && !chole && xmi_can_merge(left, curr)) + state |= CLEFT_CONTIG; + if (!rhole && !chole && xmi_can_merge(curr, right)) + state |= CRIGHT_CONTIG; + if ((state & CBOTH_CONTIG) == CBOTH_CONTIG && + !xmi_can_merge_all(left, curr, right)) + state &= ~CRIGHT_CONTIG; + + if (nhole) + state |= NHOLE; + if (!lhole && !nhole && xmi_can_merge(left, new)) + state |= NLEFT_CONTIG; + if (!rhole && !nhole && xmi_can_merge(new, right)) + state |= NRIGHT_CONTIG; + if ((state & NBOTH_CONTIG) == NBOTH_CONTIG && + !xmi_can_merge_all(left, new, right)) + state &= ~NRIGHT_CONTIG; + + switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) { + case CLEFT_CONTIG | CRIGHT_CONTIG: + /* + * left/curr/right are the same mapping, so deleting curr + * causes 2 new mappings to be created. + */ + ret += 2; + break; + case 0: + /* + * curr is not contiguous with any mapping, so we remove curr + * completely + */ + ret--; + break; + case CHOLE: + /* hole, do nothing */ + break; + case CLEFT_CONTIG: + case CRIGHT_CONTIG: + /* trim either left or right, no change */ + break; + } + + switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) { + case NLEFT_CONTIG | NRIGHT_CONTIG: + /* + * left/curr/right will become the same mapping, so adding + * curr causes the deletion of right. + */ + ret--; + break; + case 0: + /* new is not contiguous with any mapping */ + ret++; + break; + case NHOLE: + /* hole, do nothing. */ + break; + case NLEFT_CONTIG: + case NRIGHT_CONTIG: + /* new is absorbed into left or right, no change */ + break; + } + + trace_xfs_exchmaps_delta_nextents_step(mp, left, curr, new, right, ret, + state); + return ret; +} + +/* Make sure we don't overflow the extent (mapping) counters. */ +static inline int +xmi_ensure_delta_nextents( + struct xfs_exchmaps_req *req, + struct xfs_inode *ip, + int64_t delta) +{ + struct xfs_mount *mp = ip->i_mount; + int whichfork = xfs_exchmaps_reqfork(req); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + uint64_t new_nextents; + xfs_extnum_t max_nextents; + + if (delta < 0) + return 0; + + /* + * It's always an error if the delta causes integer overflow. delta + * needs an explicit cast here to avoid warnings about implicit casts + * coded into the overflow check. + */ + if (check_add_overflow(ifp->if_nextents, (uint64_t)delta, + &new_nextents)) + return -EFBIG; + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + new_nextents > 10) + return -EFBIG; + + /* + * We always promote both inodes to have large extent counts if the + * superblock feature is enabled, so we only need to check against the + * theoretical maximum. + */ + max_nextents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), + whichfork); + if (new_nextents > max_nextents) + return -EFBIG; + + return 0; +} + +/* Find the next mapping after irec. */ +static inline int +xmi_next( + struct xfs_inode *ip, + int bmap_flags, + const struct xfs_bmbt_irec *irec, + struct xfs_bmbt_irec *nrec) +{ + xfs_fileoff_t off; + xfs_filblks_t blockcount; + int nimaps = 1; + int error; + + off = irec->br_startoff + irec->br_blockcount; + blockcount = XFS_MAX_FILEOFF - off; + error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags); + if (error) + return error; + if (nrec->br_startblock == DELAYSTARTBLOCK || + nrec->br_startoff != off) { + /* + * If we don't get the mapping we want, return a zero-length + * mapping, which our estimator function will pretend is a hole. + * We shouldn't get delalloc reservations. + */ + nrec->br_startblock = HOLESTARTBLOCK; + } + + return 0; +} + +int __init +xfs_exchmaps_intent_init_cache(void) +{ + xfs_exchmaps_intent_cache = kmem_cache_create("xfs_exchmaps_intent", + sizeof(struct xfs_exchmaps_intent), + 0, 0, NULL); + + return xfs_exchmaps_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_exchmaps_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_exchmaps_intent_cache); + xfs_exchmaps_intent_cache = NULL; +} + +/* + * Decide if we will exchange the reflink flags between the two files after the + * exchange. The only time we want to do this is if we're exchanging all + * mappings under EOF and the inode reflink flags have different states. + */ +static inline bool +xmi_can_exchange_reflink_flags( + const struct xfs_exchmaps_req *req, + unsigned int reflink_state) +{ + struct xfs_mount *mp = req->ip1->i_mount; + + if (hweight32(reflink_state) != 1) + return false; + if (req->startoff1 != 0 || req->startoff2 != 0) + return false; + if (req->blockcount != XFS_B_TO_FSB(mp, req->ip1->i_disk_size)) + return false; + if (req->blockcount != XFS_B_TO_FSB(mp, req->ip2->i_disk_size)) + return false; + return true; +} + + +/* Allocate and initialize a new incore intent item from a request. */ +struct xfs_exchmaps_intent * +xfs_exchmaps_init_intent( + const struct xfs_exchmaps_req *req) +{ + struct xfs_exchmaps_intent *xmi; + unsigned int rs = 0; + + xmi = kmem_cache_zalloc(xfs_exchmaps_intent_cache, + GFP_NOFS | __GFP_NOFAIL); + INIT_LIST_HEAD(&xmi->xmi_list); + xmi->xmi_ip1 = req->ip1; + xmi->xmi_ip2 = req->ip2; + xmi->xmi_startoff1 = req->startoff1; + xmi->xmi_startoff2 = req->startoff2; + xmi->xmi_blockcount = req->blockcount; + xmi->xmi_isize1 = xmi->xmi_isize2 = -1; + xmi->xmi_flags = req->flags & XFS_EXCHMAPS_PARAMS; + + if (xfs_exchmaps_whichfork(xmi) == XFS_ATTR_FORK) { + xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; + return xmi; + } + + if (req->flags & XFS_EXCHMAPS_SET_SIZES) { + xmi->xmi_flags |= XFS_EXCHMAPS_SET_SIZES; + xmi->xmi_isize1 = req->ip2->i_disk_size; + xmi->xmi_isize2 = req->ip1->i_disk_size; + } + + /* Record the state of each inode's reflink flag before the op. */ + if (xfs_is_reflink_inode(req->ip1)) + rs |= 1; + if (xfs_is_reflink_inode(req->ip2)) + rs |= 2; + + /* + * Figure out if we're clearing the reflink flags (which effectively + * exchanges them) after the operation. + */ + if (xmi_can_exchange_reflink_flags(req, rs)) { + if (rs & 1) + xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO1_REFLINK; + if (rs & 2) + xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO2_REFLINK; + } + + if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode) || + S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) + xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; + + return xmi; +} + +/* + * Estimate the number of exchange operations and the number of file blocks + * in each file that will be affected by the exchange operation. + */ +int +xfs_exchmaps_estimate( + struct xfs_exchmaps_req *req) +{ + struct xfs_exchmaps_intent *xmi; + struct xfs_bmbt_irec irec1, irec2; + struct xfs_exchmaps_adjacent adj = ADJACENT_INIT; + xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0; + int64_t d_nexts1, d_nexts2; + int bmap_flags; + int error; + + ASSERT(!(req->flags & ~XFS_EXCHMAPS_PARAMS)); + + bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_reqfork(req)); + xmi = xfs_exchmaps_init_intent(req); + + /* + * To guard against the possibility of overflowing the extent counters, + * we have to estimate an upper bound on the potential increase in that + * counter. We can split the mapping at each end of the range, and for + * each step of the exchange we can split the mapping that we're + * working on if the mappings do not align. + */ + d_nexts1 = d_nexts2 = 3; + + while (xmi_has_more_exchange_work(xmi)) { + /* + * Walk through the file ranges until we find something to + * exchange. Because we're simulating the exchange, pass in + * adj to capture skipped mappings for correct estimation of + * bmbt record merges. + */ + error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, &adj); + if (error) + goto out_free; + if (!xmi_has_more_exchange_work(xmi)) + break; + + /* Update accounting. */ + if (xfs_bmap_is_real_extent(&irec1)) + ip1_blocks += irec1.br_blockcount; + if (xfs_bmap_is_real_extent(&irec2)) + ip2_blocks += irec2.br_blockcount; + req->nr_exchanges++; + + /* Read the next mappings from both files. */ + error = xmi_next(req->ip1, bmap_flags, &irec1, &adj.right1); + if (error) + goto out_free; + + error = xmi_next(req->ip2, bmap_flags, &irec2, &adj.right2); + if (error) + goto out_free; + + /* Update extent count deltas. */ + d_nexts1 += xmi_delta_nextents_step(req->ip1->i_mount, + &adj.left1, &irec1, &irec2, &adj.right1); + + d_nexts2 += xmi_delta_nextents_step(req->ip1->i_mount, + &adj.left2, &irec2, &irec1, &adj.right2); + + /* Now pretend we exchanged the mappings. */ + if (xmi_can_merge(&adj.left2, &irec1)) + adj.left2.br_blockcount += irec1.br_blockcount; + else + memcpy(&adj.left2, &irec1, sizeof(irec1)); + + if (xmi_can_merge(&adj.left1, &irec2)) + adj.left1.br_blockcount += irec2.br_blockcount; + else + memcpy(&adj.left1, &irec2, sizeof(irec2)); + + xmi_advance(xmi, &irec1); + } + + /* Account for the blocks that are being exchanged. */ + if (XFS_IS_REALTIME_INODE(req->ip1) && + xfs_exchmaps_reqfork(req) == XFS_DATA_FORK) { + req->ip1_rtbcount = ip1_blocks; + req->ip2_rtbcount = ip2_blocks; + } else { + req->ip1_bcount = ip1_blocks; + req->ip2_bcount = ip2_blocks; + } + + /* + * Make sure that both forks have enough slack left in their extent + * counters that the exchange operation will not overflow. + */ + trace_xfs_exchmaps_delta_nextents(req, d_nexts1, d_nexts2); + if (req->ip1 == req->ip2) { + error = xmi_ensure_delta_nextents(req, req->ip1, + d_nexts1 + d_nexts2); + } else { + error = xmi_ensure_delta_nextents(req, req->ip1, d_nexts1); + if (error) + goto out_free; + error = xmi_ensure_delta_nextents(req, req->ip2, d_nexts2); + } + if (error) + goto out_free; + + trace_xfs_exchmaps_initial_estimate(req); + error = xfs_exchmaps_estimate_overhead(req); +out_free: + kmem_cache_free(xfs_exchmaps_intent_cache, xmi); + return error; +} + +/* Set the reflink flag before an operation. */ +static inline void +xfs_exchmaps_set_reflink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + trace_xfs_reflink_set_inode_flag(ip); + + ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* + * If either file has shared blocks and we're exchanging data forks, we must + * flag the other file as having shared blocks so that we get the shared-block + * rmap functions if we need to fix up the rmaps. + */ +void +xfs_exchmaps_ensure_reflink( + struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi) +{ + unsigned int rs = 0; + + if (xfs_is_reflink_inode(xmi->xmi_ip1)) + rs |= 1; + if (xfs_is_reflink_inode(xmi->xmi_ip2)) + rs |= 2; + + if ((rs & 1) && !xfs_is_reflink_inode(xmi->xmi_ip2)) + xfs_exchmaps_set_reflink(tp, xmi->xmi_ip2); + + if ((rs & 2) && !xfs_is_reflink_inode(xmi->xmi_ip1)) + xfs_exchmaps_set_reflink(tp, xmi->xmi_ip1); +} + +/* Set the large extent count flag before an operation if needed. */ +static inline void +xfs_exchmaps_ensure_large_extent_counts( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + if (xfs_inode_has_large_extent_counts(ip)) + return; + + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Widen the extent counter fields of both inodes if necessary. */ +void +xfs_exchmaps_upgrade_extent_counts( + struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi) +{ + if (!xfs_has_large_extent_counts(tp->t_mountp)) + return; + + xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip1); + xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip2); +} + +/* + * Schedule an exchange a range of mappings from one inode to another. + * + * The use of file mapping exchange log intent items ensures the operation can + * be resumed even if the system goes down. The caller must commit the + * transaction to start the work. + * + * The caller must ensure the inodes must be joined to the transaction and + * ILOCKd; they will still be joined to the transaction at exit. + */ +void +xfs_exchange_mappings( + struct xfs_trans *tp, + const struct xfs_exchmaps_req *req) +{ + struct xfs_exchmaps_intent *xmi; + + BUILD_BUG_ON(XFS_EXCHMAPS_INTERNAL_FLAGS & XFS_EXCHMAPS_LOGGED_FLAGS); + + xfs_assert_ilocked(req->ip1, XFS_ILOCK_EXCL); + xfs_assert_ilocked(req->ip2, XFS_ILOCK_EXCL); + ASSERT(!(req->flags & ~XFS_EXCHMAPS_LOGGED_FLAGS)); + if (req->flags & XFS_EXCHMAPS_SET_SIZES) + ASSERT(!(req->flags & XFS_EXCHMAPS_ATTR_FORK)); + ASSERT(xfs_has_exchange_range(tp->t_mountp)); + + if (req->blockcount == 0) + return; + + xmi = xfs_exchmaps_init_intent(req); + xfs_exchmaps_defer_add(tp, xmi); + xfs_exchmaps_ensure_reflink(tp, xmi); + xfs_exchmaps_upgrade_extent_counts(tp, xmi); +} diff --git a/fs/xfs/libxfs/xfs_exchmaps.h b/fs/xfs/libxfs/xfs_exchmaps.h new file mode 100644 index 000000000000..fa822dff202a --- /dev/null +++ b/fs/xfs/libxfs/xfs_exchmaps.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_EXCHMAPS_H__ +#define __XFS_EXCHMAPS_H__ + +/* In-core deferred operation info about a file mapping exchange request. */ +struct xfs_exchmaps_intent { + /* List of other incore deferred work. */ + struct list_head xmi_list; + + /* Inodes participating in the operation. */ + struct xfs_inode *xmi_ip1; + struct xfs_inode *xmi_ip2; + + /* File offset range information. */ + xfs_fileoff_t xmi_startoff1; + xfs_fileoff_t xmi_startoff2; + xfs_filblks_t xmi_blockcount; + + /* Set these file sizes after the operation, unless negative. */ + xfs_fsize_t xmi_isize1; + xfs_fsize_t xmi_isize2; + + uint64_t xmi_flags; /* XFS_EXCHMAPS_* flags */ +}; + +/* Try to convert inode2 from block to short format at the end, if possible. */ +#define __XFS_EXCHMAPS_INO2_SHORTFORM (1ULL << 63) + +#define XFS_EXCHMAPS_INTERNAL_FLAGS (__XFS_EXCHMAPS_INO2_SHORTFORM) + +/* flags that can be passed to xfs_exchmaps_{estimate,mappings} */ +#define XFS_EXCHMAPS_PARAMS (XFS_EXCHMAPS_ATTR_FORK | \ + XFS_EXCHMAPS_SET_SIZES | \ + XFS_EXCHMAPS_INO1_WRITTEN) + +static inline int +xfs_exchmaps_whichfork(const struct xfs_exchmaps_intent *xmi) +{ + if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK) + return XFS_ATTR_FORK; + return XFS_DATA_FORK; +} + +/* Parameters for a mapping exchange request. */ +struct xfs_exchmaps_req { + /* Inodes participating in the operation. */ + struct xfs_inode *ip1; + struct xfs_inode *ip2; + + /* File offset range information. */ + xfs_fileoff_t startoff1; + xfs_fileoff_t startoff2; + xfs_filblks_t blockcount; + + /* XFS_EXCHMAPS_* operation flags */ + uint64_t flags; + + /* + * Fields below this line are filled out by xfs_exchmaps_estimate; + * callers should initialize this part of the struct to zero. + */ + + /* + * Data device blocks to be moved out of ip1, and free space needed to + * handle the bmbt changes. + */ + xfs_filblks_t ip1_bcount; + + /* + * Data device blocks to be moved out of ip2, and free space needed to + * handle the bmbt changes. + */ + xfs_filblks_t ip2_bcount; + + /* rt blocks to be moved out of ip1. */ + xfs_filblks_t ip1_rtbcount; + + /* rt blocks to be moved out of ip2. */ + xfs_filblks_t ip2_rtbcount; + + /* Free space needed to handle the bmbt changes */ + unsigned long long resblks; + + /* Number of exchanges needed to complete the operation */ + unsigned long long nr_exchanges; +}; + +static inline int +xfs_exchmaps_reqfork(const struct xfs_exchmaps_req *req) +{ + if (req->flags & XFS_EXCHMAPS_ATTR_FORK) + return XFS_ATTR_FORK; + return XFS_DATA_FORK; +} + +int xfs_exchmaps_estimate_overhead(struct xfs_exchmaps_req *req); +int xfs_exchmaps_estimate(struct xfs_exchmaps_req *req); + +extern struct kmem_cache *xfs_exchmaps_intent_cache; + +int __init xfs_exchmaps_intent_init_cache(void); +void xfs_exchmaps_intent_destroy_cache(void); + +struct xfs_exchmaps_intent *xfs_exchmaps_init_intent( + const struct xfs_exchmaps_req *req); +void xfs_exchmaps_ensure_reflink(struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi); +void xfs_exchmaps_upgrade_extent_counts(struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi); + +int xfs_exchmaps_finish_one(struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi); + +int xfs_exchmaps_check_forks(struct xfs_mount *mp, + const struct xfs_exchmaps_req *req); + +void xfs_exchange_mappings(struct xfs_trans *tp, + const struct xfs_exchmaps_req *req); + +#endif /* __XFS_EXCHMAPS_H__ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 2b2f9050fbfb..9566a7623365 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -90,8 +90,7 @@ struct xfs_ifork; #define XFSLABEL_MAX 12 /* - * Superblock - in core version. Must match the ondisk version below. - * Must be padded to 64 bit alignment. + * Superblock - in core version. Must be padded to 64 bit alignment. */ typedef struct xfs_sb { uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ @@ -175,13 +174,20 @@ typedef struct xfs_sb { xfs_lsn_t sb_lsn; /* last write sequence */ uuid_t sb_meta_uuid; /* metadata file system unique id */ + xfs_ino_t sb_metadirino; /* metadata directory tree root */ + + xfs_rgnumber_t sb_rgcount; /* number of realtime groups */ + xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */ + uint8_t sb_rgblklog; /* rt group number shift */ + uint8_t sb_pad[7]; /* zeroes */ + xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */ + xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */ + /* must be padded to 64 bit alignment */ } xfs_sb_t; -#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc) - /* - * Superblock - on disk version. Must match the in core version above. + * Superblock - on disk version. * Must be padded to 64 bit alignment. */ struct xfs_dsb { @@ -262,9 +268,24 @@ struct xfs_dsb { __be64 sb_lsn; /* last write sequence */ uuid_t sb_meta_uuid; /* metadata file system unique id */ - /* must be padded to 64 bit alignment */ + __be64 sb_metadirino; /* metadata directory tree root */ + __be32 sb_rgcount; /* # of realtime groups */ + __be32 sb_rgextents; /* size of rtgroup in rtx */ + __u8 sb_rgblklog; /* rt group number shift */ + __u8 sb_pad[7]; /* zeroes */ + __be64 sb_rtstart; /* start of internal RT section (FSB) */ + __be64 sb_rtreserved; /* reserved (zoned) RT blocks */ + + /* + * The size of this structure must be padded to 64 bit alignment. + * + * NOTE: Don't forget to update secondary_sb_whack in xfs_repair when + * adding new fields here. + */ }; +#define XFS_SB_CRC_OFF offsetof(struct xfs_dsb, sb_crc) + /* * Misc. Flags - warning - these will be cleared by xfs_repair unless * a feature bit is set when the flag is used. @@ -279,7 +300,7 @@ struct xfs_dsb { #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) -static inline bool xfs_sb_is_v5(struct xfs_sb *sbp) +static inline bool xfs_sb_is_v5(const struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } @@ -288,12 +309,12 @@ static inline bool xfs_sb_is_v5(struct xfs_sb *sbp) * Detect a mismatched features2 field. Older kernels read/wrote * this into the wrong slot, so to be safe we keep them in sync. */ -static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) +static inline bool xfs_sb_has_mismatched_features2(const struct xfs_sb *sbp) { return sbp->sb_bad_features2 != sbp->sb_features2; } -static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) +static inline bool xfs_sb_version_hasmorebits(const struct xfs_sb *sbp) { return xfs_sb_is_v5(sbp) || (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); @@ -343,8 +364,8 @@ static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp) #define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL static inline bool xfs_sb_has_compat_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_compat & feature) != 0; } @@ -361,31 +382,42 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_ro_compat & feature) != 0; } -#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ -#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ -#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ -#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ -#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ -#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ +#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ +#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ +#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ +#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ +#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ +#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */ +#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */ +#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */ +#define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */ +#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */ + #define XFS_SB_FEAT_INCOMPAT_ALL \ - (XFS_SB_FEAT_INCOMPAT_FTYPE| \ - XFS_SB_FEAT_INCOMPAT_SPINODES| \ - XFS_SB_FEAT_INCOMPAT_META_UUID| \ - XFS_SB_FEAT_INCOMPAT_BIGTIME| \ - XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \ - XFS_SB_FEAT_INCOMPAT_NREXT64) + (XFS_SB_FEAT_INCOMPAT_FTYPE | \ + XFS_SB_FEAT_INCOMPAT_SPINODES | \ + XFS_SB_FEAT_INCOMPAT_META_UUID | \ + XFS_SB_FEAT_INCOMPAT_BIGTIME | \ + XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR | \ + XFS_SB_FEAT_INCOMPAT_NREXT64 | \ + XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \ + XFS_SB_FEAT_INCOMPAT_PARENT | \ + XFS_SB_FEAT_INCOMPAT_METADIR | \ + XFS_SB_FEAT_INCOMPAT_ZONED | \ + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool xfs_sb_has_incompat_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_incompat & feature) != 0; } @@ -396,8 +428,8 @@ xfs_sb_has_incompat_feature( #define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL static inline bool xfs_sb_has_incompat_log_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_log_incompat & feature) != 0; } @@ -417,7 +449,7 @@ xfs_sb_add_incompat_log_features( sbp->sb_features_log_incompat |= features; } -static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp) +static inline bool xfs_sb_version_haslogxattrs(const struct xfs_sb *sbp) { return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat & XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); @@ -691,21 +723,58 @@ struct xfs_agfl { /* * Realtime bitmap information is accessed by the word, which is currently - * stored in host-endian format. + * stored in host-endian format. Starting with the realtime groups feature, + * the words are stored in be32 ondisk. */ union xfs_rtword_raw { __u32 old; + __be32 rtg; }; /* * Realtime summary counts are accessed by the word, which is currently - * stored in host-endian format. + * stored in host-endian format. Starting with the realtime groups feature, + * the words are stored in be32 ondisk. */ union xfs_suminfo_raw { __u32 old; + __be32 rtg; }; /* + * Realtime allocation groups break the rt section into multiple pieces that + * could be locked independently. Realtime block group numbers are 32-bit + * quantities. Block numbers within a group are also 32-bit quantities, but + * the upper bit must never be set. rtgroup 0 might have a superblock in it, + * so the minimum size of an rtgroup is 2 rtx. + */ +#define XFS_MAX_RGBLOCKS ((xfs_rgblock_t)(1U << 31) - 1) +#define XFS_MIN_RGEXTENTS ((xfs_rtxlen_t)2) +#define XFS_MAX_RGNUMBER ((xfs_rgnumber_t)(-1U)) + +#define XFS_RTSB_MAGIC 0x46726F67 /* 'Frog' */ + +/* + * Realtime superblock - on disk version. Must be padded to 64 bit alignment. + * The first block of the realtime volume contains this superblock. + */ +struct xfs_rtsb { + __be32 rsb_magicnum; /* magic number == XFS_RTSB_MAGIC */ + __le32 rsb_crc; /* superblock crc */ + + __be32 rsb_pad; /* zero */ + unsigned char rsb_fname[XFSLABEL_MAX]; /* file system name */ + + uuid_t rsb_uuid; /* user-visible file system unique id */ + uuid_t rsb_meta_uuid; /* metadata file system unique id */ + + /* must be padded to 64 bit alignment */ +}; + +#define XFS_RTSB_CRC_OFF offsetof(struct xfs_rtsb, rsb_crc) +#define XFS_RTSB_DADDR ((xfs_daddr_t)0) /* daddr in rt section */ + +/* * XFS Timestamps * ============== * @@ -787,6 +856,31 @@ static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds) return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET; } +enum xfs_metafile_type { + XFS_METAFILE_UNKNOWN, /* unknown */ + XFS_METAFILE_DIR, /* metadir directory */ + XFS_METAFILE_USRQUOTA, /* user quota */ + XFS_METAFILE_GRPQUOTA, /* group quota */ + XFS_METAFILE_PRJQUOTA, /* project quota */ + XFS_METAFILE_RTBITMAP, /* rt bitmap */ + XFS_METAFILE_RTSUMMARY, /* rt summary */ + XFS_METAFILE_RTRMAP, /* rt rmap */ + XFS_METAFILE_RTREFCOUNT, /* rt refcount */ + + XFS_METAFILE_MAX +} __packed; + +#define XFS_METAFILE_TYPE_STR \ + { XFS_METAFILE_UNKNOWN, "unknown" }, \ + { XFS_METAFILE_DIR, "dir" }, \ + { XFS_METAFILE_USRQUOTA, "usrquota" }, \ + { XFS_METAFILE_GRPQUOTA, "grpquota" }, \ + { XFS_METAFILE_PRJQUOTA, "prjquota" }, \ + { XFS_METAFILE_RTBITMAP, "rtbitmap" }, \ + { XFS_METAFILE_RTSUMMARY, "rtsummary" }, \ + { XFS_METAFILE_RTRMAP, "rtrmap" }, \ + { XFS_METAFILE_RTREFCOUNT, "rtrefcount" } + /* * On-disk inode structure. * @@ -809,7 +903,7 @@ struct xfs_dinode { __be16 di_mode; /* mode and type of file */ __u8 di_version; /* inode version */ __u8 di_format; /* format of di_c data */ - __be16 di_onlink; /* old number of links to file */ + __be16 di_metatype; /* XFS_METAFILE_*; was di_onlink */ __be32 di_uid; /* owner's user id */ __be32 di_gid; /* owner's group id */ __be32 di_nlink; /* number of links to file */ @@ -865,7 +959,12 @@ struct xfs_dinode { __be64 di_changecount; /* number of attribute changes */ __be64 di_lsn; /* flush sequence */ __be64 di_flags2; /* more random flags */ - __be32 di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + __be32 di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + __be32 di_used_blocks; + }; __u8 di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ @@ -898,6 +997,12 @@ static inline uint xfs_dinode_size(int version) #define XFS_MAXLINK ((1U << 31) - 1U) /* + * Any file that hits the maximum ondisk link count should be pinned to avoid + * a use-after-free situation. + */ +#define XFS_NLINK_PINNED (~0U) + +/* * Values for di_format * * This enum is used in string mapping in xfs_trace.h; please keep the @@ -908,7 +1013,8 @@ enum xfs_dinode_fmt { XFS_DINODE_FMT_LOCAL, /* bulk data */ XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ - XFS_DINODE_FMT_UUID /* added long ago, but never used */ + XFS_DINODE_FMT_UUID, /* added long ago, but never used */ + XFS_DINODE_FMT_META_BTREE, /* metadata btree */ }; #define XFS_INODE_FORMAT_STR \ @@ -916,7 +1022,8 @@ enum xfs_dinode_fmt { { XFS_DINODE_FMT_LOCAL, "local" }, \ { XFS_DINODE_FMT_EXTENTS, "extent" }, \ { XFS_DINODE_FMT_BTREE, "btree" }, \ - { XFS_DINODE_FMT_UUID, "uuid" } + { XFS_DINODE_FMT_UUID, "uuid" }, \ + { XFS_DINODE_FMT_META_BTREE, "meta_btree" } /* * Max values for extnum and aextnum. @@ -1079,21 +1186,60 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) * Values for di_flags2 These start by being exposed to userspace in the upper * 16 bits of the XFS_XFLAG_s range. */ -#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ -#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ -#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ -#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ -#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ +/* use DAX for this inode */ +#define XFS_DIFLAG2_DAX_BIT 0 + +/* file's blocks may be shared */ +#define XFS_DIFLAG2_REFLINK_BIT 1 -#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) -#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) -#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) -#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) -#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) +/* copy on write extent size hint */ +#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 + +/* big timestamps */ +#define XFS_DIFLAG2_BIGTIME_BIT 3 + +/* large extent counters */ +#define XFS_DIFLAG2_NREXT64_BIT 4 + +/* + * The inode contains filesystem metadata and can be found through the metadata + * directory tree. Metadata inodes must satisfy the following constraints: + * + * - V5 filesystem (and ftype) are enabled; + * - The only valid modes are regular files and directories; + * - The access bits must be zero; + * - DMAPI event and state masks are zero; + * - The user and group IDs must be zero; + * - The project ID can be used as a u32 annotation; + * - The immutable, sync, noatime, nodump, nodefrag flags must be set. + * - The dax flag must not be set. + * - Directories must have nosymlinks set. + * + * These requirements are chosen defensively to minimize the ability of + * userspace to read or modify the contents, should a metadata file ever + * escape to userspace. + * + * There are further constraints on the directory tree itself: + * + * - Metadata inodes must never be resolvable through the root directory; + * - They must never be accessed by userspace; + * - Metadata directory entries must have correct ftype. + * + * Superblock-rooted metadata files must have the METADATA iflag set even + * though they do not have a parent directory. + */ +#define XFS_DIFLAG2_METADATA_BIT 5 + +#define XFS_DIFLAG2_DAX (1ULL << XFS_DIFLAG2_DAX_BIT) +#define XFS_DIFLAG2_REFLINK (1ULL << XFS_DIFLAG2_REFLINK_BIT) +#define XFS_DIFLAG2_COWEXTSIZE (1ULL << XFS_DIFLAG2_COWEXTSIZE_BIT) +#define XFS_DIFLAG2_BIGTIME (1ULL << XFS_DIFLAG2_BIGTIME_BIT) +#define XFS_DIFLAG2_NREXT64 (1ULL << XFS_DIFLAG2_NREXT64_BIT) +#define XFS_DIFLAG2_METADATA (1ULL << XFS_DIFLAG2_METADATA_BIT) #define XFS_DIFLAG2_ANY \ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ - XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_METADATA) static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) { @@ -1108,6 +1254,12 @@ static inline bool xfs_dinode_has_large_extent_counts( (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64)); } +static inline bool xfs_dinode_is_metadir(const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA)); +} + /* * Inode number format: * low inopblog bits - offset in block @@ -1156,6 +1308,24 @@ static inline bool xfs_dinode_has_large_extent_counts( #define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ /* + * RT bit manipulation macros. + */ +#define XFS_RTBITMAP_MAGIC 0x424D505A /* BMPZ */ +#define XFS_RTSUMMARY_MAGIC 0x53554D59 /* SUMY */ + +struct xfs_rtbuf_blkinfo { + __be32 rt_magic; /* validity check on block */ + __be32 rt_crc; /* CRC of block */ + __be64 rt_owner; /* inode that owns the block */ + __be64 rt_blkno; /* first block of the buffer */ + __be64 rt_lsn; /* sequence number of last write */ + uuid_t rt_uuid; /* filesystem we belong to */ +}; + +#define XFS_RTBUF_CRC_OFF \ + offsetof(struct xfs_rtbuf_blkinfo, rt_crc) + +/* * Dquot and dquot block format definitions */ #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ @@ -1574,6 +1744,24 @@ typedef __be32 xfs_rmap_ptr_t; XFS_IBT_BLOCK(mp) + 1) /* + * Realtime Reverse mapping btree format definitions + * + * This is a btree for reverse mapping records for realtime volumes + */ +#define XFS_RTRMAP_CRC_MAGIC 0x4d415052 /* 'MAPR' */ + +/* + * rtrmap root header, on-disk form only. + */ +struct xfs_rtrmap_root { + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ +}; + +/* inode-based btree pointer type */ +typedef __be64 xfs_rtrmap_ptr_t; + +/* * Reference Count Btree format definitions * */ @@ -1616,12 +1804,29 @@ struct xfs_refcount_key { __be32 rc_startblock; /* starting block number */ }; -#define MAXREFCOUNT ((xfs_nlink_t)~0U) -#define MAXREFCEXTLEN ((xfs_extlen_t)~0U) +#define XFS_REFC_REFCOUNT_MAX ((xfs_nlink_t)~0U) +#define XFS_REFC_LEN_MAX ((xfs_extlen_t)~0U) /* btree pointer type */ typedef __be32 xfs_refcount_ptr_t; +/* + * Realtime Reference Count btree format definitions + * + * This is a btree for reference count records for realtime volumes + */ +#define XFS_RTREFC_CRC_MAGIC 0x52434e54 /* 'RCNT' */ + +/* + * rt refcount root header, on-disk form only. + */ +struct xfs_rtrefcount_root { + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ +}; + +/* inode-rooted btree pointer type */ +typedef __be64 xfs_rtrefcount_ptr_t; /* * BMAP Btree format definitions diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index ca1b17d01437..12463ba766da 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -8,6 +8,7 @@ /* * SGI's XFS filesystem's major stuff (constants, structures) + * NOTE: This file must be compile-able with C++ compilers. */ /* @@ -186,7 +187,11 @@ struct xfs_fsop_geom { __u32 logsunit; /* log stripe unit, bytes */ uint32_t sick; /* o: unhealthy fs & rt metadata */ uint32_t checked; /* o: checked fs & rt metadata */ - __u64 reserved[17]; /* reserved space */ + __u32 rgextents; /* rt extents in a realtime group */ + __u32 rgcount; /* number of realtime groups */ + __u64 rtstart; /* start of internal rt section */ + __u64 rtreserved; /* RT (zoned) reserved blocks */ + __u64 reserved[14]; /* reserved space */ }; #define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */ @@ -197,6 +202,8 @@ struct xfs_fsop_geom { #define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */ #define XFS_FSOP_GEOM_SICK_QUOTACHECK (1 << 6) /* quota counts */ #define XFS_FSOP_GEOM_SICK_NLINKS (1 << 7) /* inode link counts */ +#define XFS_FSOP_GEOM_SICK_METADIR (1 << 8) /* metadata directory */ +#define XFS_FSOP_GEOM_SICK_METAPATH (1 << 9) /* metadir tree path */ /* Output for XFS_FS_COUNTS */ typedef struct xfs_fsop_counts { @@ -239,6 +246,10 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ #define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */ #define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */ +#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */ +#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */ +#define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */ +#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */ /* * Minimum and maximum sizes need for growth checks. @@ -409,6 +420,7 @@ struct xfs_bulkstat { #define XFS_BS_SICK_XATTR (1 << 5) /* extended attributes */ #define XFS_BS_SICK_SYMLINK (1 << 6) /* symbolic link remote target */ #define XFS_BS_SICK_PARENT (1 << 7) /* parent pointers */ +#define XFS_BS_SICK_DIRTREE (1 << 8) /* directory tree structure */ /* * Project quota id helpers (previously projid was 16bit only @@ -485,9 +497,17 @@ struct xfs_bulk_ireq { */ #define XFS_BULK_IREQ_NREXT64 (1U << 2) +/* + * Allow bulkstat to return information about metadata directories. This + * enables xfs_scrub to find them for scanning, as they are otherwise ordinary + * directories. + */ +#define XFS_BULK_IREQ_METADIR (1U << 3) + #define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ XFS_BULK_IREQ_SPECIAL | \ - XFS_BULK_IREQ_NREXT64) + XFS_BULK_IREQ_NREXT64 | \ + XFS_BULK_IREQ_METADIR) /* Operate on the root directory inode. */ #define XFS_BULK_IREQ_SPECIAL_ROOT (1) @@ -632,7 +652,9 @@ typedef struct xfs_fsop_attrmulti_handlereq { /* * per machine unique filesystem identifier types. */ -typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */ +typedef struct xfs_fsid { + __u32 val[2]; /* file system id type */ +} xfs_fsid_t; typedef struct xfs_fid { __u16 fid_len; /* length of remainder */ @@ -715,9 +737,23 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */ #define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */ #define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */ +#define XFS_SCRUB_TYPE_DIRTREE 28 /* directory tree structure */ +#define XFS_SCRUB_TYPE_METAPATH 29 /* metadata directory tree paths */ +#define XFS_SCRUB_TYPE_RGSUPER 30 /* realtime superblock */ +#define XFS_SCRUB_TYPE_RTRMAPBT 31 /* rtgroup reverse mapping btree */ +#define XFS_SCRUB_TYPE_RTREFCBT 32 /* realtime reference count btree */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 28 +#define XFS_SCRUB_TYPE_NR 33 + +/* + * This special type code only applies to the vectored scrub implementation. + * + * If any of the previous scrub vectors recorded runtime errors or have + * sv_flags bits set that match the OFLAG bits in the barrier vector's + * sv_flags, set the barrier's sv_ret to -ECANCELED and return to userspace. + */ +#define XFS_SCRUB_TYPE_BARRIER (0xFFFFFFFF) /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1u << 0) @@ -763,6 +799,47 @@ struct xfs_scrub_metadata { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED) #define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT) +/* Vectored scrub calls to reduce the number of kernel transitions. */ + +struct xfs_scrub_vec { + __u32 sv_type; /* XFS_SCRUB_TYPE_* */ + __u32 sv_flags; /* XFS_SCRUB_FLAGS_* */ + __s32 sv_ret; /* 0 or a negative error code */ + __u32 sv_reserved; /* must be zero */ +}; + +/* Vectored metadata scrub control structure. */ +struct xfs_scrub_vec_head { + __u64 svh_ino; /* inode number. */ + __u32 svh_gen; /* inode generation. */ + __u32 svh_agno; /* ag number. */ + __u32 svh_flags; /* XFS_SCRUB_VEC_FLAGS_* */ + __u16 svh_rest_us; /* wait this much time between vector items */ + __u16 svh_nr; /* number of svh_vectors */ + __u64 svh_reserved; /* must be zero */ + __u64 svh_vectors; /* pointer to buffer of xfs_scrub_vec */ +}; + +#define XFS_SCRUB_VEC_FLAGS_ALL (0) + +/* + * i: sm_ino values for XFS_SCRUB_TYPE_METAPATH to select a metadata file for + * path checking. + */ +#define XFS_SCRUB_METAPATH_PROBE (0) /* do we have a metapath scrubber? */ +#define XFS_SCRUB_METAPATH_RTDIR (1) /* rtrgroups metadir */ +#define XFS_SCRUB_METAPATH_RTBITMAP (2) /* per-rtg bitmap */ +#define XFS_SCRUB_METAPATH_RTSUMMARY (3) /* per-rtg summary */ +#define XFS_SCRUB_METAPATH_QUOTADIR (4) /* quota metadir */ +#define XFS_SCRUB_METAPATH_USRQUOTA (5) /* user quota */ +#define XFS_SCRUB_METAPATH_GRPQUOTA (6) /* group quota */ +#define XFS_SCRUB_METAPATH_PRJQUOTA (7) /* project quota */ +#define XFS_SCRUB_METAPATH_RTRMAPBT (8) /* realtime reverse mapping */ +#define XFS_SCRUB_METAPATH_RTREFCOUNTBT (9) /* realtime refcount */ + +/* Number of metapath sm_ino values */ +#define XFS_SCRUB_METAPATH_NR (10) + /* * ioctl limits */ @@ -772,6 +849,159 @@ struct xfs_scrub_metadata { # define XFS_XATTR_LIST_MAX 65536 #endif +/* + * Exchange part of file1 with part of the file that this ioctl that is being + * called against (which we'll call file2). Filesystems must be able to + * restart and complete the operation even after the system goes down. + */ +struct xfs_exchange_range { + __s32 file1_fd; + __u32 pad; /* must be zeroes */ + __u64 file1_offset; /* file1 offset, bytes */ + __u64 file2_offset; /* file2 offset, bytes */ + __u64 length; /* bytes to exchange */ + + __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */ +}; + +/* + * Using the same definition of file2 as struct xfs_exchange_range, commit the + * contents of file1 into file2 if file2 has the same inode number, mtime, and + * ctime as the arguments provided to the call. The old contents of file2 will + * be moved to file1. + * + * Returns -EBUSY if there isn't an exact match for the file2 fields. + * + * Filesystems must be able to restart and complete the operation even after + * the system goes down. + */ +struct xfs_commit_range { + __s32 file1_fd; + __u32 pad; /* must be zeroes */ + __u64 file1_offset; /* file1 offset, bytes */ + __u64 file2_offset; /* file2 offset, bytes */ + __u64 length; /* bytes to exchange */ + + __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */ + + /* opaque file2 metadata for freshness checks */ + __u64 file2_freshness[6]; +}; + +/* + * Exchange file data all the way to the ends of both files, and then exchange + * the file sizes. This flag can be used to replace a file's contents with a + * different amount of data. length will be ignored. + */ +#define XFS_EXCHANGE_RANGE_TO_EOF (1ULL << 0) + +/* Flush all changes in file data and file metadata to disk before returning. */ +#define XFS_EXCHANGE_RANGE_DSYNC (1ULL << 1) + +/* Dry run; do all the parameter verification but do not change anything. */ +#define XFS_EXCHANGE_RANGE_DRY_RUN (1ULL << 2) + +/* + * Exchange only the parts of the two files where the file allocation units + * mapped to file1's range have been written to. This can accelerate + * scatter-gather atomic writes with a temp file if all writes are aligned to + * the file allocation unit. + */ +#define XFS_EXCHANGE_RANGE_FILE1_WRITTEN (1ULL << 3) + +#define XFS_EXCHANGE_RANGE_ALL_FLAGS (XFS_EXCHANGE_RANGE_TO_EOF | \ + XFS_EXCHANGE_RANGE_DSYNC | \ + XFS_EXCHANGE_RANGE_DRY_RUN | \ + XFS_EXCHANGE_RANGE_FILE1_WRITTEN) + +/* Iterating parent pointers of files. */ + +/* target was the root directory */ +#define XFS_GETPARENTS_OFLAG_ROOT (1U << 0) + +/* Cursor is done iterating pptrs */ +#define XFS_GETPARENTS_OFLAG_DONE (1U << 1) + +#define XFS_GETPARENTS_OFLAGS_ALL (XFS_GETPARENTS_OFLAG_ROOT | \ + XFS_GETPARENTS_OFLAG_DONE) + +#define XFS_GETPARENTS_IFLAGS_ALL (0) + +struct xfs_getparents_rec { + struct xfs_handle gpr_parent; /* Handle to parent */ + __u32 gpr_reclen; /* Length of entire record */ + __u32 gpr_reserved; /* zero */ + char gpr_name[]; /* Null-terminated filename */ +}; + +/* Iterate through this file's directory parent pointers */ +struct xfs_getparents { + /* + * Structure to track progress in iterating the parent pointers. + * Must be initialized to zeroes before the first ioctl call, and + * not touched by callers after that. + */ + struct xfs_attrlist_cursor gp_cursor; + + /* Input flags: XFS_GETPARENTS_IFLAG* */ + __u16 gp_iflags; + + /* Output flags: XFS_GETPARENTS_OFLAG* */ + __u16 gp_oflags; + + /* Size of the gp_buffer in bytes */ + __u32 gp_bufsize; + + /* Must be set to zero */ + __u64 gp_reserved; + + /* Pointer to a buffer in which to place xfs_getparents_rec */ + __u64 gp_buffer; +}; + +static inline struct xfs_getparents_rec * +xfs_getparents_first_rec(struct xfs_getparents *gp) +{ + return (struct xfs_getparents_rec *)(uintptr_t)gp->gp_buffer; +} + +static inline struct xfs_getparents_rec * +xfs_getparents_next_rec(struct xfs_getparents *gp, + struct xfs_getparents_rec *gpr) +{ + void *next = ((char *)gpr + gpr->gpr_reclen); + void *end = (void *)(uintptr_t)(gp->gp_buffer + gp->gp_bufsize); + + if (next >= end) + return NULL; + + return (struct xfs_getparents_rec *)next; +} + +/* Iterate through this file handle's directory parent pointers. */ +struct xfs_getparents_by_handle { + /* Handle to file whose parents we want. */ + struct xfs_handle gph_handle; + + struct xfs_getparents gph_request; +}; + +/* + * Output for XFS_IOC_RTGROUP_GEOMETRY + */ +struct xfs_rtgroup_geometry { + __u32 rg_number; /* i/o: rtgroup number */ + __u32 rg_length; /* o: length in blocks */ + __u32 rg_sick; /* o: sick things in ag */ + __u32 rg_checked; /* o: checked metadata in ag */ + __u32 rg_flags; /* i/o: flags for this ag */ + __u32 rg_reserved[27]; /* o: zero */ +}; +#define XFS_RTGROUP_GEOM_SICK_SUPER (1U << 0) /* superblock */ +#define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */ +#define XFS_RTGROUP_GEOM_SICK_SUMMARY (1U << 2) /* rtsummary */ +#define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */ +#define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */ /* * ioctl commands that are used by Linux filesystems @@ -808,6 +1038,10 @@ struct xfs_scrub_metadata { /* XFS_IOC_GETFSMAP ------ hoisted 59 */ #define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata) #define XFS_IOC_AG_GEOMETRY _IOWR('X', 61, struct xfs_ag_geometry) +#define XFS_IOC_GETPARENTS _IOWR('X', 62, struct xfs_getparents) +#define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle) +#define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head) +#define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry) /* * ioctl commands that replace IRIX syssgi()'s @@ -843,8 +1077,20 @@ struct xfs_scrub_metadata { #define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom) #define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req) #define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req) +#define XFS_IOC_EXCHANGE_RANGE _IOW ('X', 129, struct xfs_exchange_range) +#define XFS_IOC_START_COMMIT _IOR ('X', 130, struct xfs_commit_range) +#define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ +/* + * Devices supported by a single XFS file system. Reported in fsmaps fmr_device + * when using internal RT devices. + */ +enum xfs_device { + XFS_DEV_DATA = 1, + XFS_DEV_LOG = 2, + XFS_DEV_RT = 3, +}; #ifndef HAVE_BBMACROS /* diff --git a/fs/xfs/libxfs/xfs_group.c b/fs/xfs/libxfs/xfs_group.c new file mode 100644 index 000000000000..e9d76bcdc820 --- /dev/null +++ b/fs/xfs/libxfs/xfs_group.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Red Hat, Inc. + */ + +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_extent_busy.h" +#include "xfs_group.h" + +/* + * Groups can have passive and active references. + * + * For passive references the code freeing a group is responsible for cleaning + * up objects that hold the passive references (e.g. cached buffers). + * Routines manipulating passive references are xfs_group_get, xfs_group_hold + * and xfs_group_put. + * + * Active references are for short term access to the group for walking trees or + * accessing state. If a group is being shrunk or offlined, the lookup will fail + * to find that group and return NULL instead. + * Routines manipulating active references are xfs_group_grab and + * xfs_group_rele. + */ + +struct xfs_group * +xfs_group_get( + struct xfs_mount *mp, + uint32_t index, + enum xfs_group_type type) +{ + struct xfs_group *xg; + + rcu_read_lock(); + xg = xa_load(&mp->m_groups[type].xa, index); + if (xg) { + trace_xfs_group_get(xg, _RET_IP_); + ASSERT(atomic_read(&xg->xg_ref) >= 0); + atomic_inc(&xg->xg_ref); + } + rcu_read_unlock(); + return xg; +} + +struct xfs_group * +xfs_group_hold( + struct xfs_group *xg) +{ + ASSERT(atomic_read(&xg->xg_ref) > 0 || + atomic_read(&xg->xg_active_ref) > 0); + + trace_xfs_group_hold(xg, _RET_IP_); + atomic_inc(&xg->xg_ref); + return xg; +} + +void +xfs_group_put( + struct xfs_group *xg) +{ + trace_xfs_group_put(xg, _RET_IP_); + + ASSERT(atomic_read(&xg->xg_ref) > 0); + atomic_dec(&xg->xg_ref); +} + +struct xfs_group * +xfs_group_grab( + struct xfs_mount *mp, + uint32_t index, + enum xfs_group_type type) +{ + struct xfs_group *xg; + + rcu_read_lock(); + xg = xa_load(&mp->m_groups[type].xa, index); + if (xg) { + trace_xfs_group_grab(xg, _RET_IP_); + if (!atomic_inc_not_zero(&xg->xg_active_ref)) + xg = NULL; + } + rcu_read_unlock(); + return xg; +} + +/* + * Iterate to the next group. To start the iteration at @start_index, a %NULL + * @xg is passed, else the previous group returned from this function. The + * caller should break out of the loop when this returns %NULL. If the caller + * wants to break out of a loop that did not finish it needs to release the + * active reference to @xg using xfs_group_rele() itself. + */ +struct xfs_group * +xfs_group_next_range( + struct xfs_mount *mp, + struct xfs_group *xg, + uint32_t start_index, + uint32_t end_index, + enum xfs_group_type type) +{ + uint32_t index = start_index; + + if (xg) { + index = xg->xg_gno + 1; + xfs_group_rele(xg); + } + if (index > end_index) + return NULL; + return xfs_group_grab(mp, index, type); +} + +/* + * Find the next group after @xg, or the first group if @xg is NULL. + */ +struct xfs_group * +xfs_group_grab_next_mark( + struct xfs_mount *mp, + struct xfs_group *xg, + xa_mark_t mark, + enum xfs_group_type type) +{ + unsigned long index = 0; + + if (xg) { + index = xg->xg_gno + 1; + xfs_group_rele(xg); + } + + rcu_read_lock(); + xg = xa_find(&mp->m_groups[type].xa, &index, ULONG_MAX, mark); + if (xg) { + trace_xfs_group_grab_next_tag(xg, _RET_IP_); + if (!atomic_inc_not_zero(&xg->xg_active_ref)) + xg = NULL; + } + rcu_read_unlock(); + return xg; +} + +void +xfs_group_rele( + struct xfs_group *xg) +{ + trace_xfs_group_rele(xg, _RET_IP_); + atomic_dec(&xg->xg_active_ref); +} + +void +xfs_group_free( + struct xfs_mount *mp, + uint32_t index, + enum xfs_group_type type, + void (*uninit)(struct xfs_group *xg)) +{ + struct xfs_group *xg = xa_erase(&mp->m_groups[type].xa, index); + + XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_ref) != 0); + + xfs_defer_drain_free(&xg->xg_intents_drain); +#ifdef __KERNEL__ + kfree(xg->xg_busy_extents); +#endif + + if (uninit) + uninit(xg); + + /* drop the mount's active reference */ + xfs_group_rele(xg); + XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) != 0); + kfree_rcu_mightsleep(xg); +} + +int +xfs_group_insert( + struct xfs_mount *mp, + struct xfs_group *xg, + uint32_t index, + enum xfs_group_type type) +{ + int error; + + xg->xg_mount = mp; + xg->xg_gno = index; + xg->xg_type = type; + +#ifdef __KERNEL__ + xg->xg_busy_extents = xfs_extent_busy_alloc(); + if (!xg->xg_busy_extents) + return -ENOMEM; + spin_lock_init(&xg->xg_state_lock); + xfs_hooks_init(&xg->xg_rmap_update_hooks); +#endif + xfs_defer_drain_init(&xg->xg_intents_drain); + + /* Active ref owned by mount indicates group is online. */ + atomic_set(&xg->xg_active_ref, 1); + + error = xa_insert(&mp->m_groups[type].xa, index, xg, GFP_KERNEL); + if (error) { + WARN_ON_ONCE(error == -EBUSY); + goto out_drain; + } + + return 0; +out_drain: + xfs_defer_drain_free(&xg->xg_intents_drain); +#ifdef __KERNEL__ + kfree(xg->xg_busy_extents); +#endif + return error; +} + +struct xfs_group * +xfs_group_get_by_fsb( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + enum xfs_group_type type) +{ + return xfs_group_get(mp, xfs_fsb_to_gno(mp, fsbno, type), type); +} diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h new file mode 100644 index 000000000000..4423932a2313 --- /dev/null +++ b/fs/xfs/libxfs/xfs_group.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2018 Red Hat, Inc. + */ +#ifndef __LIBXFS_GROUP_H +#define __LIBXFS_GROUP_H 1 + +struct xfs_group { + struct xfs_mount *xg_mount; + uint32_t xg_gno; + enum xfs_group_type xg_type; + atomic_t xg_ref; /* passive reference count */ + atomic_t xg_active_ref; /* active reference count */ + + /* Precalculated geometry info */ + uint32_t xg_block_count; /* max usable gbno */ + uint32_t xg_min_gbno; /* min usable gbno */ + +#ifdef __KERNEL__ + /* -- kernel only structures below this line -- */ + + union { + /* + * For perags and non-zoned RT groups: + * Track freed but not yet committed extents. + */ + struct xfs_extent_busy_tree *xg_busy_extents; + + /* + * For zoned RT groups: + * List of groups that need a zone reset. + * + * The zonegc code forces a log flush of the rtrmap inode before + * resetting the write pointer, so there is no need for + * individual busy extent tracking. + */ + struct xfs_group *xg_next_reset; + }; + + /* + * Bitsets of per-ag metadata that have been checked and/or are sick. + * Callers should hold xg_state_lock before accessing this field. + */ + uint16_t xg_checked; + uint16_t xg_sick; + spinlock_t xg_state_lock; + + /* + * We use xfs_drain to track the number of deferred log intent items + * that have been queued (but not yet processed) so that waiters (e.g. + * scrub) will not lock resources when other threads are in the middle + * of processing a chain of intent items only to find momentary + * inconsistencies. + */ + struct xfs_defer_drain xg_intents_drain; + + /* + * Hook to feed rmapbt updates to an active online repair. + */ + struct xfs_hooks xg_rmap_update_hooks; +#endif /* __KERNEL__ */ +}; + +struct xfs_group *xfs_group_get(struct xfs_mount *mp, uint32_t index, + enum xfs_group_type type); +struct xfs_group *xfs_group_get_by_fsb(struct xfs_mount *mp, + xfs_fsblock_t fsbno, enum xfs_group_type type); +struct xfs_group *xfs_group_hold(struct xfs_group *xg); +void xfs_group_put(struct xfs_group *xg); + +struct xfs_group *xfs_group_grab(struct xfs_mount *mp, uint32_t index, + enum xfs_group_type type); +struct xfs_group *xfs_group_next_range(struct xfs_mount *mp, + struct xfs_group *xg, uint32_t start_index, uint32_t end_index, + enum xfs_group_type type); +struct xfs_group *xfs_group_grab_next_mark(struct xfs_mount *mp, + struct xfs_group *xg, xa_mark_t mark, enum xfs_group_type type); +void xfs_group_rele(struct xfs_group *xg); + +void xfs_group_free(struct xfs_mount *mp, uint32_t index, + enum xfs_group_type type, void (*uninit)(struct xfs_group *xg)); +int xfs_group_insert(struct xfs_mount *mp, struct xfs_group *xg, + uint32_t index, enum xfs_group_type); + +#define xfs_group_set_mark(_xg, _mark) \ + xa_set_mark(&(_xg)->xg_mount->m_groups[(_xg)->xg_type].xa, \ + (_xg)->xg_gno, (_mark)) +#define xfs_group_clear_mark(_xg, _mark) \ + xa_clear_mark(&(_xg)->xg_mount->m_groups[(_xg)->xg_type].xa, \ + (_xg)->xg_gno, (_mark)) +#define xfs_group_marked(_mp, _type, _mark) \ + xa_marked(&(_mp)->m_groups[(_type)].xa, (_mark)) + +static inline xfs_agblock_t +xfs_group_max_blocks( + struct xfs_group *xg) +{ + return xg->xg_mount->m_groups[xg->xg_type].blocks; +} + +static inline xfs_fsblock_t +xfs_group_start_fsb( + struct xfs_group *xg) +{ + return ((xfs_fsblock_t)xg->xg_gno) << + xg->xg_mount->m_groups[xg->xg_type].blklog; +} + +static inline xfs_fsblock_t +xfs_gbno_to_fsb( + struct xfs_group *xg, + xfs_agblock_t gbno) +{ + return xfs_group_start_fsb(xg) | gbno; +} + +static inline xfs_daddr_t +xfs_gbno_to_daddr( + struct xfs_group *xg, + xfs_agblock_t gbno) +{ + struct xfs_mount *mp = xg->xg_mount; + struct xfs_groups *g = &mp->m_groups[xg->xg_type]; + xfs_fsblock_t fsbno; + + if (g->has_daddr_gaps) + fsbno = xfs_gbno_to_fsb(xg, gbno); + else + fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno; + + return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno); +} + +static inline uint32_t +xfs_fsb_to_gno( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + enum xfs_group_type type) +{ + if (!mp->m_groups[type].blklog) + return 0; + return fsbno >> mp->m_groups[type].blklog; +} + +static inline xfs_agblock_t +xfs_fsb_to_gbno( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + enum xfs_group_type type) +{ + return fsbno & mp->m_groups[type].blkmask; +} + +static inline bool +xfs_verify_gbno( + struct xfs_group *xg, + uint32_t gbno) +{ + if (gbno >= xg->xg_block_count) + return false; + if (gbno < xg->xg_min_gbno) + return false; + return true; +} + +static inline bool +xfs_verify_gbext( + struct xfs_group *xg, + uint32_t gbno, + uint32_t glen) +{ + uint32_t end; + + if (!xfs_verify_gbno(xg, gbno)) + return false; + if (glen == 0 || check_add_overflow(gbno, glen - 1, &end)) + return false; + if (!xfs_verify_gbno(xg, end)) + return false; + return true; +} + +#endif /* __LIBXFS_GROUP_H */ diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 3c64b5f9bd68..b31000f7190c 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -6,6 +6,8 @@ #ifndef __XFS_HEALTH_H__ #define __XFS_HEALTH_H__ +struct xfs_group; + /* * In-Core Filesystem Health Assessments * ===================================== @@ -52,6 +54,7 @@ struct xfs_inode; struct xfs_fsop_geom; struct xfs_btree_cur; struct xfs_da_args; +struct xfs_rtgroup; /* Observable health issues for metadata spanning the entire filesystem. */ #define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */ @@ -60,10 +63,15 @@ struct xfs_da_args; #define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */ #define XFS_SICK_FS_QUOTACHECK (1 << 4) /* quota counts */ #define XFS_SICK_FS_NLINKS (1 << 5) /* inode link counts */ +#define XFS_SICK_FS_METADIR (1 << 6) /* metadata directory tree */ +#define XFS_SICK_FS_METAPATH (1 << 7) /* metadata directory tree path */ -/* Observable health issues for realtime volume metadata. */ -#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */ -#define XFS_SICK_RT_SUMMARY (1 << 1) /* realtime summary */ +/* Observable health issues for realtime group metadata. */ +#define XFS_SICK_RG_SUPER (1 << 0) /* rt group superblock */ +#define XFS_SICK_RG_BITMAP (1 << 1) /* rt group bitmap */ +#define XFS_SICK_RG_SUMMARY (1 << 2) /* rt groups summary */ +#define XFS_SICK_RG_RMAPBT (1 << 3) /* reverse mappings */ +#define XFS_SICK_RG_REFCNTBT (1 << 4) /* reference counts */ /* Observable health issues for AG metadata. */ #define XFS_SICK_AG_SB (1 << 0) /* superblock */ @@ -95,6 +103,7 @@ struct xfs_da_args; /* Don't propagate sick status to ag health summary during inactivation */ #define XFS_SICK_INO_FORGET (1 << 12) +#define XFS_SICK_INO_DIRTREE (1 << 13) /* directory tree structure */ /* Primary evidence of health problems in a given group. */ #define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \ @@ -102,10 +111,15 @@ struct xfs_da_args; XFS_SICK_FS_GQUOTA | \ XFS_SICK_FS_PQUOTA | \ XFS_SICK_FS_QUOTACHECK | \ - XFS_SICK_FS_NLINKS) + XFS_SICK_FS_NLINKS | \ + XFS_SICK_FS_METADIR | \ + XFS_SICK_FS_METAPATH) -#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \ - XFS_SICK_RT_SUMMARY) +#define XFS_SICK_RG_PRIMARY (XFS_SICK_RG_SUPER | \ + XFS_SICK_RG_BITMAP | \ + XFS_SICK_RG_SUMMARY | \ + XFS_SICK_RG_RMAPBT | \ + XFS_SICK_RG_REFCNTBT) #define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \ XFS_SICK_AG_AGF | \ @@ -125,7 +139,8 @@ struct xfs_da_args; XFS_SICK_INO_DIR | \ XFS_SICK_INO_XATTR | \ XFS_SICK_INO_SYMLINK | \ - XFS_SICK_INO_PARENT) + XFS_SICK_INO_PARENT | \ + XFS_SICK_INO_DIRTREE) #define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \ XFS_SICK_INO_BMBTA_ZAPPED | \ @@ -134,26 +149,26 @@ struct xfs_da_args; /* Secondary state related to (but not primary evidence of) health problems. */ #define XFS_SICK_FS_SECONDARY (0) -#define XFS_SICK_RT_SECONDARY (0) +#define XFS_SICK_RG_SECONDARY (0) #define XFS_SICK_AG_SECONDARY (0) #define XFS_SICK_INO_SECONDARY (XFS_SICK_INO_FORGET) /* Evidence of health problems elsewhere. */ #define XFS_SICK_FS_INDIRECT (0) -#define XFS_SICK_RT_INDIRECT (0) +#define XFS_SICK_RG_INDIRECT (0) #define XFS_SICK_AG_INDIRECT (XFS_SICK_AG_INODES) #define XFS_SICK_INO_INDIRECT (0) /* All health masks. */ -#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \ +#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \ XFS_SICK_FS_SECONDARY | \ XFS_SICK_FS_INDIRECT) -#define XFS_SICK_RT_ALL (XFS_SICK_RT_PRIMARY | \ - XFS_SICK_RT_SECONDARY | \ - XFS_SICK_RT_INDIRECT) +#define XFS_SICK_RG_ALL (XFS_SICK_RG_PRIMARY | \ + XFS_SICK_RG_SECONDARY | \ + XFS_SICK_RG_INDIRECT) -#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \ +#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \ XFS_SICK_AG_SECONDARY | \ XFS_SICK_AG_INDIRECT) @@ -187,18 +202,17 @@ void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask); void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick, unsigned int *checked); -void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask); -void xfs_rt_mark_corrupt(struct xfs_mount *mp, unsigned int mask); -void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask); -void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick, - unsigned int *checked); +void xfs_rgno_mark_sick(struct xfs_mount *mp, xfs_rgnumber_t rgno, + unsigned int mask); void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int mask); -void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask); -void xfs_ag_mark_corrupt(struct xfs_perag *pag, unsigned int mask); -void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask); -void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick, +void xfs_group_mark_sick(struct xfs_group *xg, unsigned int mask); +#define xfs_ag_mark_sick(pag, mask) \ + xfs_group_mark_sick(pag_group(pag), (mask)) +void xfs_group_mark_corrupt(struct xfs_group *xg, unsigned int mask); +void xfs_group_mark_healthy(struct xfs_group *xg, unsigned int mask); +void xfs_group_measure_sickness(struct xfs_group *xg, unsigned int *sick, unsigned int *checked); void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask); @@ -225,22 +239,25 @@ xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask) } static inline bool -xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask) +xfs_group_has_sickness( + struct xfs_group *xg, + unsigned int mask) { - unsigned int sick, checked; + unsigned int sick, checked; - xfs_rt_measure_sickness(mp, &sick, &checked); + xfs_group_measure_sickness(xg, &sick, &checked); return sick & mask; } -static inline bool -xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask) -{ - unsigned int sick, checked; +#define xfs_ag_has_sickness(pag, mask) \ + xfs_group_has_sickness(pag_group(pag), (mask)) +#define xfs_ag_is_healthy(pag) \ + (!xfs_ag_has_sickness((pag), UINT_MAX)) - xfs_ag_measure_sickness(pag, &sick, &checked); - return sick & mask; -} +#define xfs_rtgroup_has_sickness(rtg, mask) \ + xfs_group_has_sickness(rtg_group(rtg), (mask)) +#define xfs_rtgroup_is_healthy(rtg) \ + (!xfs_rtgroup_has_sickness((rtg), UINT_MAX)) static inline bool xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask) @@ -258,18 +275,6 @@ xfs_fs_is_healthy(struct xfs_mount *mp) } static inline bool -xfs_rt_is_healthy(struct xfs_mount *mp) -{ - return !xfs_rt_has_sickness(mp, -1U); -} - -static inline bool -xfs_ag_is_healthy(struct xfs_perag *pag) -{ - return !xfs_ag_has_sickness(pag, -1U); -} - -static inline bool xfs_inode_is_healthy(struct xfs_inode *ip) { return !xfs_inode_has_sickness(ip, -1U); @@ -277,6 +282,8 @@ xfs_inode_is_healthy(struct xfs_inode *ip) void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo); void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); +void xfs_rtgroup_geom_health(struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo); void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs); #define xfs_metadata_is_sick(error) \ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index e5ac3e5430c4..750111634d9f 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -142,7 +142,7 @@ xfs_inobt_complain_bad_rec( xfs_warn(mp, "%sbt record corruption in AG %d detected at %pS!", - cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa); + cur->bc_ops->name, cur->bc_group->xg_gno, fa); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, @@ -170,7 +170,7 @@ xfs_inobt_get_rec( return error; xfs_inobt_btrec_to_irec(mp, rec, irec); - fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec); + fa = xfs_inobt_check_irec(to_perag(cur->bc_group), irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, irec); @@ -275,8 +275,10 @@ xfs_check_agi_freecount( } } while (i == 1); - if (!xfs_is_shutdown(cur->bc_mp)) - ASSERT(freecount == cur->bc_ag.pag->pagi_freecount); + if (!xfs_is_shutdown(cur->bc_mp)) { + ASSERT(freecount == + to_perag(cur->bc_group)->pagi_freecount); + } } return 0; } @@ -362,7 +364,7 @@ xfs_ialloc_inode_init( (j * M_IGEO(mp)->blocks_per_cluster)); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, - XBF_UNMAPPED, &fbuf); + 0, &fbuf); if (error) return error; @@ -551,7 +553,7 @@ xfs_inobt_insert_sprec( struct xfs_buf *agbp, struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; int error; int i; @@ -606,15 +608,12 @@ xfs_inobt_insert_sprec( goto error; } - trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino, - rec.ir_holemask, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_pre(pag, &rec, nrec); /* merge to nrec to output the updated record */ __xfs_inobt_rec_merge(nrec, &rec); - trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_post(pag, nrec); error = xfs_inobt_rec_check_count(mp, nrec); if (error) @@ -648,7 +647,7 @@ xfs_finobt_insert_sprec( struct xfs_buf *agbp, struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; int error; int i; @@ -768,8 +767,7 @@ xfs_ialloc_ag_alloc( /* Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_exact_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - args.agbno)); + xfs_agbno_to_fsb(pag, args.agbno)); if (error) return error; @@ -811,8 +809,8 @@ xfs_ialloc_ag_alloc( */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - be32_to_cpu(agi->agi_root))); + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); if (error) return error; } @@ -824,8 +822,8 @@ xfs_ialloc_ag_alloc( if (isaligned && args.fsbno == NULLFSBLOCK) { args.alignment = igeo->cluster_align; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - be32_to_cpu(agi->agi_root))); + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); if (error) return error; } @@ -855,13 +853,14 @@ sparse_alloc: * the end of the AG. */ args.min_agbno = args.mp->m_sb.sb_inoalignmt; - args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, + args.max_agbno = round_down(xfs_ag_block_count(args.mp, + pag_agno(pag)), args.mp->m_sb.sb_inoalignmt) - igeo->ialloc_blks; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - be32_to_cpu(agi->agi_root))); + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); if (error) return error; @@ -884,7 +883,7 @@ sparse_alloc: * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag_agno(pag), args.agbno, args.len, get_random_u32()); if (error) @@ -915,8 +914,7 @@ sparse_alloc: if (error == -EFSCORRUPTED) { xfs_alert(args.mp, "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", - XFS_AGINO_TO_INO(args.mp, pag->pag_agno, - rec.ir_startino), + xfs_agino_to_ino(pag, rec.ir_startino), rec.ir_holemask, rec.ir_count); xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); } @@ -1058,6 +1056,33 @@ xfs_inobt_first_free_inode( } /* + * If this AG has corrupt inodes, check if allocating this inode would fail + * with corruption errors. Returns 0 if we're clear, or EAGAIN to try again + * somewhere else. + */ +static int +xfs_dialloc_check_ino( + struct xfs_perag *pag, + struct xfs_trans *tp, + xfs_ino_t ino) +{ + struct xfs_imap imap; + struct xfs_buf *bp; + int error; + + error = xfs_imap(pag, tp, ino, &imap, 0); + if (error) + return -EAGAIN; + + error = xfs_imap_to_bp(pag_mount(pag), tp, &imap, &bp); + if (error) + return -EAGAIN; + + xfs_trans_brelse(tp, bp); + return 0; +} + +/* * Allocate an inode using the inobt-only algorithm. */ STATIC int @@ -1100,7 +1125,7 @@ xfs_dialloc_ag_inobt( /* * If in the same AG as the parent, try to get near the parent. */ - if (pagno == pag->pag_agno) { + if (pagno == pag_agno(pag)) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ @@ -1308,7 +1333,14 @@ alloc_inode: ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); + + if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { + error = xfs_dialloc_check_ino(pag, tp, ino); + if (error) + goto error0; + } + rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; error = xfs_inobt_update(cur, &rec); @@ -1570,7 +1602,7 @@ xfs_dialloc_ag( * parent. If so, find the closest available inode to the parent. If * not, consider the agi hint or find the first free inode in the AG. */ - if (pag->pag_agno == pagno) + if (pag_agno(pag) == pagno) error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); else error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); @@ -1582,7 +1614,13 @@ xfs_dialloc_ag( ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); + + if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { + error = xfs_dialloc_check_ino(pag, tp, ino); + if (error) + goto error_cur; + } /* * Modify or remove the finobt record. @@ -1699,7 +1737,7 @@ xfs_dialloc_good_ag( return false; if (!xfs_perag_initialised_agi(pag)) { - error = xfs_ialloc_read_agi(pag, tp, NULL); + error = xfs_ialloc_read_agi(pag, tp, 0, NULL); if (error) return false; } @@ -1768,7 +1806,7 @@ xfs_dialloc_try_ag( * Then read in the AGI buffer and recheck with the AGI buffer * lock held. */ - error = xfs_ialloc_read_agi(pag, *tpp, &agbp); + error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp); if (error) return error; @@ -1805,6 +1843,40 @@ out_release: } /* + * Pick an AG for the new inode. + * + * Directories, symlinks, and regular files frequently allocate at least one + * block, so factor that potential expansion when we examine whether an AG has + * enough space for file creation. Try to keep metadata files all in the same + * AG. + */ +static inline xfs_agnumber_t +xfs_dialloc_pick_ag( + struct xfs_mount *mp, + struct xfs_inode *dp, + umode_t mode) +{ + xfs_agnumber_t start_agno; + + if (!dp) + return 0; + if (xfs_is_metadir_inode(dp)) { + if (mp->m_sb.sb_logstart) + return XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); + return 0; + } + + if (S_ISDIR(mode)) + return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi; + + start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino); + if (start_agno >= mp->m_maxagi) + start_agno = 0; + + return start_agno; +} + +/* * Allocate an on-disk inode. * * Mode is used to tell whether the new inode is a directory and hence where to @@ -1815,34 +1887,23 @@ out_release: int xfs_dialloc( struct xfs_trans **tpp, - xfs_ino_t parent, - umode_t mode, + const struct xfs_icreate_args *args, xfs_ino_t *new_ino) { struct xfs_mount *mp = (*tpp)->t_mountp; - xfs_agnumber_t agno; - int error = 0; - xfs_agnumber_t start_agno; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_ino_t ino = NULLFSINO; + xfs_ino_t parent = args->pip ? args->pip->i_ino : 0; + xfs_agnumber_t agno; + xfs_agnumber_t start_agno; + umode_t mode = args->mode & S_IFMT; bool ok_alloc = true; bool low_space = false; int flags; - xfs_ino_t ino = NULLFSINO; + int error = 0; - /* - * Directories, symlinks, and regular files frequently allocate at least - * one block, so factor that potential expansion when we examine whether - * an AG has enough space for file creation. - */ - if (S_ISDIR(mode)) - start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) % - mp->m_maxagi; - else { - start_agno = XFS_INO_TO_AGNO(mp, parent); - if (start_agno >= mp->m_maxagi) - start_agno = 0; - } + start_agno = xfs_dialloc_pick_ag(mp, args->pip, mode); /* * If we have already hit the ceiling of inode blocks then clear @@ -1866,7 +1927,7 @@ xfs_dialloc( * that we can immediately allocate, but then we allow allocation on the * second pass if we fail to find an AG with free inodes in it. */ - if (percpu_counter_read_positive(&mp->m_fdblocks) < + if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) < mp->m_low_space[XFS_LOWSP_1_PCNT]) { ok_alloc = false; low_space = true; @@ -1906,6 +1967,21 @@ retry: } return -ENOSPC; } + + /* + * Protect against obviously corrupt allocation btree records. Later + * xfs_iget checks will catch re-allocation of other active in-memory + * and on-disk inodes. If we don't catch reallocating the parent inode + * here we will deadlock in xfs_iget() so we have to do these checks + * first. + */ + if (ino == parent || !xfs_verify_dir_ino(mp, ino)) { + xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); + xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), + XFS_SICK_AG_INOBT); + return -EFSCORRUPTED; + } + *new_ino = ino; return 0; } @@ -1918,7 +1994,7 @@ retry: static int xfs_difree_inode_chunk( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_inobt_rec_incore *rec) { struct xfs_mount *mp = tp->t_mountp; @@ -1932,10 +2008,9 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - return xfs_free_extent_later(tp, - XFS_AGB_TO_FSB(mp, agno, sagbno), + return xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, 0); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1979,10 +2054,9 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - error = xfs_free_extent_later(tp, - XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, - false); + error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, agbno), + contigblk, &XFS_RMAP_OINFO_INODES, + XFS_AG_RESV_NONE, 0); if (error) return error; @@ -2004,7 +2078,7 @@ xfs_difree_inobt( struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_agi *agi = agbp->b_addr; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; @@ -2069,8 +2143,7 @@ xfs_difree_inobt( if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { xic->deleted = true; - xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, - rec.ir_startino); + xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* @@ -2093,7 +2166,7 @@ xfs_difree_inobt( goto error0; } - error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); + error = xfs_difree_inode_chunk(tp, pag, &rec); if (error) goto error0; } else { @@ -2139,7 +2212,7 @@ xfs_difree_finobt( xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; @@ -2262,31 +2335,31 @@ xfs_difree( /* * Break up inode number into its components. */ - if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) { - xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).", - __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno); + if (pag_agno(pag) != XFS_INO_TO_AGNO(mp, inode)) { + xfs_warn(mp, "%s: agno != pag_agno(pag) (%d != %d).", + __func__, XFS_INO_TO_AGNO(mp, inode), pag_agno(pag)); ASSERT(0); return -EINVAL; } agino = XFS_INO_TO_AGINO(mp, inode); - if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { - xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", + if (inode != xfs_agino_to_ino(pag, agino)) { + xfs_warn(mp, "%s: inode != xfs_agino_to_ino() (%llu != %llu).", __func__, (unsigned long long)inode, - (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); + (unsigned long long)xfs_agino_to_ino(pag, agino)); ASSERT(0); return -EINVAL; } agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agbno >= mp->m_sb.sb_agblocks) { - xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", - __func__, agbno, mp->m_sb.sb_agblocks); + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { + xfs_warn(mp, "%s: agbno >= xfs_ag_block_count (%d >= %d).", + __func__, agbno, xfs_ag_block_count(mp, pag_agno(pag))); ASSERT(0); return -EINVAL; } /* * Get the allocation group header. */ - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); @@ -2325,18 +2398,18 @@ xfs_imap_lookup( xfs_agblock_t *offset_agbno, int flags) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; int error; int i; - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", - __func__, error, pag->pag_agno); + __func__, error, pag_agno(pag)); return error; } @@ -2386,7 +2459,7 @@ xfs_imap( struct xfs_imap *imap, /* location map structure */ uint flags) /* flags for inode btree lookup */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); xfs_agblock_t agbno; /* block number of inode in the alloc group */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ @@ -2402,8 +2475,8 @@ xfs_imap( */ agino = XFS_INO_TO_AGINO(mp, ino); agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agbno >= mp->m_sb.sb_agblocks || - ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag)) || + ino != xfs_agino_to_ino(pag, agino)) { error = -EINVAL; #ifdef DEBUG /* @@ -2412,17 +2485,18 @@ xfs_imap( */ if (flags & XFS_IGET_UNTRUSTED) return error; - if (agbno >= mp->m_sb.sb_agblocks) { + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { xfs_alert(mp, "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", __func__, (unsigned long long)agbno, - (unsigned long)mp->m_sb.sb_agblocks); + (unsigned long)xfs_ag_block_count(mp, + pag_agno(pag))); } - if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + if (ino != xfs_agino_to_ino(pag, agino)) { xfs_alert(mp, - "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", + "%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)", __func__, ino, - XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); + xfs_agino_to_ino(pag, agino)); } xfs_stack_trace(); #endif /* DEBUG */ @@ -2452,7 +2526,7 @@ xfs_imap( offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno); + imap->im_blkno = xfs_agbno_to_daddr(pag, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); @@ -2482,7 +2556,7 @@ out_map: offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno); + imap->im_blkno = xfs_agbno_to_daddr(pag, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); @@ -2675,16 +2749,17 @@ int xfs_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, + xfs_buf_flags_t flags, struct xfs_buf **agibpp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); int error; - trace_xfs_read_agi(pag->pag_mount, pag->pag_agno); + trace_xfs_read_agi(pag); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops); + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops); if (xfs_metadata_is_sick(error)) xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); if (error) @@ -2704,15 +2779,18 @@ int xfs_ialloc_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, + int flags, struct xfs_buf **agibpp) { struct xfs_buf *agibp; struct xfs_agi *agi; int error; - trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno); + trace_xfs_ialloc_read_agi(pag); - error = xfs_read_agi(pag, tp, &agibp); + error = xfs_read_agi(pag, tp, + (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, + &agibp); if (error) return error; @@ -2723,12 +2801,35 @@ xfs_ialloc_read_agi( set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); } +#ifdef DEBUG /* - * It's possible for these to be out of sync if - * we are in the middle of a forced shutdown. + * It's possible for the AGF to be out of sync if the block device is + * silently dropping writes. This can happen in fstests with dmflakey + * enabled, which allows the buffer to be cleaned and reclaimed by + * memory pressure and then re-read from disk here. We will get a + * stale version of the AGF from disk, and nothing good can happen from + * here. Hence if we detect this situation, immediately shut down the + * filesystem. + * + * This can also happen if we are already in the middle of a forced + * shutdown, so don't bother checking if we are already shut down. */ - ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - xfs_is_shutdown(pag->pag_mount)); + if (!xfs_is_shutdown(pag_mount(pag))) { + bool ok = true; + + ok &= pag->pagi_freecount == be32_to_cpu(agi->agi_freecount); + ok &= pag->pagi_count == be32_to_cpu(agi->agi_count); + + if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + xfs_trans_brelse(tp, agibp); + xfs_force_shutdown(pag_mount(pag), + SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } + } +#endif /* DEBUG */ + if (agibpp) *agibpp = agibp; else @@ -2828,7 +2929,7 @@ xfs_ialloc_count_inodes_rec( xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); - fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_inobt_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, &irec); @@ -2889,8 +2990,8 @@ xfs_ialloc_setup_geometry( /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; - igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); - igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); + igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, true); + igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, false); igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2; igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2; @@ -2975,6 +3076,11 @@ xfs_ialloc_setup_geometry( igeo->ialloc_align = mp->m_dalign; else igeo->ialloc_align = 0; + + if (mp->m_sb.sb_blocksize > PAGE_SIZE) + igeo->min_folio_order = mp->m_sb.sb_blocklog - PAGE_SHIFT; + else + igeo->min_folio_order = 0; } /* Compute the location of the root directory inode that is laid out by mkfs. */ @@ -3062,13 +3168,13 @@ xfs_ialloc_check_shrink( int has; int error; - if (!xfs_has_sparseinodes(pag->pag_mount)) + if (!xfs_has_sparseinodes(pag_mount(pag))) return 0; cur = xfs_inobt_init_cursor(pag, tp, agibp); /* Look up the inobt record that would correspond to the new EOFS. */ - agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length); + agino = XFS_AGB_TO_AGINO(pag_mount(pag), new_length); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has); if (error || !has) goto out; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index f1412183bb44..3a1323155a45 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -33,11 +33,13 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog); } +struct xfs_icreate_args; + /* * Allocate an inode on disk. Mode is used to tell whether the new inode will * need space, and whether it is a directory. */ -int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode, +int xfs_dialloc(struct xfs_trans **tpp, const struct xfs_icreate_args *args, xfs_ino_t *new_ino); int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag, @@ -63,10 +65,11 @@ xfs_ialloc_log_agi( struct xfs_buf *bp, /* allocation group header buffer */ uint32_t fields); /* bitmask of fields to log */ -int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, +int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, xfs_buf_flags_t flags, struct xfs_buf **agibpp); int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, - struct xfs_buf **agibpp); + int flags, struct xfs_buf **agibpp); +#define XFS_IALLOC_FLAG_TRYLOCK (1U << 0) /* use trylock for buffer locking */ /* * Lookup a record by ino in the btree given by cur. diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index cc661fca6ff5..6f270d8f4270 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -37,7 +37,7 @@ STATIC struct xfs_btree_cur * xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { - return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, + return xfs_inobt_init_cursor(to_perag(cur->bc_group), cur->bc_tp, cur->bc_ag.agbp); } @@ -45,7 +45,7 @@ STATIC struct xfs_btree_cur * xfs_finobt_dup_cursor( struct xfs_btree_cur *cur) { - return xfs_finobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, + return xfs_finobt_init_cursor(to_perag(cur->bc_group), cur->bc_tp, cur->bc_ag.agbp); } @@ -112,7 +112,7 @@ __xfs_inobt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.pag = cur->bc_ag.pag; + args.pag = to_perag(cur->bc_group); args.oinfo = XFS_RMAP_OINFO_INOBT; args.minlen = 1; args.maxlen = 1; @@ -120,7 +120,7 @@ __xfs_inobt_alloc_block( args.resv = resv; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, sbno)); + xfs_agbno_to_fsb(args.pag, sbno)); if (error) return error; @@ -170,7 +170,7 @@ __xfs_inobt_free_block( xfs_inobt_mod_blockcount(cur, -1); fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_INOBT, resv, false); + &XFS_RMAP_OINFO_INOBT, resv, 0); } STATIC int @@ -248,7 +248,7 @@ xfs_inobt_init_ptr_from_cur( { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_root; } @@ -260,7 +260,8 @@ xfs_finobt_init_ptr_from_cur( { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agi->agi_seqno)); + ptr->s = agi->agi_free_root; } @@ -478,12 +479,12 @@ xfs_inobt_init_cursor( struct xfs_trans *tp, struct xfs_buf *agbp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; cur = xfs_btree_alloc_cursor(mp, tp, &xfs_inobt_ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agi *agi = agbp->b_addr; @@ -504,12 +505,12 @@ xfs_finobt_init_cursor( struct xfs_trans *tp, struct xfs_buf *agbp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; cur = xfs_btree_alloc_cursor(mp, tp, &xfs_finobt_ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agi *agi = agbp->b_addr; @@ -572,11 +573,11 @@ xfs_inobt_block_maxrecs( /* * Calculate number of records in an inobt btree block. */ -int +unsigned int xfs_inobt_maxrecs( struct xfs_mount *mp, - int blocklen, - int leaf) + unsigned int blocklen, + bool leaf) { blocklen -= XFS_INOBT_BLOCK_LEN(mp); return xfs_inobt_block_maxrecs(blocklen, leaf); @@ -715,8 +716,8 @@ static xfs_extlen_t xfs_inobt_max_size( struct xfs_perag *pag) { - struct xfs_mount *mp = pag->pag_mount; - xfs_agblock_t agblocks = pag->block_count; + struct xfs_mount *mp = pag_mount(pag); + xfs_agblock_t agblocks = pag_group(pag)->xg_block_count; /* Bail out if we're uninitialized, which can happen in mkfs. */ if (M_IGEO(mp)->inobt_mxr[0] == 0) @@ -727,7 +728,7 @@ xfs_inobt_max_size( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (xfs_ag_contains_log(mp, pag->pag_agno)) + if (xfs_ag_contains_log(mp, pag_agno(pag))) agblocks -= mp->m_sb.sb_logblocks; return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, @@ -743,16 +744,18 @@ xfs_finobt_count_blocks( { struct xfs_buf *agbp = NULL; struct xfs_btree_cur *cur; + xfs_filblks_t blocks; int error; - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) return error; - cur = xfs_inobt_init_cursor(pag, tp, agbp); - error = xfs_btree_count_blocks(cur, tree_blocks); + cur = xfs_finobt_init_cursor(pag, tp, agbp); + error = xfs_btree_count_blocks(cur, &blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); + *tree_blocks = blocks; return error; } @@ -768,7 +771,7 @@ xfs_finobt_read_blocks( struct xfs_agi *agi; int error; - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) return error; @@ -791,10 +794,10 @@ xfs_finobt_calc_reserves( xfs_extlen_t tree_len = 0; int error; - if (!xfs_has_finobt(pag->pag_mount)) + if (!xfs_has_finobt(pag_mount(pag))) return 0; - if (xfs_has_inobtcounts(pag->pag_mount)) + if (xfs_has_inobtcounts(pag_mount(pag))) error = xfs_finobt_read_blocks(pag, tp, &tree_len); else error = xfs_finobt_count_blocks(pag, tp, &tree_len); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 6472ec1ecbb4..300edf5bc009 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -50,7 +50,8 @@ struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp); struct xfs_btree_cur *xfs_finobt_init_cursor(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp); -extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +unsigned int xfs_inobt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); /* ir_holemask to inode allocation bitmap conversion */ uint64_t xfs_inobt_irec_to_allocmask(const struct xfs_inobt_rec_incore *irec); diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index d0dcce462bf4..aa13fc00afd7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -19,6 +19,7 @@ #include "xfs_ialloc.h" #include "xfs_dir2.h" #include "xfs_health.h" +#include "xfs_metafile.h" #include <linux/iversion.h> @@ -136,7 +137,7 @@ xfs_imap_to_bp( int error; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops); + imap->im_len, 0, bpp, &xfs_inode_buf_ops); if (xfs_metadata_is_sick(error)) xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno), XFS_SICK_AG_INODES); @@ -209,12 +210,15 @@ xfs_inode_from_disk( * They will also be unconditionally written back to disk as v2 inodes. */ if (unlikely(from->di_version == 1)) { - set_nlink(inode, be16_to_cpu(from->di_onlink)); + /* di_metatype used to be di_onlink */ + set_nlink(inode, be16_to_cpu(from->di_metatype)); ip->i_projid = 0; } else { set_nlink(inode, be32_to_cpu(from->di_nlink)); ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | be16_to_cpu(from->di_projid_lo); + if (xfs_dinode_is_metadir(from)) + ip->i_metatype = be16_to_cpu(from->di_metatype); } i_uid_write(inode, be32_to_cpu(from->di_uid)); @@ -248,7 +252,10 @@ xfs_inode_from_disk( be64_to_cpu(from->di_changecount)); ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); ip->i_diflags2 = be64_to_cpu(from->di_flags2); + /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = be32_to_cpu(from->di_cowextsize); + BUILD_BUG_ON(sizeof(from->di_cowextsize) != + sizeof(from->di_used_blocks)); } error = xfs_iformat_data_fork(ip, from); @@ -315,7 +322,10 @@ xfs_inode_to_disk( struct inode *inode = VFS_I(ip); to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); - to->di_onlink = 0; + if (xfs_is_metadir_inode(ip)) + to->di_metatype = cpu_to_be16(ip->i_metatype); + else + to->di_metatype = 0; to->di_format = xfs_ifork_format(&ip->i_df); to->di_uid = cpu_to_be32(i_uid_read(inode)); @@ -342,6 +352,7 @@ xfs_inode_to_disk( to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); to->di_flags2 = cpu_to_be64(ip->i_diflags2); + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = cpu_to_be32(ip->i_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); @@ -374,17 +385,40 @@ xfs_dinode_verify_fork( /* * For fork types that can contain local data, check that the fork * format matches the size of local data contained within the fork. - * - * For all types, check that when the size says the should be in extent - * or btree format, the inode isn't claiming it is in local format. */ if (whichfork == XFS_DATA_FORK) { - if (S_ISDIR(mode) || S_ISLNK(mode)) { + /* + * A directory small enough to fit in the inode must be stored + * in local format. The directory sf <-> extents conversion + * code updates the directory size accordingly. Directories + * being truncated have zero size and are not subject to this + * check. + */ + if (S_ISDIR(mode)) { + if (dip->di_size && + be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + /* + * A symlink with a target small enough to fit in the inode can + * be stored in extents format if xattrs were added (thus + * converting the data fork from shortform to remote format) + * and then removed. + */ + if (S_ISLNK(mode)) { if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_EXTENTS && fork_format != XFS_DINODE_FMT_LOCAL) return __this_address; } + /* + * For all types, check that when the size says the fork should + * be in extent or btree format, the inode isn't claiming to be + * in local format. + */ if (be64_to_cpu(dip->di_size) > fork_size && fork_format == XFS_DINODE_FMT_LOCAL) return __this_address; @@ -411,6 +445,30 @@ xfs_dinode_verify_fork( if (di_nextents > max_extents) return __this_address; break; + case XFS_DINODE_FMT_META_BTREE: + if (!xfs_has_metadir(mp)) + return __this_address; + if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA))) + return __this_address; + switch (be16_to_cpu(dip->di_metatype)) { + case XFS_METAFILE_RTRMAP: + /* + * growfs must create the rtrmap inodes before adding a + * realtime volume to the filesystem, so we cannot use + * the rtrmapbt predicate here. + */ + if (!xfs_has_rmapbt(mp)) + return __this_address; + break; + case XFS_METAFILE_RTREFCOUNT: + /* same comment about growfs and rmap inodes applies */ + if (!xfs_has_reflink(mp)) + return __this_address; + break; + default: + return __this_address; + } + break; default: return __this_address; } @@ -430,6 +488,10 @@ xfs_dinode_verify_forkoff( if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3)) return __this_address; break; + case XFS_DINODE_FMT_META_BTREE: + if (!xfs_has_metadir(mp) || !xfs_has_parent(mp)) + return __this_address; + fallthrough; case XFS_DINODE_FMT_LOCAL: /* fall through ... */ case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ case XFS_DINODE_FMT_BTREE: @@ -460,6 +522,69 @@ xfs_dinode_verify_nrext64( return NULL; } +/* + * Validate all the picky requirements we have for a file that claims to be + * filesystem metadata. + */ +xfs_failaddr_t +xfs_dinode_verify_metadir( + struct xfs_mount *mp, + struct xfs_dinode *dip, + uint16_t mode, + uint16_t flags, + uint64_t flags2) +{ + if (!xfs_has_metadir(mp)) + return __this_address; + + /* V5 filesystem only */ + if (dip->di_version < 3) + return __this_address; + + if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX) + return __this_address; + + /* V3 inode fields that are always zero */ + if ((flags2 & XFS_DIFLAG2_NREXT64) && dip->di_nrext64_pad) + return __this_address; + if (!(flags2 & XFS_DIFLAG2_NREXT64) && dip->di_flushiter) + return __this_address; + + /* Metadata files can only be directories or regular files */ + if (!S_ISDIR(mode) && !S_ISREG(mode)) + return __this_address; + + /* They must have zero access permissions */ + if (mode & 0777) + return __this_address; + + /* DMAPI event and state masks are zero */ + if (dip->di_dmevmask || dip->di_dmstate) + return __this_address; + + /* + * User and group IDs must be zero. The project ID is used for + * grouping inodes. Metadata inodes are never accounted to quotas. + */ + if (dip->di_uid || dip->di_gid) + return __this_address; + + /* Mandatory inode flags must be set */ + if (S_ISDIR(mode)) { + if ((flags & XFS_METADIR_DIFLAGS) != XFS_METADIR_DIFLAGS) + return __this_address; + } else { + if ((flags & XFS_METAFILE_DIFLAGS) != XFS_METAFILE_DIFLAGS) + return __this_address; + } + + /* dax flags2 must not be set */ + if (flags2 & XFS_DIFLAG2_DAX) + return __this_address; + + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -491,6 +616,23 @@ xfs_dinode_verify( return __this_address; } + /* + * Historical note: xfsprogs in the 3.2 era set up its incore inodes to + * have di_nlink track the link count, even if the actual filesystem + * only supported V1 inodes (i.e. di_onlink). When writing out the + * ondisk inode, it would set both the ondisk di_nlink and di_onlink to + * the the incore di_nlink value, which is why we cannot check for + * di_nlink==0 on a V1 inode. V2/3 inodes would get written out with + * di_onlink==0, so we can check that. + */ + if (dip->di_version == 2) { + if (dip->di_metatype) + return __this_address; + } else if (dip->di_version >= 3) { + if (!xfs_dinode_is_metadir(dip) && dip->di_metatype) + return __this_address; + } + /* don't allow invalid i_size */ di_size = be64_to_cpu(dip->di_size); if (di_size & (1ULL << 63)) @@ -500,9 +642,20 @@ xfs_dinode_verify( if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) return __this_address; - /* No zero-length symlinks/dirs. */ - if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) - return __this_address; + /* + * No zero-length symlinks/dirs unless they're unlinked and hence being + * inactivated. + */ + if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) { + if (dip->di_version > 1) { + if (dip->di_nlink) + return __this_address; + } else { + /* di_metatype used to be di_onlink */ + if (dip->di_metatype) + return __this_address; + } + } fa = xfs_dinode_verify_nrext64(mp, dip); if (fa) @@ -516,9 +669,6 @@ xfs_dinode_verify( if (mode && nextents + naextents > nblocks) return __this_address; - if (nextents + naextents == 0 && nblocks != 0) - return __this_address; - if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; @@ -602,20 +752,40 @@ xfs_dinode_verify( return __this_address; /* don't let reflink and realtime mix */ - if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) + if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME) && + !xfs_has_rtreflink(mp)) return __this_address; - /* COW extent size hint validation */ - fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), - mode, flags, flags2); - if (fa) - return fa; + if (xfs_has_zoned(mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) { + if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents) + return __this_address; + } else { + /* COW extent size hint validation */ + fa = xfs_inode_validate_cowextsize(mp, + be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) + return fa; + } /* bigtime iflag can only happen on bigtime filesystems */ if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) return __this_address; + if (flags2 & XFS_DIFLAG2_METADATA) { + fa = xfs_dinode_verify_metadir(mp, dip, mode, flags, flags2); + if (fa) + return fa; + } + + /* metadata inodes containing btrees always have zero extent count */ + if (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK) != XFS_DINODE_FMT_META_BTREE) { + if (nextents + naextents == 0 && nblocks != 0) + return __this_address; + } + return NULL; } @@ -751,11 +921,29 @@ xfs_inode_validate_cowextsize( bool rt_flag; bool hint_flag; uint32_t cowextsize_bytes; + uint32_t blocksize_bytes; rt_flag = (flags & XFS_DIFLAG_REALTIME); hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); + /* + * Similar to extent size hints, a directory can be configured to + * propagate realtime status and a CoW extent size hint to newly + * created files even if there is no realtime device, and the hints on + * disk can become misaligned if the sysadmin changes the rt extent + * size while adding the realtime device. + * + * Therefore, we can only enforce the rextsize alignment check against + * regular realtime files, and rely on callers to decide when alignment + * checks are appropriate, and fix things up as needed. + */ + + if (rt_flag) + blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); + else + blocksize_bytes = mp->m_sb.sb_blocksize; + if (hint_flag && !xfs_has_reflink(mp)) return __this_address; @@ -769,16 +957,13 @@ xfs_inode_validate_cowextsize( if (mode && !hint_flag && cowextsize != 0) return __this_address; - if (hint_flag && rt_flag) - return __this_address; - - if (cowextsize_bytes % mp->m_sb.sb_blocksize) + if (cowextsize_bytes % blocksize_bytes) return __this_address; if (cowextsize > XFS_MAX_BMBT_EXTLEN) return __this_address; - if (cowextsize > mp->m_sb.sb_agblocks / 2) + if (!rt_flag && cowextsize > mp->m_sb.sb_agblocks / 2) return __this_address; return NULL; diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 585ed5a110af..8d43d2641c73 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -28,6 +28,9 @@ int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); +xfs_failaddr_t xfs_dinode_verify_metadir(struct xfs_mount *mp, + struct xfs_dinode *dip, uint16_t mode, uint16_t flags, + uint64_t flags2); xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp, uint32_t extsize, uint16_t mode, uint16_t flags); xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 7d660a973909..4f99b90add55 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -27,6 +27,8 @@ #include "xfs_errortag.h" #include "xfs_health.h" #include "xfs_symlink_remote.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" struct kmem_cache *xfs_ifork_cache; @@ -178,14 +180,14 @@ xfs_iformat_btree( struct xfs_mount *mp = ip->i_mount; xfs_bmdr_block_t *dfp; struct xfs_ifork *ifp; - /* REFERENCED */ + struct xfs_btree_block *broot; int nrecs; int size; int level; ifp = xfs_ifork_ptr(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); - size = XFS_BMAP_BROOT_SPACE(mp, dfp); + size = xfs_bmap_broot_space(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); level = be16_to_cpu(dfp->bb_level); @@ -198,7 +200,7 @@ xfs_iformat_btree( */ if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) || nrecs == 0 || - XFS_BMDR_SPACE_CALC(nrecs) > + xfs_bmdr_space_calc(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || ifp->if_nextents > ip->i_nblocks) || level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) { @@ -211,16 +213,13 @@ xfs_iformat_btree( return -EFSCORRUPTED; } - ifp->if_broot_bytes = size; - ifp->if_broot = kmalloc(size, - GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); - ASSERT(ifp->if_broot != NULL); + broot = xfs_broot_alloc(ifp, size); /* * Copy and convert from the on-disk structure * to the in-memory structure. */ xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), - ifp->if_broot, size); + broot, size); ifp->if_bytes = 0; ifp->if_data = NULL; @@ -270,6 +269,16 @@ xfs_iformat_data_fork( return xfs_iformat_extents(ip, dip, XFS_DATA_FORK); case XFS_DINODE_FMT_BTREE: return xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + case XFS_DINODE_FMT_META_BTREE: + switch (ip->i_metatype) { + case XFS_METAFILE_RTRMAP: + return xfs_iformat_rtrmap(ip, dip); + case XFS_METAFILE_RTREFCOUNT: + return xfs_iformat_rtrefcount(ip, dip); + default: + break; + } + fallthrough; default: xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); @@ -363,135 +372,68 @@ xfs_iformat_attr_fork( } /* - * Reallocate the space for if_broot based on the number of records - * being added or deleted as indicated in rec_diff. Move the records - * and pointers in if_broot to fit the new size. When shrinking this - * will eliminate holes between the records and pointers created by - * the caller. When growing this will create holes to be filled in - * by the caller. - * - * The caller must not request to add more records than would fit in - * the on-disk inode root. If the if_broot is currently NULL, then - * if we are adding records, one will be allocated. The caller must also - * not request that the number of records go below zero, although - * it can go to zero. - * - * ip -- the inode whose if_broot area is changing - * ext_diff -- the change in the number of records, positive or negative, - * requested for the if_broot array. + * Allocate the if_broot component of an inode fork so that it is @new_size + * bytes in size, using __GFP_NOLOCKDEP like all the other code that + * initializes a broot during inode load. Returns if_broot. */ -void -xfs_iroot_realloc( - xfs_inode_t *ip, - int rec_diff, - int whichfork) +struct xfs_btree_block * +xfs_broot_alloc( + struct xfs_ifork *ifp, + size_t new_size) { - struct xfs_mount *mp = ip->i_mount; - int cur_max; - struct xfs_ifork *ifp; - struct xfs_btree_block *new_broot; - int new_max; - size_t new_size; - char *np; - char *op; + ASSERT(ifp->if_broot == NULL); - /* - * Handle the degenerate case quietly. - */ - if (rec_diff == 0) { - return; - } + ifp->if_broot = kmalloc(new_size, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); + ifp->if_broot_bytes = new_size; + return ifp->if_broot; +} - ifp = xfs_ifork_ptr(ip, whichfork); - if (rec_diff > 0) { - /* - * If there wasn't any memory allocated before, just - * allocate it now and get out. - */ - if (ifp->if_broot_bytes == 0) { - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); - ifp->if_broot = kmalloc(new_size, - GFP_KERNEL | __GFP_NOFAIL); - ifp->if_broot_bytes = (int)new_size; - return; - } +/* + * Reallocate the if_broot component of an inode fork so that it is @new_size + * bytes in size. Returns if_broot. + */ +struct xfs_btree_block * +xfs_broot_realloc( + struct xfs_ifork *ifp, + size_t new_size) +{ + /* No size change? No action needed. */ + if (new_size == ifp->if_broot_bytes) + return ifp->if_broot; - /* - * If there is already an existing if_broot, then we need - * to realloc() it and shift the pointers to their new - * location. The records don't change location because - * they are kept butted up against the btree block header. - */ - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); - new_max = cur_max + rec_diff; - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); - ifp->if_broot = krealloc(ifp->if_broot, new_size, - GFP_KERNEL | __GFP_NOFAIL); - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - (int)new_size); - ifp->if_broot_bytes = (int)new_size; - ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - xfs_inode_fork_size(ip, whichfork)); - memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t)); - return; + /* New size is zero, free it. */ + if (new_size == 0) { + ifp->if_broot_bytes = 0; + kfree(ifp->if_broot); + ifp->if_broot = NULL; + return NULL; } /* - * rec_diff is less than 0. In this case, we are shrinking the - * if_broot buffer. It must already exist. If we go to zero - * records, just get rid of the root and clear the status bit. + * Shrinking the iroot means we allocate a new smaller object and copy + * it. We don't trust krealloc not to nop on realloc-down. */ - ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); - new_max = cur_max + rec_diff; - ASSERT(new_max >= 0); - if (new_max > 0) - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); - else - new_size = 0; - if (new_size > 0) { - new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); - /* - * First copy over the btree block header. - */ - memcpy(new_broot, ifp->if_broot, - XFS_BMBT_BLOCK_LEN(ip->i_mount)); - } else { - new_broot = NULL; + if (ifp->if_broot_bytes > 0 && ifp->if_broot_bytes > new_size) { + struct xfs_btree_block *old_broot = ifp->if_broot; + + ifp->if_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); + ifp->if_broot_bytes = new_size; + memcpy(ifp->if_broot, old_broot, new_size); + kfree(old_broot); + return ifp->if_broot; } /* - * Only copy the records and pointers if there are any. + * Growing the iroot means we can krealloc. This may get us the same + * object. */ - if (new_max > 0) { - /* - * First copy the records. - */ - op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); - np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); - memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); - - /* - * Then copy the pointers. - */ - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, - (int)new_size); - memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t)); - } - kfree(ifp->if_broot); - ifp->if_broot = new_broot; - ifp->if_broot_bytes = (int)new_size; - if (ifp->if_broot) - ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - xfs_inode_fork_size(ip, whichfork)); - return; + ifp->if_broot = krealloc(ifp->if_broot, new_size, + GFP_KERNEL | __GFP_NOFAIL); + ifp->if_broot_bytes = new_size; + return ifp->if_broot; } - /* * This is called when the amount of space needed for if_data * is increased or decreased. The change in size is indicated by @@ -655,7 +597,7 @@ xfs_iflush_fork( if ((iip->ili_fields & brootflag[whichfork]) && (ifp->if_broot_bytes > 0)) { ASSERT(ifp->if_broot != NULL); - ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <= xfs_inode_fork_size(ip, whichfork)); xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, @@ -671,6 +613,25 @@ xfs_iflush_fork( } break; + case XFS_DINODE_FMT_META_BTREE: + ASSERT(whichfork == XFS_DATA_FORK); + + if (!(iip->ili_fields & brootflag[whichfork])) + break; + + switch (ip->i_metatype) { + case XFS_METAFILE_RTRMAP: + xfs_iflush_rtrmap(ip, dip); + break; + case XFS_METAFILE_RTREFCOUNT: + xfs_iflush_rtrefcount(ip, dip); + break; + default: + ASSERT(0); + break; + } + break; + default: ASSERT(0); break; @@ -765,53 +726,46 @@ xfs_ifork_verify_local_attr( return 0; } +/* + * Check if the inode fork supports adding nr_to_add more extents. + * + * If it doesn't but we can upgrade it to large extent counters, do the upgrade. + * If we can't upgrade or are already using big counters but still can't fit the + * additional extents, return -EFBIG. + */ int -xfs_iext_count_may_overflow( +xfs_iext_count_extend( + struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, - int nr_to_add) + uint nr_to_add) { + struct xfs_mount *mp = ip->i_mount; + bool has_large = + xfs_inode_has_large_extent_counts(ip); struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); - uint64_t max_exts; uint64_t nr_exts; + ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); + if (whichfork == XFS_COW_FORK) return 0; - max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip), - whichfork); - - if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) - max_exts = 10; - + /* no point in upgrading if if_nextents overflows */ nr_exts = ifp->if_nextents + nr_to_add; - if (nr_exts < ifp->if_nextents || nr_exts > max_exts) + if (nr_exts < ifp->if_nextents) return -EFBIG; - return 0; -} - -/* - * Upgrade this inode's extent counter fields to be able to handle a potential - * increase in the extent count by nr_to_add. Normally this is the same - * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG. - */ -int -xfs_iext_count_upgrade( - struct xfs_trans *tp, - struct xfs_inode *ip, - uint nr_to_add) -{ - ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); - - if (!xfs_has_large_extent_counts(ip->i_mount) || - xfs_inode_has_large_extent_counts(ip) || - XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + nr_exts > 10) return -EFBIG; - ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - + if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) { + if (has_large || !xfs_has_large_extent_counts(mp)) + return -EFBIG; + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } return 0; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index bd53eb951b65..69ed0919d60b 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -170,7 +170,11 @@ void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, void xfs_idestroy_fork(struct xfs_ifork *ifp); void * xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, int whichfork); -void xfs_iroot_realloc(struct xfs_inode *, int, int); +struct xfs_btree_block *xfs_broot_alloc(struct xfs_ifork *ifp, + size_t new_size); +struct xfs_btree_block *xfs_broot_realloc(struct xfs_ifork *ifp, + size_t new_size); + int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, int); @@ -256,10 +260,8 @@ extern void xfs_ifork_init_cow(struct xfs_inode *ip); int xfs_ifork_verify_local_data(struct xfs_inode *ip); int xfs_ifork_verify_local_attr(struct xfs_inode *ip); -int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, - int nr_to_add); -int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, - uint nr_to_add); +int xfs_iext_count_extend(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, uint nr_to_add); bool xfs_ifork_is_realtime(struct xfs_inode *ip, int whichfork); /* returns true if the fork has extents but they are not read in yet. */ diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c new file mode 100644 index 000000000000..48fe49a5f050 --- /dev/null +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -0,0 +1,752 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include <linux/iversion.h> +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_inode_util.h" +#include "xfs_trans.h" +#include "xfs_ialloc.h" +#include "xfs_health.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_ag.h" +#include "xfs_iunlink_item.h" +#include "xfs_inode_item.h" + +uint16_t +xfs_flags2diflags( + struct xfs_inode *ip, + unsigned int xflags) +{ + /* can't set PREALLOC this way, just preserve it */ + uint16_t di_flags = + (ip->i_diflags & XFS_DIFLAG_PREALLOC); + + if (xflags & FS_XFLAG_IMMUTABLE) + di_flags |= XFS_DIFLAG_IMMUTABLE; + if (xflags & FS_XFLAG_APPEND) + di_flags |= XFS_DIFLAG_APPEND; + if (xflags & FS_XFLAG_SYNC) + di_flags |= XFS_DIFLAG_SYNC; + if (xflags & FS_XFLAG_NOATIME) + di_flags |= XFS_DIFLAG_NOATIME; + if (xflags & FS_XFLAG_NODUMP) + di_flags |= XFS_DIFLAG_NODUMP; + if (xflags & FS_XFLAG_NODEFRAG) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (xflags & FS_XFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + if (S_ISDIR(VFS_I(ip)->i_mode)) { + if (xflags & FS_XFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (xflags & FS_XFLAG_NOSYMLINKS) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if (xflags & FS_XFLAG_EXTSZINHERIT) + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + if (xflags & FS_XFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(VFS_I(ip)->i_mode)) { + if (xflags & FS_XFLAG_REALTIME) + di_flags |= XFS_DIFLAG_REALTIME; + if (xflags & FS_XFLAG_EXTSIZE) + di_flags |= XFS_DIFLAG_EXTSIZE; + } + + return di_flags; +} + +uint64_t +xfs_flags2diflags2( + struct xfs_inode *ip, + unsigned int xflags) +{ + uint64_t di_flags2 = + (ip->i_diflags2 & (XFS_DIFLAG2_REFLINK | + XFS_DIFLAG2_BIGTIME | + XFS_DIFLAG2_NREXT64)); + + if (xflags & FS_XFLAG_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + if (xflags & FS_XFLAG_COWEXTSIZE) + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + + return di_flags2; +} + +uint32_t +xfs_ip2xflags( + struct xfs_inode *ip) +{ + uint32_t flags = 0; + + if (ip->i_diflags & XFS_DIFLAG_ANY) { + if (ip->i_diflags & XFS_DIFLAG_REALTIME) + flags |= FS_XFLAG_REALTIME; + if (ip->i_diflags & XFS_DIFLAG_PREALLOC) + flags |= FS_XFLAG_PREALLOC; + if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) + flags |= FS_XFLAG_IMMUTABLE; + if (ip->i_diflags & XFS_DIFLAG_APPEND) + flags |= FS_XFLAG_APPEND; + if (ip->i_diflags & XFS_DIFLAG_SYNC) + flags |= FS_XFLAG_SYNC; + if (ip->i_diflags & XFS_DIFLAG_NOATIME) + flags |= FS_XFLAG_NOATIME; + if (ip->i_diflags & XFS_DIFLAG_NODUMP) + flags |= FS_XFLAG_NODUMP; + if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) + flags |= FS_XFLAG_RTINHERIT; + if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT) + flags |= FS_XFLAG_PROJINHERIT; + if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS) + flags |= FS_XFLAG_NOSYMLINKS; + if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) + flags |= FS_XFLAG_EXTSIZE; + if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) + flags |= FS_XFLAG_EXTSZINHERIT; + if (ip->i_diflags & XFS_DIFLAG_NODEFRAG) + flags |= FS_XFLAG_NODEFRAG; + if (ip->i_diflags & XFS_DIFLAG_FILESTREAM) + flags |= FS_XFLAG_FILESTREAM; + } + + if (ip->i_diflags2 & XFS_DIFLAG2_ANY) { + if (ip->i_diflags2 & XFS_DIFLAG2_DAX) + flags |= FS_XFLAG_DAX; + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) + flags |= FS_XFLAG_COWEXTSIZE; + } + + if (xfs_inode_has_attr_fork(ip)) + flags |= FS_XFLAG_HASATTR; + return flags; +} + +prid_t +xfs_get_initial_prid(struct xfs_inode *dp) +{ + if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT) + return dp->i_projid; + + /* Assign to the root project by default. */ + return 0; +} + +/* Propagate di_flags from a parent inode to a child inode. */ +static inline void +xfs_inode_inherit_flags( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + unsigned int di_flags = 0; + xfs_failaddr_t failaddr; + umode_t mode = VFS_I(ip)->i_mode; + + if (S_ISDIR(mode)) { + if (pip->i_diflags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + ip->i_extsize = pip->i_extsize; + } + if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(mode)) { + if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && + xfs_has_realtime(ip->i_mount)) + di_flags |= XFS_DIFLAG_REALTIME; + if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSIZE; + ip->i_extsize = pip->i_extsize; + } + } + if ((pip->i_diflags & XFS_DIFLAG_NOATIME) && + xfs_inherit_noatime) + di_flags |= XFS_DIFLAG_NOATIME; + if ((pip->i_diflags & XFS_DIFLAG_NODUMP) && + xfs_inherit_nodump) + di_flags |= XFS_DIFLAG_NODUMP; + if ((pip->i_diflags & XFS_DIFLAG_SYNC) && + xfs_inherit_sync) + di_flags |= XFS_DIFLAG_SYNC; + if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) && + xfs_inherit_nosymlinks) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) && + xfs_inherit_nodefrag) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_diflags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + + ip->i_diflags |= di_flags; + + /* + * Inode verifiers on older kernels only check that the extent size + * hint is an integer multiple of the rt extent size on realtime files. + * They did not check the hint alignment on a directory with both + * rtinherit and extszinherit flags set. If the misaligned hint is + * propagated from a directory into a new realtime file, new file + * allocations will fail due to math errors in the rt allocator and/or + * trip the verifiers. Validate the hint settings in the new file so + * that we don't let broken hints propagate. + */ + failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize, + VFS_I(ip)->i_mode, ip->i_diflags); + if (failaddr) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + } +} + +/* Propagate di_flags2 from a parent inode to a child inode. */ +static inline void +xfs_inode_inherit_flags2( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + xfs_failaddr_t failaddr; + + if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { + ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = pip->i_cowextsize; + } + if (pip->i_diflags2 & XFS_DIFLAG2_DAX) + ip->i_diflags2 |= XFS_DIFLAG2_DAX; + if (xfs_is_metadir_inode(pip)) + ip->i_diflags2 |= XFS_DIFLAG2_METADATA; + + /* Don't let invalid cowextsize hints propagate. */ + failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, + VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2); + if (failaddr) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + } +} + +/* + * If we need to create attributes immediately after allocating the inode, + * initialise an empty attribute fork right now. We use the default fork offset + * for attributes here as we don't know exactly what size or how many + * attributes we might be adding. We can do this safely here because we know + * the data fork is completely empty and this saves us from needing to run a + * separate transaction to set the fork offset in the immediate future. + * + * If we have parent pointers and the caller hasn't told us that the file will + * never be linked into a directory tree, we /must/ create the attr fork. + */ +static inline bool +xfs_icreate_want_attrfork( + struct xfs_mount *mp, + const struct xfs_icreate_args *args) +{ + if (args->flags & XFS_ICREATE_INIT_XATTRS) + return true; + + if (!(args->flags & XFS_ICREATE_UNLINKABLE) && xfs_has_parent(mp)) + return true; + + return false; +} + +/* Initialise an inode's attributes. */ +void +xfs_inode_init( + struct xfs_trans *tp, + const struct xfs_icreate_args *args, + struct xfs_inode *ip) +{ + struct xfs_inode *pip = args->pip; + struct inode *dir = pip ? VFS_I(pip) : NULL; + struct xfs_mount *mp = tp->t_mountp; + struct inode *inode = VFS_I(ip); + unsigned int flags; + int times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG | + XFS_ICHGTIME_ACCESS; + + if (args->flags & XFS_ICREATE_TMPFILE) + set_nlink(inode, 0); + else if (S_ISDIR(args->mode)) + set_nlink(inode, 2); + else + set_nlink(inode, 1); + inode->i_rdev = args->rdev; + + if (!args->idmap || pip == NULL) { + /* creating a tree root, sb rooted, or detached file */ + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + ip->i_projid = 0; + inode->i_mode = args->mode; + } else { + /* creating a child in the directory tree */ + if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { + inode_fsuid_set(inode, args->idmap); + inode->i_gid = dir->i_gid; + inode->i_mode = args->mode; + } else { + inode_init_owner(args->idmap, inode, dir, args->mode); + } + + /* + * If the group ID of the new file does not match the effective + * group ID or one of the supplementary group IDs, the S_ISGID + * bit is cleared (and only if the irix_sgid_inherit + * compatibility variable is set). + */ + if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && + !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode))) + inode->i_mode &= ~S_ISGID; + + ip->i_projid = xfs_get_initial_prid(pip); + } + + ip->i_disk_size = 0; + ip->i_df.if_nextents = 0; + ASSERT(ip->i_nblocks == 0); + + ip->i_extsize = 0; + ip->i_diflags = 0; + + if (xfs_has_v3inodes(mp)) { + inode_set_iversion(inode, 1); + /* also covers the di_used_blocks union arm: */ + ip->i_cowextsize = 0; + times |= XFS_ICHGTIME_CREATE; + } + + xfs_trans_ichgtime(tp, ip, times); + + flags = XFS_ILOG_CORE; + switch (args->mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + ip->i_df.if_format = XFS_DINODE_FMT_DEV; + flags |= XFS_ILOG_DEV; + break; + case S_IFREG: + case S_IFDIR: + if (pip && (pip->i_diflags & XFS_DIFLAG_ANY)) + xfs_inode_inherit_flags(ip, pip); + if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) + xfs_inode_inherit_flags2(ip, pip); + fallthrough; + case S_IFLNK: + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_bytes = 0; + ip->i_df.if_data = NULL; + break; + default: + ASSERT(0); + } + + if (xfs_icreate_want_attrfork(mp, args)) { + ip->i_forkoff = xfs_default_attroffset(ip) >> 3; + xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); + + if (!xfs_has_attr(mp)) { + spin_lock(&mp->m_sb_lock); + xfs_add_attr(mp); + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); + } + } + + xfs_trans_log_inode(tp, ip, flags); +} + +/* + * In-Core Unlinked List Lookups + * ============================= + * + * Every inode is supposed to be reachable from some other piece of metadata + * with the exception of the root directory. Inodes with a connection to a + * file descriptor but not linked from anywhere in the on-disk directory tree + * are collectively known as unlinked inodes, though the filesystem itself + * maintains links to these inodes so that on-disk metadata are consistent. + * + * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI + * header contains a number of buckets that point to an inode, and each inode + * record has a pointer to the next inode in the hash chain. This + * singly-linked list causes scaling problems in the iunlink remove function + * because we must walk that list to find the inode that points to the inode + * being removed from the unlinked hash bucket list. + * + * Hence we keep an in-memory double linked list to link each inode on an + * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer + * based lists would require having 64 list heads in the perag, one for each + * list. This is expensive in terms of memory (think millions of AGs) and cache + * misses on lookups. Instead, use the fact that inodes on the unlinked list + * must be referenced at the VFS level to keep them on the list and hence we + * have an existence guarantee for inodes on the unlinked list. + * + * Given we have an existence guarantee, we can use lockless inode cache lookups + * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode + * for the double linked unlinked list, and we don't need any extra locking to + * keep the list safe as all manipulations are done under the AGI buffer lock. + * Keeping the list up to date does not require memory allocation, just finding + * the XFS inode and updating the next/prev unlinked list aginos. + */ + +/* + * Update the prev pointer of the next agino. Returns -ENOLINK if the inode + * is not in cache. + */ +static int +xfs_iunlink_update_backref( + struct xfs_perag *pag, + xfs_agino_t prev_agino, + xfs_agino_t next_agino) +{ + struct xfs_inode *ip; + + /* No update necessary if we are at the end of the list. */ + if (next_agino == NULLAGINO) + return 0; + + ip = xfs_iunlink_lookup(pag, next_agino); + if (!ip) + return -ENOLINK; + + ip->i_prev_unlinked = prev_agino; + return 0; +} + +/* + * Point the AGI unlinked bucket at an inode and log the results. The caller + * is responsible for validating the old value. + */ +STATIC int +xfs_iunlink_update_bucket( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agibp, + unsigned int bucket_index, + xfs_agino_t new_agino) +{ + struct xfs_agi *agi = agibp->b_addr; + xfs_agino_t old_value; + int offset; + + ASSERT(xfs_verify_agino_or_null(pag, new_agino)); + + old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); + trace_xfs_iunlink_update_bucket(pag, bucket_index, old_value, + new_agino); + + /* + * We should never find the head of the list already set to the value + * passed in because either we're adding or removing ourselves from the + * head of the list. + */ + if (old_value == new_agino) { + xfs_buf_mark_corrupt(agibp); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + return -EFSCORRUPTED; + } + + agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); + offset = offsetof(struct xfs_agi, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +} + +static int +xfs_iunlink_insert_inode( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agibp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = agibp->b_addr; + xfs_agino_t next_agino; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; + + /* + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the pointer isn't garbage and that this inode + * isn't already on the list. + */ + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (next_agino == agino || + !xfs_verify_agino_or_null(pag, next_agino)) { + xfs_buf_mark_corrupt(agibp); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + return -EFSCORRUPTED; + } + + /* + * Update the prev pointer in the next inode to point back to this + * inode. + */ + error = xfs_iunlink_update_backref(pag, agino, next_agino); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); + if (error) + return error; + + if (next_agino != NULLAGINO) { + /* + * There is already another inode in the bucket, so point this + * inode to the current head of the list. + */ + error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); + if (error) + return error; + ip->i_next_unlinked = next_agino; + } + + /* Point the head of the list to point to this inode. */ + ip->i_prev_unlinked = NULLAGINO; + return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); +} + +/* + * This is called when the inode's link count has gone to 0 or we are creating + * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. + * + * We place the on-disk inode on a list in the AGI. It will be pulled from this + * list when the inode is freed. + */ +int +xfs_iunlink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; + struct xfs_buf *agibp; + int error; + + ASSERT(VFS_I(ip)->i_nlink == 0); + ASSERT(VFS_I(ip)->i_mode != 0); + trace_xfs_iunlink(ip); + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(pag, tp, 0, &agibp); + if (error) + goto out; + + error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); +out: + xfs_perag_put(pag); + return error; +} + +static int +xfs_iunlink_remove_inode( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agibp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = agibp->b_addr; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t head_agino; + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; + + trace_xfs_iunlink_remove(ip); + + /* + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the head pointer isn't garbage. + */ + head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (!xfs_verify_agino(pag, head_agino)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + agi, sizeof(*agi)); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + return -EFSCORRUPTED; + } + + /* + * Set our inode's next_unlinked pointer to NULL and then return + * the old pointer value so that we can update whatever was previous + * to us in the list to point to whatever was next in the list. + */ + error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); + if (error) + return error; + + /* + * Update the prev pointer in the next inode to point back to previous + * inode in the chain. + */ + error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, + ip->i_next_unlinked); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, + ip->i_next_unlinked); + if (error) + return error; + + if (head_agino != agino) { + struct xfs_inode *prev_ip; + + prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); + if (!prev_ip) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + error = xfs_iunlink_log_inode(tp, prev_ip, pag, + ip->i_next_unlinked); + prev_ip->i_next_unlinked = ip->i_next_unlinked; + } else { + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, + ip->i_next_unlinked); + } + + ip->i_next_unlinked = NULLAGINO; + ip->i_prev_unlinked = 0; + return error; +} + +/* + * Pull the on-disk inode from the AGI unlinked list. + */ +int +xfs_iunlink_remove( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + struct xfs_buf *agibp; + int error; + + trace_xfs_iunlink_remove(ip); + + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(pag, tp, 0, &agibp); + if (error) + return error; + + return xfs_iunlink_remove_inode(tp, pag, agibp, ip); +} + +/* + * Decrement the link count on an inode & log the change. If this causes the + * link count to go to zero, move the inode to AGI unlinked list so that it can + * be freed when the last active reference goes away via xfs_inactive(). + */ +int +xfs_droplink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + if (inode->i_nlink == 0) { + xfs_info_ratelimited(tp->t_mountp, + "Inode 0x%llx link count dropped below zero. Pinning link count.", + ip->i_ino); + set_nlink(inode, XFS_NLINK_PINNED); + } + if (inode->i_nlink != XFS_NLINK_PINNED) + drop_nlink(inode); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (inode->i_nlink) + return 0; + + return xfs_iunlink(tp, ip); +} + +/* + * Increment the link count on an inode & log the change. + */ +void +xfs_bumplink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + if (inode->i_nlink == XFS_NLINK_PINNED - 1) + xfs_info_ratelimited(tp->t_mountp, + "Inode 0x%llx link count exceeded maximum. Pinning link count.", + ip->i_ino); + if (inode->i_nlink != XFS_NLINK_PINNED) + inc_nlink(inode); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Free an inode in the ondisk index and zero it out. */ +int +xfs_inode_uninit( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_inode *ip, + struct xfs_icluster *xic) +{ + struct xfs_mount *mp = ip->i_mount; + int error; + + /* + * Free the inode first so that we guarantee that the AGI lock is going + * to be taken before we remove the inode from the unlinked list. This + * makes the AGI lock -> unlinked list modification order the same as + * used in O_TMPFILE creation. + */ + error = xfs_difree(tp, pag, ip->i_ino, xic); + if (error) + return error; + + error = xfs_iunlink_remove(tp, pag, ip); + if (error) + return error; + + /* + * Free any local-format data sitting around before we reset the + * data fork to extents format. Note that the attr fork data has + * already been freed by xfs_attr_inactive. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + kfree(ip->i_df.if_data); + ip->i_df.if_data = NULL; + ip->i_df.if_bytes = 0; + } + + VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ + ip->i_diflags = 0; + ip->i_diflags2 = mp->m_ino_geo.new_diflags2; + ip->i_forkoff = 0; /* mark the attr fork not in use */ + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; + + /* + * Bump the generation count so no one will be confused + * by reincarnations of this inode. + */ + VFS_I(ip)->i_generation++; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h new file mode 100644 index 000000000000..060242998a23 --- /dev/null +++ b/fs/xfs/libxfs/xfs_inode_util.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_INODE_UTIL_H__ +#define __XFS_INODE_UTIL_H__ + +struct xfs_icluster; + +uint16_t xfs_flags2diflags(struct xfs_inode *ip, unsigned int xflags); +uint64_t xfs_flags2diflags2(struct xfs_inode *ip, unsigned int xflags); +uint32_t xfs_dic2xflags(struct xfs_inode *ip); +uint32_t xfs_ip2xflags(struct xfs_inode *ip); + +prid_t xfs_get_initial_prid(struct xfs_inode *dp); + +/* + * File creation context. + * + * Due to our only partial reliance on the VFS to propagate uid and gid values + * according to accepted Unix behaviors, callers must initialize idmap to the + * correct idmapping structure to get the correct inheritance behaviors when + * XFS_MOUNT_GRPID is set. + * + * To create files detached from the directory tree (e.g. quota inodes), set + * idmap to NULL. To create a tree root, set pip to NULL. + */ +struct xfs_icreate_args { + struct mnt_idmap *idmap; + struct xfs_inode *pip; /* parent inode or null */ + dev_t rdev; + umode_t mode; + +#define XFS_ICREATE_TMPFILE (1U << 0) /* create an unlinked file */ +#define XFS_ICREATE_INIT_XATTRS (1U << 1) /* will set xattrs immediately */ +#define XFS_ICREATE_UNLINKABLE (1U << 2) /* cannot link into dir tree */ + uint16_t flags; +}; + +/* + * Flags for xfs_trans_ichgtime(). + */ +#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ +#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ +#define XFS_ICHGTIME_ACCESS 0x8 /* last access timestamp */ +void xfs_trans_ichgtime(struct xfs_trans *tp, struct xfs_inode *ip, int flags); + +void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args, + struct xfs_inode *ip); + +int xfs_inode_uninit(struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_inode *ip, struct xfs_icluster *xic); + +int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip); +int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_inode *ip); +int xfs_droplink(struct xfs_trans *tp, struct xfs_inode *ip); +void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip); + +#endif /* __XFS_INODE_UTIL_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 16872972e1e9..0d637c276db0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -115,10 +115,13 @@ struct xfs_unmount_log_format { #define XLOG_REG_TYPE_BUD_FORMAT 26 #define XLOG_REG_TYPE_ATTRI_FORMAT 27 #define XLOG_REG_TYPE_ATTRD_FORMAT 28 -#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_NAME 29 #define XLOG_REG_TYPE_ATTR_VALUE 30 -#define XLOG_REG_TYPE_MAX 30 - +#define XLOG_REG_TYPE_XMI_FORMAT 31 +#define XLOG_REG_TYPE_XMD_FORMAT 32 +#define XLOG_REG_TYPE_ATTR_NEWNAME 33 +#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 +#define XLOG_REG_TYPE_MAX 34 /* * Flags to log operation header @@ -243,6 +246,14 @@ typedef struct xfs_trans_header { #define XFS_LI_BUD 0x1245 #define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/ #define XFS_LI_ATTRD 0x1247 /* attr set/remove done */ +#define XFS_LI_XMI 0x1248 /* mapping exchange intent */ +#define XFS_LI_XMD 0x1249 /* mapping exchange done */ +#define XFS_LI_EFI_RT 0x124a /* realtime extent free intent */ +#define XFS_LI_EFD_RT 0x124b /* realtime extent free done */ +#define XFS_LI_RUI_RT 0x124c /* realtime rmap update intent */ +#define XFS_LI_RUD_RT 0x124d /* realtime rmap update done */ +#define XFS_LI_CUI_RT 0x124e /* realtime refcount update intent */ +#define XFS_LI_CUD_RT 0x124f /* realtime refcount update done */ #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -260,7 +271,15 @@ typedef struct xfs_trans_header { { XFS_LI_BUI, "XFS_LI_BUI" }, \ { XFS_LI_BUD, "XFS_LI_BUD" }, \ { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \ - { XFS_LI_ATTRD, "XFS_LI_ATTRD" } + { XFS_LI_ATTRD, "XFS_LI_ATTRD" }, \ + { XFS_LI_XMI, "XFS_LI_XMI" }, \ + { XFS_LI_XMD, "XFS_LI_XMD" }, \ + { XFS_LI_EFI_RT, "XFS_LI_EFI_RT" }, \ + { XFS_LI_EFD_RT, "XFS_LI_EFD_RT" }, \ + { XFS_LI_RUI_RT, "XFS_LI_RUI_RT" }, \ + { XFS_LI_RUD_RT, "XFS_LI_RUD_RT" }, \ + { XFS_LI_CUI_RT, "XFS_LI_CUI_RT" }, \ + { XFS_LI_CUD_RT, "XFS_LI_CUD_RT" } /* * Inode Log Item Format definitions. @@ -340,12 +359,6 @@ struct xfs_inode_log_format_32 { */ #define XFS_ILOG_IVERSION 0x8000 -#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ - XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ - XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \ - XFS_ILOG_AOWNER) - #define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT) @@ -397,7 +410,7 @@ struct xfs_log_dinode { uint16_t di_mode; /* mode and type of file */ int8_t di_version; /* inode version */ int8_t di_format; /* format of di_c data */ - uint8_t di_pad3[2]; /* unused in v2/3 inodes */ + uint16_t di_metatype; /* metadata type, if DIFLAG2_METADATA */ uint32_t di_uid; /* owner's user id */ uint32_t di_gid; /* owner's group id */ uint32_t di_nlink; /* number of links to file */ @@ -462,7 +475,12 @@ struct xfs_log_dinode { xfs_lsn_t di_lsn; uint64_t di_flags2; /* more random flags */ - uint32_t di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + uint32_t di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + uint32_t di_used_blocks; + }; uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ @@ -879,6 +897,61 @@ struct xfs_bud_log_format { }; /* + * XMI/XMD (file mapping exchange) log format definitions + */ + +/* This is the structure used to lay out an mapping exchange log item. */ +struct xfs_xmi_log_format { + uint16_t xmi_type; /* xmi log item type */ + uint16_t xmi_size; /* size of this item */ + uint32_t __pad; /* must be zero */ + uint64_t xmi_id; /* xmi identifier */ + + uint64_t xmi_inode1; /* inumber of first file */ + uint64_t xmi_inode2; /* inumber of second file */ + uint32_t xmi_igen1; /* generation of first file */ + uint32_t xmi_igen2; /* generation of second file */ + uint64_t xmi_startoff1; /* block offset into file1 */ + uint64_t xmi_startoff2; /* block offset into file2 */ + uint64_t xmi_blockcount; /* number of blocks */ + uint64_t xmi_flags; /* XFS_EXCHMAPS_* */ + uint64_t xmi_isize1; /* intended file1 size */ + uint64_t xmi_isize2; /* intended file2 size */ +}; + +/* Exchange mappings between extended attribute forks instead of data forks. */ +#define XFS_EXCHMAPS_ATTR_FORK (1ULL << 0) + +/* Set the file sizes when finished. */ +#define XFS_EXCHMAPS_SET_SIZES (1ULL << 1) + +/* + * Exchange the mappings of the two files only if the file allocation units + * mapped to file1's range have been written. + */ +#define XFS_EXCHMAPS_INO1_WRITTEN (1ULL << 2) + +/* Clear the reflink flag from inode1 after the operation. */ +#define XFS_EXCHMAPS_CLEAR_INO1_REFLINK (1ULL << 3) + +/* Clear the reflink flag from inode2 after the operation. */ +#define XFS_EXCHMAPS_CLEAR_INO2_REFLINK (1ULL << 4) + +#define XFS_EXCHMAPS_LOGGED_FLAGS (XFS_EXCHMAPS_ATTR_FORK | \ + XFS_EXCHMAPS_SET_SIZES | \ + XFS_EXCHMAPS_INO1_WRITTEN | \ + XFS_EXCHMAPS_CLEAR_INO1_REFLINK | \ + XFS_EXCHMAPS_CLEAR_INO2_REFLINK) + +/* This is the structure used to lay out an mapping exchange done log item. */ +struct xfs_xmd_log_format { + uint16_t xmd_type; /* xmd log item type */ + uint16_t xmd_size; /* size of this item */ + uint32_t __pad; + uint64_t xmd_xmi_id; /* id of corresponding xmi */ +}; + +/* * Dquot Log format definitions. * * The first two fields must be the type and size fitting into @@ -966,6 +1039,9 @@ struct xfs_icreate_log { #define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */ #define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */ #define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */ +#define XFS_ATTRI_OP_FLAGS_PPTR_SET 4 /* Set parent pointer */ +#define XFS_ATTRI_OP_FLAGS_PPTR_REMOVE 5 /* Remove parent pointer */ +#define XFS_ATTRI_OP_FLAGS_PPTR_REPLACE 6 /* Replace parent pointer */ #define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ /* @@ -974,6 +1050,7 @@ struct xfs_icreate_log { */ #define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \ XFS_ATTR_SECURE | \ + XFS_ATTR_PARENT | \ XFS_ATTR_INCOMPLETE) /* @@ -983,11 +1060,22 @@ struct xfs_icreate_log { struct xfs_attri_log_format { uint16_t alfi_type; /* attri log item type */ uint16_t alfi_size; /* size of this item */ - uint32_t __pad; /* pad to 64 bit aligned */ + uint32_t alfi_igen; /* generation of alfi_ino for pptr ops */ uint64_t alfi_id; /* attri identifier */ uint64_t alfi_ino; /* the inode for this attr operation */ uint32_t alfi_op_flags; /* marks the op as a set or remove */ - uint32_t alfi_name_len; /* attr name length */ + union { + uint32_t alfi_name_len; /* attr name length */ + struct { + /* + * For PPTR_REPLACE, these are the lengths of the old + * and new attr names. The new and old values must + * have the same length. + */ + uint16_t alfi_old_name_len; + uint16_t alfi_new_name_len; + }; + }; uint32_t alfi_value_len; /* attr value length */ uint32_t alfi_attr_filter;/* attr filter flags */ }; diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 9fe7a9564bca..66c7916fb5cd 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -75,6 +75,14 @@ extern const struct xlog_recover_item_ops xlog_cui_item_ops; extern const struct xlog_recover_item_ops xlog_cud_item_ops; extern const struct xlog_recover_item_ops xlog_attri_item_ops; extern const struct xlog_recover_item_ops xlog_attrd_item_ops; +extern const struct xlog_recover_item_ops xlog_xmi_item_ops; +extern const struct xlog_recover_item_ops xlog_xmd_item_ops; +extern const struct xlog_recover_item_ops xlog_rtefi_item_ops; +extern const struct xlog_recover_item_ops xlog_rtefd_item_ops; +extern const struct xlog_recover_item_ops xlog_rtrui_item_ops; +extern const struct xlog_recover_item_ops xlog_rtrud_item_ops; +extern const struct xlog_recover_item_ops xlog_rtcui_item_ops; +extern const struct xlog_recover_item_ops xlog_rtcud_item_ops; /* * Macros, structures, prototypes for internal log manager use. @@ -121,6 +129,8 @@ bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp); +int xlog_recover_iget_handle(struct xfs_mount *mp, xfs_ino_t ino, uint32_t gen, + struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); int xlog_alloc_buf_cancel_table(struct xlog *log); diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 9975b93a7412..34bba96d30ca 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -17,6 +17,34 @@ #include "xfs_trace.h" /* + * Shortly after enabling the large extents count feature in 2023, longstanding + * bugs were found in the code that computes the minimum log size. Luckily, + * the bugs resulted in over-estimates of that size, so there's no impact to + * existing users. However, we don't want to reduce the minimum log size + * because that can create the situation where a newer mkfs writes a new + * filesystem that an older kernel won't mount. + * + * Several years prior, we also discovered that the transaction reservations + * for rmap and reflink operations were unnecessarily large. That was fixed, + * but the minimum log size computation was left alone to avoid the + * compatibility problems noted above. Fix that too. + * + * Therefore, we only may correct the computation starting with filesystem + * features that didn't exist in 2023. In other words, only turn this on if + * the filesystem has parent pointers. + * + * This function can be called before the XFS_HAS_* flags have been set up, + * (e.g. mkfs) so we must check the ondisk superblock. + */ +static inline bool +xfs_want_minlogsize_fixes( + struct xfs_sb *sb) +{ + return xfs_sb_is_v5(sb) && + xfs_sb_has_incompat_feature(sb, XFS_SB_FEAT_INCOMPAT_PARENT); +} + +/* * Calculate the maximum length in bytes that would be required for a local * attribute value as large attributes out of line are not logged. */ @@ -31,6 +59,15 @@ xfs_log_calc_max_attrsetm_res( MAXNAMELEN - 1; nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); nblks += XFS_B_TO_FSB(mp, size); + + /* + * If the feature set is new enough, correct a unit conversion error in + * the xattr transaction reservation code that resulted in oversized + * minimum log size computations. + */ + if (xfs_want_minlogsize_fixes(&mp->m_sb)) + size = XFS_B_TO_FSB(mp, size); + nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK); return M_RES(mp)->tr_attrsetm.tr_logres + @@ -49,6 +86,16 @@ xfs_log_calc_trans_resv_for_minlogblocks( unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; /* + * If the feature set is new enough, drop the oversized minimum log + * size computation introduced by the original reflink code. + */ + if (xfs_want_minlogsize_fixes(&mp->m_sb)) { + xfs_trans_resv_calc(mp, resv); + resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend; + return; + } + + /* * In the early days of rmap+reflink, we always set the rmap maxlevels * to 9 even if the AG was small enough that it would never grow to * that height. Transaction reservation sizes influence the minimum @@ -61,6 +108,9 @@ xfs_log_calc_trans_resv_for_minlogblocks( xfs_trans_resv_calc(mp, resv); + /* Copy the dynamic transaction reservation types from the running fs */ + resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend; + if (xfs_has_reflink(mp)) { /* * In the early days of reflink, typical log operation counts diff --git a/fs/xfs/libxfs/xfs_metadir.c b/fs/xfs/libxfs/xfs_metadir.c new file mode 100644 index 000000000000..178e89711cb7 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metadir.c @@ -0,0 +1,485 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_metafile.h" +#include "xfs_metadir.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_bmap_btree.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_trans_space.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_parent.h" +#include "xfs_health.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_btree.h" +#include "xfs_alloc.h" + +/* + * Metadata Directory Tree + * ======================= + * + * These functions provide an abstraction layer for looking up, creating, and + * deleting metadata inodes that live within a special metadata directory tree. + * + * This code does not manage the five existing metadata inodes: real time + * bitmap & summary; and the user, group, and quotas. All other metadata + * inodes must use only the xfs_meta{dir,file}_* functions. + * + * Callers wishing to create or hardlink a metadata inode must create an + * xfs_metadir_update structure, call the appropriate xfs_metadir* function, + * and then call xfs_metadir_commit or xfs_metadir_cancel to commit or cancel + * the update. Files in the metadata directory tree currently cannot be + * unlinked. + * + * When the metadir feature is enabled, all metadata inodes must have the + * "metadata" inode flag set to prevent them from being exposed to the outside + * world. + * + * Callers must take the ILOCK of any inode in the metadata directory tree to + * synchronize access to that inode. It is never necessary to take the IOLOCK + * or the MMAPLOCK since metadata inodes must not be exposed to user space. + */ + +static inline void +xfs_metadir_set_xname( + struct xfs_name *xname, + const char *path, + unsigned char ftype) +{ + xname->name = (const unsigned char *)path; + xname->len = strlen(path); + xname->type = ftype; +} + +/* + * Given a parent directory @dp and a metadata inode path component @xname, + * Look up the inode number in the directory, returning it in @ino. + * @xname.type must match the directory entry's ftype. + * + * Caller must hold ILOCK_EXCL. + */ +static inline int +xfs_metadir_lookup( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_name *xname, + xfs_ino_t *ino) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args = { + .trans = tp, + .dp = dp, + .geo = mp->m_dir_geo, + .name = xname->name, + .namelen = xname->len, + .hashval = xfs_dir2_hashname(mp, xname), + .whichfork = XFS_DATA_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + .owner = dp->i_ino, + }; + int error; + + if (!S_ISDIR(VFS_I(dp)->i_mode)) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + if (xfs_is_shutdown(mp)) + return -EIO; + + error = xfs_dir_lookup_args(&args); + if (error) + return error; + + if (!xfs_verify_ino(mp, args.inumber)) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + if (xname->type != XFS_DIR3_FT_UNKNOWN && xname->type != args.filetype) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + trace_xfs_metadir_lookup(dp, xname, args.inumber); + *ino = args.inumber; + return 0; +} + +/* + * Look up and read a metadata inode from the metadata directory. If the path + * component doesn't exist, return -ENOENT. + */ +int +xfs_metadir_load( + struct xfs_trans *tp, + struct xfs_inode *dp, + const char *path, + enum xfs_metafile_type metafile_type, + struct xfs_inode **ipp) +{ + struct xfs_name xname; + xfs_ino_t ino; + int error; + + xfs_metadir_set_xname(&xname, path, XFS_DIR3_FT_UNKNOWN); + + xfs_ilock(dp, XFS_ILOCK_EXCL); + error = xfs_metadir_lookup(tp, dp, &xname, &ino); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + if (error) + return error; + return xfs_trans_metafile_iget(tp, ino, metafile_type, ipp); +} + +/* + * Unlock and release resources after committing (or cancelling) a metadata + * directory tree operation. The caller retains its reference to @upd->ip + * and must release it explicitly. + */ +static inline void +xfs_metadir_teardown( + struct xfs_metadir_update *upd, + int error) +{ + trace_xfs_metadir_teardown(upd, error); + + if (upd->ppargs) { + xfs_parent_finish(upd->dp->i_mount, upd->ppargs); + upd->ppargs = NULL; + } + + if (upd->ip) { + if (upd->ip_locked) + xfs_iunlock(upd->ip, XFS_ILOCK_EXCL); + upd->ip_locked = false; + } + + if (upd->dp_locked) + xfs_iunlock(upd->dp, XFS_ILOCK_EXCL); + upd->dp_locked = false; +} + +/* + * Begin the process of creating a metadata file by allocating transactions + * and taking whatever resources we're going to need. + */ +int +xfs_metadir_start_create( + struct xfs_metadir_update *upd) +{ + struct xfs_mount *mp = upd->dp->i_mount; + int error; + + ASSERT(upd->dp != NULL); + ASSERT(upd->ip == NULL); + ASSERT(xfs_has_metadir(mp)); + ASSERT(upd->metafile_type != XFS_METAFILE_UNKNOWN); + + error = xfs_parent_start(mp, &upd->ppargs); + if (error) + return error; + + /* + * If we ever need the ability to create rt metadata files on a + * pre-metadir filesystem, we'll need to dqattach the parent here. + * Currently we assume that mkfs will create the files and quotacheck + * will account for them. + */ + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, + xfs_create_space_res(mp, MAXNAMELEN), 0, 0, &upd->tp); + if (error) + goto out_teardown; + + /* + * Lock the parent directory if there is one. We can't ijoin it to + * the transaction until after the child file has been created. + */ + xfs_ilock(upd->dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + upd->dp_locked = true; + + trace_xfs_metadir_start_create(upd); + return 0; +out_teardown: + xfs_metadir_teardown(upd, error); + return error; +} + +/* + * Create a metadata inode with the given @mode, and insert it into the + * metadata directory tree at the given @upd->path. The path up to the final + * component must already exist. The final path component must not exist. + * + * The new metadata inode will be attached to the update structure @upd->ip, + * with the ILOCK held until the caller releases it. + * + * NOTE: This function may return a new inode to the caller even if it returns + * a negative error code. If an inode is passed back, the caller must finish + * setting up the inode before releasing it. + */ +int +xfs_metadir_create( + struct xfs_metadir_update *upd, + umode_t mode) +{ + struct xfs_icreate_args args = { + .pip = upd->dp, + .mode = mode, + }; + struct xfs_name xname; + struct xfs_dir_update du = { + .dp = upd->dp, + .name = &xname, + .ppargs = upd->ppargs, + }; + struct xfs_mount *mp = upd->dp->i_mount; + xfs_ino_t ino; + unsigned int resblks; + int error; + + xfs_assert_ilocked(upd->dp, XFS_ILOCK_EXCL); + + /* Check that the name does not already exist in the directory. */ + xfs_metadir_set_xname(&xname, upd->path, XFS_DIR3_FT_UNKNOWN); + error = xfs_metadir_lookup(upd->tp, upd->dp, &xname, &ino); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + fallthrough; + default: + return error; + } + + /* + * A newly created regular or special file just has one directory + * entry pointing to them, but a directory also the "." entry + * pointing to itself. + */ + error = xfs_dialloc(&upd->tp, &args, &ino); + if (error) + return error; + error = xfs_icreate(upd->tp, ino, &args, &upd->ip); + if (error) + return error; + du.ip = upd->ip; + xfs_metafile_set_iflag(upd->tp, upd->ip, upd->metafile_type); + upd->ip_locked = true; + + /* + * Join the directory inode to the transaction. We do not do it + * earlier because xfs_dialloc rolls the transaction. + */ + xfs_trans_ijoin(upd->tp, upd->dp, 0); + + /* Create the entry. */ + if (S_ISDIR(args.mode)) + resblks = xfs_mkdir_space_res(mp, xname.len); + else + resblks = xfs_create_space_res(mp, xname.len); + xname.type = xfs_mode_to_ftype(args.mode); + + trace_xfs_metadir_try_create(upd); + + error = xfs_dir_create_child(upd->tp, resblks, &du); + if (error) + return error; + + /* Metadir files are not accounted to quota. */ + + trace_xfs_metadir_create(upd); + + return 0; +} + +#ifndef __KERNEL__ +/* + * Begin the process of linking a metadata file by allocating transactions + * and locking whatever resources we're going to need. + */ +int +xfs_metadir_start_link( + struct xfs_metadir_update *upd) +{ + struct xfs_mount *mp = upd->dp->i_mount; + unsigned int resblks; + int nospace_error = 0; + int error; + + ASSERT(upd->dp != NULL); + ASSERT(upd->ip != NULL); + ASSERT(xfs_has_metadir(mp)); + + error = xfs_parent_start(mp, &upd->ppargs); + if (error) + return error; + + resblks = xfs_link_space_res(mp, MAXNAMELEN); + error = xfs_trans_alloc_dir(upd->dp, &M_RES(mp)->tr_link, upd->ip, + &resblks, &upd->tp, &nospace_error); + if (error) + goto out_teardown; + if (!resblks) { + /* We don't allow reservationless updates. */ + xfs_trans_cancel(upd->tp); + upd->tp = NULL; + xfs_iunlock(upd->dp, XFS_ILOCK_EXCL); + xfs_iunlock(upd->ip, XFS_ILOCK_EXCL); + error = nospace_error; + goto out_teardown; + } + + upd->dp_locked = true; + upd->ip_locked = true; + + trace_xfs_metadir_start_link(upd); + return 0; +out_teardown: + xfs_metadir_teardown(upd, error); + return error; +} + +/* + * Link the metadata directory given by @path to the inode @upd->ip. + * The path (up to the final component) must already exist, but the final + * component must not already exist. + */ +int +xfs_metadir_link( + struct xfs_metadir_update *upd) +{ + struct xfs_name xname; + struct xfs_dir_update du = { + .dp = upd->dp, + .name = &xname, + .ip = upd->ip, + .ppargs = upd->ppargs, + }; + struct xfs_mount *mp = upd->dp->i_mount; + xfs_ino_t ino; + unsigned int resblks; + int error; + + xfs_assert_ilocked(upd->dp, XFS_ILOCK_EXCL); + xfs_assert_ilocked(upd->ip, XFS_ILOCK_EXCL); + + /* Look up the name in the current directory. */ + xfs_metadir_set_xname(&xname, upd->path, + xfs_mode_to_ftype(VFS_I(upd->ip)->i_mode)); + error = xfs_metadir_lookup(upd->tp, upd->dp, &xname, &ino); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + fallthrough; + default: + return error; + } + + resblks = xfs_link_space_res(mp, xname.len); + error = xfs_dir_add_child(upd->tp, resblks, &du); + if (error) + return error; + + trace_xfs_metadir_link(upd); + + return 0; +} +#endif /* ! __KERNEL__ */ + +/* Commit a metadir update and unlock/drop all resources. */ +int +xfs_metadir_commit( + struct xfs_metadir_update *upd) +{ + int error; + + trace_xfs_metadir_commit(upd); + + error = xfs_trans_commit(upd->tp); + upd->tp = NULL; + + xfs_metadir_teardown(upd, error); + return error; +} + +/* Cancel a metadir update and unlock/drop all resources. */ +void +xfs_metadir_cancel( + struct xfs_metadir_update *upd, + int error) +{ + trace_xfs_metadir_cancel(upd); + + xfs_trans_cancel(upd->tp); + upd->tp = NULL; + + xfs_metadir_teardown(upd, error); +} + +/* Create a metadata for the last component of the path. */ +int +xfs_metadir_mkdir( + struct xfs_inode *dp, + const char *path, + struct xfs_inode **ipp) +{ + struct xfs_metadir_update upd = { + .dp = dp, + .path = path, + .metafile_type = XFS_METAFILE_DIR, + }; + int error; + + if (xfs_is_shutdown(dp->i_mount)) + return -EIO; + + /* Allocate a transaction to create the last directory. */ + error = xfs_metadir_start_create(&upd); + if (error) + return error; + + /* Create the subdirectory and take our reference. */ + error = xfs_metadir_create(&upd, S_IFDIR); + if (error) + goto out_cancel; + + error = xfs_metadir_commit(&upd); + if (error) + goto out_irele; + + xfs_finish_inode_setup(upd.ip); + *ipp = upd.ip; + return 0; + +out_cancel: + xfs_metadir_cancel(&upd, error); +out_irele: + /* Have to finish setting up the inode to ensure it's deleted. */ + if (upd.ip) { + xfs_finish_inode_setup(upd.ip); + xfs_irele(upd.ip); + } + return error; +} diff --git a/fs/xfs/libxfs/xfs_metadir.h b/fs/xfs/libxfs/xfs_metadir.h new file mode 100644 index 000000000000..bfecac7d3d14 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metadir.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_METADIR_H__ +#define __XFS_METADIR_H__ + +/* Cleanup widget for metadata inode creation and deletion. */ +struct xfs_metadir_update { + /* Parent directory */ + struct xfs_inode *dp; + + /* Path to metadata file */ + const char *path; + + /* Parent pointer update context */ + struct xfs_parent_args *ppargs; + + /* Child metadata file */ + struct xfs_inode *ip; + + struct xfs_trans *tp; + + enum xfs_metafile_type metafile_type; + + unsigned int dp_locked:1; + unsigned int ip_locked:1; +}; + +int xfs_metadir_load(struct xfs_trans *tp, struct xfs_inode *dp, + const char *path, enum xfs_metafile_type metafile_type, + struct xfs_inode **ipp); + +int xfs_metadir_start_create(struct xfs_metadir_update *upd); +int xfs_metadir_create(struct xfs_metadir_update *upd, umode_t mode); + +int xfs_metadir_start_link(struct xfs_metadir_update *upd); +int xfs_metadir_link(struct xfs_metadir_update *upd); + +int xfs_metadir_commit(struct xfs_metadir_update *upd); +void xfs_metadir_cancel(struct xfs_metadir_update *upd, int error); + +int xfs_metadir_mkdir(struct xfs_inode *dp, const char *path, + struct xfs_inode **ipp); + +#endif /* __XFS_METADIR_H__ */ diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c new file mode 100644 index 000000000000..225923e463c4 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_metafile.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_alloc.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" + +static const struct { + enum xfs_metafile_type mtype; + const char *name; +} xfs_metafile_type_strs[] = { XFS_METAFILE_TYPE_STR }; + +const char * +xfs_metafile_type_str(enum xfs_metafile_type metatype) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(xfs_metafile_type_strs); i++) { + if (xfs_metafile_type_strs[i].mtype == metatype) + return xfs_metafile_type_strs[i].name; + } + + return NULL; +} + +/* Set up an inode to be recognized as a metadata directory inode. */ +void +xfs_metafile_set_iflag( + struct xfs_trans *tp, + struct xfs_inode *ip, + enum xfs_metafile_type metafile_type) +{ + VFS_I(ip)->i_mode &= ~0777; + VFS_I(ip)->i_uid = GLOBAL_ROOT_UID; + VFS_I(ip)->i_gid = GLOBAL_ROOT_GID; + if (S_ISDIR(VFS_I(ip)->i_mode)) + ip->i_diflags |= XFS_METADIR_DIFLAGS; + else + ip->i_diflags |= XFS_METAFILE_DIFLAGS; + ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; + ip->i_diflags2 |= XFS_DIFLAG2_METADATA; + ip->i_metatype = metafile_type; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Clear the metadata directory inode flag. */ +void +xfs_metafile_clear_iflag( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + ASSERT(xfs_is_metadir_inode(ip)); + ASSERT(VFS_I(ip)->i_nlink == 0); + + ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* + * Is the metafile reservations at or beneath a certain threshold? + */ +static inline bool +xfs_metafile_resv_can_cover( + struct xfs_mount *mp, + int64_t rhs) +{ + /* + * The amount of space that can be allocated to this metadata file is + * the remaining reservation for the particular metadata file + the + * global free block count. Take care of the first case to avoid + * touching the per-cpu counter. + */ + if (mp->m_metafile_resv_avail >= rhs) + return true; + + /* + * There aren't enough blocks left in the inode's reservation, but it + * isn't critical unless there also isn't enough free space. + */ + return xfs_compare_freecounter(mp, XC_FREE_BLOCKS, + rhs - mp->m_metafile_resv_avail, 2048) >= 0; +} + +/* + * Is the metafile reservation critically low on blocks? For now we'll define + * that as the number of blocks we can get our hands on being less than 10% of + * what we reserved or less than some arbitrary number (maximum btree height). + */ +bool +xfs_metafile_resv_critical( + struct xfs_mount *mp) +{ + ASSERT(xfs_has_metadir(mp)); + + trace_xfs_metafile_resv_critical(mp, 0); + + if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels)) + return true; + + if (!xfs_metafile_resv_can_cover(mp, + div_u64(mp->m_metafile_resv_target, 10))) + return true; + + return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); +} + +/* Allocate a block from the metadata file's reservation. */ +void +xfs_metafile_resv_alloc_space( + struct xfs_inode *ip, + struct xfs_alloc_arg *args) +{ + struct xfs_mount *mp = ip->i_mount; + int64_t len = args->len; + + ASSERT(xfs_is_metadir_inode(ip)); + ASSERT(args->resv == XFS_AG_RESV_METAFILE); + + trace_xfs_metafile_resv_alloc_space(mp, args->len); + + /* + * Allocate the blocks from the metadata inode's block reservation + * and update the ondisk sb counter. + */ + mutex_lock(&mp->m_metafile_resv_lock); + if (mp->m_metafile_resv_avail > 0) { + int64_t from_resv; + + from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail); + mp->m_metafile_resv_avail -= from_resv; + xfs_mod_delalloc(ip, 0, -from_resv); + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, + -from_resv); + len -= from_resv; + } + + /* + * Any allocation in excess of the reservation requires in-core and + * on-disk fdblocks updates. If we can grab @len blocks from the + * in-core fdblocks then all we need to do is update the on-disk + * superblock; if not, then try to steal some from the transaction's + * block reservation. Overruns are only expected for rmap btrees. + */ + if (len) { + unsigned int field; + int error; + + error = xfs_dec_fdblocks(ip->i_mount, len, true); + if (error) + field = XFS_TRANS_SB_FDBLOCKS; + else + field = XFS_TRANS_SB_RES_FDBLOCKS; + + xfs_trans_mod_sb(args->tp, field, -len); + } + + mp->m_metafile_resv_used += args->len; + mutex_unlock(&mp->m_metafile_resv_lock); + + ip->i_nblocks += args->len; + xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE); +} + +/* Free a block to the metadata file's reservation. */ +void +xfs_metafile_resv_free_space( + struct xfs_inode *ip, + struct xfs_trans *tp, + xfs_filblks_t len) +{ + struct xfs_mount *mp = ip->i_mount; + int64_t to_resv; + + ASSERT(xfs_is_metadir_inode(ip)); + + trace_xfs_metafile_resv_free_space(mp, len); + + ip->i_nblocks -= len; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + mutex_lock(&mp->m_metafile_resv_lock); + mp->m_metafile_resv_used -= len; + + /* + * Add the freed blocks back into the inode's delalloc reservation + * until it reaches the maximum size. Update the ondisk fdblocks only. + */ + to_resv = mp->m_metafile_resv_target - + (mp->m_metafile_resv_used + mp->m_metafile_resv_avail); + if (to_resv > 0) { + to_resv = min_t(int64_t, to_resv, len); + mp->m_metafile_resv_avail += to_resv; + xfs_mod_delalloc(ip, 0, to_resv); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv); + len -= to_resv; + } + mutex_unlock(&mp->m_metafile_resv_lock); + + /* + * Everything else goes back to the filesystem, so update the in-core + * and on-disk counters. + */ + if (len) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len); +} + +static void +__xfs_metafile_resv_free( + struct xfs_mount *mp) +{ + if (mp->m_metafile_resv_avail) { + xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail); + xfs_add_fdblocks(mp, mp->m_metafile_resv_avail); + } + mp->m_metafile_resv_avail = 0; + mp->m_metafile_resv_used = 0; + mp->m_metafile_resv_target = 0; +} + +/* Release unused metafile space reservation. */ +void +xfs_metafile_resv_free( + struct xfs_mount *mp) +{ + if (!xfs_has_metadir(mp)) + return; + + trace_xfs_metafile_resv_free(mp, 0); + + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + mutex_unlock(&mp->m_metafile_resv_lock); +} + +/* Set up a metafile space reservation. */ +int +xfs_metafile_resv_init( + struct xfs_mount *mp) +{ + struct xfs_rtgroup *rtg = NULL; + xfs_filblks_t used = 0, target = 0; + xfs_filblks_t hidden_space; + xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4; + int error = 0; + + if (!xfs_has_metadir(mp)) + return 0; + + /* + * Free any previous reservation to have a clean slate. + */ + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + + /* + * Currently the only btree metafiles that require reservations are the + * rtrmap and the rtrefcount. Anything new will have to be added here + * as well. + */ + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + if (xfs_has_rtrmapbt(mp)) { + used += rtg_rmap(rtg)->i_nblocks; + target += xfs_rtrmapbt_calc_reserves(mp); + } + if (xfs_has_rtreflink(mp)) { + used += rtg_refcount(rtg)->i_nblocks; + target += xfs_rtrefcountbt_calc_reserves(mp); + } + } + + if (!target) + goto out_unlock; + + /* + * Space taken by the per-AG metadata btrees are accounted on-disk as + * used space. We therefore only hide the space that is reserved but + * not used by the trees. + */ + if (used > target) + target = used; + else if (target > dblocks_avail) + target = dblocks_avail; + hidden_space = target - used; + + error = xfs_dec_fdblocks(mp, hidden_space, true); + if (error) { + trace_xfs_metafile_resv_init_error(mp, 0); + goto out_unlock; + } + + xfs_mod_sb_delalloc(mp, hidden_space); + + mp->m_metafile_resv_target = target; + mp->m_metafile_resv_used = used; + mp->m_metafile_resv_avail = hidden_space; + + trace_xfs_metafile_resv_init(mp, target); + +out_unlock: + mutex_unlock(&mp->m_metafile_resv_lock); + return error; +} diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h new file mode 100644 index 000000000000..ae6f9e779b98 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metafile.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_METAFILE_H__ +#define __XFS_METAFILE_H__ + +const char *xfs_metafile_type_str(enum xfs_metafile_type metatype); + +/* All metadata files must have these flags set. */ +#define XFS_METAFILE_DIFLAGS (XFS_DIFLAG_IMMUTABLE | \ + XFS_DIFLAG_SYNC | \ + XFS_DIFLAG_NOATIME | \ + XFS_DIFLAG_NODUMP | \ + XFS_DIFLAG_NODEFRAG) + +/* All metadata directories must have these flags set. */ +#define XFS_METADIR_DIFLAGS (XFS_METAFILE_DIFLAGS | \ + XFS_DIFLAG_NOSYMLINKS) + +void xfs_metafile_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip, + enum xfs_metafile_type metafile_type); +void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip); + +/* Space reservations for metadata inodes. */ +struct xfs_alloc_arg; + +bool xfs_metafile_resv_critical(struct xfs_mount *mp); +void xfs_metafile_resv_alloc_space(struct xfs_inode *ip, + struct xfs_alloc_arg *args); +void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp, + xfs_filblks_t len); +void xfs_metafile_resv_free(struct xfs_mount *mp); +int xfs_metafile_resv_init(struct xfs_mount *mp); + +/* Code specific to kernel/userspace; must be provided externally. */ + +int xfs_trans_metafile_iget(struct xfs_trans *tp, xfs_ino_t ino, + enum xfs_metafile_type metafile_type, struct xfs_inode **ipp); +int xfs_metafile_iget(struct xfs_mount *mp, xfs_ino_t ino, + enum xfs_metafile_type metafile_type, struct xfs_inode **ipp); + +#endif /* __XFS_METAFILE_H__ */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 81885a6a028e..5ed44fdf7491 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -19,40 +19,46 @@ static_assert((value) == (expected), \ "XFS: value of " #value " is wrong, expected " #expected) +#define XFS_CHECK_SB_OFFSET(field, offset) \ + XFS_CHECK_OFFSET(struct xfs_dsb, field, offset); \ + XFS_CHECK_OFFSET(struct xfs_sb, field, offset); + static inline void __init xfs_check_ondisk_structs(void) { - /* ag/file structures */ + /* file structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_acl, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12); - XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224); - XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36); - XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); - XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48); - XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64); - XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104); XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk, 136); - XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 264); XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56); + XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8); + + /* space btrees */ + XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224); + XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36); + XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344); + XFS_CHECK_STRUCT_SIZE(struct xfs_alloc_rec, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); - XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8); - XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_bmdr_key_t, 8); /* dir/attr trees */ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80); @@ -67,32 +73,38 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr, 64); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_entry_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_hdr_t, 32); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_hdr, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_map, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_name_local, 4); /* realtime structures */ + XFS_CHECK_STRUCT_SIZE(struct xfs_rtsb, 56); XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4); XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo, 48); + XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root, 4); + XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_rtrefcount_root, 4); /* - * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to - * 4 bytes anyway so it's not obviously a problem. Hence for the moment - * we don't check this structure. This can be re-instated when the attr - * definitions are updated to use c99 VLA definitions. + * m68k has problems with struct xfs_attr_leaf_name_remote, but we pad + * it to 4 bytes anyway so it's not obviously a problem. Hence for the + * moment we don't check this structure. This can be re-instated when + * the attr definitions are updated to use c99 VLA definitions. * - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_name_remote, 12); */ - XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, valuelen, 0); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, namelen, 2); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, nameval, 3); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, valueblk, 0); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, valuelen, 4); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, namelen, 8); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, name, 9); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leafblock, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr, 4); XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize, 0); XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count, 2); @@ -100,25 +112,40 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen, 1); XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags, 2); XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval, 3); - XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); - XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16); - XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0); - XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3); - XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0); - XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1); - XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_blkinfo, 12); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_intnode, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16); + XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, freetag, 0); + XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, length, 2); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_tail, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_entry, 3); + XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, namelen, 0); + XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, offset, 1); + XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, name, 3); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_hdr, 10); + XFS_CHECK_STRUCT_SIZE(struct xfs_parent_rec, 12); + + /* ondisk dir/attr structures from xfs/122 */ + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_entry, 3); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_unused, 6); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_tail, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_entry, 3); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_hdr, 10); /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); @@ -155,6 +182,16 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16); XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16); + /* ondisk log structures from xfs/122 */ + XFS_CHECK_STRUCT_SIZE(struct xfs_unmount_log_format, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_xmd_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_xmi_log_format, 88); + + /* parent pointer ioctls */ + XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_rec, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_getparents, 40); + XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_by_handle, 64); + /* * The v5 superblock format extended several v4 header structures with * additional data. While new fields are only accessible on v5 @@ -194,6 +231,72 @@ xfs_check_ondisk_structs(void) XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4); XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT, 16299260424LL); + + /* superblock field checks we got from xfs/122 */ + XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304); + XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304); + XFS_CHECK_SB_OFFSET(sb_magicnum, 0); + XFS_CHECK_SB_OFFSET(sb_blocksize, 4); + XFS_CHECK_SB_OFFSET(sb_dblocks, 8); + XFS_CHECK_SB_OFFSET(sb_rblocks, 16); + XFS_CHECK_SB_OFFSET(sb_rextents, 24); + XFS_CHECK_SB_OFFSET(sb_uuid, 32); + XFS_CHECK_SB_OFFSET(sb_logstart, 48); + XFS_CHECK_SB_OFFSET(sb_rootino, 56); + XFS_CHECK_SB_OFFSET(sb_rbmino, 64); + XFS_CHECK_SB_OFFSET(sb_rsumino, 72); + XFS_CHECK_SB_OFFSET(sb_rextsize, 80); + XFS_CHECK_SB_OFFSET(sb_agblocks, 84); + XFS_CHECK_SB_OFFSET(sb_agcount, 88); + XFS_CHECK_SB_OFFSET(sb_rbmblocks, 92); + XFS_CHECK_SB_OFFSET(sb_logblocks, 96); + XFS_CHECK_SB_OFFSET(sb_versionnum, 100); + XFS_CHECK_SB_OFFSET(sb_sectsize, 102); + XFS_CHECK_SB_OFFSET(sb_inodesize, 104); + XFS_CHECK_SB_OFFSET(sb_inopblock, 106); + XFS_CHECK_SB_OFFSET(sb_blocklog, 120); + XFS_CHECK_SB_OFFSET(sb_fname[12], 120); + XFS_CHECK_SB_OFFSET(sb_sectlog, 121); + XFS_CHECK_SB_OFFSET(sb_inodelog, 122); + XFS_CHECK_SB_OFFSET(sb_inopblog, 123); + XFS_CHECK_SB_OFFSET(sb_agblklog, 124); + XFS_CHECK_SB_OFFSET(sb_rextslog, 125); + XFS_CHECK_SB_OFFSET(sb_inprogress, 126); + XFS_CHECK_SB_OFFSET(sb_imax_pct, 127); + XFS_CHECK_SB_OFFSET(sb_icount, 128); + XFS_CHECK_SB_OFFSET(sb_ifree, 136); + XFS_CHECK_SB_OFFSET(sb_fdblocks, 144); + XFS_CHECK_SB_OFFSET(sb_frextents, 152); + XFS_CHECK_SB_OFFSET(sb_uquotino, 160); + XFS_CHECK_SB_OFFSET(sb_gquotino, 168); + XFS_CHECK_SB_OFFSET(sb_qflags, 176); + XFS_CHECK_SB_OFFSET(sb_flags, 178); + XFS_CHECK_SB_OFFSET(sb_shared_vn, 179); + XFS_CHECK_SB_OFFSET(sb_inoalignmt, 180); + XFS_CHECK_SB_OFFSET(sb_unit, 184); + XFS_CHECK_SB_OFFSET(sb_width, 188); + XFS_CHECK_SB_OFFSET(sb_dirblklog, 192); + XFS_CHECK_SB_OFFSET(sb_logsectlog, 193); + XFS_CHECK_SB_OFFSET(sb_logsectsize, 194); + XFS_CHECK_SB_OFFSET(sb_logsunit, 196); + XFS_CHECK_SB_OFFSET(sb_features2, 200); + XFS_CHECK_SB_OFFSET(sb_bad_features2, 204); + XFS_CHECK_SB_OFFSET(sb_features_compat, 208); + XFS_CHECK_SB_OFFSET(sb_features_ro_compat, 212); + XFS_CHECK_SB_OFFSET(sb_features_incompat, 216); + XFS_CHECK_SB_OFFSET(sb_features_log_incompat, 220); + XFS_CHECK_SB_OFFSET(sb_crc, 224); + XFS_CHECK_SB_OFFSET(sb_spino_align, 228); + XFS_CHECK_SB_OFFSET(sb_pquotino, 232); + XFS_CHECK_SB_OFFSET(sb_lsn, 240); + XFS_CHECK_SB_OFFSET(sb_meta_uuid, 248); + XFS_CHECK_SB_OFFSET(sb_metadirino, 264); + XFS_CHECK_SB_OFFSET(sb_rgcount, 272); + XFS_CHECK_SB_OFFSET(sb_rgextents, 276); + XFS_CHECK_SB_OFFSET(sb_rgblklog, 280); + XFS_CHECK_SB_OFFSET(sb_pad, 281); + XFS_CHECK_SB_OFFSET(sb_rtstart, 288); + XFS_CHECK_SB_OFFSET(sb_rtreserved, 296); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/libxfs/xfs_parent.c b/fs/xfs/libxfs/xfs_parent.c new file mode 100644 index 000000000000..69366c44a701 --- /dev/null +++ b/fs/xfs/libxfs/xfs_parent.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022-2024 Oracle. + * All rights reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_da_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr_sf.h" +#include "xfs_bmap.h" +#include "xfs_defer.h" +#include "xfs_log.h" +#include "xfs_xattr.h" +#include "xfs_parent.h" +#include "xfs_trans_space.h" +#include "xfs_attr_item.h" +#include "xfs_health.h" + +struct kmem_cache *xfs_parent_args_cache; + +/* + * Parent pointer attribute handling. + * + * Because the attribute name is a filename component, it will never be longer + * than 255 bytes and must not contain nulls or slashes. These are roughly the + * same constraints that apply to attribute names. + * + * The attribute value must always be a struct xfs_parent_rec. This means the + * attribute will never be in remote format because 12 bytes is nowhere near + * xfs_attr_leaf_entsize_local_max() (~75% of block size). + * + * Creating a new parent attribute will always create a new attribute - there + * should never, ever be an existing attribute in the tree for a new inode. + * ENOSPC behavior is problematic - creating the inode without the parent + * pointer is effectively a corruption, so we allow parent attribute creation + * to dip into the reserve block pool to avoid unexpected ENOSPC errors from + * occurring. + */ + +/* Return true if parent pointer attr name is valid. */ +bool +xfs_parent_namecheck( + unsigned int attr_flags, + const void *name, + size_t length) +{ + /* + * Parent pointers always use logged operations, so there should never + * be incomplete xattrs. + */ + if (attr_flags & XFS_ATTR_INCOMPLETE) + return false; + + return xfs_dir2_namecheck(name, length); +} + +/* Return true if parent pointer attr value is valid. */ +bool +xfs_parent_valuecheck( + struct xfs_mount *mp, + const void *value, + size_t valuelen) +{ + const struct xfs_parent_rec *rec = value; + + if (!xfs_has_parent(mp)) + return false; + + /* The xattr value must be a parent record. */ + if (valuelen != sizeof(struct xfs_parent_rec)) + return false; + + /* The parent record must be local. */ + if (value == NULL) + return false; + + /* The parent inumber must be valid. */ + if (!xfs_verify_dir_ino(mp, be64_to_cpu(rec->p_ino))) + return false; + + return true; +} + +/* Compute the attribute name hash for a parent pointer. */ +xfs_dahash_t +xfs_parent_hashval( + struct xfs_mount *mp, + const uint8_t *name, + int namelen, + xfs_ino_t parent_ino) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + + /* + * Use the same dirent name hash as would be used on the directory, but + * mix in the parent inode number to avoid collisions on hardlinked + * files with identical names but different parents. + */ + return xfs_dir2_hashname(mp, &xname) ^ + upper_32_bits(parent_ino) ^ lower_32_bits(parent_ino); +} + +/* Compute the attribute name hash from the xattr components. */ +xfs_dahash_t +xfs_parent_hashattr( + struct xfs_mount *mp, + const uint8_t *name, + int namelen, + const void *value, + int valuelen) +{ + const struct xfs_parent_rec *rec = value; + + /* Requires a local attr value in xfs_parent_rec format */ + if (valuelen != sizeof(struct xfs_parent_rec)) { + ASSERT(valuelen == sizeof(struct xfs_parent_rec)); + return 0; + } + + if (!value) { + ASSERT(value != NULL); + return 0; + } + + return xfs_parent_hashval(mp, name, namelen, be64_to_cpu(rec->p_ino)); +} + +/* + * Initialize the parent pointer arguments structure. Caller must have zeroed + * the contents of @args. @tp is only required for updates. + */ +static void +xfs_parent_da_args_init( + struct xfs_da_args *args, + struct xfs_trans *tp, + struct xfs_parent_rec *rec, + struct xfs_inode *child, + xfs_ino_t owner, + const struct xfs_name *parent_name) +{ + args->geo = child->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->attr_filter = XFS_ATTR_PARENT; + args->op_flags = XFS_DA_OP_LOGGED | XFS_DA_OP_OKNOENT; + args->trans = tp; + args->dp = child; + args->owner = owner; + args->name = parent_name->name; + args->namelen = parent_name->len; + args->value = rec; + args->valuelen = sizeof(struct xfs_parent_rec); + xfs_attr_sethash(args); +} + +/* Make sure the incore state is ready for a parent pointer query/update. */ +static inline int +xfs_parent_iread_extents( + struct xfs_trans *tp, + struct xfs_inode *child) +{ + /* Parent pointers require that the attr fork must exist. */ + if (XFS_IS_CORRUPT(child->i_mount, !xfs_inode_has_attr_fork(child))) { + xfs_inode_mark_sick(child, XFS_SICK_INO_PARENT); + return -EFSCORRUPTED; + } + + return xfs_iread_extents(tp, child, XFS_ATTR_FORK); +} + +/* Add a parent pointer to reflect a dirent addition. */ +int +xfs_parent_addname( + struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *dp, + const struct xfs_name *parent_name, + struct xfs_inode *child) +{ + int error; + + error = xfs_parent_iread_extents(tp, child); + if (error) + return error; + + xfs_inode_to_parent_rec(&ppargs->rec, dp); + xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child, + child->i_ino, parent_name); + xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_SET); + return 0; +} + +/* Remove a parent pointer to reflect a dirent removal. */ +int +xfs_parent_removename( + struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *dp, + const struct xfs_name *parent_name, + struct xfs_inode *child) +{ + int error; + + error = xfs_parent_iread_extents(tp, child); + if (error) + return error; + + xfs_inode_to_parent_rec(&ppargs->rec, dp); + xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child, + child->i_ino, parent_name); + xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_REMOVE); + return 0; +} + +/* Replace one parent pointer with another to reflect a rename. */ +int +xfs_parent_replacename( + struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *old_dp, + const struct xfs_name *old_name, + struct xfs_inode *new_dp, + const struct xfs_name *new_name, + struct xfs_inode *child) +{ + int error; + + error = xfs_parent_iread_extents(tp, child); + if (error) + return error; + + xfs_inode_to_parent_rec(&ppargs->rec, old_dp); + xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child, + child->i_ino, old_name); + + xfs_inode_to_parent_rec(&ppargs->new_rec, new_dp); + ppargs->args.new_name = new_name->name; + ppargs->args.new_namelen = new_name->len; + ppargs->args.new_value = &ppargs->new_rec; + ppargs->args.new_valuelen = sizeof(struct xfs_parent_rec); + xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_REPLACE); + return 0; +} + +/* + * Extract parent pointer information from any parent pointer xattr into + * @parent_ino/gen. The last two parameters can be NULL pointers. + * + * Returns 0 if this is not a parent pointer xattr at all; or -EFSCORRUPTED for + * garbage. + */ +int +xfs_parent_from_attr( + struct xfs_mount *mp, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + xfs_ino_t *parent_ino, + uint32_t *parent_gen) +{ + const struct xfs_parent_rec *rec = value; + + ASSERT(attr_flags & XFS_ATTR_PARENT); + + if (!xfs_parent_namecheck(attr_flags, name, namelen)) + return -EFSCORRUPTED; + if (!xfs_parent_valuecheck(mp, value, valuelen)) + return -EFSCORRUPTED; + + if (parent_ino) + *parent_ino = be64_to_cpu(rec->p_ino); + if (parent_gen) + *parent_gen = be32_to_cpu(rec->p_gen); + return 0; +} + +/* + * Look up a parent pointer record (@parent_name -> @pptr) of @ip. + * + * Caller must hold at least ILOCK_SHARED. The scratchpad need not be + * initialized. + * + * Returns 0 if the pointer is found, -ENOATTR if there is no match, or a + * negative errno. + */ +int +xfs_parent_lookup( + struct xfs_trans *tp, + struct xfs_inode *ip, + const struct xfs_name *parent_name, + struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch) +{ + memset(scratch, 0, sizeof(struct xfs_da_args)); + xfs_parent_da_args_init(scratch, tp, pptr, ip, ip->i_ino, parent_name); + return xfs_attr_get_ilocked(scratch); +} + +/* Sanity-check a parent pointer before we try to perform repairs. */ +static inline bool +xfs_parent_sanity_check( + struct xfs_mount *mp, + const struct xfs_name *parent_name, + const struct xfs_parent_rec *pptr) +{ + if (!xfs_parent_namecheck(XFS_ATTR_PARENT, parent_name->name, + parent_name->len)) + return false; + + if (!xfs_parent_valuecheck(mp, pptr, sizeof(*pptr))) + return false; + + return true; +} + + +/* + * Attach the parent pointer (@parent_name -> @pptr) to @ip immediately. + * Caller must not have a transaction or hold the ILOCK. This is for + * specialized repair functions only. The scratchpad need not be initialized. + */ +int +xfs_parent_set( + struct xfs_inode *ip, + xfs_ino_t owner, + const struct xfs_name *parent_name, + struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch) +{ + if (!xfs_parent_sanity_check(ip->i_mount, parent_name, pptr)) { + ASSERT(0); + return -EFSCORRUPTED; + } + + memset(scratch, 0, sizeof(struct xfs_da_args)); + xfs_parent_da_args_init(scratch, NULL, pptr, ip, owner, parent_name); + return xfs_attr_set(scratch, XFS_ATTRUPDATE_CREATE, false); +} + +/* + * Remove the parent pointer (@parent_name -> @pptr) from @ip immediately. + * Caller must not have a transaction or hold the ILOCK. This is for + * specialized repair functions only. The scratchpad need not be initialized. + */ +int +xfs_parent_unset( + struct xfs_inode *ip, + xfs_ino_t owner, + const struct xfs_name *parent_name, + struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch) +{ + if (!xfs_parent_sanity_check(ip->i_mount, parent_name, pptr)) { + ASSERT(0); + return -EFSCORRUPTED; + } + + memset(scratch, 0, sizeof(struct xfs_da_args)); + xfs_parent_da_args_init(scratch, NULL, pptr, ip, owner, parent_name); + return xfs_attr_set(scratch, XFS_ATTRUPDATE_REMOVE, false); +} diff --git a/fs/xfs/libxfs/xfs_parent.h b/fs/xfs/libxfs/xfs_parent.h new file mode 100644 index 000000000000..b8036527cdc7 --- /dev/null +++ b/fs/xfs/libxfs/xfs_parent.h @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022-2024 Oracle. + * All Rights Reserved. + */ +#ifndef __XFS_PARENT_H__ +#define __XFS_PARENT_H__ + +/* Metadata validators */ +bool xfs_parent_namecheck(unsigned int attr_flags, const void *name, + size_t length); +bool xfs_parent_valuecheck(struct xfs_mount *mp, const void *value, + size_t valuelen); + +xfs_dahash_t xfs_parent_hashval(struct xfs_mount *mp, const uint8_t *name, + int namelen, xfs_ino_t parent_ino); +xfs_dahash_t xfs_parent_hashattr(struct xfs_mount *mp, const uint8_t *name, + int namelen, const void *value, int valuelen); + +/* Initializes a xfs_parent_rec to be stored as an attribute name. */ +static inline void +xfs_parent_rec_init( + struct xfs_parent_rec *rec, + xfs_ino_t ino, + uint32_t gen) +{ + rec->p_ino = cpu_to_be64(ino); + rec->p_gen = cpu_to_be32(gen); +} + +/* Initializes a xfs_parent_rec to be stored as an attribute name. */ +static inline void +xfs_inode_to_parent_rec( + struct xfs_parent_rec *rec, + const struct xfs_inode *dp) +{ + xfs_parent_rec_init(rec, dp->i_ino, VFS_IC(dp)->i_generation); +} + +extern struct kmem_cache *xfs_parent_args_cache; + +/* + * Parent pointer information needed to pass around the deferred xattr update + * machinery. + */ +struct xfs_parent_args { + struct xfs_parent_rec rec; + struct xfs_parent_rec new_rec; + struct xfs_da_args args; +}; + +/* + * Start a parent pointer update by allocating the context object we need to + * perform a parent pointer update. + */ +static inline int +xfs_parent_start( + struct xfs_mount *mp, + struct xfs_parent_args **ppargsp) +{ + if (!xfs_has_parent(mp)) { + *ppargsp = NULL; + return 0; + } + + *ppargsp = kmem_cache_zalloc(xfs_parent_args_cache, GFP_KERNEL); + if (!*ppargsp) + return -ENOMEM; + return 0; +} + +/* Finish a parent pointer update by freeing the context object. */ +static inline void +xfs_parent_finish( + struct xfs_mount *mp, + struct xfs_parent_args *ppargs) +{ + if (ppargs) + kmem_cache_free(xfs_parent_args_cache, ppargs); +} + +int xfs_parent_addname(struct xfs_trans *tp, struct xfs_parent_args *ppargs, + struct xfs_inode *dp, const struct xfs_name *parent_name, + struct xfs_inode *child); +int xfs_parent_removename(struct xfs_trans *tp, struct xfs_parent_args *ppargs, + struct xfs_inode *dp, const struct xfs_name *parent_name, + struct xfs_inode *child); +int xfs_parent_replacename(struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *old_dp, const struct xfs_name *old_name, + struct xfs_inode *new_dp, const struct xfs_name *new_name, + struct xfs_inode *child); + +int xfs_parent_from_attr(struct xfs_mount *mp, unsigned int attr_flags, + const unsigned char *name, unsigned int namelen, + const void *value, unsigned int valuelen, + xfs_ino_t *parent_ino, uint32_t *parent_gen); + +/* Repair functions */ +int xfs_parent_lookup(struct xfs_trans *tp, struct xfs_inode *ip, + const struct xfs_name *name, struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch); +int xfs_parent_set(struct xfs_inode *ip, xfs_ino_t owner, + const struct xfs_name *name, struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch); +int xfs_parent_unset(struct xfs_inode *ip, xfs_ino_t owner, + const struct xfs_name *name, struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch); + +#endif /* __XFS_PARENT_H__ */ diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index cb035da3f990..763d941a8420 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -56,7 +56,7 @@ typedef uint8_t xfs_dqtype_t; * And, of course, we also need to take into account the dquot log format item * used to describe each dquot. */ -#define XFS_DQUOT_LOGRES(mp) \ +#define XFS_DQUOT_LOGRES \ ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6) #define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) @@ -143,4 +143,47 @@ time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq, __be32 dtimer); __be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer); +static inline const char * +xfs_dqinode_path(xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: + return "user"; + case XFS_DQTYPE_GROUP: + return "group"; + case XFS_DQTYPE_PROJ: + return "project"; + } + + ASSERT(0); + return NULL; +} + +static inline enum xfs_metafile_type +xfs_dqinode_metafile_type(xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: + return XFS_METAFILE_USRQUOTA; + case XFS_DQTYPE_GROUP: + return XFS_METAFILE_GRPQUOTA; + case XFS_DQTYPE_PROJ: + return XFS_METAFILE_PRJQUOTA; + } + + ASSERT(0); + return XFS_METAFILE_UNKNOWN; +} + +unsigned int xfs_dqinode_sick_mask(xfs_dqtype_t type); + +int xfs_dqinode_load(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dqtype_t type, struct xfs_inode **ipp); +int xfs_dqinode_metadir_create(struct xfs_inode *dp, xfs_dqtype_t type, + struct xfs_inode **ipp); +int xfs_dqinode_metadir_link(struct xfs_inode *dp, xfs_dqtype_t type, + struct xfs_inode *ip); +int xfs_dqinode_mkdir_parent(struct xfs_mount *mp, struct xfs_inode **dpp); +int xfs_dqinode_load_parent(struct xfs_trans *tp, struct xfs_inode **dpp); + #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 511c912d515c..cebe83f7842a 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -24,6 +24,10 @@ #include "xfs_rmap.h" #include "xfs_ag.h" #include "xfs_health.h" +#include "xfs_refcount_item.h" +#include "xfs_rtgroup.h" +#include "xfs_rtalloc.h" +#include "xfs_rtrefcount_btree.h" struct kmem_cache *xfs_refcount_intent_cache; @@ -51,7 +55,7 @@ xfs_refcount_lookup_le( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + trace_xfs_refcount_lookup(cur, xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; @@ -71,7 +75,7 @@ xfs_refcount_lookup_ge( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + trace_xfs_refcount_lookup(cur, xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; @@ -91,7 +95,7 @@ xfs_refcount_lookup_eq( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + trace_xfs_refcount_lookup(cur, xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; @@ -127,7 +131,7 @@ xfs_refcount_check_irec( struct xfs_perag *pag, const struct xfs_refcount_irec *irec) { - if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) + if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX) return __this_address; if (!xfs_refcount_check_domain(irec)) @@ -137,12 +141,43 @@ xfs_refcount_check_irec( if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount)) return __this_address; - if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) + if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX) return __this_address; return NULL; } +xfs_failaddr_t +xfs_rtrefcount_check_irec( + struct xfs_rtgroup *rtg, + const struct xfs_refcount_irec *irec) +{ + if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX) + return __this_address; + + if (!xfs_refcount_check_domain(irec)) + return __this_address; + + /* check for valid extent range, including overflow */ + if (!xfs_verify_rgbext(rtg, irec->rc_startblock, irec->rc_blockcount)) + return __this_address; + + if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX) + return __this_address; + + return NULL; +} + +static inline xfs_failaddr_t +xfs_refcount_check_btrec( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec) +{ + if (xfs_btree_is_rtrefcount(cur->bc_ops)) + return xfs_rtrefcount_check_irec(to_rtg(cur->bc_group), irec); + return xfs_refcount_check_irec(to_perag(cur->bc_group), irec); +} + static inline int xfs_refcount_complain_bad_rec( struct xfs_btree_cur *cur, @@ -151,9 +186,15 @@ xfs_refcount_complain_bad_rec( { struct xfs_mount *mp = cur->bc_mp; - xfs_warn(mp, + if (xfs_btree_is_rtrefcount(cur->bc_ops)) { + xfs_warn(mp, + "RT Refcount BTree record corruption in rtgroup %u detected at %pS!", + cur->bc_group->xg_gno, fa); + } else { + xfs_warn(mp, "Refcount BTree record corruption in AG %d detected at %pS!", - cur->bc_ag.pag->pag_agno, fa); + cur->bc_group->xg_gno, fa); + } xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); @@ -179,11 +220,11 @@ xfs_refcount_get_rec( return error; xfs_refcount_btrec_to_irec(rec, irec); - fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec); + fa = xfs_refcount_check_btrec(cur, irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, irec); - trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + trace_xfs_refcount_get(cur, irec); return 0; } @@ -201,7 +242,7 @@ xfs_refcount_update( uint32_t start; int error; - trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + trace_xfs_refcount_update(cur, irec); start = xfs_refcount_encode_startblock(irec->rc_startblock, irec->rc_domain); @@ -211,8 +252,7 @@ xfs_refcount_update( error = xfs_btree_update(cur, &rec); if (error) - trace_xfs_refcount_update_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_update_error(cur, error, _RET_IP_); return error; } @@ -229,7 +269,7 @@ xfs_refcount_insert( { int error; - trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + trace_xfs_refcount_insert(cur, irec); cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; @@ -247,8 +287,7 @@ xfs_refcount_insert( out_error: if (error) - trace_xfs_refcount_insert_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_insert_error(cur, error, _RET_IP_); return error; } @@ -275,7 +314,7 @@ xfs_refcount_delete( error = -EFSCORRUPTED; goto out_error; } - trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec); + trace_xfs_refcount_delete(cur, &irec); error = xfs_btree_delete(cur, i); if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { xfs_btree_mark_sick(cur); @@ -288,8 +327,7 @@ xfs_refcount_delete( &found_rec); out_error: if (error) - trace_xfs_refcount_delete_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_delete_error(cur, error, _RET_IP_); return error; } @@ -413,8 +451,7 @@ xfs_refcount_split_extent( return 0; *shape_changed = true; - trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, - &rcext, agbno); + trace_xfs_refcount_split_extent(cur, &rcext, agbno); /* Establish the right extent. */ tmp = rcext; @@ -438,8 +475,7 @@ xfs_refcount_split_extent( return error; out_error: - trace_xfs_refcount_split_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_split_extent_error(cur, error, _RET_IP_); return error; } @@ -458,8 +494,7 @@ xfs_refcount_merge_center_extents( int error; int found_rec; - trace_xfs_refcount_merge_center_extents(cur->bc_mp, - cur->bc_ag.pag->pag_agno, left, center, right); + trace_xfs_refcount_merge_center_extents(cur, left, center, right); ASSERT(left->rc_domain == center->rc_domain); ASSERT(right->rc_domain == center->rc_domain); @@ -522,8 +557,7 @@ xfs_refcount_merge_center_extents( return error; out_error: - trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_merge_center_extents_error(cur, error, _RET_IP_); return error; } @@ -541,8 +575,7 @@ xfs_refcount_merge_left_extent( int error; int found_rec; - trace_xfs_refcount_merge_left_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, left, cleft); + trace_xfs_refcount_merge_left_extent(cur, left, cleft); ASSERT(left->rc_domain == cleft->rc_domain); @@ -589,8 +622,7 @@ xfs_refcount_merge_left_extent( return error; out_error: - trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_merge_left_extent_error(cur, error, _RET_IP_); return error; } @@ -607,8 +639,7 @@ xfs_refcount_merge_right_extent( int error; int found_rec; - trace_xfs_refcount_merge_right_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, cright, right); + trace_xfs_refcount_merge_right_extent(cur, cright, right); ASSERT(right->rc_domain == cright->rc_domain); @@ -658,8 +689,7 @@ xfs_refcount_merge_right_extent( return error; out_error: - trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_merge_right_extent_error(cur, error, _RET_IP_); return error; } @@ -748,13 +778,11 @@ not_found: cleft->rc_refcount = 1; cleft->rc_domain = domain; } - trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, - left, cleft, agbno); + trace_xfs_refcount_find_left_extent(cur, left, cleft, agbno); return error; out_error: - trace_xfs_refcount_find_left_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_find_left_extent_error(cur, error, _RET_IP_); return error; } @@ -843,13 +871,12 @@ not_found: cright->rc_refcount = 1; cright->rc_domain = domain; } - trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, - cright, right, agbno + aglen); + trace_xfs_refcount_find_right_extent(cur, cright, right, + agbno + aglen); return error; out_error: - trace_xfs_refcount_find_right_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_find_right_extent_error(cur, error, _RET_IP_); return error; } @@ -866,9 +893,9 @@ xfs_refc_merge_refcount( const struct xfs_refcount_irec *irec, enum xfs_refc_adjust_op adjust) { - /* Once a record hits MAXREFCOUNT, it is pinned there forever */ - if (irec->rc_refcount == MAXREFCOUNT) - return MAXREFCOUNT; + /* Once a record hits XFS_REFC_REFCOUNT_MAX, it is pinned forever */ + if (irec->rc_refcount == XFS_REFC_REFCOUNT_MAX) + return XFS_REFC_REFCOUNT_MAX; return irec->rc_refcount + adjust; } @@ -911,7 +938,7 @@ xfs_refc_want_merge_center( * hence we need to catch u32 addition overflows here. */ ulen += cleft->rc_blockcount + right->rc_blockcount; - if (ulen >= MAXREFCEXTLEN) + if (ulen >= XFS_REFC_LEN_MAX) return false; *ulenp = ulen; @@ -946,7 +973,7 @@ xfs_refc_want_merge_left( * hence we need to catch u32 addition overflows here. */ ulen += cleft->rc_blockcount; - if (ulen >= MAXREFCEXTLEN) + if (ulen >= XFS_REFC_LEN_MAX) return false; return true; @@ -980,7 +1007,7 @@ xfs_refc_want_merge_right( * hence we need to catch u32 addition overflows here. */ ulen += cright->rc_blockcount; - if (ulen >= MAXREFCEXTLEN) + if (ulen >= XFS_REFC_LEN_MAX) return false; return true; @@ -1078,7 +1105,7 @@ xfs_refcount_still_have_space( */ overhead = xfs_allocfree_block_count(cur->bc_mp, cur->bc_refc.shape_changes); - overhead += cur->bc_mp->m_refc_maxlevels; + overhead += cur->bc_maxlevels; overhead *= cur->bc_mp->m_sb.sb_blocksize; /* @@ -1098,6 +1125,22 @@ xfs_refcount_still_have_space( cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; } +/* Schedule an extent free. */ +static int +xrefc_free_extent( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *rec) +{ + unsigned int flags = 0; + + if (xfs_btree_is_rtrefcount(cur->bc_ops)) + flags |= XFS_FREE_EXTENT_REALTIME; + + return xfs_free_extent_later(cur->bc_tp, + xfs_gbno_to_fsb(cur->bc_group, rec->rc_startblock), + rec->rc_blockcount, NULL, XFS_AG_RESV_NONE, flags); +} + /* * Adjust the refcounts of middle extents. At this point we should have * split extents that crossed the adjustment range; merged with adjacent @@ -1114,7 +1157,6 @@ xfs_refcount_adjust_extents( struct xfs_refcount_irec ext, tmp; int error; int found_rec, found_tmp; - xfs_fsblock_t fsbno; /* Merging did all the work already. */ if (*aglen == 0) @@ -1130,7 +1172,7 @@ xfs_refcount_adjust_extents( if (error) goto out_error; if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) { - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; + ext.rc_startblock = xfs_group_max_blocks(cur->bc_group); ext.rc_blockcount = 0; ext.rc_refcount = 0; ext.rc_domain = XFS_REFC_DOMAIN_SHARED; @@ -1148,8 +1190,7 @@ xfs_refcount_adjust_extents( tmp.rc_refcount = 1 + adj; tmp.rc_domain = XFS_REFC_DOMAIN_SHARED; - trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, &tmp); + trace_xfs_refcount_modify_extent(cur, &tmp); /* * Either cover the hole (increment) or @@ -1168,12 +1209,7 @@ xfs_refcount_adjust_extents( goto out_error; } } else { - fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_ag.pag->pag_agno, - tmp.rc_startblock); - error = xfs_free_extent_later(cur->bc_tp, fsbno, - tmp.rc_blockcount, NULL, - XFS_AG_RESV_NONE, false); + error = xrefc_free_extent(cur, &tmp); if (error) goto out_error; } @@ -1211,11 +1247,10 @@ xfs_refcount_adjust_extents( * Adjust the reference count and either update the tree * (incr) or free the blocks (decr). */ - if (ext.rc_refcount == MAXREFCOUNT) + if (ext.rc_refcount == XFS_REFC_REFCOUNT_MAX) goto skip; ext.rc_refcount += adj; - trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, &ext); + trace_xfs_refcount_modify_extent(cur, &ext); cur->bc_refc.nr_ops++; if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); @@ -1232,12 +1267,7 @@ xfs_refcount_adjust_extents( } goto advloop; } else { - fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_ag.pag->pag_agno, - ext.rc_startblock); - error = xfs_free_extent_later(cur->bc_tp, fsbno, - ext.rc_blockcount, NULL, - XFS_AG_RESV_NONE, false); + error = xrefc_free_extent(cur, &ext); if (error) goto out_error; } @@ -1254,8 +1284,7 @@ advloop: return error; out_error: - trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_modify_extent_error(cur, error, _RET_IP_); return error; } @@ -1272,11 +1301,9 @@ xfs_refcount_adjust( int error; if (adj == XFS_REFCOUNT_ADJUST_INCREASE) - trace_xfs_refcount_increase(cur->bc_mp, - cur->bc_ag.pag->pag_agno, *agbno, *aglen); + trace_xfs_refcount_increase(cur, *agbno, *aglen); else - trace_xfs_refcount_decrease(cur->bc_mp, - cur->bc_ag.pag->pag_agno, *agbno, *aglen); + trace_xfs_refcount_decrease(cur, *agbno, *aglen); /* * Ensure that no rcextents cross the boundary of the adjustment range. @@ -1315,28 +1342,10 @@ xfs_refcount_adjust( return 0; out_error: - trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.pag->pag_agno, - error, _RET_IP_); + trace_xfs_refcount_adjust_error(cur, error, _RET_IP_); return error; } -/* Clean up after calling xfs_refcount_finish_one. */ -void -xfs_refcount_finish_one_cleanup( - struct xfs_trans *tp, - struct xfs_btree_cur *rcur, - int error) -{ - struct xfs_buf *agbp; - - if (rcur == NULL) - return; - agbp = rcur->bc_ag.agbp; - xfs_btree_del_cursor(rcur, error); - if (error) - xfs_trans_brelse(tp, agbp); -} - /* * Set up a continuation a deferred refcount operation by updating the intent. * Checks to make sure we're not going to run off the end of the AG. @@ -1348,7 +1357,7 @@ xfs_refcount_continue_op( xfs_agblock_t new_agbno) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, ri->ri_blockcount))) { @@ -1356,10 +1365,10 @@ xfs_refcount_continue_op( return -EFSCORRUPTED; } - ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + ri->ri_startblock = xfs_agbno_to_fsb(pag, new_agbno); ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount)); - ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); + ASSERT(pag_agno(pag) == XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); return 0; } @@ -1378,7 +1387,7 @@ xfs_refcount_finish_one( struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_btree_cur *rcur; + struct xfs_btree_cur *rcur = *pcur; struct xfs_buf *agbp = NULL; int error = 0; xfs_agblock_t bno; @@ -1387,9 +1396,7 @@ xfs_refcount_finish_one( bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock); - trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock), - ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock), - ri->ri_blockcount); + trace_xfs_refcount_deferred(mp, ri); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; @@ -1398,25 +1405,25 @@ xfs_refcount_finish_one( * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ - rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { + if (rcur != NULL && rcur->bc_group != ri->ri_group) { nr_ops = rcur->bc_refc.nr_ops; shape_changes = rcur->bc_refc.shape_changes; - xfs_refcount_finish_one_cleanup(tp, rcur, 0); + xfs_btree_del_cursor(rcur, 0); rcur = NULL; *pcur = NULL; } if (rcur == NULL) { - error = xfs_alloc_read_agf(ri->ri_pag, tp, + struct xfs_perag *pag = to_perag(ri->ri_group); + + error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING, &agbp); if (error) return error; - rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag); + *pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); rcur->bc_refc.nr_ops = nr_ops; rcur->bc_refc.shape_changes = shape_changes; } - *pcur = rcur; switch (ri->ri_type) { case XFS_REFCOUNT_INCREASE: @@ -1452,8 +1459,116 @@ xfs_refcount_finish_one( return -EFSCORRUPTED; } if (!error && ri->ri_blockcount > 0) - trace_xfs_refcount_finish_one_leftover(mp, ri->ri_pag->pag_agno, - ri->ri_type, bno, ri->ri_blockcount); + trace_xfs_refcount_finish_one_leftover(mp, ri); + return error; +} + +/* + * Set up a continuation a deferred rtrefcount operation by updating the + * intent. Checks to make sure we're not going to run off the end of the + * rtgroup. + */ +static inline int +xfs_rtrefcount_continue_op( + struct xfs_btree_cur *cur, + struct xfs_refcount_intent *ri, + xfs_agblock_t new_agbno) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); + + if (XFS_IS_CORRUPT(mp, !xfs_verify_rgbext(rtg, new_agbno, + ri->ri_blockcount))) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + + ri->ri_startblock = xfs_rgbno_to_rtb(rtg, new_agbno); + + ASSERT(xfs_verify_rtbext(mp, ri->ri_startblock, ri->ri_blockcount)); + return 0; +} + +/* + * Process one of the deferred realtime refcount operations. We pass back the + * btree cursor to maintain our lock on the btree between calls. + */ +int +xfs_rtrefcount_finish_one( + struct xfs_trans *tp, + struct xfs_refcount_intent *ri, + struct xfs_btree_cur **pcur) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); + struct xfs_btree_cur *rcur = *pcur; + int error = 0; + xfs_rgblock_t bno; + unsigned long nr_ops = 0; + int shape_changes = 0; + + bno = xfs_rtb_to_rgbno(mp, ri->ri_startblock); + + trace_xfs_refcount_deferred(mp, ri); + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + return -EIO; + + /* + * If we haven't gotten a cursor or the cursor AG doesn't match + * the startblock, get one now. + */ + if (rcur != NULL && rcur->bc_group != ri->ri_group) { + nr_ops = rcur->bc_refc.nr_ops; + shape_changes = rcur->bc_refc.shape_changes; + xfs_btree_del_cursor(rcur, 0); + rcur = NULL; + *pcur = NULL; + } + if (rcur == NULL) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_REFCOUNT); + *pcur = rcur = xfs_rtrefcountbt_init_cursor(tp, rtg); + + rcur->bc_refc.nr_ops = nr_ops; + rcur->bc_refc.shape_changes = shape_changes; + } + + switch (ri->ri_type) { + case XFS_REFCOUNT_INCREASE: + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_INCREASE); + if (error) + return error; + if (ri->ri_blockcount > 0) + error = xfs_rtrefcount_continue_op(rcur, ri, bno); + break; + case XFS_REFCOUNT_DECREASE: + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_DECREASE); + if (error) + return error; + if (ri->ri_blockcount > 0) + error = xfs_rtrefcount_continue_op(rcur, ri, bno); + break; + case XFS_REFCOUNT_ALLOC_COW: + error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount); + if (error) + return error; + ri->ri_blockcount = 0; + break; + case XFS_REFCOUNT_FREE_COW: + error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount); + if (error) + return error; + ri->ri_blockcount = 0; + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + if (!error && ri->ri_blockcount > 0) + trace_xfs_refcount_finish_one_leftover(mp, ri); return error; } @@ -1464,25 +1579,21 @@ static void __xfs_refcount_add( struct xfs_trans *tp, enum xfs_refcount_intent_type type, + bool isrt, xfs_fsblock_t startblock, xfs_extlen_t blockcount) { struct xfs_refcount_intent *ri; - trace_xfs_refcount_defer(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, startblock), - type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), - blockcount); - ri = kmem_cache_alloc(xfs_refcount_intent_cache, GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_startblock = startblock; ri->ri_blockcount = blockcount; + ri->ri_realtime = isrt; - xfs_refcount_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); + xfs_refcount_defer_add(tp, ri); } /* @@ -1491,12 +1602,13 @@ __xfs_refcount_add( void xfs_refcount_increase_extent( struct xfs_trans *tp, + bool isrt, struct xfs_bmbt_irec *PREV) { if (!xfs_has_reflink(tp->t_mountp)) return; - __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock, + __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, isrt, PREV->br_startblock, PREV->br_blockcount); } @@ -1506,12 +1618,13 @@ xfs_refcount_increase_extent( void xfs_refcount_decrease_extent( struct xfs_trans *tp, + bool isrt, struct xfs_bmbt_irec *PREV) { if (!xfs_has_reflink(tp->t_mountp)) return; - __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock, + __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, isrt, PREV->br_startblock, PREV->br_blockcount); } @@ -1537,8 +1650,7 @@ xfs_refcount_find_shared( int have; int error; - trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_find_shared(cur, agbno, aglen); /* By default, skip the whole range */ *fbno = NULLAGBLOCK; @@ -1625,13 +1737,11 @@ xfs_refcount_find_shared( } done: - trace_xfs_refcount_find_shared_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, *fbno, *flen); + trace_xfs_refcount_find_shared_result(cur, *fbno, *flen); out_error: if (error) - trace_xfs_refcount_find_shared_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_find_shared_error(cur, error, _RET_IP_); return error; } @@ -1716,7 +1826,7 @@ xfs_refcount_adjust_cow_extents( goto out_error; } if (!found_rec) { - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; + ext.rc_startblock = xfs_group_max_blocks(cur->bc_group); ext.rc_blockcount = 0; ext.rc_refcount = 0; ext.rc_domain = XFS_REFC_DOMAIN_COW; @@ -1737,8 +1847,7 @@ xfs_refcount_adjust_cow_extents( tmp.rc_refcount = 1; tmp.rc_domain = XFS_REFC_DOMAIN_COW; - trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, &tmp); + trace_xfs_refcount_modify_extent(cur, &tmp); error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -1769,8 +1878,7 @@ xfs_refcount_adjust_cow_extents( } ext.rc_refcount = 0; - trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.pag->pag_agno, &ext); + trace_xfs_refcount_modify_extent(cur, &ext); error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; @@ -1786,8 +1894,7 @@ xfs_refcount_adjust_cow_extents( return error; out_error: - trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_refcount_modify_extent_error(cur, error, _RET_IP_); return error; } @@ -1833,8 +1940,7 @@ xfs_refcount_adjust_cow( return 0; out_error: - trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.pag->pag_agno, - error, _RET_IP_); + trace_xfs_refcount_adjust_cow_error(cur, error, _RET_IP_); return error; } @@ -1847,8 +1953,7 @@ __xfs_refcount_cow_alloc( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_cow_increase(rcur, agbno, aglen); /* Add refcount btree reservation */ return xfs_refcount_adjust_cow(rcur, agbno, aglen, @@ -1864,8 +1969,7 @@ __xfs_refcount_cow_free( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_cow_decrease(rcur, agbno, aglen); /* Remove refcount btree reservation */ return xfs_refcount_adjust_cow(rcur, agbno, aglen, @@ -1876,6 +1980,7 @@ __xfs_refcount_cow_free( void xfs_refcount_alloc_cow_extent( struct xfs_trans *tp, + bool isrt, xfs_fsblock_t fsb, xfs_extlen_t len) { @@ -1884,17 +1989,17 @@ xfs_refcount_alloc_cow_extent( if (!xfs_has_reflink(mp)) return; - __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); + __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, isrt, fsb, len); /* Add rmap entry */ - xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), - XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); + xfs_rmap_alloc_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW); } /* Forget a CoW staging event in the refcount btree. */ void xfs_refcount_free_cow_extent( struct xfs_trans *tp, + bool isrt, xfs_fsblock_t fsb, xfs_extlen_t len) { @@ -1904,9 +2009,8 @@ xfs_refcount_free_cow_extent( return; /* Remove rmap entry */ - xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), - XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); - __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); + xfs_rmap_free_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW); + __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, isrt, fsb, len); } struct xfs_refcount_recovery { @@ -1935,7 +2039,7 @@ xfs_refcount_recover_extent( INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL || + if (xfs_refcount_check_btrec(cur, &rr->rr_rrec) != NULL || XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { xfs_btree_mark_sick(cur); @@ -1950,12 +2054,13 @@ xfs_refcount_recover_extent( /* Find and remove leftover CoW reservations. */ int xfs_refcount_recover_cow_leftovers( - struct xfs_mount *mp, - struct xfs_perag *pag) + struct xfs_group *xg) { + struct xfs_mount *mp = xg->xg_mount; + bool isrt = xg->xg_type == XG_TYPE_RTG; struct xfs_trans *tp; struct xfs_btree_cur *cur; - struct xfs_buf *agbp; + struct xfs_buf *agbp = NULL; struct xfs_refcount_recovery *rr, *n; struct list_head debris; union xfs_btree_irec low = { @@ -1968,10 +2073,19 @@ xfs_refcount_recover_cow_leftovers( xfs_fsblock_t fsb; int error; - /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */ + /* reflink filesystems must not have groups larger than 2^31-1 blocks */ + BUILD_BUG_ON(XFS_MAX_RGBLOCKS >= XFS_REFC_COWFLAG); BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG); - if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS) - return -EOPNOTSUPP; + + if (isrt) { + if (!xfs_has_rtgroups(mp)) + return 0; + if (xfs_group_max_blocks(xg) >= XFS_MAX_RGBLOCKS) + return -EOPNOTSUPP; + } else { + if (xfs_group_max_blocks(xg) > XFS_MAX_CRC_AG_BLOCKS) + return -EOPNOTSUPP; + } INIT_LIST_HEAD(&debris); @@ -1989,16 +2103,24 @@ xfs_refcount_recover_cow_leftovers( if (error) return error; - error = xfs_alloc_read_agf(pag, tp, 0, &agbp); - if (error) - goto out_trans; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); + if (isrt) { + xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT); + cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(xg)); + } else { + error = xfs_alloc_read_agf(to_perag(xg), tp, 0, &agbp); + if (error) + goto out_trans; + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, to_perag(xg)); + } /* Find all the leftover CoW staging extents. */ error = xfs_btree_query_range(cur, &low, &high, xfs_refcount_recover_extent, &debris); xfs_btree_del_cursor(cur, error); - xfs_trans_brelse(tp, agbp); + if (agbp) + xfs_trans_brelse(tp, agbp); + else + xfs_rtgroup_unlock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT); xfs_trans_cancel(tp); if (error) goto out_free; @@ -2010,19 +2132,16 @@ xfs_refcount_recover_cow_leftovers( if (error) goto out_free; - trace_xfs_refcount_recover_extent(mp, pag->pag_agno, - &rr->rr_rrec); - /* Free the orphan record */ - fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, - rr->rr_rrec.rc_startblock); - xfs_refcount_free_cow_extent(tp, fsb, + fsb = xfs_gbno_to_fsb(xg, rr->rr_rrec.rc_startblock); + xfs_refcount_free_cow_extent(tp, isrt, fsb, rr->rr_rrec.rc_blockcount); /* Free the block. */ error = xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL, - XFS_AG_RESV_NONE, false); + XFS_AG_RESV_NONE, + isrt ? XFS_FREE_EXTENT_REALTIME : 0); if (error) goto out_trans; @@ -2087,7 +2206,7 @@ xfs_refcount_query_range_helper( xfs_failaddr_t fa; xfs_refcount_btrec_to_irec(rec, &irec); - fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_refcount_check_btrec(cur, &irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 9b56768a590c..f2e299a716a4 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -12,6 +12,7 @@ struct xfs_perag; struct xfs_btree_cur; struct xfs_bmbt_irec; struct xfs_refcount_irec; +struct xfs_rtgroup; extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); @@ -48,12 +49,19 @@ enum xfs_refcount_intent_type { XFS_REFCOUNT_FREE_COW, }; +#define XFS_REFCOUNT_INTENT_STRINGS \ + { XFS_REFCOUNT_INCREASE, "incr" }, \ + { XFS_REFCOUNT_DECREASE, "decr" }, \ + { XFS_REFCOUNT_ALLOC_COW, "alloc_cow" }, \ + { XFS_REFCOUNT_FREE_COW, "free_cow" } + struct xfs_refcount_intent { struct list_head ri_list; - struct xfs_perag *ri_pag; + struct xfs_group *ri_group; enum xfs_refcount_intent_type ri_type; xfs_extlen_t ri_blockcount; xfs_fsblock_t ri_startblock; + bool ri_realtime; }; /* Check that the refcount is appropriate for the record domain. */ @@ -68,29 +76,25 @@ xfs_refcount_check_domain( return true; } -void xfs_refcount_update_get_group(struct xfs_mount *mp, - struct xfs_refcount_intent *ri); - -void xfs_refcount_increase_extent(struct xfs_trans *tp, +void xfs_refcount_increase_extent(struct xfs_trans *tp, bool isrt, struct xfs_bmbt_irec *irec); -void xfs_refcount_decrease_extent(struct xfs_trans *tp, +void xfs_refcount_decrease_extent(struct xfs_trans *tp, bool isrt, struct xfs_bmbt_irec *irec); -extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, - struct xfs_btree_cur *rcur, int error); -extern int xfs_refcount_finish_one(struct xfs_trans *tp, +int xfs_refcount_finish_one(struct xfs_trans *tp, + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); +int xfs_rtrefcount_finish_one(struct xfs_trans *tp, struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_end_of_shared); -void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, - xfs_extlen_t len); -void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, - xfs_extlen_t len); -extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, - struct xfs_perag *pag); +void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt, + xfs_fsblock_t fsb, xfs_extlen_t len); +void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt, + xfs_fsblock_t fsb, xfs_extlen_t len); +int xfs_refcount_recover_cow_leftovers(struct xfs_group *xg); /* * While we're adjusting the refcounts records of an extent, we have @@ -119,6 +123,8 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag, const struct xfs_refcount_irec *irec); +xfs_failaddr_t xfs_rtrefcount_check_irec(struct xfs_rtgroup *rtg, + const struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index ca59f6c89f3e..54505fee1852 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -30,7 +30,7 @@ xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.pag); + cur->bc_ag.agbp, to_perag(cur->bc_group)); } STATIC void @@ -68,21 +68,20 @@ xfs_refcountbt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.pag = cur->bc_ag.pag; + args.pag = to_perag(cur->bc_group); args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; args.resv = XFS_AG_RESV_METADATA; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, - xfs_refc_block(args.mp))); + xfs_agbno_to_fsb(args.pag, xfs_refc_block(args.mp))); if (error) goto out_error; if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; } - ASSERT(args.agno == cur->bc_ag.pag->pag_agno); + ASSERT(args.agno == cur->bc_group->xg_gno); ASSERT(args.len == 1); new->s = cpu_to_be32(args.agbno); @@ -109,7 +108,7 @@ xfs_refcountbt_free_block( be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false); + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, 0); } STATIC int @@ -170,7 +169,7 @@ xfs_refcountbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_refcount_root; } @@ -362,11 +361,11 @@ xfs_refcountbt_init_cursor( { struct xfs_btree_cur *cur; - ASSERT(pag->pag_agno < mp->m_sb.sb_agcount); + ASSERT(pag_agno(pag) < mp->m_sb.sb_agcount); cur = xfs_btree_alloc_cursor(mp, tp, &xfs_refcountbt_ops, mp->m_refc_maxlevels, xfs_refcountbt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_refc.nr_ops = 0; cur->bc_refc.shape_changes = 0; cur->bc_ag.agbp = agbp; @@ -417,9 +416,10 @@ xfs_refcountbt_block_maxrecs( /* * Calculate the number of records in a refcount btree block. */ -int +unsigned int xfs_refcountbt_maxrecs( - int blocklen, + struct xfs_mount *mp, + unsigned int blocklen, bool leaf) { blocklen -= XFS_REFCOUNT_BLOCK_LEN; @@ -514,7 +514,7 @@ xfs_refcountbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (xfs_ag_contains_log(mp, pag->pag_agno)) + if (xfs_ag_contains_log(mp, pag_agno(pag))) agblocks -= mp->m_sb.sb_logblocks; *ask += xfs_refcountbt_max_size(mp, agblocks); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index 1e0ab25f6c68..beb93bef6a81 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -48,7 +48,8 @@ struct xbtree_afakeroot; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_perag *pag); -extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); +unsigned int xfs_refcountbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index ef16f6f9cef6..3cdf50563fec 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -24,6 +24,9 @@ #include "xfs_inode.h" #include "xfs_ag.h" #include "xfs_health.h" +#include "xfs_rmap_item.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" struct kmem_cache *xfs_rmap_intent_cache; @@ -100,8 +103,7 @@ xfs_rmap_update( union xfs_btree_rec rec; int error; - trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, - irec->rm_startblock, irec->rm_blockcount, + trace_xfs_rmap_update(cur, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock); @@ -111,8 +113,7 @@ xfs_rmap_update( xfs_rmap_irec_offset_pack(irec)); error = xfs_btree_update(cur, &rec); if (error) - trace_xfs_rmap_update_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_update_error(cur, error, _RET_IP_); return error; } @@ -128,8 +129,7 @@ xfs_rmap_insert( int i; int error; - trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, - len, owner, offset, flags); + trace_xfs_rmap_insert(rcur, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); if (error) @@ -155,8 +155,7 @@ xfs_rmap_insert( } done: if (error) - trace_xfs_rmap_insert_error(rcur->bc_mp, - rcur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_insert_error(rcur, error, _RET_IP_); return error; } @@ -172,8 +171,7 @@ xfs_rmap_delete( int i; int error; - trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, - len, owner, offset, flags); + trace_xfs_rmap_delete(rcur, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); if (error) @@ -194,8 +192,7 @@ xfs_rmap_delete( } done: if (error) - trace_xfs_rmap_delete_error(rcur->bc_mp, - rcur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_delete_error(rcur, error, _RET_IP_); return error; } @@ -218,7 +215,7 @@ xfs_rmap_check_irec( struct xfs_perag *pag, const struct xfs_rmap_irec *irec) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); bool is_inode; bool is_unwritten; bool is_bmbt; @@ -269,14 +266,78 @@ xfs_rmap_check_irec( return NULL; } +static xfs_failaddr_t +xfs_rtrmap_check_meta_irec( + struct xfs_rtgroup *rtg, + const struct xfs_rmap_irec *irec) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (irec->rm_offset != 0) + return __this_address; + if (irec->rm_flags & XFS_RMAP_UNWRITTEN) + return __this_address; + + switch (irec->rm_owner) { + case XFS_RMAP_OWN_FS: + if (irec->rm_startblock != 0) + return __this_address; + if (irec->rm_blockcount != mp->m_sb.sb_rextsize) + return __this_address; + return NULL; + case XFS_RMAP_OWN_COW: + if (!xfs_has_rtreflink(mp)) + return __this_address; + if (!xfs_verify_rgbext(rtg, irec->rm_startblock, + irec->rm_blockcount)) + return __this_address; + return NULL; + default: + return __this_address; + } + + return NULL; +} + +static xfs_failaddr_t +xfs_rtrmap_check_inode_irec( + struct xfs_rtgroup *rtg, + const struct xfs_rmap_irec *irec) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (!xfs_verify_ino(mp, irec->rm_owner)) + return __this_address; + if (!xfs_verify_rgbext(rtg, irec->rm_startblock, irec->rm_blockcount)) + return __this_address; + if (!xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount)) + return __this_address; + return NULL; +} + +xfs_failaddr_t +xfs_rtrmap_check_irec( + struct xfs_rtgroup *rtg, + const struct xfs_rmap_irec *irec) +{ + if (irec->rm_blockcount == 0) + return __this_address; + if (irec->rm_flags & (XFS_RMAP_BMBT_BLOCK | XFS_RMAP_ATTR_FORK)) + return __this_address; + if (XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)) + return xfs_rtrmap_check_meta_irec(rtg, irec); + return xfs_rtrmap_check_inode_irec(rtg, irec); +} + static inline xfs_failaddr_t xfs_rmap_check_btrec( struct xfs_btree_cur *cur, const struct xfs_rmap_irec *irec) { - if (xfs_btree_is_mem_rmap(cur->bc_ops)) - return xfs_rmap_check_irec(cur->bc_mem.pag, irec); - return xfs_rmap_check_irec(cur->bc_ag.pag, irec); + if (xfs_btree_is_rtrmap(cur->bc_ops) || + xfs_btree_is_mem_rtrmap(cur->bc_ops)) + return xfs_rtrmap_check_irec(to_rtg(cur->bc_group), irec); + return xfs_rmap_check_irec(to_perag(cur->bc_group), irec); } static inline int @@ -290,10 +351,14 @@ xfs_rmap_complain_bad_rec( if (xfs_btree_is_mem_rmap(cur->bc_ops)) xfs_warn(mp, "In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa); + else if (xfs_btree_is_rtrmap(cur->bc_ops)) + xfs_warn(mp, + "RT Reverse Mapping BTree record corruption in rtgroup %u detected at %pS!", + cur->bc_group->xg_gno, fa); else xfs_warn(mp, "Reverse Mapping BTree record corruption in AG %d detected at %pS!", - cur->bc_ag.pag->pag_agno, fa); + cur->bc_group->xg_gno, fa); xfs_warn(mp, "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, @@ -342,8 +407,7 @@ xfs_rmap_find_left_neighbor_helper( { struct xfs_find_left_neighbor_info *info = priv; - trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, - cur->bc_ag.pag->pag_agno, rec->rm_startblock, + trace_xfs_rmap_find_left_neighbor_candidate(cur, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -393,8 +457,8 @@ xfs_rmap_find_left_neighbor( info.high.rm_blockcount = 0; info.irec = irec; - trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, - cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); + trace_xfs_rmap_find_left_neighbor_query(cur, bno, 0, owner, offset, + flags); /* * Historically, we always used the range query to walk every reverse @@ -425,8 +489,7 @@ xfs_rmap_find_left_neighbor( return error; *stat = 1; - trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, + trace_xfs_rmap_find_left_neighbor_result(cur, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return 0; @@ -441,8 +504,7 @@ xfs_rmap_lookup_le_range_helper( { struct xfs_find_left_neighbor_info *info = priv; - trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, - cur->bc_ag.pag->pag_agno, rec->rm_startblock, + trace_xfs_rmap_lookup_le_range_candidate(cur, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -489,8 +551,7 @@ xfs_rmap_lookup_le_range( *stat = 0; info.irec = irec; - trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno, - bno, 0, owner, offset, flags); + trace_xfs_rmap_lookup_le_range(cur, bno, 0, owner, offset, flags); /* * Historically, we always used the range query to walk every reverse @@ -521,8 +582,7 @@ xfs_rmap_lookup_le_range( return error; *stat = 1; - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, + trace_xfs_rmap_lookup_le_range_result(cur, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return 0; @@ -537,7 +597,7 @@ xfs_rmap_free_check_owner( struct xfs_btree_cur *cur, uint64_t ltoff, struct xfs_rmap_irec *rec, - xfs_filblks_t len, + xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags) @@ -634,8 +694,7 @@ xfs_rmap_unmap( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_unmap(cur, bno, len, unwritten, oinfo); /* * We should always have a left record because there's a static record @@ -651,10 +710,9 @@ xfs_rmap_unmap( goto out_error; } - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, - ltrec.rm_blockcount, ltrec.rm_owner, - ltrec.rm_offset, ltrec.rm_flags); + trace_xfs_rmap_lookup_le_range_result(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, + ltrec.rm_flags); ltoff = ltrec.rm_offset; /* @@ -721,10 +779,9 @@ xfs_rmap_unmap( if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - ltrec.rm_startblock, ltrec.rm_blockcount, - ltrec.rm_owner, ltrec.rm_offset, - ltrec.rm_flags); + trace_xfs_rmap_delete(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto out_error; @@ -800,8 +857,7 @@ xfs_rmap_unmap( else cur->bc_rec.r.rm_offset = offset + len; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, - cur->bc_rec.r.rm_startblock, + trace_xfs_rmap_insert(cur, cur->bc_rec.r.rm_startblock, cur->bc_rec.r.rm_blockcount, cur->bc_rec.r.rm_owner, cur->bc_rec.r.rm_offset, @@ -812,12 +868,10 @@ xfs_rmap_unmap( } out_done: - trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_unmap_done(cur, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_unmap_error(mp, cur->bc_ag.pag->pag_agno, - error, _RET_IP_); + trace_xfs_rmap_unmap_error(cur, error, _RET_IP_); return error; } @@ -851,7 +905,7 @@ xfs_rmap_hook_enable(void) static inline void xfs_rmap_update_hook( struct xfs_trans *tp, - struct xfs_perag *pag, + struct xfs_group *xg, enum xfs_rmap_intent_type op, xfs_agblock_t startblock, xfs_extlen_t blockcount, @@ -866,27 +920,27 @@ xfs_rmap_update_hook( .oinfo = *oinfo, /* struct copy */ }; - if (pag) - xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p); + if (xg) + xfs_hooks_call(&xg->xg_rmap_update_hooks, op, &p); } } /* Call the specified function during a reverse mapping update. */ int xfs_rmap_hook_add( - struct xfs_perag *pag, + struct xfs_group *xg, struct xfs_rmap_hook *hook) { - return xfs_hooks_add(&pag->pag_rmap_update_hooks, &hook->rmap_hook); + return xfs_hooks_add(&xg->xg_rmap_update_hooks, &hook->rmap_hook); } /* Stop calling the specified function during a reverse mapping update. */ void xfs_rmap_hook_del( - struct xfs_perag *pag, + struct xfs_group *xg, struct xfs_rmap_hook *hook) { - xfs_hooks_del(&pag->pag_rmap_update_hooks, &hook->rmap_hook); + xfs_hooks_del(&xg->xg_rmap_update_hooks, &hook->rmap_hook); } /* Configure rmap update hook functions. */ @@ -921,7 +975,8 @@ xfs_rmap_free( return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); - xfs_rmap_update_hook(tp, pag, XFS_RMAP_UNMAP, bno, len, false, oinfo); + xfs_rmap_update_hook(tp, pag_group(pag), XFS_RMAP_UNMAP, bno, len, + false, oinfo); error = xfs_rmap_unmap(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -987,8 +1042,7 @@ xfs_rmap_map( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_map(cur, bno, len, unwritten, oinfo); ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); /* @@ -1001,8 +1055,7 @@ xfs_rmap_map( if (error) goto out_error; if (have_lt) { - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, + trace_xfs_rmap_lookup_le_range_result(cur, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -1040,10 +1093,10 @@ xfs_rmap_map( error = -EFSCORRUPTED; goto out_error; } - trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, - gtrec.rm_blockcount, gtrec.rm_owner, - gtrec.rm_offset, gtrec.rm_flags); + trace_xfs_rmap_find_right_neighbor_result(cur, + gtrec.rm_startblock, gtrec.rm_blockcount, + gtrec.rm_owner, gtrec.rm_offset, + gtrec.rm_flags); if (!xfs_rmap_is_mergeable(>rec, owner, flags)) have_gt = 0; } @@ -1080,12 +1133,9 @@ xfs_rmap_map( * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| */ ltrec.rm_blockcount += gtrec.rm_blockcount; - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - gtrec.rm_startblock, - gtrec.rm_blockcount, - gtrec.rm_owner, - gtrec.rm_offset, - gtrec.rm_flags); + trace_xfs_rmap_delete(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto out_error; @@ -1132,8 +1182,7 @@ xfs_rmap_map( cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, - owner, offset, flags); + trace_xfs_rmap_insert(cur, bno, len, owner, offset, flags); error = xfs_btree_insert(cur, &i); if (error) goto out_error; @@ -1144,12 +1193,10 @@ xfs_rmap_map( } } - trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_map_done(cur, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_map_error(mp, cur->bc_ag.pag->pag_agno, - error, _RET_IP_); + trace_xfs_rmap_map_error(cur, error, _RET_IP_); return error; } @@ -1173,7 +1220,8 @@ xfs_rmap_alloc( return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); - xfs_rmap_update_hook(tp, pag, XFS_RMAP_MAP, bno, len, false, oinfo); + xfs_rmap_update_hook(tp, pag_group(pag), XFS_RMAP_MAP, bno, len, false, + oinfo); error = xfs_rmap_map(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -1223,8 +1271,7 @@ xfs_rmap_convert( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_convert(cur, bno, len, unwritten, oinfo); /* * For the initial lookup, look for an exact match or the left-adjacent @@ -1240,10 +1287,9 @@ xfs_rmap_convert( goto done; } - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, PREV.rm_startblock, - PREV.rm_blockcount, PREV.rm_owner, - PREV.rm_offset, PREV.rm_flags); + trace_xfs_rmap_lookup_le_range_result(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, + PREV.rm_flags); ASSERT(PREV.rm_offset <= offset); ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff); @@ -1284,10 +1330,9 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, LEFT.rm_startblock, - LEFT.rm_blockcount, LEFT.rm_owner, - LEFT.rm_offset, LEFT.rm_flags); + trace_xfs_rmap_find_left_neighbor_result(cur, + LEFT.rm_startblock, LEFT.rm_blockcount, + LEFT.rm_owner, LEFT.rm_offset, LEFT.rm_flags); if (LEFT.rm_startblock + LEFT.rm_blockcount == bno && LEFT.rm_offset + LEFT.rm_blockcount == offset && xfs_rmap_is_mergeable(&LEFT, owner, newext)) @@ -1325,10 +1370,10 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, - RIGHT.rm_blockcount, RIGHT.rm_owner, - RIGHT.rm_offset, RIGHT.rm_flags); + trace_xfs_rmap_find_right_neighbor_result(cur, + RIGHT.rm_startblock, RIGHT.rm_blockcount, + RIGHT.rm_owner, RIGHT.rm_offset, + RIGHT.rm_flags); if (bno + len == RIGHT.rm_startblock && offset + len == RIGHT.rm_offset && xfs_rmap_is_mergeable(&RIGHT, owner, newext)) @@ -1344,8 +1389,7 @@ xfs_rmap_convert( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state, - _RET_IP_); + trace_xfs_rmap_convert_state(cur, state, _RET_IP_); /* reset the cursor back to PREV */ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i); @@ -1376,10 +1420,9 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - RIGHT.rm_startblock, RIGHT.rm_blockcount, - RIGHT.rm_owner, RIGHT.rm_offset, - RIGHT.rm_flags); + trace_xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; @@ -1396,10 +1439,9 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - PREV.rm_startblock, PREV.rm_blockcount, - PREV.rm_owner, PREV.rm_offset, - PREV.rm_flags); + trace_xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; @@ -1428,10 +1470,9 @@ xfs_rmap_convert( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - PREV.rm_startblock, PREV.rm_blockcount, - PREV.rm_owner, PREV.rm_offset, - PREV.rm_flags); + trace_xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; @@ -1468,10 +1509,9 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, - RIGHT.rm_startblock, RIGHT.rm_blockcount, - RIGHT.rm_owner, RIGHT.rm_offset, - RIGHT.rm_flags); + trace_xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; @@ -1549,8 +1589,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, - len, owner, offset, newext); + trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) goto done; @@ -1608,8 +1647,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, - len, owner, offset, newext); + trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) goto done; @@ -1640,9 +1678,8 @@ xfs_rmap_convert( NEW = PREV; NEW.rm_blockcount = offset - PREV.rm_offset; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, - NEW.rm_startblock, NEW.rm_blockcount, - NEW.rm_owner, NEW.rm_offset, + trace_xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, NEW.rm_flags); error = xfs_btree_insert(cur, &i); if (error) @@ -1669,8 +1706,7 @@ xfs_rmap_convert( /* new middle extent - newext */ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN; cur->bc_rec.r.rm_flags |= newext; - trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, - owner, offset, newext); + trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) goto done; @@ -1694,12 +1730,10 @@ xfs_rmap_convert( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_convert_done(cur, bno, len, unwritten, oinfo); done: if (error) - trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_convert_error(cur, error, _RET_IP_); return error; } @@ -1735,8 +1769,7 @@ xfs_rmap_convert_shared( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_convert(cur, bno, len, unwritten, oinfo); /* * For the initial lookup, look for and exact match or the left-adjacent @@ -1805,10 +1838,10 @@ xfs_rmap_convert_shared( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, - RIGHT.rm_blockcount, RIGHT.rm_owner, - RIGHT.rm_offset, RIGHT.rm_flags); + trace_xfs_rmap_find_right_neighbor_result(cur, + RIGHT.rm_startblock, RIGHT.rm_blockcount, + RIGHT.rm_owner, RIGHT.rm_offset, + RIGHT.rm_flags); if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) state |= RMAP_RIGHT_CONTIG; } @@ -1822,8 +1855,7 @@ xfs_rmap_convert_shared( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state, - _RET_IP_); + trace_xfs_rmap_convert_state(cur, state, _RET_IP_); /* * Switch out based on the FILLING and CONTIG state bits. */ @@ -2121,12 +2153,10 @@ xfs_rmap_convert_shared( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_convert_done(cur, bno, len, unwritten, oinfo); done: if (error) - trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_convert_error(cur, error, _RET_IP_); return error; } @@ -2164,8 +2194,7 @@ xfs_rmap_unmap_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_unmap(cur, bno, len, unwritten, oinfo); /* * We should always have a left record because there's a static record @@ -2321,12 +2350,10 @@ xfs_rmap_unmap_shared( goto out_error; } - trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_unmap_done(cur, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_unmap_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_unmap_error(cur, error, _RET_IP_); return error; } @@ -2361,8 +2388,7 @@ xfs_rmap_map_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_map(cur, bno, len, unwritten, oinfo); /* Is there a left record that abuts our range? */ error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags, @@ -2387,10 +2413,10 @@ xfs_rmap_map_shared( error = -EFSCORRUPTED; goto out_error; } - trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, - gtrec.rm_blockcount, gtrec.rm_owner, - gtrec.rm_offset, gtrec.rm_flags); + trace_xfs_rmap_find_right_neighbor_result(cur, + gtrec.rm_startblock, gtrec.rm_blockcount, + gtrec.rm_owner, gtrec.rm_offset, + gtrec.rm_flags); if (!xfs_rmap_is_mergeable(>rec, owner, flags)) have_gt = 0; @@ -2482,12 +2508,10 @@ xfs_rmap_map_shared( goto out_error; } - trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len, - unwritten, oinfo); + trace_xfs_rmap_map_done(cur, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_map_error(cur->bc_mp, - cur->bc_ag.pag->pag_agno, error, _RET_IP_); + trace_xfs_rmap_map_error(cur, error, _RET_IP_); return error; } @@ -2572,23 +2596,6 @@ xfs_rmap_query_all( return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query); } -/* Clean up after calling xfs_rmap_finish_one. */ -void -xfs_rmap_finish_one_cleanup( - struct xfs_trans *tp, - struct xfs_btree_cur *rcur, - int error) -{ - struct xfs_buf *agbp; - - if (rcur == NULL) - return; - agbp = rcur->bc_ag.agbp; - xfs_btree_del_cursor(rcur, error); - if (error) - xfs_trans_brelse(tp, agbp); -} - /* Commit an rmap operation into the ondisk tree. */ int __xfs_rmap_finish_intent( @@ -2621,6 +2628,47 @@ __xfs_rmap_finish_intent( } } +static int +xfs_rmap_finish_init_cursor( + struct xfs_trans *tp, + struct xfs_rmap_intent *ri, + struct xfs_btree_cur **pcur) +{ + struct xfs_perag *pag = to_perag(ri->ri_group); + struct xfs_buf *agbp = NULL; + int error; + + /* + * Refresh the freelist before we start changing the rmapbt, because a + * shape change could cause us to allocate blocks. + */ + error = xfs_free_extent_fix_freelist(tp, pag, &agbp); + if (error) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); + return error; + } + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); + return -EFSCORRUPTED; + } + *pcur = xfs_rmapbt_init_cursor(tp->t_mountp, tp, agbp, pag); + return 0; +} + +static int +xfs_rtrmap_finish_init_cursor( + struct xfs_trans *tp, + struct xfs_rmap_intent *ri, + struct xfs_btree_cur **pcur) +{ + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + *pcur = xfs_rtrmapbt_init_cursor(tp, rtg); + return 0; +} + /* * Process one of the deferred rmap operations. We pass back the * btree cursor to maintain our lock on the rmapbt between calls. @@ -2634,20 +2682,13 @@ xfs_rmap_finish_one( struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_btree_cur *rcur; - struct xfs_buf *agbp = NULL; - int error = 0; struct xfs_owner_info oinfo; + struct xfs_mount *mp = tp->t_mountp; xfs_agblock_t bno; bool unwritten; + int error = 0; - bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock); - - trace_xfs_rmap_deferred(mp, ri->ri_pag->pag_agno, ri->ri_type, bno, - ri->ri_owner, ri->ri_whichfork, - ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount, - ri->ri_bmap.br_state); + trace_xfs_rmap_deferred(mp, ri); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) return -EIO; @@ -2656,43 +2697,31 @@ xfs_rmap_finish_one( * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ - rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { - xfs_rmap_finish_one_cleanup(tp, rcur, 0); - rcur = NULL; + if (*pcur != NULL && (*pcur)->bc_group != ri->ri_group) { + xfs_btree_del_cursor(*pcur, 0); *pcur = NULL; } - if (rcur == NULL) { - /* - * Refresh the freelist before we start changing the - * rmapbt, because a shape change could cause us to - * allocate blocks. - */ - error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp); - if (error) { - xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL); + if (*pcur == NULL) { + if (ri->ri_group->xg_type == XG_TYPE_RTG) + error = xfs_rtrmap_finish_init_cursor(tp, ri, pcur); + else + error = xfs_rmap_finish_init_cursor(tp, ri, pcur); + if (error) return error; - } - if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { - xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL); - return -EFSCORRUPTED; - } - - rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag); } - *pcur = rcur; xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork, ri->ri_bmap.br_startoff); unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN; - bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock); - error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno, + bno = xfs_fsb_to_gbno(mp, ri->ri_bmap.br_startblock, + ri->ri_group->xg_type); + error = __xfs_rmap_finish_intent(*pcur, ri->ri_type, bno, ri->ri_bmap.br_blockcount, &oinfo, unwritten); if (error) return error; - xfs_rmap_update_hook(tp, ri->ri_pag, ri->ri_type, bno, + xfs_rmap_update_hook(tp, ri->ri_group, ri->ri_type, bno, ri->ri_bmap.br_blockcount, unwritten, &oinfo); return 0; } @@ -2717,29 +2746,21 @@ __xfs_rmap_add( struct xfs_trans *tp, enum xfs_rmap_intent_type type, uint64_t owner, + bool isrt, int whichfork, struct xfs_bmbt_irec *bmap) { struct xfs_rmap_intent *ri; - trace_xfs_rmap_defer(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), - type, - XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), - owner, whichfork, - bmap->br_startoff, - bmap->br_blockcount, - bmap->br_state); - ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_owner = owner; ri->ri_whichfork = whichfork; ri->ri_bmap = *bmap; + ri->ri_realtime = isrt; - xfs_rmap_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type); + xfs_rmap_defer_add(tp, ri); } /* Map an extent into a file. */ @@ -2751,6 +2772,7 @@ xfs_rmap_map_extent( struct xfs_bmbt_irec *PREV) { enum xfs_rmap_intent_type type = XFS_RMAP_MAP; + bool isrt = xfs_ifork_is_realtime(ip, whichfork); if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; @@ -2758,7 +2780,7 @@ xfs_rmap_map_extent( if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) type = XFS_RMAP_MAP_SHARED; - __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); + __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV); } /* Unmap an extent out of a file. */ @@ -2770,6 +2792,7 @@ xfs_rmap_unmap_extent( struct xfs_bmbt_irec *PREV) { enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP; + bool isrt = xfs_ifork_is_realtime(ip, whichfork); if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; @@ -2777,7 +2800,7 @@ xfs_rmap_unmap_extent( if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) type = XFS_RMAP_UNMAP_SHARED; - __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); + __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV); } /* @@ -2795,6 +2818,7 @@ xfs_rmap_convert_extent( struct xfs_bmbt_irec *PREV) { enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT; + bool isrt = xfs_ifork_is_realtime(ip, whichfork); if (!xfs_rmap_update_is_needed(mp, whichfork)) return; @@ -2802,15 +2826,15 @@ xfs_rmap_convert_extent( if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) type = XFS_RMAP_CONVERT_SHARED; - __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); + __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV); } /* Schedule the creation of an rmap for non-file data. */ void xfs_rmap_alloc_extent( struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, + bool isrt, + xfs_fsblock_t fsbno, xfs_extlen_t len, uint64_t owner) { @@ -2819,20 +2843,20 @@ xfs_rmap_alloc_extent( if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) return; - bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); + bmap.br_startblock = fsbno; bmap.br_blockcount = len; bmap.br_startoff = 0; bmap.br_state = XFS_EXT_NORM; - __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); + __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, isrt, XFS_DATA_FORK, &bmap); } /* Schedule the deletion of an rmap for non-file data. */ void xfs_rmap_free_extent( struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, + bool isrt, + xfs_fsblock_t fsbno, xfs_extlen_t len, uint64_t owner) { @@ -2841,12 +2865,12 @@ xfs_rmap_free_extent( if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) return; - bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); + bmap.br_startblock = fsbno; bmap.br_blockcount = len; bmap.br_startoff = 0; bmap.br_state = XFS_EXT_NORM; - __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); + __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, isrt, XFS_DATA_FORK, &bmap); } /* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */ diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 9d01fe689497..5f39f6e53cd1 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -7,6 +7,7 @@ #define __XFS_RMAP_H__ struct xfs_perag; +struct xfs_rtgroup; static inline void xfs_rmap_ino_bmbt_owner( @@ -157,18 +158,26 @@ enum xfs_rmap_intent_type { XFS_RMAP_FREE, }; +#define XFS_RMAP_INTENT_STRINGS \ + { XFS_RMAP_MAP, "map" }, \ + { XFS_RMAP_MAP_SHARED, "map_shared" }, \ + { XFS_RMAP_UNMAP, "unmap" }, \ + { XFS_RMAP_UNMAP_SHARED, "unmap_shared" }, \ + { XFS_RMAP_CONVERT, "cvt" }, \ + { XFS_RMAP_CONVERT_SHARED, "cvt_shared" }, \ + { XFS_RMAP_ALLOC, "alloc" }, \ + { XFS_RMAP_FREE, "free" } + struct xfs_rmap_intent { struct list_head ri_list; enum xfs_rmap_intent_type ri_type; int ri_whichfork; uint64_t ri_owner; struct xfs_bmbt_irec ri_bmap; - struct xfs_perag *ri_pag; + struct xfs_group *ri_group; + bool ri_realtime; }; -void xfs_rmap_update_get_group(struct xfs_mount *mp, - struct xfs_rmap_intent *ri); - /* functions for updating the rmapbt based on bmbt map/unmap operations */ void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); @@ -177,13 +186,11 @@ void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); -void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); -void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); +void xfs_rmap_alloc_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno, + xfs_extlen_t len, uint64_t owner); +void xfs_rmap_free_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno, + xfs_extlen_t len, uint64_t owner); -void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, - struct xfs_btree_cur *rcur, int error); int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur); int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur, @@ -201,6 +208,8 @@ xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec); xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag, const struct xfs_rmap_irec *irec); +xfs_failaddr_t xfs_rtrmap_check_irec(struct xfs_rtgroup *rtg, + const struct xfs_rmap_irec *irec); int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, enum xbtree_recpacking *outcome); @@ -259,8 +268,8 @@ struct xfs_rmap_hook { void xfs_rmap_hook_disable(void); void xfs_rmap_hook_enable(void); -int xfs_rmap_hook_add(struct xfs_perag *pag, struct xfs_rmap_hook *hook); -void xfs_rmap_hook_del(struct xfs_perag *pag, struct xfs_rmap_hook *hook); +int xfs_rmap_hook_add(struct xfs_group *xg, struct xfs_rmap_hook *hook); +void xfs_rmap_hook_del(struct xfs_group *xg, struct xfs_rmap_hook *hook); void xfs_rmap_hook_setup(struct xfs_rmap_hook *hook, notifier_fn_t mod_fn); #endif diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 9e759efa81cc..2cab694ac58a 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -57,7 +57,7 @@ xfs_rmapbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.pag); + cur->bc_ag.agbp, to_perag(cur->bc_group)); } STATIC void @@ -66,14 +66,15 @@ xfs_rmapbt_set_root( const union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_ag.agbp; - struct xfs_agf *agf = agbp->b_addr; + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag = to_perag(cur->bc_group); ASSERT(ptr->s != 0); agf->agf_rmap_root = ptr->s; be32_add_cpu(&agf->agf_rmap_level, inc); - cur->bc_ag.pag->pagf_rmap_level += inc; + pag->pagf_rmap_level += inc; xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -87,7 +88,8 @@ xfs_rmapbt_alloc_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); + struct xfs_alloc_arg args = { .len = 1 }; int error; xfs_agblock_t bno; @@ -101,13 +103,17 @@ xfs_rmapbt_alloc_block( return 0; } - xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false); + xfs_extent_busy_reuse(pag_group(pag), bno, 1, false); new->s = cpu_to_be32(bno); be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno); + /* + * Since rmapbt blocks are sourced from the AGFL, they are allocated one + * at a time and the reservation updates don't require a transaction. + */ + xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args); *stat = 1; return 0; @@ -120,7 +126,7 @@ xfs_rmapbt_free_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); xfs_agblock_t bno; int error; @@ -131,7 +137,7 @@ xfs_rmapbt_free_block( if (error) return error; - xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1, + xfs_extent_busy_insert(cur->bc_tp, pag_group(pag), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); @@ -222,7 +228,7 @@ xfs_rmapbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_rmap_root; } @@ -533,7 +539,7 @@ xfs_rmapbt_init_cursor( cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops, mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agf *agf = agbp->b_addr; @@ -642,14 +648,13 @@ xfs_rmapbt_mem_cursor( struct xfbtree *xfbt) { struct xfs_btree_cur *cur; - struct xfs_mount *mp = pag->pag_mount; - cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops, + cur = xfs_btree_alloc_cursor(pag_mount(pag), tp, &xfs_rmapbt_mem_ops, xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache); cur->bc_mem.xfbtree = xfbt; cur->bc_nlevels = xfbt->nlevels; - cur->bc_mem.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); return cur; } @@ -726,10 +731,11 @@ xfs_rmapbt_block_maxrecs( /* * Calculate number of records in an rmap btree block. */ -int +unsigned int xfs_rmapbt_maxrecs( - int blocklen, - int leaf) + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) { blocklen -= XFS_RMAP_BLOCK_LEN; return xfs_rmapbt_block_maxrecs(blocklen, leaf); @@ -857,7 +863,7 @@ xfs_rmapbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (xfs_ag_contains_log(mp, pag->pag_agno)) + if (xfs_ag_contains_log(mp, pag_agno(pag))) agblocks -= mp->m_sb.sb_logblocks; /* Reserve 1% of the AG or enough for 1 block per record. */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index eb90d89e8086..119b1567cd0e 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -47,7 +47,8 @@ struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_perag *pag); void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); -int xfs_rmapbt_maxrecs(int blocklen, int leaf); +unsigned int xfs_rmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index f246d6dbf4ec..5057536e586c 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -13,33 +13,94 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" #include "xfs_trans.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_rtbitmap.h" #include "xfs_health.h" +#include "xfs_sb.h" +#include "xfs_errortag.h" +#include "xfs_log.h" +#include "xfs_buf_item.h" +#include "xfs_extent_busy.h" /* * Realtime allocator bitmap functions shared with userspace. */ -/* - * Real time buffers need verifiers to avoid runtime warnings during IO. - * We don't have anything to verify, however, so these are just dummy - * operations. - */ +static xfs_failaddr_t +xfs_rtbuf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + if (!xfs_verify_magic(bp, hdr->rt_magic)) + return __this_address; + if (!xfs_has_rtgroups(mp)) + return __this_address; + if (!xfs_has_crc(mp)) + return __this_address; + if (!uuid_equal(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (hdr->rt_blkno != cpu_to_be64(xfs_buf_daddr(bp))) + return __this_address; + return NULL; +} + static void xfs_rtbuf_verify_read( - struct xfs_buf *bp) + struct xfs_buf *bp) { + struct xfs_mount *mp = bp->b_mount; + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + xfs_failaddr_t fa; + + if (!xfs_has_rtgroups(mp)) + return; + + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr->rt_lsn))) { + fa = __this_address; + goto fail; + } + + if (!xfs_buf_verify_cksum(bp, XFS_RTBUF_CRC_OFF)) { + fa = __this_address; + goto fail; + } + + fa = xfs_rtbuf_verify(bp); + if (fa) + goto fail; + return; +fail: + xfs_verifier_error(bp, -EFSCORRUPTED, fa); } static void xfs_rtbuf_verify_write( struct xfs_buf *bp) { - return; + struct xfs_mount *mp = bp->b_mount; + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + struct xfs_buf_log_item *bip = bp->b_log_item; + xfs_failaddr_t fa; + + if (!xfs_has_rtgroups(mp)) + return; + + fa = xfs_rtbuf_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + + if (bip) + hdr->rt_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_RTBUF_CRC_OFF); } const struct xfs_buf_ops xfs_rtbuf_ops = { @@ -48,6 +109,22 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { .verify_write = xfs_rtbuf_verify_write, }; +const struct xfs_buf_ops xfs_rtbitmap_buf_ops = { + .name = "xfs_rtbitmap", + .magic = { 0, cpu_to_be32(XFS_RTBITMAP_MAGIC) }, + .verify_read = xfs_rtbuf_verify_read, + .verify_write = xfs_rtbuf_verify_write, + .verify_struct = xfs_rtbuf_verify, +}; + +const struct xfs_buf_ops xfs_rtsummary_buf_ops = { + .name = "xfs_rtsummary", + .magic = { 0, cpu_to_be32(XFS_RTSUMMARY_MAGIC) }, + .verify_read = xfs_rtbuf_verify_read, + .verify_write = xfs_rtbuf_verify_write, + .verify_struct = xfs_rtbuf_verify, +}; + /* Release cached rt bitmap and summary buffers. */ void xfs_rtbuf_cache_relse( @@ -69,32 +146,35 @@ xfs_rtbuf_cache_relse( * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ -int +static int xfs_rtbuf_get( struct xfs_rtalloc_args *args, xfs_fileoff_t block, /* block number in bitmap or summary */ - int issum) /* is summary not bitmap */ + enum xfs_rtg_inodes type) { + struct xfs_inode *ip = args->rtg->rtg_inodes[type]; struct xfs_mount *mp = args->mp; struct xfs_buf **cbpp; /* cached block buffer */ xfs_fileoff_t *coffp; /* cached block number */ struct xfs_buf *bp; /* block buffer, result */ - struct xfs_inode *ip; /* bitmap or summary inode */ struct xfs_bmbt_irec map; - enum xfs_blft type; + enum xfs_blft buf_type; int nmap = 1; int error; - if (issum) { + switch (type) { + case XFS_RTGI_SUMMARY: cbpp = &args->sumbp; coffp = &args->sumoff; - ip = mp->m_rsumip; - type = XFS_BLFT_RTSUMMARY_BUF; - } else { + buf_type = XFS_BLFT_RTSUMMARY_BUF; + break; + case XFS_RTGI_BITMAP: cbpp = &args->rbmbp; coffp = &args->rbmoff; - ip = mp->m_rbmip; - type = XFS_BLFT_RTBITMAP_BUF; + buf_type = XFS_BLFT_RTBITMAP_BUF; + break; + default: + return -EINVAL; } /* @@ -117,36 +197,74 @@ xfs_rtbuf_get( return error; if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) { - xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY : - XFS_SICK_RT_BITMAP); + xfs_rtginode_mark_sick(args->rtg, type); return -EFSCORRUPTED; } ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), - mp->m_bsize, 0, &bp, &xfs_rtbuf_ops); + mp->m_bsize, 0, &bp, + xfs_rtblock_ops(mp, type)); if (xfs_metadata_is_sick(error)) - xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY : - XFS_SICK_RT_BITMAP); + xfs_rtginode_mark_sick(args->rtg, type); if (error) return error; - xfs_trans_buf_set_type(args->tp, bp, type); + if (xfs_has_rtgroups(mp)) { + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + if (hdr->rt_owner != cpu_to_be64(ip->i_ino)) { + xfs_buf_mark_corrupt(bp); + xfs_trans_brelse(args->tp, bp); + xfs_rtginode_mark_sick(args->rtg, type); + return -EFSCORRUPTED; + } + } + + xfs_trans_buf_set_type(args->tp, bp, buf_type); *cbpp = bp; *coffp = block; return 0; } +int +xfs_rtbitmap_read_buf( + struct xfs_rtalloc_args *args, + xfs_fileoff_t block) +{ + struct xfs_mount *mp = args->mp; + + if (XFS_IS_CORRUPT(mp, block >= mp->m_sb.sb_rbmblocks)) { + xfs_rtginode_mark_sick(args->rtg, XFS_RTGI_BITMAP); + return -EFSCORRUPTED; + } + + return xfs_rtbuf_get(args, block, XFS_RTGI_BITMAP); +} + +int +xfs_rtsummary_read_buf( + struct xfs_rtalloc_args *args, + xfs_fileoff_t block) +{ + struct xfs_mount *mp = args->mp; + + if (XFS_IS_CORRUPT(mp, block >= mp->m_rsumblocks)) { + xfs_rtginode_mark_sick(args->rtg, XFS_RTGI_SUMMARY); + return -EFSCORRUPTED; + } + return xfs_rtbuf_get(args, block, XFS_RTGI_SUMMARY); +} + /* - * Searching backward from start to limit, find the first block whose - * allocated/free state is different from start's. + * Searching backward from start find the first block whose allocated/free state + * is different from start's. */ int xfs_rtfind_back( struct xfs_rtalloc_args *args, xfs_rtxnum_t start, /* starting rtext to look at */ - xfs_rtxnum_t limit, /* last rtext to look at */ xfs_rtxnum_t *rtx) /* out: start rtext found */ { struct xfs_mount *mp = args->mp; @@ -175,7 +293,7 @@ xfs_rtfind_back( */ word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); - len = start - limit + 1; + len = start + 1; /* * Compute match value, based on the bit at start: if 1 (free) * then all-ones, else all-zeroes. @@ -316,6 +434,8 @@ xfs_rtfind_forw( xfs_rtword_t incore; unsigned int word; /* word number in the buffer */ + ASSERT(start <= limit); + /* * Compute and read in starting bitmap block for starting block. */ @@ -471,6 +591,7 @@ xfs_rtmodify_summary( { struct xfs_mount *mp = args->mp; xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno); + uint8_t *rsum_cache = args->rtg->rtg_rsum_cache; unsigned int infoword; xfs_suminfo_t val; int error; @@ -482,11 +603,11 @@ xfs_rtmodify_summary( infoword = xfs_rtsumoffs_to_infoword(mp, so); val = xfs_suminfo_add(args, infoword, delta); - if (mp->m_rsum_cache) { - if (val == 0 && log + 1 == mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log; - if (val != 0 && log >= mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log + 1; + if (rsum_cache) { + if (val == 0 && log + 1 == rsum_cache[bbno]) + rsum_cache[bbno] = log; + if (val != 0 && log >= rsum_cache[bbno]) + rsum_cache[bbno] = log + 1; } xfs_trans_log_rtsummary(args, infoword); @@ -698,14 +819,14 @@ xfs_rtfree_range( * We need to find the beginning and end of the extent so we can * properly update the summary. */ - error = xfs_rtfind_back(args, start, 0, &preblock); + error = xfs_rtfind_back(args, start, &preblock); if (error) { return error; } /* * Find the next allocated block (end of allocated extent). */ - error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, + error = xfs_rtfind_forw(args, end, args->rtg->rtg_extents - 1, &postblock); if (error) return error; @@ -929,19 +1050,25 @@ xfs_rtcheck_alloc_range( int xfs_rtfree_extent( struct xfs_trans *tp, /* transaction pointer */ + struct xfs_rtgroup *rtg, xfs_rtxnum_t start, /* starting rtext number to free */ xfs_rtxlen_t len) /* length of extent freed */ { struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *rbmip = rtg_bitmap(rtg); struct xfs_rtalloc_args args = { .mp = mp, .tp = tp, + .rtg = rtg, }; int error; struct timespec64 atime; - ASSERT(mp->m_rbmip->i_itemp != NULL); - xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL); + ASSERT(rbmip->i_itemp != NULL); + xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL); + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT)) + return -EIO; error = xfs_rtcheck_alloc_range(&args, start, len); if (error) @@ -958,19 +1085,21 @@ xfs_rtfree_extent( * Mark more blocks free in the superblock. */ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); + /* * If we've now freed all the blocks, reset the file sequence - * number to 0. + * number to 0 for pre-RTG file systems. */ - if (tp->t_frextents_delta + mp->m_sb.sb_frextents == + if (!xfs_has_rtgroups(mp) && + tp->t_frextents_delta + mp->m_sb.sb_frextents == mp->m_sb.sb_rextents) { - if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) - mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; + if (!(rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) + rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; - atime = inode_get_atime(VFS_I(mp->m_rbmip)); + atime = inode_get_atime(VFS_I(rbmip)); atime.tv_sec = 0; - inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime); - xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + inode_set_atime_to_ts(VFS_I(rbmip), atime); + xfs_trans_log_inode(tp, rbmip, XFS_ILOG_CORE); } error = 0; out: @@ -986,84 +1115,96 @@ out: int xfs_rtfree_blocks( struct xfs_trans *tp, + struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno, xfs_filblks_t rtlen) { struct xfs_mount *mp = tp->t_mountp; - xfs_rtxnum_t start; - xfs_filblks_t len; xfs_extlen_t mod; + int error; + ASSERT(!xfs_has_zoned(mp)); ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); - len = xfs_rtb_to_rtxrem(mp, rtlen, &mod); + mod = xfs_blen_to_rtxoff(mp, rtlen); if (mod) { ASSERT(mod == 0); return -EIO; } - start = xfs_rtb_to_rtxrem(mp, rtbno, &mod); + mod = xfs_rtb_to_rtxoff(mp, rtbno); if (mod) { ASSERT(mod == 0); return -EIO; } - return xfs_rtfree_extent(tp, start, len); + error = xfs_rtfree_extent(tp, rtg, xfs_rtb_to_rtx(mp, rtbno), + xfs_extlen_to_rtxlen(mp, rtlen)); + if (error) + return error; + + if (xfs_has_rtgroups(mp)) + xfs_extent_busy_insert(tp, rtg_group(rtg), + xfs_rtb_to_rgbno(mp, rtbno), rtlen, 0); + + return 0; } /* Find all the free records within a given range. */ int xfs_rtalloc_query_range( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, - const struct xfs_rtalloc_rec *low_rec, - const struct xfs_rtalloc_rec *high_rec, + xfs_rtxnum_t start, + xfs_rtxnum_t end, xfs_rtalloc_query_range_fn fn, void *priv) { + struct xfs_mount *mp = rtg_mount(rtg); struct xfs_rtalloc_args args = { + .rtg = rtg, .mp = mp, .tp = tp, }; - struct xfs_rtalloc_rec rec; - xfs_rtxnum_t rtstart; - xfs_rtxnum_t rtend; - xfs_rtxnum_t high_key; - int is_free; int error = 0; - if (low_rec->ar_startext > high_rec->ar_startext) + if (start > end) return -EINVAL; - if (low_rec->ar_startext >= mp->m_sb.sb_rextents || - low_rec->ar_startext == high_rec->ar_startext) + if (start == end || start >= rtg->rtg_extents) return 0; - high_key = min(high_rec->ar_startext, mp->m_sb.sb_rextents - 1); + end = min(end, rtg->rtg_extents - 1); + + if (xfs_has_zoned(mp)) + return -EINVAL; /* Iterate the bitmap, looking for discrepancies. */ - rtstart = low_rec->ar_startext; - while (rtstart <= high_key) { + while (start <= end) { + struct xfs_rtalloc_rec rec; + int is_free; + xfs_rtxnum_t rtend; + /* Is the first block free? */ - error = xfs_rtcheck_range(&args, rtstart, 1, 1, &rtend, + error = xfs_rtcheck_range(&args, start, 1, 1, &rtend, &is_free); if (error) break; /* How long does the extent go for? */ - error = xfs_rtfind_forw(&args, rtstart, high_key, &rtend); + error = xfs_rtfind_forw(&args, start, end, &rtend); if (error) break; if (is_free) { - rec.ar_startext = rtstart; - rec.ar_extcount = rtend - rtstart + 1; + rec.ar_startext = start; + rec.ar_extcount = rtend - start + 1; - error = fn(mp, tp, &rec, priv); + error = fn(rtg, tp, &rec, priv); if (error) break; } - rtstart = rtend + 1; + start = rtend + 1; } xfs_rtbuf_cache_relse(&args); @@ -1073,31 +1214,27 @@ xfs_rtalloc_query_range( /* Find all the free records. */ int xfs_rtalloc_query_all( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv) { - struct xfs_rtalloc_rec keys[2]; - - keys[0].ar_startext = 0; - keys[1].ar_startext = mp->m_sb.sb_rextents - 1; - keys[0].ar_extcount = keys[1].ar_extcount = 0; - - return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv); + return xfs_rtalloc_query_range(rtg, tp, 0, rtg->rtg_extents - 1, fn, + priv); } /* Is the given extent all free? */ int xfs_rtalloc_extent_is_free( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtxnum_t start, xfs_rtxlen_t len, bool *is_free) { struct xfs_rtalloc_args args = { - .mp = mp, + .mp = rtg_mount(rtg), + .rtg = rtg, .tp = tp, }; xfs_rtxnum_t end; @@ -1113,58 +1250,255 @@ xfs_rtalloc_extent_is_free( return 0; } +/* Compute the number of rt extents tracked by a single bitmap block. */ +xfs_rtxnum_t +xfs_rtbitmap_rtx_per_rbmblock( + struct xfs_mount *mp) +{ + unsigned int rbmblock_bytes = mp->m_sb.sb_blocksize; + + if (xfs_has_rtgroups(mp)) + rbmblock_bytes -= sizeof(struct xfs_rtbuf_blkinfo); + + return rbmblock_bytes * NBBY; +} + /* * Compute the number of rtbitmap blocks needed to track the given number of rt * extents. */ xfs_filblks_t -xfs_rtbitmap_blockcount( +xfs_rtbitmap_blockcount_len( struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { - return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize); + if (xfs_has_zoned(mp)) + return 0; + return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp)); +} + +/* How many rt extents does each rtbitmap file track? */ +static inline xfs_rtbxlen_t +xfs_rtbitmap_bitcount( + struct xfs_mount *mp) +{ + if (!mp->m_sb.sb_rextents) + return 0; + + /* rtgroup size can be nonzero even if rextents is zero */ + if (xfs_has_rtgroups(mp)) + return mp->m_sb.sb_rgextents; + + return mp->m_sb.sb_rextents; } /* - * Compute the number of rtbitmap words needed to populate every block of a - * bitmap that is large enough to track the given number of rt extents. + * Compute the number of rtbitmap blocks used for a given file system. */ -unsigned long long -xfs_rtbitmap_wordcount( - struct xfs_mount *mp, - xfs_rtbxlen_t rtextents) +xfs_filblks_t +xfs_rtbitmap_blockcount( + struct xfs_mount *mp) { - xfs_filblks_t blocks; - - blocks = xfs_rtbitmap_blockcount(mp, rtextents); - return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; + return xfs_rtbitmap_blockcount_len(mp, xfs_rtbitmap_bitcount(mp)); } -/* Compute the number of rtsummary blocks needed to track the given rt space. */ +/* + * Compute the geometry of the rtsummary file needed to track the given rt + * space. + */ xfs_filblks_t xfs_rtsummary_blockcount( struct xfs_mount *mp, - unsigned int rsumlevels, - xfs_extlen_t rbmblocks) + unsigned int *rsumlevels) { + xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp); unsigned long long rsumwords; - rsumwords = (unsigned long long)rsumlevels * rbmblocks; - return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG); + if (xfs_has_zoned(mp)) { + *rsumlevels = 0; + return 0; + } + + *rsumlevels = xfs_compute_rextslog(rextents) + 1; + rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels); + return howmany_64(rsumwords, mp->m_blockwsize); +} + +static int +xfs_rtfile_alloc_blocks( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb, + struct xfs_bmbt_irec *map) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int nmap = 1; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, + XFS_GROWFSRT_SPACE_RES(mp, count_fsb), 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + goto out_trans_cancel; + + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_METADATA, 0, map, &nmap); + if (error) + goto out_trans_cancel; + + return xfs_trans_commit(tp); + +out_trans_cancel: + xfs_trans_cancel(tp); + return error; +} + +/* Get a buffer for the block. */ +static int +xfs_rtfile_initialize_block( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, + xfs_fsblock_t fsbno, + void *data) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_inode *ip = rtg->rtg_inodes[type]; + struct xfs_trans *tp; + struct xfs_buf *bp; + void *bufdata; + const size_t copylen = mp->m_blockwsize << XFS_WORDLOG; + enum xfs_blft buf_type; + int error; + + if (type == XFS_RTGI_BITMAP) + buf_type = XFS_BLFT_RTBITMAP_BUF; + else if (type == XFS_RTGI_SUMMARY) + buf_type = XFS_BLFT_RTSUMMARY_BUF; + else + return -EINVAL; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero, 0, 0, 0, &tp); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, fsbno), mp->m_bsize, 0, &bp); + if (error) { + xfs_trans_cancel(tp); + return error; + } + bufdata = bp->b_addr; + + xfs_trans_buf_set_type(tp, bp, buf_type); + bp->b_ops = xfs_rtblock_ops(mp, type); + + if (xfs_has_rtgroups(mp)) { + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + if (type == XFS_RTGI_BITMAP) + hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC); + else + hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC); + hdr->rt_owner = cpu_to_be64(ip->i_ino); + hdr->rt_blkno = cpu_to_be64(XFS_FSB_TO_DADDR(mp, fsbno)); + hdr->rt_lsn = 0; + uuid_copy(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid); + + bufdata += sizeof(*hdr); + } + + if (data) + memcpy(bufdata, data, copylen); + else + memset(bufdata, 0, copylen); + xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); + return xfs_trans_commit(tp); } /* - * Compute the number of rtsummary info words needed to populate every block of - * a summary file that is large enough to track the given rt space. + * Allocate space to the bitmap or summary file, and zero it, for growfs. + * @data must be a contiguous buffer large enough to fill all blocks in the + * file; or NULL to initialize the contents to zeroes. */ -unsigned long long -xfs_rtsummary_wordcount( - struct xfs_mount *mp, - unsigned int rsumlevels, - xfs_extlen_t rbmblocks) +int +xfs_rtfile_initialize_blocks( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, + xfs_fileoff_t offset_fsb, /* offset to start from */ + xfs_fileoff_t end_fsb, /* offset to allocate to */ + void *data) /* data to fill the blocks */ { - xfs_filblks_t blocks; + struct xfs_mount *mp = rtg_mount(rtg); + const size_t copylen = mp->m_blockwsize << XFS_WORDLOG; + + while (offset_fsb < end_fsb) { + struct xfs_bmbt_irec map; + xfs_filblks_t i; + int error; + + error = xfs_rtfile_alloc_blocks(rtg->rtg_inodes[type], + offset_fsb, end_fsb - offset_fsb, &map); + if (error) + return error; + + /* + * Now we need to clear the allocated blocks. + * + * Do this one block per transaction, to keep it simple. + */ + for (i = 0; i < map.br_blockcount; i++) { + error = xfs_rtfile_initialize_block(rtg, type, + map.br_startblock + i, data); + if (error) + return error; + if (data) + data += copylen; + } - blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks); - return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; + offset_fsb = map.br_startoff + map.br_blockcount; + } + + return 0; +} + +int +xfs_rtbitmap_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + ip->i_disk_size = mp->m_sb.sb_rbmblocks * mp->m_sb.sb_blocksize; + if (init && !xfs_has_rtgroups(mp)) { + ip->i_diflags |= XFS_DIFLAG_NEWRTBM; + inode_set_atime(VFS_I(ip), 0, 0); + } + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} + +int +xfs_rtsummary_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + ip->i_disk_size = mp->m_rsumblocks * mp->m_sb.sb_blocksize; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; } diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 152a66750af5..22e5d9cd95f4 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -6,7 +6,10 @@ #ifndef __XFS_RTBITMAP_H__ #define __XFS_RTBITMAP_H__ +#include "xfs_rtgroup.h" + struct xfs_rtalloc_args { + struct xfs_rtgroup *rtg; struct xfs_mount *mp; struct xfs_trans *tp; @@ -19,13 +22,37 @@ struct xfs_rtalloc_args { static inline xfs_rtblock_t xfs_rtx_to_rtb( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, xfs_rtxnum_t rtx) { + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rtblock_t start = xfs_group_start_fsb(rtg_group(rtg)); + if (mp->m_rtxblklog >= 0) - return rtx << mp->m_rtxblklog; + return start + (rtx << mp->m_rtxblklog); + return start + (rtx * mp->m_sb.sb_rextsize); +} - return rtx * mp->m_sb.sb_rextsize; +/* Convert an rgbno into an rt extent number. */ +static inline xfs_rtxnum_t +xfs_rgbno_to_rtx( + struct xfs_mount *mp, + xfs_rgblock_t rgbno) +{ + if (likely(mp->m_rtxblklog >= 0)) + return rgbno >> mp->m_rtxblklog; + return rgbno / mp->m_sb.sb_rextsize; +} + +static inline uint64_t +xfs_rtbxlen_to_blen( + struct xfs_mount *mp, + xfs_rtbxlen_t rtbxlen) +{ + if (mp->m_rtxblklog >= 0) + return rtbxlen << mp->m_rtxblklog; + + return rtbxlen * mp->m_sb.sb_rextsize; } static inline xfs_extlen_t @@ -62,84 +89,90 @@ xfs_extlen_to_rtxlen( return len / mp->m_sb.sb_rextsize; } -/* Convert an rt block number into an rt extent number. */ -static inline xfs_rtxnum_t -xfs_rtb_to_rtx( +/* Convert an rt block count into an rt extent count. */ +static inline xfs_rtbxlen_t +xfs_blen_to_rtbxlen( struct xfs_mount *mp, - xfs_rtblock_t rtbno) + uint64_t blen) { if (likely(mp->m_rtxblklog >= 0)) - return rtbno >> mp->m_rtxblklog; + return blen >> mp->m_rtxblklog; - return div_u64(rtbno, mp->m_sb.sb_rextsize); + return div_u64(blen, mp->m_sb.sb_rextsize); } -/* Return the offset of an rt block number within an rt extent. */ +/* Return the offset of a file block length within an rt extent. */ static inline xfs_extlen_t -xfs_rtb_to_rtxoff( +xfs_blen_to_rtxoff( struct xfs_mount *mp, - xfs_rtblock_t rtbno) + xfs_filblks_t blen) { if (likely(mp->m_rtxblklog >= 0)) - return rtbno & mp->m_rtxblkmask; + return blen & mp->m_rtxblkmask; - return do_div(rtbno, mp->m_sb.sb_rextsize); + return do_div(blen, mp->m_sb.sb_rextsize); } -/* - * Crack an rt block number into an rt extent number and an offset within that - * rt extent. Returns the rt extent number directly and the offset in @off. - */ -static inline xfs_rtxnum_t -xfs_rtb_to_rtxrem( +/* Round this block count up to the nearest rt extent size. */ +static inline xfs_filblks_t +xfs_blen_roundup_rtx( struct xfs_mount *mp, - xfs_rtblock_t rtbno, - xfs_extlen_t *off) + xfs_filblks_t blen) { - if (likely(mp->m_rtxblklog >= 0)) { - *off = rtbno & mp->m_rtxblkmask; - return rtbno >> mp->m_rtxblklog; - } - - return div_u64_rem(rtbno, mp->m_sb.sb_rextsize, off); + return roundup_64(blen, mp->m_sb.sb_rextsize); } -/* - * Convert an rt block number into an rt extent number, rounding up to the next - * rt extent if the rt block is not aligned to an rt extent boundary. - */ +/* Convert an rt block number into an rt extent number. */ static inline xfs_rtxnum_t -xfs_rtb_to_rtxup( +xfs_rtb_to_rtx( struct xfs_mount *mp, xfs_rtblock_t rtbno) { - if (likely(mp->m_rtxblklog >= 0)) { - if (rtbno & mp->m_rtxblkmask) - return (rtbno >> mp->m_rtxblklog) + 1; + /* open-coded 64-bit masking operation */ + rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask; + if (likely(mp->m_rtxblklog >= 0)) return rtbno >> mp->m_rtxblklog; - } + return div_u64(rtbno, mp->m_sb.sb_rextsize); +} - if (do_div(rtbno, mp->m_sb.sb_rextsize)) - rtbno++; - return rtbno; +/* Return the offset of a rtgroup block number within an rt extent. */ +static inline xfs_extlen_t +xfs_rgbno_to_rtxoff( + struct xfs_mount *mp, + xfs_rgblock_t rgbno) +{ + return rgbno % mp->m_sb.sb_rextsize; } -/* Round this rtblock up to the nearest rt extent size. */ -static inline xfs_rtblock_t -xfs_rtb_roundup_rtx( +/* Return the offset of an rt block number within an rt extent. */ +static inline xfs_extlen_t +xfs_rtb_to_rtxoff( struct xfs_mount *mp, xfs_rtblock_t rtbno) { - return roundup_64(rtbno, mp->m_sb.sb_rextsize); + /* open-coded 64-bit masking operation */ + rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask; + if (likely(mp->m_rtxblklog >= 0)) + return rtbno & mp->m_rtxblkmask; + return do_div(rtbno, mp->m_sb.sb_rextsize); } -/* Round this rtblock down to the nearest rt extent size. */ +/* Round this file block offset up to the nearest rt extent size. */ static inline xfs_rtblock_t -xfs_rtb_rounddown_rtx( +xfs_fileoff_roundup_rtx( struct xfs_mount *mp, - xfs_rtblock_t rtbno) + xfs_fileoff_t off) +{ + return roundup_64(off, mp->m_sb.sb_rextsize); +} + +/* Round this file block offset down to the nearest rt extent size. */ +static inline xfs_rtblock_t +xfs_fileoff_rounddown_rtx( + struct xfs_mount *mp, + xfs_fileoff_t off) { - return rounddown_64(rtbno, mp->m_sb.sb_rextsize); + return rounddown_64(off, mp->m_sb.sb_rextsize); } /* Convert an rt extent number to a file block offset in the rt bitmap file. */ @@ -148,6 +181,9 @@ xfs_rtx_to_rbmblock( struct xfs_mount *mp, xfs_rtxnum_t rtx) { + if (xfs_has_rtgroups(mp)) + return div_u64(rtx, mp->m_rtx_per_rbmblock); + return rtx >> mp->m_blkbit_log; } @@ -157,6 +193,13 @@ xfs_rtx_to_rbmword( struct xfs_mount *mp, xfs_rtxnum_t rtx) { + if (xfs_has_rtgroups(mp)) { + unsigned int mod; + + div_u64_rem(rtx >> XFS_NBWORDLOG, mp->m_blockwsize, &mod); + return mod; + } + return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1); } @@ -166,6 +209,9 @@ xfs_rbmblock_to_rtx( struct xfs_mount *mp, xfs_fileoff_t rbmoff) { + if (xfs_has_rtgroups(mp)) + return rbmoff * mp->m_rtx_per_rbmblock; + return rbmoff << mp->m_blkbit_log; } @@ -175,7 +221,14 @@ xfs_rbmblock_wordptr( struct xfs_rtalloc_args *args, unsigned int index) { - union xfs_rtword_raw *words = args->rbmbp->b_addr; + struct xfs_mount *mp = args->mp; + union xfs_rtword_raw *words; + struct xfs_rtbuf_blkinfo *hdr = args->rbmbp->b_addr; + + if (xfs_has_rtgroups(mp)) + words = (union xfs_rtword_raw *)(hdr + 1); + else + words = args->rbmbp->b_addr; return words + index; } @@ -188,6 +241,8 @@ xfs_rtbitmap_getword( { union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); + if (xfs_has_rtgroups(args->mp)) + return be32_to_cpu(word->rtg); return word->old; } @@ -200,7 +255,10 @@ xfs_rtbitmap_setword( { union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); - word->old = value; + if (xfs_has_rtgroups(args->mp)) + word->rtg = cpu_to_be32(value); + else + word->old = value; } /* @@ -225,6 +283,9 @@ xfs_rtsumoffs_to_block( struct xfs_mount *mp, xfs_rtsumoff_t rsumoff) { + if (xfs_has_rtgroups(mp)) + return rsumoff / mp->m_blockwsize; + return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t)); } @@ -239,6 +300,9 @@ xfs_rtsumoffs_to_infoword( { unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG; + if (xfs_has_rtgroups(mp)) + return rsumoff % mp->m_blockwsize; + return rsumoff & mask; } @@ -248,7 +312,13 @@ xfs_rsumblock_infoptr( struct xfs_rtalloc_args *args, unsigned int index) { - union xfs_suminfo_raw *info = args->sumbp->b_addr; + union xfs_suminfo_raw *info; + struct xfs_rtbuf_blkinfo *hdr = args->sumbp->b_addr; + + if (xfs_has_rtgroups(args->mp)) + info = (union xfs_suminfo_raw *)(hdr + 1); + else + info = args->sumbp->b_addr; return info + index; } @@ -261,6 +331,8 @@ xfs_suminfo_get( { union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + if (xfs_has_rtgroups(args->mp)) + return be32_to_cpu(info->rtg); return info->old; } @@ -273,10 +345,28 @@ xfs_suminfo_add( { union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + if (xfs_has_rtgroups(args->mp)) { + be32_add_cpu(&info->rtg, delta); + return be32_to_cpu(info->rtg); + } + info->old += delta; return info->old; } +static inline const struct xfs_buf_ops * +xfs_rtblock_ops( + struct xfs_mount *mp, + enum xfs_rtg_inodes type) +{ + if (xfs_has_rtgroups(mp)) { + if (type == XFS_RTGI_SUMMARY) + return &xfs_rtsummary_buf_ops; + return &xfs_rtbitmap_buf_ops; + } + return &xfs_rtbuf_ops; +} + /* * Functions for walking free space rtextents in the realtime bitmap. */ @@ -286,37 +376,19 @@ struct xfs_rtalloc_rec { }; typedef int (*xfs_rtalloc_query_range_fn)( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv); #ifdef CONFIG_XFS_RT void xfs_rtbuf_cache_relse(struct xfs_rtalloc_args *args); - -int xfs_rtbuf_get(struct xfs_rtalloc_args *args, xfs_fileoff_t block, - int issum); - -static inline int -xfs_rtbitmap_read_buf( - struct xfs_rtalloc_args *args, - xfs_fileoff_t block) -{ - return xfs_rtbuf_get(args, block, 0); -} - -static inline int -xfs_rtsummary_read_buf( - struct xfs_rtalloc_args *args, - xfs_fileoff_t block) -{ - return xfs_rtbuf_get(args, block, 1); -} - +int xfs_rtbitmap_read_buf(struct xfs_rtalloc_args *args, xfs_fileoff_t block); +int xfs_rtsummary_read_buf(struct xfs_rtalloc_args *args, xfs_fileoff_t block); int xfs_rtcheck_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len, int val, xfs_rtxnum_t *new, int *stat); int xfs_rtfind_back(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, - xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); + xfs_rtxnum_t *rtblock); int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, @@ -327,42 +399,43 @@ int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log, xfs_fileoff_t bbno, int delta); int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len); -int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, - const struct xfs_rtalloc_rec *low_rec, - const struct xfs_rtalloc_rec *high_rec, +int xfs_rtalloc_query_range(struct xfs_rtgroup *rtg, struct xfs_trans *tp, + xfs_rtxnum_t start, xfs_rtxnum_t end, xfs_rtalloc_query_range_fn fn, void *priv); -int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtalloc_query_range_fn fn, - void *priv); -int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtxnum_t start, xfs_rtxlen_t len, - bool *is_free); -/* - * Free an extent in the realtime subvolume. Length is expressed in - * realtime extents, as is the block number. - */ -int /* error */ -xfs_rtfree_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtxnum_t start, /* starting rtext number to free */ - xfs_rtxlen_t len); /* length of extent freed */ - +int xfs_rtalloc_query_all(struct xfs_rtgroup *rtg, struct xfs_trans *tp, + xfs_rtalloc_query_range_fn fn, void *priv); +int xfs_rtalloc_extent_is_free(struct xfs_rtgroup *rtg, struct xfs_trans *tp, + xfs_rtxnum_t start, xfs_rtxlen_t len, bool *is_free); +int xfs_rtfree_extent(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + xfs_rtxnum_t start, xfs_rtxlen_t len); /* Same as above, but in units of rt blocks. */ -int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, - xfs_filblks_t rtlen); +int xfs_rtfree_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + xfs_fsblock_t rtbno, xfs_filblks_t rtlen); -xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t - rtextents); -unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, +xfs_rtxnum_t xfs_rtbitmap_rtx_per_rbmblock(struct xfs_mount *mp); +xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp); +xfs_filblks_t xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); - xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp, - unsigned int rsumlevels, xfs_extlen_t rbmblocks); -unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, - unsigned int rsumlevels, xfs_extlen_t rbmblocks); + unsigned int *rsumlevels); + +int xfs_rtfile_initialize_blocks(struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb, void *data); +int xfs_rtbitmap_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); +int xfs_rtsummary_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); + #else /* CONFIG_XFS_RT */ # define xfs_rtfree_extent(t,b,l) (-ENOSYS) -# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS) + +static inline int xfs_rtfree_blocks(struct xfs_trans *tp, + struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno, + xfs_filblks_t rtlen) +{ + return -ENOSYS; +} # define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS) # define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS) # define xfs_rtbitmap_read_buf(a,b) (-ENOSYS) @@ -370,14 +443,11 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) static inline xfs_filblks_t -xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) +xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { /* shut up gcc */ return 0; } -# define xfs_rtbitmap_wordcount(mp, r) (0) -# define xfs_rtsummary_blockcount(mp, l, b) (0) -# define xfs_rtsummary_wordcount(mp, l, b) (0) #endif /* CONFIG_XFS_RT */ #endif /* __XFS_RTBITMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c new file mode 100644 index 000000000000..9186c58e83d5 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtgroup.c @@ -0,0 +1,750 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" +#include "xfs_ag_resv.h" +#include "xfs_health.h" +#include "xfs_error.h" +#include "xfs_bmap.h" +#include "xfs_defer.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_buf_item.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" +#include "xfs_metafile.h" +#include "xfs_metadir.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" + +/* Find the first usable fsblock in this rtgroup. */ +static inline uint32_t +xfs_rtgroup_min_block( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + if (xfs_has_rtsb(mp) && rgno == 0) + return mp->m_sb.sb_rextsize; + + return 0; +} + +/* Precompute this group's geometry */ +void +xfs_rtgroup_calc_geometry( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg, + xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents) +{ + rtg->rtg_extents = __xfs_rtgroup_extents(mp, rgno, rgcount, rextents); + rtg_group(rtg)->xg_block_count = rtg->rtg_extents * mp->m_sb.sb_rextsize; + rtg_group(rtg)->xg_min_gbno = xfs_rtgroup_min_block(mp, rgno); +} + +int +xfs_rtgroup_alloc( + struct xfs_mount *mp, + xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents) +{ + struct xfs_rtgroup *rtg; + int error; + + rtg = kzalloc(sizeof(struct xfs_rtgroup), GFP_KERNEL); + if (!rtg) + return -ENOMEM; + + xfs_rtgroup_calc_geometry(mp, rtg, rgno, rgcount, rextents); + + error = xfs_group_insert(mp, rtg_group(rtg), rgno, XG_TYPE_RTG); + if (error) + goto out_free_rtg; + return 0; + +out_free_rtg: + kfree(rtg); + return error; +} + +void +xfs_rtgroup_free( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + xfs_group_free(mp, rgno, XG_TYPE_RTG, NULL); +} + +/* Free a range of incore rtgroup objects. */ +void +xfs_free_rtgroups( + struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno) +{ + xfs_rgnumber_t rgno; + + for (rgno = first_rgno; rgno < end_rgno; rgno++) + xfs_rtgroup_free(mp, rgno); +} + +/* Initialize some range of incore rtgroup objects. */ +int +xfs_initialize_rtgroups( + struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno, + xfs_rtbxlen_t rextents) +{ + xfs_rgnumber_t index; + int error; + + if (first_rgno >= end_rgno) + return 0; + + for (index = first_rgno; index < end_rgno; index++) { + error = xfs_rtgroup_alloc(mp, index, end_rgno, rextents); + if (error) + goto out_unwind_new_rtgs; + } + + return 0; + +out_unwind_new_rtgs: + xfs_free_rtgroups(mp, first_rgno, index); + return error; +} + +/* Compute the number of rt extents in this realtime group. */ +xfs_rtxnum_t +__xfs_rtgroup_extents( + struct xfs_mount *mp, + xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents) +{ + ASSERT(rgno < rgcount); + if (rgno == rgcount - 1) + return rextents - ((xfs_rtxnum_t)rgno * mp->m_sb.sb_rgextents); + + ASSERT(xfs_has_rtgroups(mp)); + return mp->m_sb.sb_rgextents; +} + +xfs_rtxnum_t +xfs_rtgroup_extents( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + return __xfs_rtgroup_extents(mp, rgno, mp->m_sb.sb_rgcount, + mp->m_sb.sb_rextents); +} + +/* + * Update the rt extent count of the previous tail rtgroup if it changed during + * recovery (i.e. recovery of a growfs). + */ +int +xfs_update_last_rtgroup_size( + struct xfs_mount *mp, + xfs_rgnumber_t prev_rgcount) +{ + struct xfs_rtgroup *rtg; + + ASSERT(prev_rgcount > 0); + + rtg = xfs_rtgroup_grab(mp, prev_rgcount - 1); + if (!rtg) + return -EFSCORRUPTED; + rtg->rtg_extents = __xfs_rtgroup_extents(mp, prev_rgcount - 1, + mp->m_sb.sb_rgcount, mp->m_sb.sb_rextents); + rtg_group(rtg)->xg_block_count = rtg->rtg_extents * mp->m_sb.sb_rextsize; + xfs_rtgroup_rele(rtg); + return 0; +} + +/* Lock metadata inodes associated with this rt group. */ +void +xfs_rtgroup_lock( + struct xfs_rtgroup *rtg, + unsigned int rtglock_flags) +{ + ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); + ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || + !(rtglock_flags & XFS_RTGLOCK_BITMAP)); + + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + /* + * Lock both realtime free space metadata inodes for a + * freespace update. + */ + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } + } + + if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) + xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_EXCL); + + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) + xfs_ilock(rtg_refcount(rtg), XFS_ILOCK_EXCL); +} + +/* Unlock metadata inodes associated with this rt group. */ +void +xfs_rtgroup_unlock( + struct xfs_rtgroup *rtg, + unsigned int rtglock_flags) +{ + ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); + ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || + !(rtglock_flags & XFS_RTGLOCK_BITMAP)); + + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) + xfs_iunlock(rtg_refcount(rtg), XFS_ILOCK_EXCL); + + if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) + xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL); + + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } + } +} + +/* + * Join realtime group metadata inodes to the transaction. The ILOCKs will be + * released on transaction commit. + */ +void +xfs_rtgroup_trans_join( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg, + unsigned int rtglock_flags) +{ + ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); + ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)); + + if (!xfs_has_zoned(rtg_mount(rtg)) && + (rtglock_flags & XFS_RTGLOCK_BITMAP)) { + xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL); + } + + if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) + xfs_trans_ijoin(tp, rtg_rmap(rtg), XFS_ILOCK_EXCL); + + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) + xfs_trans_ijoin(tp, rtg_refcount(rtg), XFS_ILOCK_EXCL); +} + +/* Retrieve rt group geometry. */ +int +xfs_rtgroup_get_geometry( + struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo) +{ + /* Fill out form. */ + memset(rgeo, 0, sizeof(*rgeo)); + rgeo->rg_number = rtg_rgno(rtg); + rgeo->rg_length = rtg_blocks(rtg); + xfs_rtgroup_geom_health(rtg, rgeo); + return 0; +} + +#ifdef CONFIG_PROVE_LOCKING +static struct lock_class_key xfs_rtginode_lock_class; + +static int +xfs_rtginode_ilock_cmp_fn( + const struct lockdep_map *m1, + const struct lockdep_map *m2) +{ + const struct xfs_inode *ip1 = + container_of(m1, struct xfs_inode, i_lock.dep_map); + const struct xfs_inode *ip2 = + container_of(m2, struct xfs_inode, i_lock.dep_map); + + if (ip1->i_projid < ip2->i_projid) + return -1; + if (ip1->i_projid > ip2->i_projid) + return 1; + return 0; +} + +static inline void +xfs_rtginode_ilock_print_fn( + const struct lockdep_map *m) +{ + const struct xfs_inode *ip = + container_of(m, struct xfs_inode, i_lock.dep_map); + + printk(KERN_CONT " rgno=%u metatype=%s", ip->i_projid, + xfs_metafile_type_str(ip->i_metatype)); +} + +/* + * Most of the time each of the RTG inode locks are only taken one at a time. + * But when committing deferred ops, more than one of a kind can be taken. + * However, deferred rt ops will be committed in rgno order so there is no + * potential for deadlocks. The code here is needed to tell lockdep about this + * order. + */ +static inline void +xfs_rtginode_lockdep_setup( + struct xfs_inode *ip, + xfs_rgnumber_t rgno, + enum xfs_rtg_inodes type) +{ + lockdep_set_class_and_subclass(&ip->i_lock, &xfs_rtginode_lock_class, + type); + lock_set_cmp_fn(&ip->i_lock, xfs_rtginode_ilock_cmp_fn, + xfs_rtginode_ilock_print_fn); +} +#else +#define xfs_rtginode_lockdep_setup(ip, rgno, type) do { } while (0) +#endif /* CONFIG_PROVE_LOCKING */ + +struct xfs_rtginode_ops { + const char *name; /* short name */ + + enum xfs_metafile_type metafile_type; + + unsigned int sick; /* rtgroup sickness flag */ + + unsigned int fmt_mask; /* all valid data fork formats */ + + /* Does the fs have this feature? */ + bool (*enabled)(const struct xfs_mount *mp); + + /* Create this rtgroup metadata inode and initialize it. */ + int (*create)(struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init); +}; + +static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = { + [XFS_RTGI_BITMAP] = { + .name = "bitmap", + .metafile_type = XFS_METAFILE_RTBITMAP, + .sick = XFS_SICK_RG_BITMAP, + .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | + (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, + .create = xfs_rtbitmap_create, + }, + [XFS_RTGI_SUMMARY] = { + .name = "summary", + .metafile_type = XFS_METAFILE_RTSUMMARY, + .sick = XFS_SICK_RG_SUMMARY, + .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | + (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, + .create = xfs_rtsummary_create, + }, + [XFS_RTGI_RMAP] = { + .name = "rmap", + .metafile_type = XFS_METAFILE_RTRMAP, + .sick = XFS_SICK_RG_RMAPBT, + .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE, + /* + * growfs must create the rtrmap inodes before adding a + * realtime volume to the filesystem, so we cannot use the + * rtrmapbt predicate here. + */ + .enabled = xfs_has_rmapbt, + .create = xfs_rtrmapbt_create, + }, + [XFS_RTGI_REFCOUNT] = { + .name = "refcount", + .metafile_type = XFS_METAFILE_RTREFCOUNT, + .sick = XFS_SICK_RG_REFCNTBT, + .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE, + /* same comment about growfs and rmap inodes applies here */ + .enabled = xfs_has_reflink, + .create = xfs_rtrefcountbt_create, + }, +}; + +/* Return the shortname of this rtgroup inode. */ +const char * +xfs_rtginode_name( + enum xfs_rtg_inodes type) +{ + return xfs_rtginode_ops[type].name; +} + +/* Return the metafile type of this rtgroup inode. */ +enum xfs_metafile_type +xfs_rtginode_metafile_type( + enum xfs_rtg_inodes type) +{ + return xfs_rtginode_ops[type].metafile_type; +} + +/* Should this rtgroup inode be present? */ +bool +xfs_rtginode_enabled( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type) +{ + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + + if (!ops->enabled) + return true; + return ops->enabled(rtg_mount(rtg)); +} + +/* Mark an rtgroup inode sick */ +void +xfs_rtginode_mark_sick( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type) +{ + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + + xfs_group_mark_sick(rtg_group(rtg), ops->sick); +} + +/* Load and existing rtgroup inode into the rtgroup structure. */ +int +xfs_rtginode_load( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip; + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + int error; + + if (!xfs_rtginode_enabled(rtg, type)) + return 0; + + if (!xfs_has_rtgroups(mp)) { + xfs_ino_t ino; + + switch (type) { + case XFS_RTGI_BITMAP: + ino = mp->m_sb.sb_rbmino; + break; + case XFS_RTGI_SUMMARY: + ino = mp->m_sb.sb_rsumino; + break; + default: + /* None of the other types exist on !rtgroups */ + return 0; + } + + error = xfs_trans_metafile_iget(tp, ino, ops->metafile_type, + &ip); + } else { + const char *path; + + if (!mp->m_rtdirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + path = xfs_rtginode_path(rtg_rgno(rtg), type); + if (!path) + return -ENOMEM; + error = xfs_metadir_load(tp, mp->m_rtdirip, path, + ops->metafile_type, &ip); + kfree(path); + } + + if (error) { + if (xfs_metadata_is_sick(error)) + xfs_rtginode_mark_sick(rtg, type); + return error; + } + + if (XFS_IS_CORRUPT(mp, !((1U << ip->i_df.if_format) & ops->fmt_mask))) { + xfs_irele(ip); + xfs_rtginode_mark_sick(rtg, type); + return -EFSCORRUPTED; + } + + if (XFS_IS_CORRUPT(mp, ip->i_projid != rtg_rgno(rtg))) { + xfs_irele(ip); + xfs_rtginode_mark_sick(rtg, type); + return -EFSCORRUPTED; + } + + xfs_rtginode_lockdep_setup(ip, rtg_rgno(rtg), type); + rtg->rtg_inodes[type] = ip; + return 0; +} + +/* Release an rtgroup metadata inode. */ +void +xfs_rtginode_irele( + struct xfs_inode **ipp) +{ + if (*ipp) + xfs_irele(*ipp); + *ipp = NULL; +} + +/* Add a metadata inode for a realtime rmap btree. */ +int +xfs_rtginode_create( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, + bool init) +{ + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_metadir_update upd = { + .dp = mp->m_rtdirip, + .metafile_type = ops->metafile_type, + }; + int error; + + if (!xfs_rtginode_enabled(rtg, type)) + return 0; + + if (!mp->m_rtdirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + upd.path = xfs_rtginode_path(rtg_rgno(rtg), type); + if (!upd.path) + return -ENOMEM; + + error = xfs_metadir_start_create(&upd); + if (error) + goto out_path; + + error = xfs_metadir_create(&upd, S_IFREG); + if (error) + goto out_cancel; + + xfs_rtginode_lockdep_setup(upd.ip, rtg_rgno(rtg), type); + + upd.ip->i_projid = rtg_rgno(rtg); + error = ops->create(rtg, upd.ip, upd.tp, init); + if (error) + goto out_cancel; + + error = xfs_metadir_commit(&upd); + if (error) + goto out_path; + + kfree(upd.path); + xfs_finish_inode_setup(upd.ip); + rtg->rtg_inodes[type] = upd.ip; + return 0; + +out_cancel: + xfs_metadir_cancel(&upd, error); + /* Have to finish setting up the inode to ensure it's deleted. */ + if (upd.ip) { + xfs_finish_inode_setup(upd.ip); + xfs_irele(upd.ip); + } +out_path: + kfree(upd.path); + return error; +} + +/* Create the parent directory for all rtgroup inodes and load it. */ +int +xfs_rtginode_mkdir_parent( + struct xfs_mount *mp) +{ + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_mkdir(mp->m_metadirip, "rtgroups", &mp->m_rtdirip); +} + +/* Load the parent directory of all rtgroup inodes. */ +int +xfs_rtginode_load_parent( + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_load(tp, mp->m_metadirip, "rtgroups", + XFS_METAFILE_DIR, &mp->m_rtdirip); +} + +/* Check superblock fields for a read or a write. */ +static xfs_failaddr_t +xfs_rtsb_verify_common( + struct xfs_buf *bp) +{ + struct xfs_rtsb *rsb = bp->b_addr; + + if (!xfs_verify_magic(bp, rsb->rsb_magicnum)) + return __this_address; + if (rsb->rsb_pad) + return __this_address; + + /* Everything to the end of the fs block must be zero */ + if (memchr_inv(rsb + 1, 0, BBTOB(bp->b_length) - sizeof(*rsb))) + return __this_address; + + return NULL; +} + +/* Check superblock fields for a read or revalidation. */ +static inline xfs_failaddr_t +xfs_rtsb_verify_all( + struct xfs_buf *bp) +{ + struct xfs_rtsb *rsb = bp->b_addr; + struct xfs_mount *mp = bp->b_mount; + xfs_failaddr_t fa; + + fa = xfs_rtsb_verify_common(bp); + if (fa) + return fa; + + if (memcmp(&rsb->rsb_fname, &mp->m_sb.sb_fname, XFSLABEL_MAX)) + return __this_address; + if (!uuid_equal(&rsb->rsb_uuid, &mp->m_sb.sb_uuid)) + return __this_address; + if (!uuid_equal(&rsb->rsb_meta_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + + return NULL; +} + +static void +xfs_rtsb_read_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (!xfs_buf_verify_cksum(bp, XFS_RTSB_CRC_OFF)) { + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + return; + } + + fa = xfs_rtsb_verify_all(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +static void +xfs_rtsb_write_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_rtsb_verify_common(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + + xfs_buf_update_cksum(bp, XFS_RTSB_CRC_OFF); +} + +const struct xfs_buf_ops xfs_rtsb_buf_ops = { + .name = "xfs_rtsb", + .magic = { 0, cpu_to_be32(XFS_RTSB_MAGIC) }, + .verify_read = xfs_rtsb_read_verify, + .verify_write = xfs_rtsb_write_verify, + .verify_struct = xfs_rtsb_verify_all, +}; + +/* Update a realtime superblock from the primary fs super */ +void +xfs_update_rtsb( + struct xfs_buf *rtsb_bp, + const struct xfs_buf *sb_bp) +{ + const struct xfs_dsb *dsb = sb_bp->b_addr; + struct xfs_rtsb *rsb = rtsb_bp->b_addr; + const uuid_t *meta_uuid; + + rsb->rsb_magicnum = cpu_to_be32(XFS_RTSB_MAGIC); + + rsb->rsb_pad = 0; + memcpy(&rsb->rsb_fname, &dsb->sb_fname, XFSLABEL_MAX); + + memcpy(&rsb->rsb_uuid, &dsb->sb_uuid, sizeof(rsb->rsb_uuid)); + + /* + * The metadata uuid is the fs uuid if the metauuid feature is not + * enabled. + */ + if (dsb->sb_features_incompat & + cpu_to_be32(XFS_SB_FEAT_INCOMPAT_META_UUID)) + meta_uuid = &dsb->sb_meta_uuid; + else + meta_uuid = &dsb->sb_uuid; + memcpy(&rsb->rsb_meta_uuid, meta_uuid, sizeof(rsb->rsb_meta_uuid)); +} + +/* + * Update the realtime superblock from a filesystem superblock and log it to + * the given transaction. + */ +struct xfs_buf * +xfs_log_rtsb( + struct xfs_trans *tp, + const struct xfs_buf *sb_bp) +{ + struct xfs_buf *rtsb_bp; + + if (!xfs_has_rtsb(tp->t_mountp)) + return NULL; + + rtsb_bp = xfs_trans_getrtsb(tp); + if (!rtsb_bp) { + /* + * It's possible for the rtgroups feature to be enabled but + * there is no incore rt superblock buffer if the rt geometry + * was specified at mkfs time but the rt section has not yet + * been attached. In this case, rblocks must be zero. + */ + ASSERT(tp->t_mountp->m_sb.sb_rblocks == 0); + return NULL; + } + + xfs_update_rtsb(rtsb_bp, sb_bp); + xfs_trans_ordered_buf(tp, rtsb_bp); + return rtsb_bp; +} diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h new file mode 100644 index 000000000000..d36a6ae0abe5 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -0,0 +1,368 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __LIBXFS_RTGROUP_H +#define __LIBXFS_RTGROUP_H 1 + +#include "xfs_group.h" + +struct xfs_mount; +struct xfs_trans; + +enum xfs_rtg_inodes { + XFS_RTGI_BITMAP, /* allocation bitmap */ + XFS_RTGI_SUMMARY, /* allocation summary */ + XFS_RTGI_RMAP, /* rmap btree inode */ + XFS_RTGI_REFCOUNT, /* refcount btree inode */ + + XFS_RTGI_MAX, +}; + +#ifdef MAX_LOCKDEP_SUBCLASSES +static_assert(XFS_RTGI_MAX <= MAX_LOCKDEP_SUBCLASSES); +#endif + +/* + * Realtime group incore structure, similar to the per-AG structure. + */ +struct xfs_rtgroup { + struct xfs_group rtg_group; + + /* per-rtgroup metadata inodes */ + struct xfs_inode *rtg_inodes[XFS_RTGI_MAX]; + + /* Number of blocks in this group */ + xfs_rtxnum_t rtg_extents; + + /* + * For bitmap based RT devices this points to a cache of rt summary + * level per bitmap block with the invariant that rtg_rsum_cache[bbno] + * > the maximum i for which rsum[i][bbno] != 0, or 0 if + * rsum[i][bbno] == 0 for all i. + * Reads and writes are serialized by the rsumip inode lock. + * + * For zoned RT devices this points to the open zone structure for + * a group that is open for writers, or is NULL. + */ + union { + uint8_t *rtg_rsum_cache; + struct xfs_open_zone *rtg_open_zone; + }; +}; + +/* + * For zoned RT devices this is set on groups that have no written blocks + * and can be picked by the allocator for opening. + */ +#define XFS_RTG_FREE XA_MARK_0 + +/* + * For zoned RT devices this is set on groups that are fully written and that + * have unused blocks. Used by the garbage collection to pick targets. + */ +#define XFS_RTG_RECLAIMABLE XA_MARK_1 + +static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg) +{ + return container_of(xg, struct xfs_rtgroup, rtg_group); +} + +static inline struct xfs_group *rtg_group(struct xfs_rtgroup *rtg) +{ + return &rtg->rtg_group; +} + +static inline struct xfs_mount *rtg_mount(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_mount; +} + +static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_gno; +} + +static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_block_count; +} + +static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_inodes[XFS_RTGI_BITMAP]; +} + +static inline struct xfs_inode *rtg_summary(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_inodes[XFS_RTGI_SUMMARY]; +} + +static inline struct xfs_inode *rtg_rmap(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_inodes[XFS_RTGI_RMAP]; +} + +static inline struct xfs_inode *rtg_refcount(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_inodes[XFS_RTGI_REFCOUNT]; +} + +/* Passive rtgroup references */ +static inline struct xfs_rtgroup * +xfs_rtgroup_get( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + return to_rtg(xfs_group_get(mp, rgno, XG_TYPE_RTG)); +} + +static inline struct xfs_rtgroup * +xfs_rtgroup_hold( + struct xfs_rtgroup *rtg) +{ + return to_rtg(xfs_group_hold(rtg_group(rtg))); +} + +static inline void +xfs_rtgroup_put( + struct xfs_rtgroup *rtg) +{ + xfs_group_put(rtg_group(rtg)); +} + +/* Active rtgroup references */ +static inline struct xfs_rtgroup * +xfs_rtgroup_grab( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + return to_rtg(xfs_group_grab(mp, rgno, XG_TYPE_RTG)); +} + +static inline void +xfs_rtgroup_rele( + struct xfs_rtgroup *rtg) +{ + xfs_group_rele(rtg_group(rtg)); +} + +static inline struct xfs_rtgroup * +xfs_rtgroup_next_range( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg, + xfs_rgnumber_t start_rgno, + xfs_rgnumber_t end_rgno) +{ + return to_rtg(xfs_group_next_range(mp, rtg ? rtg_group(rtg) : NULL, + start_rgno, end_rgno, XG_TYPE_RTG)); +} + +static inline struct xfs_rtgroup * +xfs_rtgroup_next( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg) +{ + return xfs_rtgroup_next_range(mp, rtg, 0, mp->m_sb.sb_rgcount - 1); +} + +static inline bool +xfs_verify_rgbno( + struct xfs_rtgroup *rtg, + xfs_rgblock_t rgbno) +{ + ASSERT(xfs_has_rtgroups(rtg_mount(rtg))); + + return xfs_verify_gbno(rtg_group(rtg), rgbno); +} + +/* + * Check that [@rgbno,@len] is a valid extent range in @rtg. + * + * Must only be used for RTG-enabled file systems. + */ +static inline bool +xfs_verify_rgbext( + struct xfs_rtgroup *rtg, + xfs_rgblock_t rgbno, + xfs_extlen_t len) +{ + ASSERT(xfs_has_rtgroups(rtg_mount(rtg))); + + return xfs_verify_gbext(rtg_group(rtg), rgbno, len); +} + +static inline xfs_rtblock_t +xfs_rgbno_to_rtb( + struct xfs_rtgroup *rtg, + xfs_rgblock_t rgbno) +{ + return xfs_gbno_to_fsb(rtg_group(rtg), rgbno); +} + +static inline xfs_rgnumber_t +xfs_rtb_to_rgno( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return xfs_fsb_to_gno(mp, rtbno, XG_TYPE_RTG); +} + +static inline xfs_rgblock_t +xfs_rtb_to_rgbno( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return xfs_fsb_to_gbno(mp, rtbno, XG_TYPE_RTG); +} + +/* Is rtbno the start of a RT group? */ +static inline bool +xfs_rtbno_is_group_start( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return (rtbno & mp->m_groups[XG_TYPE_RTG].blkmask) == 0; +} + +/* Convert an rtgroups rt extent number into an rgbno. */ +static inline xfs_rgblock_t +xfs_rtx_to_rgbno( + struct xfs_rtgroup *rtg, + xfs_rtxnum_t rtx) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (likely(mp->m_rtxblklog >= 0)) + return rtx << mp->m_rtxblklog; + return rtx * mp->m_sb.sb_rextsize; +} + +static inline xfs_daddr_t +xfs_rtb_to_daddr( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); + + rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask); + } + + return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno); +} + +static inline xfs_rtblock_t +xfs_daddr_to_rtb( + struct xfs_mount *mp, + xfs_daddr_t daddr) +{ + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + xfs_rfsblock_t bno; + + bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb; + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { + xfs_rgnumber_t rgno; + uint32_t rgbno; + + rgno = div_u64_rem(bno, g->blocks, &rgbno); + return ((xfs_rtblock_t)rgno << g->blklog) + rgbno; + } + + return bno; +} + +#ifdef CONFIG_XFS_RT +int xfs_rtgroup_alloc(struct xfs_mount *mp, xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, xfs_rtbxlen_t rextents); +void xfs_rtgroup_free(struct xfs_mount *mp, xfs_rgnumber_t rgno); + +void xfs_free_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno); +int xfs_initialize_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno, xfs_rtbxlen_t rextents); + +xfs_rtxnum_t __xfs_rtgroup_extents(struct xfs_mount *mp, xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, xfs_rtbxlen_t rextents); +xfs_rtxnum_t xfs_rtgroup_extents(struct xfs_mount *mp, xfs_rgnumber_t rgno); +void xfs_rtgroup_calc_geometry(struct xfs_mount *mp, struct xfs_rtgroup *rtg, + xfs_rgnumber_t rgno, xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents); + +int xfs_update_last_rtgroup_size(struct xfs_mount *mp, + xfs_rgnumber_t prev_rgcount); + +/* Lock the rt bitmap inode in exclusive mode */ +#define XFS_RTGLOCK_BITMAP (1U << 0) +/* Lock the rt bitmap inode in shared mode */ +#define XFS_RTGLOCK_BITMAP_SHARED (1U << 1) +/* Lock the rt rmap inode in exclusive mode */ +#define XFS_RTGLOCK_RMAP (1U << 2) +/* Lock the rt refcount inode in exclusive mode */ +#define XFS_RTGLOCK_REFCOUNT (1U << 3) + +#define XFS_RTGLOCK_ALL_FLAGS (XFS_RTGLOCK_BITMAP | \ + XFS_RTGLOCK_BITMAP_SHARED | \ + XFS_RTGLOCK_RMAP | \ + XFS_RTGLOCK_REFCOUNT) + +void xfs_rtgroup_lock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags); +void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags); +void xfs_rtgroup_trans_join(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + unsigned int rtglock_flags); + +int xfs_rtgroup_get_geometry(struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo); + +int xfs_rtginode_mkdir_parent(struct xfs_mount *mp); +int xfs_rtginode_load_parent(struct xfs_trans *tp); + +const char *xfs_rtginode_name(enum xfs_rtg_inodes type); +enum xfs_metafile_type xfs_rtginode_metafile_type(enum xfs_rtg_inodes type); +bool xfs_rtginode_enabled(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type); +void xfs_rtginode_mark_sick(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type); +int xfs_rtginode_load(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type, + struct xfs_trans *tp); +int xfs_rtginode_create(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type, + bool init); +void xfs_rtginode_irele(struct xfs_inode **ipp); + +void xfs_rtginode_irele(struct xfs_inode **ipp); + +static inline const char *xfs_rtginode_path(xfs_rgnumber_t rgno, + enum xfs_rtg_inodes type) +{ + return kasprintf(GFP_KERNEL, "%u.%s", rgno, xfs_rtginode_name(type)); +} + +void xfs_update_rtsb(struct xfs_buf *rtsb_bp, + const struct xfs_buf *sb_bp); +struct xfs_buf *xfs_log_rtsb(struct xfs_trans *tp, + const struct xfs_buf *sb_bp); +#else +static inline void xfs_free_rtgroups(struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, xfs_rgnumber_t end_rgno) +{ +} + +static inline int xfs_initialize_rtgroups(struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, xfs_rgnumber_t end_rgno, + xfs_rtbxlen_t rextents) +{ + return 0; +} + +# define xfs_rtgroup_extents(mp, rgno) (0) +# define xfs_update_last_rtgroup_size(mp, rgno) (0) +# define xfs_rtgroup_lock(rtg, gf) ((void)0) +# define xfs_rtgroup_unlock(rtg, gf) ((void)0) +# define xfs_rtgroup_trans_join(tp, rtg, gf) ((void)0) +# define xfs_update_rtsb(bp, sb_bp) ((void)0) +# define xfs_log_rtsb(tp, sb_bp) (NULL) +# define xfs_rtgroup_get_geometry(rtg, rgeo) (-EOPNOTSUPP) +#endif /* CONFIG_XFS_RT */ + +#endif /* __LIBXFS_RTGROUP_H */ diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c new file mode 100644 index 000000000000..3db5e7a4a945 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c @@ -0,0 +1,757 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_rtrefcount_btree.h" +#include "xfs_refcount.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" +#include "xfs_metafile.h" +#include "xfs_health.h" + +static struct kmem_cache *xfs_rtrefcountbt_cur_cache; + +/* + * Realtime Reference Count btree. + * + * This is a btree used to track the owner(s) of a given extent in the realtime + * device. See the comments in xfs_refcount_btree.c for more information. + * + * This tree is basically the same as the regular refcount btree except that + * it's rooted in an inode. + */ + +static struct xfs_btree_cur * +xfs_rtrefcountbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_rtrefcountbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group)); +} + +STATIC int +xfs_rtrefcountbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0) / 2; + } + + return cur->bc_mp->m_rtrefc_mnr[level != 0]; +} + +STATIC int +xfs_rtrefcountbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0); + } + + return cur->bc_mp->m_rtrefc_mxr[level != 0]; +} + +/* + * Calculate number of records in a realtime refcount btree inode root. + */ +unsigned int +xfs_rtrefcountbt_droot_maxrecs( + unsigned int blocklen, + bool leaf) +{ + blocklen -= sizeof(struct xfs_rtrefcount_root); + + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (2 * sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); +} + +/* + * Get the maximum records we could store in the on-disk format. + * + * For non-root nodes this is equivalent to xfs_rtrefcountbt_get_maxrecs, but + * for the root node this checks the available space in the dinode fork so that + * we can resize the in-memory buffer to match it. After a resize to the + * maximum size this function returns the same value as + * xfs_rtrefcountbt_get_maxrecs for the root node, too. + */ +STATIC int +xfs_rtrefcountbt_get_dmaxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level != cur->bc_nlevels - 1) + return cur->bc_mp->m_rtrefc_mxr[level != 0]; + return xfs_rtrefcountbt_droot_maxrecs(cur->bc_ino.forksize, level == 0); +} + +STATIC void +xfs_rtrefcountbt_init_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + key->refc.rc_startblock = rec->refc.rc_startblock; +} + +STATIC void +xfs_rtrefcountbt_init_high_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + __u32 x; + + x = be32_to_cpu(rec->refc.rc_startblock); + x += be32_to_cpu(rec->refc.rc_blockcount) - 1; + key->refc.rc_startblock = cpu_to_be32(x); +} + +STATIC void +xfs_rtrefcountbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec->refc.rc_startblock = cpu_to_be32(start); + rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); + rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); +} + +STATIC void +xfs_rtrefcountbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ptr->l = 0; +} + +STATIC int64_t +xfs_rtrefcountbt_key_diff( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) +{ + const struct xfs_refcount_key *kp = &key->refc; + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + return (int64_t)be32_to_cpu(kp->rc_startblock) - start; +} + +STATIC int64_t +xfs_rtrefcountbt_diff_two_keys( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->refc.rc_startblock); + + return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - + be32_to_cpu(k2->refc.rc_startblock); +} + +static xfs_failaddr_t +xfs_rtrefcountbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + int level; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (!xfs_has_reflink(mp)) + return __this_address; + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + level = be16_to_cpu(block->bb_level); + if (level > mp->m_rtrefc_maxlevels) + return __this_address; + + return xfs_btree_fsblock_verify(bp, mp->m_rtrefc_mxr[level != 0]); +} + +static void +xfs_rtrefcountbt_read_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (!xfs_btree_fsblock_verify_crc(bp)) + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_rtrefcountbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } + + if (bp->b_error) + trace_xfs_btree_corrupt(bp, _RET_IP_); +} + +static void +xfs_rtrefcountbt_write_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_rtrefcountbt_verify(bp); + if (fa) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + xfs_btree_fsblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = { + .name = "xfs_rtrefcountbt", + .magic = { 0, cpu_to_be32(XFS_RTREFC_CRC_MAGIC) }, + .verify_read = xfs_rtrefcountbt_read_verify, + .verify_write = xfs_rtrefcountbt_write_verify, + .verify_struct = xfs_rtrefcountbt_verify, +}; + +STATIC int +xfs_rtrefcountbt_keys_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->refc.rc_startblock) < + be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC int +xfs_rtrefcountbt_recs_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->refc.rc_startblock) + + be32_to_cpu(r1->refc.rc_blockcount) <= + be32_to_cpu(r2->refc.rc_startblock); +} + +STATIC enum xbtree_key_contig +xfs_rtrefcountbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->refc.rc_startblock); + + return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock), + be32_to_cpu(key2->refc.rc_startblock)); +} + +static inline void +xfs_rtrefcountbt_move_ptrs( + struct xfs_mount *mp, + struct xfs_btree_block *broot, + short old_size, + size_t new_size, + unsigned int numrecs) +{ + void *dptr; + void *sptr; + + sptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, old_size); + dptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, new_size); + memmove(dptr, sptr, numrecs * sizeof(xfs_rtrefcount_ptr_t)); +} + +static struct xfs_btree_block * +xfs_rtrefcountbt_broot_realloc( + struct xfs_btree_cur *cur, + unsigned int new_numrecs) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + struct xfs_btree_block *broot; + unsigned int new_size; + unsigned int old_size = ifp->if_broot_bytes; + const unsigned int level = cur->bc_nlevels - 1; + + new_size = xfs_rtrefcount_broot_space_calc(mp, level, new_numrecs); + + /* Handle the nop case quietly. */ + if (new_size == old_size) + return ifp->if_broot; + + if (new_size > old_size) { + unsigned int old_numrecs; + + /* + * If there wasn't any memory allocated before, just allocate + * it now and get out. + */ + if (old_size == 0) + return xfs_broot_realloc(ifp, new_size); + + /* + * If there is already an existing if_broot, then we need to + * realloc it and possibly move the node block pointers because + * those are not butted up against the btree block header. + */ + old_numrecs = xfs_rtrefcountbt_maxrecs(mp, old_size, level); + broot = xfs_broot_realloc(ifp, new_size); + if (level > 0) + xfs_rtrefcountbt_move_ptrs(mp, broot, old_size, + new_size, old_numrecs); + goto out_broot; + } + + /* + * We're reducing numrecs. If we're going all the way to zero, just + * free the block. + */ + ASSERT(ifp->if_broot != NULL && old_size > 0); + if (new_size == 0) + return xfs_broot_realloc(ifp, 0); + + /* + * Shrink the btree root by possibly moving the rtrmapbt pointers, + * since they are not butted up against the btree block header. Then + * reallocate broot. + */ + if (level > 0) + xfs_rtrefcountbt_move_ptrs(mp, ifp->if_broot, old_size, + new_size, new_numrecs); + broot = xfs_broot_realloc(ifp, new_size); + +out_broot: + ASSERT(xfs_rtrefcount_droot_space(broot) <= + xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork)); + return broot; +} + +const struct xfs_btree_ops xfs_rtrefcountbt_ops = { + .name = "rtrefcount", + .type = XFS_BTREE_TYPE_INODE, + .geom_flags = XFS_BTGEO_IROOT_RECORDS, + + .rec_len = sizeof(struct xfs_refcount_rec), + .key_len = sizeof(struct xfs_refcount_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_REFC_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rtrefcbt_2), + .sick_mask = XFS_SICK_RG_REFCNTBT, + + .dup_cursor = xfs_rtrefcountbt_dup_cursor, + .alloc_block = xfs_btree_alloc_metafile_block, + .free_block = xfs_btree_free_metafile_block, + .get_minrecs = xfs_rtrefcountbt_get_minrecs, + .get_maxrecs = xfs_rtrefcountbt_get_maxrecs, + .get_dmaxrecs = xfs_rtrefcountbt_get_dmaxrecs, + .init_key_from_rec = xfs_rtrefcountbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur, + .key_diff = xfs_rtrefcountbt_key_diff, + .buf_ops = &xfs_rtrefcountbt_buf_ops, + .diff_two_keys = xfs_rtrefcountbt_diff_two_keys, + .keys_inorder = xfs_rtrefcountbt_keys_inorder, + .recs_inorder = xfs_rtrefcountbt_recs_inorder, + .keys_contiguous = xfs_rtrefcountbt_keys_contiguous, + .broot_realloc = xfs_rtrefcountbt_broot_realloc, +}; + +/* Allocate a new rt refcount btree cursor. */ +struct xfs_btree_cur * +xfs_rtrefcountbt_init_cursor( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg) +{ + struct xfs_inode *ip = rtg_refcount(rtg); + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_btree_cur *cur; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrefcountbt_ops, + mp->m_rtrefc_maxlevels, xfs_rtrefcountbt_cur_cache); + + cur->bc_ino.ip = ip; + cur->bc_refc.nr_ops = 0; + cur->bc_refc.shape_changes = 0; + cur->bc_group = xfs_group_hold(rtg_group(rtg)); + cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK); + cur->bc_ino.whichfork = XFS_DATA_FORK; + return cur; +} + +/* + * Install a new rt reverse mapping btree root. Caller is responsible for + * invalidating and freeing the old btree blocks. + */ +void +xfs_rtrefcountbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp) +{ + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; + struct xfs_ifork *ifp; + int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE); + + /* + * Free any resources hanging off the real fork, then shallow-copy the + * staging fork's contents into the real fork to transfer everything + * we just built. + */ + ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); + + cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno; + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); + xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK); +} + +/* Calculate number of records in a realtime refcount btree block. */ +static inline unsigned int +xfs_rtrefcountbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); +} + +/* + * Calculate number of records in an refcount btree block. + */ +unsigned int +xfs_rtrefcountbt_maxrecs( + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) +{ + blocklen -= XFS_RTREFCOUNT_BLOCK_LEN; + return xfs_rtrefcountbt_block_maxrecs(blocklen, leaf); +} + +/* Compute the max possible height for realtime refcount btrees. */ +unsigned int +xfs_rtrefcountbt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = xfs_rtrefcountbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_rtrefcountbt_block_maxrecs(blocklen, false) / 2; + + /* We need at most one record for every block in an rt group. */ + return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS); +} + +int __init +xfs_rtrefcountbt_init_cur_cache(void) +{ + xfs_rtrefcountbt_cur_cache = kmem_cache_create("xfs_rtrefcountbt_cur", + xfs_btree_cur_sizeof( + xfs_rtrefcountbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_rtrefcountbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_rtrefcountbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_rtrefcountbt_cur_cache); + xfs_rtrefcountbt_cur_cache = NULL; +} + +/* Compute the maximum height of a realtime refcount btree. */ +void +xfs_rtrefcountbt_compute_maxlevels( + struct xfs_mount *mp) +{ + unsigned int d_maxlevels, r_maxlevels; + + if (!xfs_has_rtreflink(mp)) { + mp->m_rtrefc_maxlevels = 0; + return; + } + + /* + * The realtime refcountbt lives on the data device, which means that + * its maximum height is constrained by the size of the data device and + * the height required to store one refcount record for each rtextent + * in an rt group. + */ + d_maxlevels = xfs_btree_space_to_height(mp->m_rtrefc_mnr, + mp->m_sb.sb_dblocks); + r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrefc_mnr, + mp->m_sb.sb_rgextents); + + /* Add one level to handle the inode root level. */ + mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1; +} + +/* Calculate the rtrefcount btree size for some records. */ +unsigned long long +xfs_rtrefcountbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_rtrefc_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +static unsigned long long +xfs_rtrefcountbt_max_size( + struct xfs_mount *mp, + xfs_rtblock_t rtblocks) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_rtrefc_mxr[0] == 0) + return 0; + + return xfs_rtrefcountbt_calc_size(mp, rtblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + * We need enough space to hold one record for every rt extent in the rtgroup. + */ +xfs_filblks_t +xfs_rtrefcountbt_calc_reserves( + struct xfs_mount *mp) +{ + if (!xfs_has_rtreflink(mp)) + return 0; + + return xfs_rtrefcountbt_max_size(mp, mp->m_sb.sb_rgextents); +} + +/* + * Convert on-disk form of btree root to in-memory form. + */ +STATIC void +xfs_rtrefcountbt_from_disk( + struct xfs_inode *ip, + struct xfs_rtrefcount_root *dblock, + int dblocklen, + struct xfs_btree_block *rblock) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_refcount_key *fkp; + __be64 *fpp; + struct xfs_refcount_key *tkp; + __be64 *tpp; + struct xfs_refcount_rec *frp; + struct xfs_refcount_rec *trp; + unsigned int numrecs; + unsigned int maxrecs; + unsigned int rblocklen; + + rblocklen = xfs_rtrefcount_broot_space(mp, dblock); + + xfs_btree_init_block(mp, rblock, &xfs_rtrefcountbt_ops, 0, 0, + ip->i_ino); + + rblock->bb_level = dblock->bb_level; + rblock->bb_numrecs = dblock->bb_numrecs; + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrefcount_droot_key_addr(dblock, 1); + tkp = xfs_rtrefcount_key_addr(rblock, 1); + fpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs); + tpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen); + numrecs = be16_to_cpu(dblock->bb_numrecs); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrefcount_droot_rec_addr(dblock, 1); + trp = xfs_rtrefcount_rec_addr(rblock, 1); + numrecs = be16_to_cpu(dblock->bb_numrecs); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Load a realtime reference count btree root in from disk. */ +int +xfs_iformat_rtrefcount( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + struct xfs_btree_block *broot; + unsigned int numrecs; + unsigned int level; + int dsize; + + /* + * growfs must create the rtrefcount inodes before adding a realtime + * volume to the filesystem, so we cannot use the rtrefcount predicate + * here. + */ + if (!xfs_has_reflink(ip->i_mount)) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK); + numrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (level > mp->m_rtrefc_maxlevels || + xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK), + xfs_rtrefcount_broot_space_calc(mp, level, numrecs)); + if (broot) + xfs_rtrefcountbt_from_disk(ip, dfp, dsize, broot); + return 0; +} + +/* + * Convert in-memory form of btree root to on-disk form. + */ +void +xfs_rtrefcountbt_to_disk( + struct xfs_mount *mp, + struct xfs_btree_block *rblock, + int rblocklen, + struct xfs_rtrefcount_root *dblock, + int dblocklen) +{ + struct xfs_refcount_key *fkp; + __be64 *fpp; + struct xfs_refcount_key *tkp; + __be64 *tpp; + struct xfs_refcount_rec *frp; + struct xfs_refcount_rec *trp; + unsigned int maxrecs; + unsigned int numrecs; + + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTREFC_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL)); + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)); + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)); + + dblock->bb_level = rblock->bb_level; + dblock->bb_numrecs = rblock->bb_numrecs; + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrefcount_key_addr(rblock, 1); + tkp = xfs_rtrefcount_droot_key_addr(dblock, 1); + fpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen); + tpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs); + numrecs = be16_to_cpu(rblock->bb_numrecs); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrefcount_rec_addr(rblock, 1); + trp = xfs_rtrefcount_droot_rec_addr(dblock, 1); + numrecs = be16_to_cpu(rblock->bb_numrecs); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Flush a realtime reference count btree root out to disk. */ +void +xfs_iflush_rtrefcount( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + + ASSERT(ifp->if_broot != NULL); + ASSERT(ifp->if_broot_bytes > 0); + ASSERT(xfs_rtrefcount_droot_space(ifp->if_broot) <= + xfs_inode_fork_size(ip, XFS_DATA_FORK)); + xfs_rtrefcountbt_to_disk(ip->i_mount, ifp->if_broot, + ifp->if_broot_bytes, dfp, + XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK)); +} + +/* + * Create a realtime refcount btree inode. + */ +int +xfs_rtrefcountbt_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_block *broot; + + ifp->if_format = XFS_DINODE_FMT_META_BTREE; + ASSERT(ifp->if_broot_bytes == 0); + ASSERT(ifp->if_bytes == 0); + + /* Initialize the empty incore btree root. */ + broot = xfs_broot_realloc(ifp, + xfs_rtrefcount_broot_space_calc(mp, 0, 0)); + if (broot) + xfs_btree_init_block(mp, broot, &xfs_rtrefcountbt_ops, 0, 0, + ip->i_ino); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.h b/fs/xfs/libxfs/xfs_rtrefcount_btree.h new file mode 100644 index 000000000000..a99b7a8aec86 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.h @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_RTREFCOUNT_BTREE_H__ +#define __XFS_RTREFCOUNT_BTREE_H__ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; +struct xbtree_ifakeroot; +struct xfs_rtgroup; + +/* refcounts only exist on crc enabled filesystems */ +#define XFS_RTREFCOUNT_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN + +struct xfs_btree_cur *xfs_rtrefcountbt_init_cursor(struct xfs_trans *tp, + struct xfs_rtgroup *rtg); +struct xfs_btree_cur *xfs_rtrefcountbt_stage_cursor(struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xbtree_ifakeroot *ifake); +void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp); +unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp, + unsigned int blocklen, bool leaf); +void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp); +unsigned int xfs_rtrefcountbt_droot_maxrecs(unsigned int blocklen, bool leaf); + +/* + * Addresses of records, keys, and pointers within an incore rtrefcountbt block. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +static inline struct xfs_refcount_rec * +xfs_rtrefcount_rec_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_refcount_rec *) + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + + (index - 1) * sizeof(struct xfs_refcount_rec)); +} + +static inline struct xfs_refcount_key * +xfs_rtrefcount_key_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_refcount_key *) + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + + (index - 1) * sizeof(struct xfs_refcount_key)); +} + +static inline xfs_rtrefcount_ptr_t * +xfs_rtrefcount_ptr_addr( + struct xfs_btree_block *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrefcount_ptr_t *) + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + + maxrecs * sizeof(struct xfs_refcount_key) + + (index - 1) * sizeof(xfs_rtrefcount_ptr_t)); +} + +unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void); +int __init xfs_rtrefcountbt_init_cur_cache(void); +void xfs_rtrefcountbt_destroy_cur_cache(void); + +xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp); +unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp, + unsigned long long len); + +/* Addresses of key, pointers, and records within an ondisk rtrefcount block. */ + +static inline struct xfs_refcount_rec * +xfs_rtrefcount_droot_rec_addr( + struct xfs_rtrefcount_root *block, + unsigned int index) +{ + return (struct xfs_refcount_rec *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_refcount_rec)); +} + +static inline struct xfs_refcount_key * +xfs_rtrefcount_droot_key_addr( + struct xfs_rtrefcount_root *block, + unsigned int index) +{ + return (struct xfs_refcount_key *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_refcount_key)); +} + +static inline xfs_rtrefcount_ptr_t * +xfs_rtrefcount_droot_ptr_addr( + struct xfs_rtrefcount_root *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrefcount_ptr_t *) + ((char *)(block + 1) + + maxrecs * sizeof(struct xfs_refcount_key) + + (index - 1) * sizeof(xfs_rtrefcount_ptr_t)); +} + +/* + * Address of pointers within the incore btree root. + * + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +static inline xfs_rtrefcount_ptr_t * +xfs_rtrefcount_broot_ptr_addr( + struct xfs_mount *mp, + struct xfs_btree_block *bb, + unsigned int index, + unsigned int block_size) +{ + return xfs_rtrefcount_ptr_addr(bb, index, + xfs_rtrefcountbt_maxrecs(mp, block_size, false)); +} + +/* + * Compute the space required for the incore btree root containing the given + * number of records. + */ +static inline size_t +xfs_rtrefcount_broot_space_calc( + struct xfs_mount *mp, + unsigned int level, + unsigned int nrecs) +{ + size_t sz = XFS_RTREFCOUNT_BLOCK_LEN; + + if (level > 0) + return sz + nrecs * (sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); + return sz + nrecs * sizeof(struct xfs_refcount_rec); +} + +/* + * Compute the space required for the incore btree root given the ondisk + * btree root block. + */ +static inline size_t +xfs_rtrefcount_broot_space(struct xfs_mount *mp, struct xfs_rtrefcount_root *bb) +{ + return xfs_rtrefcount_broot_space_calc(mp, be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +/* Compute the space required for the ondisk root block. */ +static inline size_t +xfs_rtrefcount_droot_space_calc( + unsigned int level, + unsigned int nrecs) +{ + size_t sz = sizeof(struct xfs_rtrefcount_root); + + if (level > 0) + return sz + nrecs * (sizeof(struct xfs_refcount_key) + + sizeof(xfs_rtrefcount_ptr_t)); + return sz + nrecs * sizeof(struct xfs_refcount_rec); +} + +/* + * Compute the space required for the ondisk root block given an incore root + * block. + */ +static inline size_t +xfs_rtrefcount_droot_space(struct xfs_btree_block *bb) +{ + return xfs_rtrefcount_droot_space_calc(be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +int xfs_iformat_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip); +void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp, + struct xfs_btree_block *rblock, int rblocklen, + struct xfs_rtrefcount_root *dblock, int dblocklen); +void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip); + +int xfs_rtrefcountbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); + +#endif /* __XFS_RTREFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c new file mode 100644 index 000000000000..9bdc2cbfc113 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c @@ -0,0 +1,1054 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_metafile.h" +#include "xfs_rmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" +#include "xfs_rtgroup.h" +#include "xfs_bmap.h" +#include "xfs_health.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" + +static struct kmem_cache *xfs_rtrmapbt_cur_cache; + +/* + * Realtime Reverse Map btree. + * + * This is a btree used to track the owner(s) of a given extent in the realtime + * device. See the comments in xfs_rmap_btree.c for more information. + * + * This tree is basically the same as the regular rmap btree except that it + * is rooted in an inode and does not live in free space. + */ + +static struct xfs_btree_cur * +xfs_rtrmapbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_rtrmapbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group)); +} + +STATIC int +xfs_rtrmapbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0) / 2; + } + + return cur->bc_mp->m_rtrmap_mnr[level != 0]; +} + +STATIC int +xfs_rtrmapbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + + return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, + level == 0); + } + + return cur->bc_mp->m_rtrmap_mxr[level != 0]; +} + +/* Calculate number of records in the ondisk realtime rmap btree inode root. */ +unsigned int +xfs_rtrmapbt_droot_maxrecs( + unsigned int blocklen, + bool leaf) +{ + blocklen -= sizeof(struct xfs_rtrmap_root); + + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / (2 * sizeof(struct xfs_rmap_key) + + sizeof(xfs_rtrmap_ptr_t)); +} + +/* + * Get the maximum records we could store in the on-disk format. + * + * For non-root nodes this is equivalent to xfs_rtrmapbt_get_maxrecs, but + * for the root node this checks the available space in the dinode fork + * so that we can resize the in-memory buffer to match it. After a + * resize to the maximum size this function returns the same value + * as xfs_rtrmapbt_get_maxrecs for the root node, too. + */ +STATIC int +xfs_rtrmapbt_get_dmaxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level != cur->bc_nlevels - 1) + return cur->bc_mp->m_rtrmap_mxr[level != 0]; + return xfs_rtrmapbt_droot_maxrecs(cur->bc_ino.forksize, level == 0); +} + +/* + * Convert the ondisk record's offset field into the ondisk key's offset field. + * Fork and bmbt are significant parts of the rmap record key, but written + * status is merely a record attribute. + */ +static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec) +{ + return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); +} + +STATIC void +xfs_rtrmapbt_init_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + key->rmap.rm_startblock = rec->rmap.rm_startblock; + key->rmap.rm_owner = rec->rmap.rm_owner; + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); +} + +STATIC void +xfs_rtrmapbt_init_high_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + uint64_t off; + int adj; + + adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; + + key->rmap.rm_startblock = rec->rmap.rm_startblock; + be32_add_cpu(&key->rmap.rm_startblock, adj); + key->rmap.rm_owner = rec->rmap.rm_owner; + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); + if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || + XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) + return; + off = be64_to_cpu(key->rmap.rm_offset); + off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); + key->rmap.rm_offset = cpu_to_be64(off); +} + +STATIC void +xfs_rtrmapbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); + rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); + rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); + rec->rmap.rm_offset = cpu_to_be64( + xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); +} + +STATIC void +xfs_rtrmapbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ptr->l = 0; +} + +/* + * Mask the appropriate parts of the ondisk key field for a key comparison. + * Fork and bmbt are significant parts of the rmap record key, but written + * status is merely a record attribute. + */ +static inline uint64_t offset_keymask(uint64_t offset) +{ + return offset & ~XFS_RMAP_OFF_UNWRITTEN; +} + +STATIC int64_t +xfs_rtrmapbt_key_diff( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) +{ + struct xfs_rmap_irec *rec = &cur->bc_rec.r; + const struct xfs_rmap_key *kp = &key->rmap; + __u64 x, y; + int64_t d; + + d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; + if (d) + return d; + + x = be64_to_cpu(kp->rm_owner); + y = rec->rm_owner; + if (x > y) + return 1; + else if (y > x) + return -1; + + x = offset_keymask(be64_to_cpu(kp->rm_offset)); + y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); + if (x > y) + return 1; + else if (y > x) + return -1; + return 0; +} + +STATIC int64_t +xfs_rtrmapbt_diff_two_keys( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) +{ + const struct xfs_rmap_key *kp1 = &k1->rmap; + const struct xfs_rmap_key *kp2 = &k2->rmap; + int64_t d; + __u64 x, y; + + /* Doesn't make sense to mask off the physical space part */ + ASSERT(!mask || mask->rmap.rm_startblock); + + d = (int64_t)be32_to_cpu(kp1->rm_startblock) - + be32_to_cpu(kp2->rm_startblock); + if (d) + return d; + + if (!mask || mask->rmap.rm_owner) { + x = be64_to_cpu(kp1->rm_owner); + y = be64_to_cpu(kp2->rm_owner); + if (x > y) + return 1; + else if (y > x) + return -1; + } + + if (!mask || mask->rmap.rm_offset) { + /* Doesn't make sense to allow offset but not owner */ + ASSERT(!mask || mask->rmap.rm_owner); + + x = offset_keymask(be64_to_cpu(kp1->rm_offset)); + y = offset_keymask(be64_to_cpu(kp2->rm_offset)); + if (x > y) + return 1; + else if (y > x) + return -1; + } + + return 0; +} + +static xfs_failaddr_t +xfs_rtrmapbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + int level; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (!xfs_has_rmapbt(mp)) + return __this_address; + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + level = be16_to_cpu(block->bb_level); + if (level > mp->m_rtrmap_maxlevels) + return __this_address; + + return xfs_btree_fsblock_verify(bp, mp->m_rtrmap_mxr[level != 0]); +} + +static void +xfs_rtrmapbt_read_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (!xfs_btree_fsblock_verify_crc(bp)) + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + else { + fa = xfs_rtrmapbt_verify(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + } + + if (bp->b_error) + trace_xfs_btree_corrupt(bp, _RET_IP_); +} + +static void +xfs_rtrmapbt_write_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_rtrmapbt_verify(bp); + if (fa) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + xfs_btree_fsblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = { + .name = "xfs_rtrmapbt", + .magic = { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) }, + .verify_read = xfs_rtrmapbt_read_verify, + .verify_write = xfs_rtrmapbt_write_verify, + .verify_struct = xfs_rtrmapbt_verify, +}; + +STATIC int +xfs_rtrmapbt_keys_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) +{ + uint32_t x; + uint32_t y; + uint64_t a; + uint64_t b; + + x = be32_to_cpu(k1->rmap.rm_startblock); + y = be32_to_cpu(k2->rmap.rm_startblock); + if (x < y) + return 1; + else if (x > y) + return 0; + a = be64_to_cpu(k1->rmap.rm_owner); + b = be64_to_cpu(k2->rmap.rm_owner); + if (a < b) + return 1; + else if (a > b) + return 0; + a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset)); + b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset)); + if (a <= b) + return 1; + return 0; +} + +STATIC int +xfs_rtrmapbt_recs_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) +{ + uint32_t x; + uint32_t y; + uint64_t a; + uint64_t b; + + x = be32_to_cpu(r1->rmap.rm_startblock); + y = be32_to_cpu(r2->rmap.rm_startblock); + if (x < y) + return 1; + else if (x > y) + return 0; + a = be64_to_cpu(r1->rmap.rm_owner); + b = be64_to_cpu(r2->rmap.rm_owner); + if (a < b) + return 1; + else if (a > b) + return 0; + a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset)); + b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset)); + if (a <= b) + return 1; + return 0; +} + +STATIC enum xbtree_key_contig +xfs_rtrmapbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->rmap.rm_startblock); + + /* + * We only support checking contiguity of the physical space component. + * If any callers ever need more specificity than that, they'll have to + * implement it here. + */ + ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset)); + + return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock), + be32_to_cpu(key2->rmap.rm_startblock)); +} + +static inline void +xfs_rtrmapbt_move_ptrs( + struct xfs_mount *mp, + struct xfs_btree_block *broot, + short old_size, + size_t new_size, + unsigned int numrecs) +{ + void *dptr; + void *sptr; + + sptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, old_size); + dptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, new_size); + memmove(dptr, sptr, numrecs * sizeof(xfs_rtrmap_ptr_t)); +} + +static struct xfs_btree_block * +xfs_rtrmapbt_broot_realloc( + struct xfs_btree_cur *cur, + unsigned int new_numrecs) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + struct xfs_btree_block *broot; + unsigned int new_size; + unsigned int old_size = ifp->if_broot_bytes; + const unsigned int level = cur->bc_nlevels - 1; + + new_size = xfs_rtrmap_broot_space_calc(mp, level, new_numrecs); + + /* Handle the nop case quietly. */ + if (new_size == old_size) + return ifp->if_broot; + + if (new_size > old_size) { + unsigned int old_numrecs; + + /* + * If there wasn't any memory allocated before, just allocate + * it now and get out. + */ + if (old_size == 0) + return xfs_broot_realloc(ifp, new_size); + + /* + * If there is already an existing if_broot, then we need to + * realloc it and possibly move the node block pointers because + * those are not butted up against the btree block header. + */ + old_numrecs = xfs_rtrmapbt_maxrecs(mp, old_size, level == 0); + broot = xfs_broot_realloc(ifp, new_size); + if (level > 0) + xfs_rtrmapbt_move_ptrs(mp, broot, old_size, new_size, + old_numrecs); + goto out_broot; + } + + /* + * We're reducing numrecs. If we're going all the way to zero, just + * free the block. + */ + ASSERT(ifp->if_broot != NULL && old_size > 0); + if (new_size == 0) + return xfs_broot_realloc(ifp, 0); + + /* + * Shrink the btree root by possibly moving the rtrmapbt pointers, + * since they are not butted up against the btree block header. Then + * reallocate broot. + */ + if (level > 0) + xfs_rtrmapbt_move_ptrs(mp, ifp->if_broot, old_size, new_size, + new_numrecs); + broot = xfs_broot_realloc(ifp, new_size); + +out_broot: + ASSERT(xfs_rtrmap_droot_space(broot) <= + xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork)); + return broot; +} + +const struct xfs_btree_ops xfs_rtrmapbt_ops = { + .name = "rtrmap", + .type = XFS_BTREE_TYPE_INODE, + .geom_flags = XFS_BTGEO_OVERLAPPING | + XFS_BTGEO_IROOT_RECORDS, + + .rec_len = sizeof(struct xfs_rmap_rec), + /* Overlapping btree; 2 keys per pointer. */ + .key_len = 2 * sizeof(struct xfs_rmap_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_RMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rtrmap_2), + .sick_mask = XFS_SICK_RG_RMAPBT, + + .dup_cursor = xfs_rtrmapbt_dup_cursor, + .alloc_block = xfs_btree_alloc_metafile_block, + .free_block = xfs_btree_free_metafile_block, + .get_minrecs = xfs_rtrmapbt_get_minrecs, + .get_maxrecs = xfs_rtrmapbt_get_maxrecs, + .get_dmaxrecs = xfs_rtrmapbt_get_dmaxrecs, + .init_key_from_rec = xfs_rtrmapbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_rtrmapbt_init_ptr_from_cur, + .key_diff = xfs_rtrmapbt_key_diff, + .buf_ops = &xfs_rtrmapbt_buf_ops, + .diff_two_keys = xfs_rtrmapbt_diff_two_keys, + .keys_inorder = xfs_rtrmapbt_keys_inorder, + .recs_inorder = xfs_rtrmapbt_recs_inorder, + .keys_contiguous = xfs_rtrmapbt_keys_contiguous, + .broot_realloc = xfs_rtrmapbt_broot_realloc, +}; + +/* Allocate a new rt rmap btree cursor. */ +struct xfs_btree_cur * +xfs_rtrmapbt_init_cursor( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg) +{ + struct xfs_inode *ip = rtg_rmap(rtg); + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_btree_cur *cur; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_ops, + mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache); + + cur->bc_ino.ip = ip; + cur->bc_group = xfs_group_hold(rtg_group(rtg)); + cur->bc_ino.whichfork = XFS_DATA_FORK; + cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK); + + return cur; +} + +#ifdef CONFIG_XFS_BTREE_IN_MEM +/* + * Validate an in-memory realtime rmap btree block. Callers are allowed to + * generate an in-memory btree even if the ondisk feature is not enabled. + */ +static xfs_failaddr_t +xfs_rtrmapbt_mem_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + unsigned int level; + unsigned int maxrecs; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + + level = be16_to_cpu(block->bb_level); + if (xfs_has_rmapbt(mp)) { + if (level >= mp->m_rtrmap_maxlevels) + return __this_address; + } else { + if (level >= xfs_rtrmapbt_maxlevels_ondisk()) + return __this_address; + } + + maxrecs = xfs_rtrmapbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0); + return xfs_btree_memblock_verify(bp, maxrecs); +} + +static void +xfs_rtrmapbt_mem_rw_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa = xfs_rtrmapbt_mem_verify(bp); + + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +/* skip crc checks on in-memory btrees to save time */ +static const struct xfs_buf_ops xfs_rtrmapbt_mem_buf_ops = { + .name = "xfs_rtrmapbt_mem", + .magic = { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) }, + .verify_read = xfs_rtrmapbt_mem_rw_verify, + .verify_write = xfs_rtrmapbt_mem_rw_verify, + .verify_struct = xfs_rtrmapbt_mem_verify, +}; + +const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = { + .type = XFS_BTREE_TYPE_MEM, + .geom_flags = XFS_BTGEO_OVERLAPPING, + + .rec_len = sizeof(struct xfs_rmap_rec), + /* Overlapping btree; 2 keys per pointer. */ + .key_len = 2 * sizeof(struct xfs_rmap_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_RMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rtrmap_mem_2), + + .dup_cursor = xfbtree_dup_cursor, + .set_root = xfbtree_set_root, + .alloc_block = xfbtree_alloc_block, + .free_block = xfbtree_free_block, + .get_minrecs = xfbtree_get_minrecs, + .get_maxrecs = xfbtree_get_maxrecs, + .init_key_from_rec = xfs_rtrmapbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur, + .init_ptr_from_cur = xfbtree_init_ptr_from_cur, + .key_diff = xfs_rtrmapbt_key_diff, + .buf_ops = &xfs_rtrmapbt_mem_buf_ops, + .diff_two_keys = xfs_rtrmapbt_diff_two_keys, + .keys_inorder = xfs_rtrmapbt_keys_inorder, + .recs_inorder = xfs_rtrmapbt_recs_inorder, + .keys_contiguous = xfs_rtrmapbt_keys_contiguous, +}; + +/* Create a cursor for an in-memory btree. */ +struct xfs_btree_cur * +xfs_rtrmapbt_mem_cursor( + struct xfs_rtgroup *rtg, + struct xfs_trans *tp, + struct xfbtree *xfbt) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_btree_cur *cur; + + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_mem_ops, + mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache); + cur->bc_mem.xfbtree = xfbt; + cur->bc_nlevels = xfbt->nlevels; + cur->bc_group = xfs_group_hold(rtg_group(rtg)); + return cur; +} + +/* Create an in-memory realtime rmap btree. */ +int +xfs_rtrmapbt_mem_init( + struct xfs_mount *mp, + struct xfbtree *xfbt, + struct xfs_buftarg *btp, + xfs_rgnumber_t rgno) +{ + xfbt->owner = rgno; + return xfbtree_init(mp, xfbt, btp, &xfs_rtrmapbt_mem_ops); +} +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + +/* + * Install a new rt reverse mapping btree root. Caller is responsible for + * invalidating and freeing the old btree blocks. + */ +void +xfs_rtrmapbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp) +{ + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; + struct xfs_ifork *ifp; + int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE); + + /* + * Free any resources hanging off the real fork, then shallow-copy the + * staging fork's contents into the real fork to transfer everything + * we just built. + */ + ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); + + cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno; + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); + xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK); +} + +/* Calculate number of records in a rt reverse mapping btree block. */ +static inline unsigned int +xfs_rtrmapbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / + (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t)); +} + +/* + * Calculate number of records in an rt reverse mapping btree block. + */ +unsigned int +xfs_rtrmapbt_maxrecs( + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) +{ + blocklen -= XFS_RTRMAP_BLOCK_LEN; + return xfs_rtrmapbt_block_maxrecs(blocklen, leaf); +} + +/* Compute the max possible height for realtime reverse mapping btrees. */ +unsigned int +xfs_rtrmapbt_maxlevels_ondisk(void) +{ + unsigned long long max_dblocks; + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2; + + /* + * Compute the asymptotic maxlevels for an rtrmapbt on any rtreflink fs. + * + * On a reflink filesystem, each block in an rtgroup can have up to + * 2^32 (per the refcount record format) owners, which means that + * theoretically we could face up to 2^64 rmap records. However, we're + * likely to run out of blocks in the data device long before that + * happens, which means that we must compute the max height based on + * what the btree will look like if it consumes almost all the blocks + * in the data device due to maximal sharing factor. + */ + max_dblocks = -1U; /* max ag count */ + max_dblocks *= XFS_MAX_CRC_AG_BLOCKS; + return xfs_btree_space_to_height(minrecs, max_dblocks); +} + +int __init +xfs_rtrmapbt_init_cur_cache(void) +{ + xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur", + xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_rtrmapbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_rtrmapbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_rtrmapbt_cur_cache); + xfs_rtrmapbt_cur_cache = NULL; +} + +/* Compute the maximum height of an rt reverse mapping btree. */ +void +xfs_rtrmapbt_compute_maxlevels( + struct xfs_mount *mp) +{ + unsigned int d_maxlevels, r_maxlevels; + + if (!xfs_has_rtrmapbt(mp)) { + mp->m_rtrmap_maxlevels = 0; + return; + } + + /* + * The realtime rmapbt lives on the data device, which means that its + * maximum height is constrained by the size of the data device and + * the height required to store one rmap record for each block in an + * rt group. + * + * On a reflink filesystem, each rt block can have up to 2^32 (per the + * refcount record format) owners, which means that theoretically we + * could face up to 2^64 rmap records. This makes the computation of + * maxlevels based on record count meaningless, so we only consider the + * size of the data device. + */ + d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr, + mp->m_sb.sb_dblocks); + if (xfs_has_rtreflink(mp)) { + mp->m_rtrmap_maxlevels = d_maxlevels + 1; + return; + } + + r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr, + mp->m_groups[XG_TYPE_RTG].blocks); + + /* Add one level to handle the inode root level. */ + mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1; +} + +/* Calculate the rtrmap btree size for some records. */ +unsigned long long +xfs_rtrmapbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_rtrmap_mnr, len); +} + +/* + * Calculate the maximum rmap btree size. + */ +static unsigned long long +xfs_rtrmapbt_max_size( + struct xfs_mount *mp, + xfs_rtblock_t rtblocks) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_rtrmap_mxr[0] == 0) + return 0; + + return xfs_rtrmapbt_calc_size(mp, rtblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +xfs_filblks_t +xfs_rtrmapbt_calc_reserves( + struct xfs_mount *mp) +{ + uint32_t blocks = mp->m_groups[XG_TYPE_RTG].blocks; + + if (!xfs_has_rtrmapbt(mp)) + return 0; + + /* Reserve 1% of the rtgroup or enough for 1 block per record. */ + return max_t(xfs_filblks_t, blocks / 100, + xfs_rtrmapbt_max_size(mp, blocks)); +} + +/* Convert on-disk form of btree root to in-memory form. */ +STATIC void +xfs_rtrmapbt_from_disk( + struct xfs_inode *ip, + struct xfs_rtrmap_root *dblock, + unsigned int dblocklen, + struct xfs_btree_block *rblock) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_rmap_key *fkp; + __be64 *fpp; + struct xfs_rmap_key *tkp; + __be64 *tpp; + struct xfs_rmap_rec *frp; + struct xfs_rmap_rec *trp; + unsigned int rblocklen = xfs_rtrmap_broot_space(mp, dblock); + unsigned int numrecs; + unsigned int maxrecs; + + xfs_btree_init_block(mp, rblock, &xfs_rtrmapbt_ops, 0, 0, ip->i_ino); + + rblock->bb_level = dblock->bb_level; + rblock->bb_numrecs = dblock->bb_numrecs; + numrecs = be16_to_cpu(dblock->bb_numrecs); + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrmap_droot_key_addr(dblock, 1); + tkp = xfs_rtrmap_key_addr(rblock, 1); + fpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs); + tpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrmap_droot_rec_addr(dblock, 1); + trp = xfs_rtrmap_rec_addr(rblock, 1); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Load a realtime reverse mapping btree root in from disk. */ +int +xfs_iformat_rtrmap( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_rtrmap_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + struct xfs_btree_block *broot; + unsigned int numrecs; + unsigned int level; + int dsize; + + /* + * growfs must create the rtrmap inodes before adding a realtime volume + * to the filesystem, so we cannot use the rtrmapbt predicate here. + */ + if (!xfs_has_rmapbt(ip->i_mount)) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK); + numrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (level > mp->m_rtrmap_maxlevels || + xfs_rtrmap_droot_space_calc(level, numrecs) > dsize) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); + return -EFSCORRUPTED; + } + + broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK), + xfs_rtrmap_broot_space_calc(mp, level, numrecs)); + if (broot) + xfs_rtrmapbt_from_disk(ip, dfp, dsize, broot); + return 0; +} + +/* Convert in-memory form of btree root to on-disk form. */ +void +xfs_rtrmapbt_to_disk( + struct xfs_mount *mp, + struct xfs_btree_block *rblock, + unsigned int rblocklen, + struct xfs_rtrmap_root *dblock, + unsigned int dblocklen) +{ + struct xfs_rmap_key *fkp; + __be64 *fpp; + struct xfs_rmap_key *tkp; + __be64 *tpp; + struct xfs_rmap_rec *frp; + struct xfs_rmap_rec *trp; + unsigned int numrecs; + unsigned int maxrecs; + + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTRMAP_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL)); + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)); + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)); + + dblock->bb_level = rblock->bb_level; + dblock->bb_numrecs = rblock->bb_numrecs; + numrecs = be16_to_cpu(rblock->bb_numrecs); + + if (be16_to_cpu(rblock->bb_level) > 0) { + maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false); + fkp = xfs_rtrmap_key_addr(rblock, 1); + tkp = xfs_rtrmap_droot_key_addr(dblock, 1); + fpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen); + tpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs); + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); + } else { + frp = xfs_rtrmap_rec_addr(rblock, 1); + trp = xfs_rtrmap_droot_rec_addr(dblock, 1); + memcpy(trp, frp, sizeof(*frp) * numrecs); + } +} + +/* Flush a realtime reverse mapping btree root out to disk. */ +void +xfs_iflush_rtrmap( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_rtrmap_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + + ASSERT(ifp->if_broot != NULL); + ASSERT(ifp->if_broot_bytes > 0); + ASSERT(xfs_rtrmap_droot_space(ifp->if_broot) <= + xfs_inode_fork_size(ip, XFS_DATA_FORK)); + xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes, + dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK)); +} + +/* + * Create a realtime rmap btree inode. + */ +int +xfs_rtrmapbt_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_block *broot; + + ifp->if_format = XFS_DINODE_FMT_META_BTREE; + ASSERT(ifp->if_broot_bytes == 0); + ASSERT(ifp->if_bytes == 0); + + /* Initialize the empty incore btree root. */ + broot = xfs_broot_realloc(ifp, xfs_rtrmap_broot_space_calc(mp, 0, 0)); + if (broot) + xfs_btree_init_block(mp, broot, &xfs_rtrmapbt_ops, 0, 0, + ip->i_ino); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT); + + return 0; +} + +/* + * Initialize an rmap for a realtime superblock using the potentially updated + * rt geometry in the provided @mp. + */ +int +xfs_rtrmapbt_init_rtsb( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg, + struct xfs_trans *tp) +{ + struct xfs_rmap_irec rmap = { + .rm_blockcount = mp->m_sb.sb_rextsize, + .rm_owner = XFS_RMAP_OWN_FS, + }; + struct xfs_btree_cur *cur; + int error; + + ASSERT(xfs_has_rtsb(mp)); + ASSERT(rtg_rgno(rtg) == 0); + + cur = xfs_rtrmapbt_init_cursor(tp, rtg); + error = xfs_rmap_map_raw(cur, &rmap); + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Return the highest rgbno currently tracked by the rmap for this rtg. + */ +xfs_rgblock_t +xfs_rtrmap_highest_rgbno( + struct xfs_rtgroup *rtg) +{ + struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot; + union xfs_btree_key key = {}; + struct xfs_btree_cur *cur; + + if (block->bb_numrecs == 0) + return NULLRGBLOCK; + cur = xfs_rtrmapbt_init_cursor(NULL, rtg); + xfs_btree_get_keys(cur, block, &key); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock); +} diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h new file mode 100644 index 000000000000..e328fd62a149 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h @@ -0,0 +1,212 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_RTRMAP_BTREE_H__ +#define __XFS_RTRMAP_BTREE_H__ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; +struct xbtree_ifakeroot; +struct xfs_rtgroup; +struct xfbtree; + +/* rmaps only exist on crc enabled filesystems */ +#define XFS_RTRMAP_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN + +struct xfs_btree_cur *xfs_rtrmapbt_init_cursor(struct xfs_trans *tp, + struct xfs_rtgroup *rtg); +struct xfs_btree_cur *xfs_rtrmapbt_stage_cursor(struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xbtree_ifakeroot *ifake); +void xfs_rtrmapbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp); +unsigned int xfs_rtrmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); +void xfs_rtrmapbt_compute_maxlevels(struct xfs_mount *mp); +unsigned int xfs_rtrmapbt_droot_maxrecs(unsigned int blocklen, bool leaf); + +/* + * Addresses of records, keys, and pointers within an incore rtrmapbt block. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +static inline struct xfs_rmap_rec * +xfs_rtrmap_rec_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_rmap_rec *) + ((char *)block + XFS_RTRMAP_BLOCK_LEN + + (index - 1) * sizeof(struct xfs_rmap_rec)); +} + +static inline struct xfs_rmap_key * +xfs_rtrmap_key_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_rmap_key *) + ((char *)block + XFS_RTRMAP_BLOCK_LEN + + (index - 1) * 2 * sizeof(struct xfs_rmap_key)); +} + +static inline struct xfs_rmap_key * +xfs_rtrmap_high_key_addr( + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_rmap_key *) + ((char *)block + XFS_RTRMAP_BLOCK_LEN + + sizeof(struct xfs_rmap_key) + + (index - 1) * 2 * sizeof(struct xfs_rmap_key)); +} + +static inline xfs_rtrmap_ptr_t * +xfs_rtrmap_ptr_addr( + struct xfs_btree_block *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrmap_ptr_t *) + ((char *)block + XFS_RTRMAP_BLOCK_LEN + + maxrecs * 2 * sizeof(struct xfs_rmap_key) + + (index - 1) * sizeof(xfs_rtrmap_ptr_t)); +} + +unsigned int xfs_rtrmapbt_maxlevels_ondisk(void); + +int __init xfs_rtrmapbt_init_cur_cache(void); +void xfs_rtrmapbt_destroy_cur_cache(void); + +xfs_filblks_t xfs_rtrmapbt_calc_reserves(struct xfs_mount *mp); + +/* Addresses of key, pointers, and records within an ondisk rtrmapbt block. */ + +static inline struct xfs_rmap_rec * +xfs_rtrmap_droot_rec_addr( + struct xfs_rtrmap_root *block, + unsigned int index) +{ + return (struct xfs_rmap_rec *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_rmap_rec)); +} + +static inline struct xfs_rmap_key * +xfs_rtrmap_droot_key_addr( + struct xfs_rtrmap_root *block, + unsigned int index) +{ + return (struct xfs_rmap_key *) + ((char *)(block + 1) + + (index - 1) * 2 * sizeof(struct xfs_rmap_key)); +} + +static inline xfs_rtrmap_ptr_t * +xfs_rtrmap_droot_ptr_addr( + struct xfs_rtrmap_root *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_rtrmap_ptr_t *) + ((char *)(block + 1) + + maxrecs * 2 * sizeof(struct xfs_rmap_key) + + (index - 1) * sizeof(xfs_rtrmap_ptr_t)); +} + +/* + * Address of pointers within the incore btree root. + * + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +static inline xfs_rtrmap_ptr_t * +xfs_rtrmap_broot_ptr_addr( + struct xfs_mount *mp, + struct xfs_btree_block *bb, + unsigned int index, + unsigned int block_size) +{ + return xfs_rtrmap_ptr_addr(bb, index, + xfs_rtrmapbt_maxrecs(mp, block_size, false)); +} + +/* + * Compute the space required for the incore btree root containing the given + * number of records. + */ +static inline size_t +xfs_rtrmap_broot_space_calc( + struct xfs_mount *mp, + unsigned int level, + unsigned int nrecs) +{ + size_t sz = XFS_RTRMAP_BLOCK_LEN; + + if (level > 0) + return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) + + sizeof(xfs_rtrmap_ptr_t)); + return sz + nrecs * sizeof(struct xfs_rmap_rec); +} + +/* + * Compute the space required for the incore btree root given the ondisk + * btree root block. + */ +static inline size_t +xfs_rtrmap_broot_space(struct xfs_mount *mp, struct xfs_rtrmap_root *bb) +{ + return xfs_rtrmap_broot_space_calc(mp, be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +/* Compute the space required for the ondisk root block. */ +static inline size_t +xfs_rtrmap_droot_space_calc( + unsigned int level, + unsigned int nrecs) +{ + size_t sz = sizeof(struct xfs_rtrmap_root); + + if (level > 0) + return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) + + sizeof(xfs_rtrmap_ptr_t)); + return sz + nrecs * sizeof(struct xfs_rmap_rec); +} + +/* + * Compute the space required for the ondisk root block given an incore root + * block. + */ +static inline size_t +xfs_rtrmap_droot_space(struct xfs_btree_block *bb) +{ + return xfs_rtrmap_droot_space_calc(be16_to_cpu(bb->bb_level), + be16_to_cpu(bb->bb_numrecs)); +} + +int xfs_iformat_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip); +void xfs_rtrmapbt_to_disk(struct xfs_mount *mp, struct xfs_btree_block *rblock, + unsigned int rblocklen, struct xfs_rtrmap_root *dblock, + unsigned int dblocklen); +void xfs_iflush_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip); + +int xfs_rtrmapbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); +int xfs_rtrmapbt_init_rtsb(struct xfs_mount *mp, struct xfs_rtgroup *rtg, + struct xfs_trans *tp); + +unsigned long long xfs_rtrmapbt_calc_size(struct xfs_mount *mp, + unsigned long long len); + +struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg, + struct xfs_trans *tp, struct xfbtree *xfbtree); +int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, + struct xfs_buftarg *btp, xfs_rgnumber_t rgno); + +xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg); + +#endif /* __XFS_RTRMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index d991eec05436..711e180f9ebb 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -26,6 +26,10 @@ #include "xfs_health.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" +#include "xfs_exchrange.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -175,6 +179,14 @@ xfs_sb_version_to_features( features |= XFS_FEAT_NEEDSREPAIR; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64) features |= XFS_FEAT_NREXT64; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE) + features |= XFS_FEAT_EXCHANGE_RANGE; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT) + features |= XFS_FEAT_PARENT; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) + features |= XFS_FEAT_METADIR; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) + features |= XFS_FEAT_ZONED; return features; } @@ -227,6 +239,73 @@ xfs_validate_sb_read( return 0; } +/* Return the number of extents covered by a single rt bitmap file */ +static xfs_rtbxlen_t +xfs_extents_per_rbm( + struct xfs_sb *sbp) +{ + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) + return sbp->sb_rgextents; + return sbp->sb_rextents; +} + +/* + * Return the payload size of a single rt bitmap block (without the metadata + * header if any). + */ +static inline unsigned int +xfs_rtbmblock_size( + struct xfs_sb *sbp) +{ + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) + return sbp->sb_blocksize - sizeof(struct xfs_rtbuf_blkinfo); + return sbp->sb_blocksize; +} + +static uint64_t +xfs_expected_rbmblocks( + struct xfs_sb *sbp) +{ + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) + return 0; + return howmany_64(xfs_extents_per_rbm(sbp), + NBBY * xfs_rtbmblock_size(sbp)); +} + +/* Validate the realtime geometry */ +bool +xfs_validate_rt_geometry( + struct xfs_sb *sbp) +{ + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) { + if (sbp->sb_rextsize != 1) + return false; + } else { + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) + return false; + } + + if (sbp->sb_rblocks == 0) { + if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || + sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) + return false; + return true; + } + + if (sbp->sb_rextents == 0 || + sbp->sb_rextents != div_u64(sbp->sb_rblocks, sbp->sb_rextsize) || + sbp->sb_rextslog != xfs_compute_rextslog(sbp->sb_rextents) || + sbp->sb_rbmblocks != xfs_expected_rbmblocks(sbp)) + return false; + + return true; +} + /* Check all the superblock fields we care about when writing one out. */ STATIC int xfs_validate_sb_write( @@ -260,13 +339,6 @@ xfs_validate_sb_write( * the kernel cannot support since we checked for unsupported bits in * the read verifier, which means that memory is corrupt. */ - if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { - xfs_warn(mp, -"Corruption detected in superblock compatible features (0x%x)!", - (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN)); - return -EFSCORRUPTED; - } - if (!xfs_is_readonly(mp) && xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_alert(mp, @@ -302,6 +374,106 @@ xfs_validate_sb_write( return 0; } +int +xfs_compute_rgblklog( + xfs_rtxlen_t rgextents, + xfs_rgblock_t rextsize) +{ + uint64_t rgblocks = (uint64_t)rgextents * rextsize; + + return xfs_highbit64(rgblocks - 1) + 1; +} + +static int +xfs_validate_sb_rtgroups( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + uint64_t groups; + int rgblklog; + + if (sbp->sb_rextsize == 0) { + xfs_warn(mp, +"Realtime extent size must not be zero."); + return -EINVAL; + } + + if (sbp->sb_rgextents > XFS_MAX_RGBLOCKS / sbp->sb_rextsize) { + xfs_warn(mp, +"Realtime group size (%u) must be less than %u rt extents.", + sbp->sb_rgextents, + XFS_MAX_RGBLOCKS / sbp->sb_rextsize); + return -EINVAL; + } + + if (sbp->sb_rgextents < XFS_MIN_RGEXTENTS) { + xfs_warn(mp, +"Realtime group size (%u) must be at least %u rt extents.", + sbp->sb_rgextents, XFS_MIN_RGEXTENTS); + return -EINVAL; + } + + if (sbp->sb_rgcount > XFS_MAX_RGNUMBER) { + xfs_warn(mp, +"Realtime groups (%u) must be less than %u.", + sbp->sb_rgcount, XFS_MAX_RGNUMBER); + return -EINVAL; + } + + groups = howmany_64(sbp->sb_rextents, sbp->sb_rgextents); + if (groups != sbp->sb_rgcount) { + xfs_warn(mp, +"Realtime groups (%u) do not cover the entire rt section; need (%llu) groups.", + sbp->sb_rgcount, groups); + return -EINVAL; + } + + /* Exchange-range is required for fsr to work on realtime files */ + if (!(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE)) { + xfs_warn(mp, +"Realtime groups feature requires exchange-range support."); + return -EINVAL; + } + + rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents, sbp->sb_rextsize); + if (sbp->sb_rgblklog != rgblklog) { + xfs_warn(mp, +"Realtime group log (%d) does not match expected value (%d).", + sbp->sb_rgblklog, rgblklog); + return -EINVAL; + } + + return 0; +} + +static int +xfs_validate_sb_zoned( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + if (sbp->sb_frextents != 0) { + xfs_warn(mp, +"sb_frextents must be zero for zoned file systems."); + return -EINVAL; + } + + if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) { + xfs_warn(mp, +"sb_rtstart (%lld) overlaps sb_dblocks (%lld).", + sbp->sb_rtstart, sbp->sb_dblocks); + return -EINVAL; + } + + if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) { + xfs_warn(mp, +"sb_rtreserved (%lld) larger than sb_rblocks (%lld).", + sbp->sb_rtreserved, sbp->sb_rblocks); + return -EINVAL; + } + + return 0; +} + /* Check the validity of the SB. */ STATIC int xfs_validate_sb_common( @@ -313,6 +485,7 @@ xfs_validate_sb_common( uint32_t agcount = 0; uint32_t rem; bool has_dalign; + int error; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, @@ -361,6 +534,38 @@ xfs_validate_sb_common( sbp->sb_inoalignmt, align); return -EINVAL; } + + if (sbp->sb_spino_align && + (sbp->sb_spino_align > sbp->sb_inoalignmt || + (sbp->sb_inoalignmt % sbp->sb_spino_align) != 0)) { + xfs_warn(mp, +"Sparse inode alignment (%u) is invalid, must be integer factor of (%u).", + sbp->sb_spino_align, + sbp->sb_inoalignmt); + return -EINVAL; + } + } else if (sbp->sb_spino_align) { + xfs_warn(mp, + "Sparse inode alignment (%u) should be zero.", + sbp->sb_spino_align); + return -EINVAL; + } + + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { + if (memchr_inv(sbp->sb_pad, 0, sizeof(sbp->sb_pad))) { + xfs_warn(mp, +"Metadir superblock padding fields must be zero."); + return -EINVAL; + } + + error = xfs_validate_sb_rtgroups(mp, sbp); + if (error) + return error; + } + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + error = xfs_validate_sb_zoned(mp, sbp); + if (error) + return error; } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { @@ -486,39 +691,13 @@ xfs_validate_sb_common( } } - /* Validate the realtime geometry; stolen from xfs_repair */ - if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || - sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { + if (!xfs_validate_rt_geometry(sbp)) { xfs_notice(mp, - "realtime extent sanity check failed"); + "realtime %sgeometry check failed", + sbp->sb_rblocks ? "" : "zeroed "); return -EFSCORRUPTED; } - if (sbp->sb_rblocks == 0) { - if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || - sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) { - xfs_notice(mp, - "realtime zeroed geometry check failed"); - return -EFSCORRUPTED; - } - } else { - uint64_t rexts; - uint64_t rbmblocks; - - rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize); - rbmblocks = howmany_64(sbp->sb_rextents, - NBBY * sbp->sb_blocksize); - - if (!xfs_validate_rtextents(rexts) || - sbp->sb_rextents != rexts || - sbp->sb_rextslog != xfs_compute_rextslog(rexts) || - sbp->sb_rbmblocks != rbmblocks) { - xfs_notice(mp, - "realtime geometry sanity check failed"); - return -EFSCORRUPTED; - } - } - /* * Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign) * would imply the image is corrupted. @@ -530,7 +709,8 @@ xfs_validate_sb_common( } if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit), - XFS_FSB_TO_B(mp, sbp->sb_width), 0, false)) + XFS_FSB_TO_B(mp, sbp->sb_width), 0, + xfs_buf_daddr(bp) == XFS_SB_DADDR, false)) return -EFSCORRUPTED; /* @@ -554,6 +734,14 @@ xfs_validate_sb_common( void xfs_sb_quota_from_disk(struct xfs_sb *sbp) { + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { + sbp->sb_uquotino = NULLFSINO; + sbp->sb_gquotino = NULLFSINO; + sbp->sb_pquotino = NULLFSINO; + return; + } + /* * older mkfs doesn't initialize quota inodes to NULLFSINO. This * leads to in-core values having two different values for a quota @@ -677,6 +865,28 @@ __xfs_sb_from_disk( /* Convert on-disk flags to in-memory flags? */ if (convert_xquota) xfs_sb_quota_from_disk(to); + + if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { + to->sb_metadirino = be64_to_cpu(from->sb_metadirino); + to->sb_rgblklog = from->sb_rgblklog; + memcpy(to->sb_pad, from->sb_pad, sizeof(to->sb_pad)); + to->sb_rgcount = be32_to_cpu(from->sb_rgcount); + to->sb_rgextents = be32_to_cpu(from->sb_rgextents); + to->sb_rbmino = NULLFSINO; + to->sb_rsumino = NULLFSINO; + } else { + to->sb_metadirino = NULLFSINO; + to->sb_rgcount = 1; + to->sb_rgextents = 0; + } + + if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = be64_to_cpu(from->sb_rtstart); + to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved); + } else { + to->sb_rtstart = 0; + to->sb_rtreserved = 0; + } } void @@ -694,6 +904,15 @@ xfs_sb_quota_to_disk( { uint16_t qflags = from->sb_qflags; + if (xfs_sb_is_v5(from) && + (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { + to->sb_qflags = cpu_to_be16(from->sb_qflags); + to->sb_uquotino = cpu_to_be64(0); + to->sb_gquotino = cpu_to_be64(0); + to->sb_pquotino = cpu_to_be64(0); + return; + } + to->sb_uquotino = cpu_to_be64(from->sb_uquotino); /* @@ -824,6 +1043,21 @@ xfs_sb_to_disk( to->sb_lsn = cpu_to_be64(from->sb_lsn); if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); + + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { + to->sb_metadirino = cpu_to_be64(from->sb_metadirino); + to->sb_rgblklog = from->sb_rgblklog; + memset(to->sb_pad, 0, sizeof(to->sb_pad)); + to->sb_rgcount = cpu_to_be32(from->sb_rgcount); + to->sb_rgextents = cpu_to_be32(from->sb_rgextents); + to->sb_rbmino = cpu_to_be64(0); + to->sb_rsumino = cpu_to_be64(0); + } + + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = cpu_to_be64(from->sb_rtstart); + to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved); + } } /* @@ -953,6 +1187,49 @@ const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { .verify_write = xfs_sb_write_verify, }; +/* Compute cached rt geometry from the incore sb. */ +void +xfs_sb_mount_rextsize( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG]; + + mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize); + mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); + + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { + rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize; + rgs->blklog = mp->m_sb.sb_rgblklog; + rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog); + rgs->start_fsb = mp->m_sb.sb_rtstart; + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)) + rgs->has_daddr_gaps = true; + } else { + rgs->blocks = 0; + rgs->blklog = 0; + rgs->blkmask = (uint64_t)-1; + } +} + +/* Update incore sb rt extent size, then recompute the cached rt geometry. */ +void +xfs_mount_sb_set_rextsize( + struct xfs_mount *mp, + struct xfs_sb *sbp, + xfs_agblock_t rextsize) +{ + sbp->sb_rextsize = rextsize; + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) + sbp->sb_rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents, + rextsize); + + xfs_sb_mount_rextsize(mp, sbp); +} + /* * xfs_mount_common * @@ -967,6 +1244,8 @@ xfs_sb_mount_common( struct xfs_mount *mp, struct xfs_sb *sbp) { + struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG]; + mp->m_agfrotor = 0; atomic_set(&mp->m_agirotor, 0); mp->m_maxagi = mp->m_sb.sb_agcount; @@ -975,31 +1254,47 @@ xfs_sb_mount_common( mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; mp->m_blockmask = sbp->sb_blocksize - 1; - mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; - mp->m_blockwmask = mp->m_blockwsize - 1; - mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize); - mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); + mp->m_blockwsize = xfs_rtbmblock_size(sbp) >> XFS_WORDLOG; + mp->m_rtx_per_rbmblock = mp->m_blockwsize << XFS_NBWORDLOG; - mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); + ags->blocks = mp->m_sb.sb_agblocks; + ags->blklog = mp->m_sb.sb_agblklog; + ags->blkmask = xfs_mask32lo(mp->m_sb.sb_agblklog); + + xfs_sb_mount_rextsize(mp, sbp); + + mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; - mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; - mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 1); - mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 0); + mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; - mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, true); - mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, false); + mp->m_rtrmap_mxr[0] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_rtrmap_mxr[1] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, false); + mp->m_rtrmap_mnr[0] = mp->m_rtrmap_mxr[0] / 2; + mp->m_rtrmap_mnr[1] = mp->m_rtrmap_mxr[1] / 2; + + mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; + mp->m_rtrefc_mxr[0] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize, + true); + mp->m_rtrefc_mxr[1] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize, + false); + mp->m_rtrefc_mnr[0] = mp->m_rtrefc_mxr[0] / 2; + mp->m_rtrefc_mnr[1] = mp->m_rtrefc_mxr[1] / 2; + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp); @@ -1025,18 +1320,24 @@ xfs_log_sb( * reservations that have been taken out percpu counters. If we have an * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. - * - * Do not update sb_frextents here because it is not part of the lazy - * sb counters, despite having a percpu counter. It is always kept - * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas() - * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { - mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount); mp->m_sb.sb_ifree = min_t(uint64_t, - percpu_counter_sum(&mp->m_ifree), + percpu_counter_sum_positive(&mp->m_ifree), mp->m_sb.sb_icount); - mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS); + } + + /* + * sb_frextents was added to the lazy sb counters when the rt groups + * feature was introduced. This counter can go negative due to the way + * we handle nearly-lockless reservations, so we must use the _positive + * variant here to avoid writing out nonsense frextents. + */ + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) { + mp->m_sb.sb_frextents = + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS); } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); @@ -1088,18 +1389,17 @@ int xfs_update_secondary_sbs( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno = 1; + struct xfs_perag *pag = NULL; int saved_error = 0; int error = 0; LIST_HEAD (buffer_list); /* update secondary superblocks. */ - for_each_perag_from(mp, agno, pag) { + while ((pag = xfs_perag_next_from(mp, pag, 1))) { struct xfs_buf *bp; error = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR), + XFS_AG_DADDR(mp, pag_agno(pag), XFS_SB_DADDR), XFS_FSS_TO_BB(mp, 1), &bp); /* * If we get an error reading or writing alternate superblocks, @@ -1111,7 +1411,7 @@ xfs_update_secondary_sbs( if (error) { xfs_warn(mp, "error allocating secondary superblock for ag %d", - pag->pag_agno); + pag_agno(pag)); if (!saved_error) saved_error = error; continue; @@ -1125,26 +1425,22 @@ xfs_update_secondary_sbs( xfs_buf_relse(bp); /* don't hold too many buffers at once */ - if (agno % 16) + if (pag_agno(pag) % 16) continue; error = xfs_buf_delwri_submit(&buffer_list); if (error) { xfs_warn(mp, "write error %d updating a secondary superblock near ag %d", - error, pag->pag_agno); + error, pag_agno(pag)); if (!saved_error) saved_error = error; continue; } } error = xfs_buf_delwri_submit(&buffer_list); - if (error) { - xfs_warn(mp, - "write error %d updating a secondary superblock near ag %d", - error, agno); - } - + if (error) + xfs_warn(mp, "error %d writing secondary superblocks", error); return saved_error ? saved_error : error; } @@ -1154,10 +1450,12 @@ xfs_update_secondary_sbs( */ int xfs_sync_sb_buf( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool update_rtsb) { struct xfs_trans *tp; struct xfs_buf *bp; + struct xfs_buf *rtsb_bp = NULL; int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp); @@ -1167,6 +1465,11 @@ xfs_sync_sb_buf( bp = xfs_trans_getsb(tp); xfs_log_sb(tp); xfs_trans_bhold(tp, bp); + if (update_rtsb) { + rtsb_bp = xfs_log_rtsb(tp, bp); + if (rtsb_bp) + xfs_trans_bhold(tp, rtsb_bp); + } xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) @@ -1175,7 +1478,11 @@ xfs_sync_sb_buf( * write out the sb buffer to get the changes to disk */ error = xfs_bwrite(bp); + if (!error && rtsb_bp) + error = xfs_bwrite(rtsb_bp); out: + if (rtsb_bp) + xfs_buf_relse(rtsb_bp); xfs_buf_relse(bp); return error; } @@ -1250,6 +1557,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; if (xfs_has_inobtcounts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT; + if (xfs_has_parent(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_PARENT; if (xfs_has_sector(mp)) { geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; geo->logsectsize = sbp->sb_logsectsize; @@ -1258,6 +1567,12 @@ xfs_fs_geometry( } if (xfs_has_large_extent_counts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; + if (xfs_has_exchange_range(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE; + if (xfs_has_metadir(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR; + if (xfs_has_zoned(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); @@ -1273,6 +1588,15 @@ xfs_fs_geometry( return; geo->version = XFS_FSOP_GEOM_VERSION_V5; + + if (xfs_has_rtgroups(mp)) { + geo->rgcount = sbp->sb_rgcount; + geo->rgextents = sbp->sb_rgextents; + } + if (xfs_has_zoned(mp)) { + geo->rtstart = sbp->sb_rtstart; + geo->rtreserved = sbp->sb_rtreserved; + } } /* Read a secondary superblock. */ @@ -1323,8 +1647,10 @@ xfs_sb_get_secondary( } /* - * sunit, swidth, sectorsize(optional with 0) should be all in bytes, - * so users won't be confused by values in error messages. + * sunit, swidth, sectorsize(optional with 0) should be all in bytes, so users + * won't be confused by values in error messages. This function returns false + * if the stripe geometry is invalid and the caller is unable to repair the + * stripe configuration later in the mount process. */ bool xfs_validate_stripe_geometry( @@ -1332,20 +1658,21 @@ xfs_validate_stripe_geometry( __s64 sunit, __s64 swidth, int sectorsize, + bool may_repair, bool silent) { if (swidth > INT_MAX) { if (!silent) xfs_notice(mp, "stripe width (%lld) is too large", swidth); - return false; + goto check_override; } if (sunit > swidth) { if (!silent) xfs_notice(mp, "stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth); - return false; + goto check_override; } if (sectorsize && (int)sunit % sectorsize) { @@ -1353,21 +1680,21 @@ xfs_validate_stripe_geometry( xfs_notice(mp, "stripe unit (%lld) must be a multiple of the sector size (%d)", sunit, sectorsize); - return false; + goto check_override; } if (sunit && !swidth) { if (!silent) xfs_notice(mp, "invalid stripe unit (%lld) and stripe width of 0", sunit); - return false; + goto check_override; } if (!sunit && swidth) { if (!silent) xfs_notice(mp, "invalid stripe width (%lld) and stripe unit of 0", swidth); - return false; + goto check_override; } if (sunit && (int)swidth % (int)sunit) { @@ -1375,9 +1702,27 @@ xfs_validate_stripe_geometry( xfs_notice(mp, "stripe width (%lld) must be a multiple of the stripe unit (%lld)", swidth, sunit); - return false; + goto check_override; } return true; + +check_override: + if (!may_repair) + return false; + /* + * During mount, mp->m_dalign will not be set unless the sunit mount + * option was set. If it was set, ignore the bad stripe alignment values + * and allow the validation and overwrite later in the mount process to + * attempt to overwrite the bad stripe alignment values with the values + * supplied by mount options. + */ + if (!mp->m_dalign) + return false; + if (!silent) + xfs_notice(mp, +"Will try to correct with specified mount options sunit (%d) and swidth (%d)", + BBTOB(mp->m_dalign), BBTOB(mp->m_swidth)); + return true; } /* diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 2e8e8d63d4eb..34d0dd374e9b 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -15,8 +15,11 @@ struct xfs_perag; extern void xfs_log_sb(struct xfs_trans *tp); extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); -extern int xfs_sync_sb_buf(struct xfs_mount *mp); +extern int xfs_sync_sb_buf(struct xfs_mount *mp, bool update_rtsb); extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); +void xfs_sb_mount_rextsize(struct xfs_mount *mp, struct xfs_sb *sbp); +void xfs_mount_sb_set_rextsize(struct xfs_mount *mp, + struct xfs_sb *sbp, xfs_agblock_t rextsize); extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); @@ -35,9 +38,12 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); -extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp, - __s64 sunit, __s64 swidth, int sectorsize, bool silent); +bool xfs_validate_stripe_geometry(struct xfs_mount *mp, + __s64 sunit, __s64 swidth, int sectorsize, bool may_repair, + bool silent); +bool xfs_validate_rt_geometry(struct xfs_sb *sbp); uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); +int xfs_compute_rgblklog(xfs_rtxlen_t rgextents, xfs_rgblock_t rextsize); #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index dfd61fa8332e..b1e0d9bc1f7d 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -38,7 +38,12 @@ extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; +extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops; +extern const struct xfs_buf_ops xfs_rtsummary_buf_ops; extern const struct xfs_buf_ops xfs_rtbuf_ops; +extern const struct xfs_buf_ops xfs_rtsb_buf_ops; +extern const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops; +extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; @@ -52,6 +57,9 @@ extern const struct xfs_btree_ops xfs_bmbt_ops; extern const struct xfs_btree_ops xfs_refcountbt_ops; extern const struct xfs_btree_ops xfs_rmapbt_ops; extern const struct xfs_btree_ops xfs_rmapbt_mem_ops; +extern const struct xfs_btree_ops xfs_rtrmapbt_ops; +extern const struct xfs_btree_ops xfs_rtrmapbt_mem_ops; +extern const struct xfs_btree_ops xfs_rtrefcountbt_ops; static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops) { @@ -93,10 +101,26 @@ static inline bool xfs_btree_is_mem_rmap(const struct xfs_btree_ops *ops) { return ops == &xfs_rmapbt_mem_ops; } + +static inline bool xfs_btree_is_mem_rtrmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rtrmapbt_mem_ops; +} #else # define xfs_btree_is_mem_rmap(...) (false) +# define xfs_btree_is_mem_rtrmap(...) (false) #endif +static inline bool xfs_btree_is_rtrmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rtrmapbt_ops; +} + +static inline bool xfs_btree_is_rtrefcount(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rtrefcountbt_ops; +} + /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); int xfs_log_calc_minimum_size(struct xfs_mount *); @@ -124,7 +148,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_RES_FDBLKS (1u << 6) /* Transaction contains an intent done log item */ #define XFS_TRANS_HAS_INTENT_DONE (1u << 7) - /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an @@ -136,7 +159,10 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, * for free space from AG 0. If the correct transaction reservations have been * made then this algorithm will eventually find all the space it needs. */ -#define XFS_TRANS_LOWMODE 0x100 /* allocate in low space mode */ +#define XFS_TRANS_LOWMODE (1u << 8) + +/* Transaction has locked the rtbitmap and rtsum inodes */ +#define XFS_TRANS_RTBITMAP_LOCKED (1u << 9) /* * Field values for xfs_trans_mod_sb. @@ -155,6 +181,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_SB_RBLOCKS 0x00000800 #define XFS_TRANS_SB_REXTENTS 0x00001000 #define XFS_TRANS_SB_REXTSLOG 0x00002000 +#define XFS_TRANS_SB_RGCOUNT 0x00004000 /* * Here we centralize the specification of XFS meta-data buffer reference count @@ -175,13 +202,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_REFC_BTREE_REF 1 #define XFS_SSB_REF 0 -/* - * Flags for xfs_trans_ichgtime(). - */ -#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ -#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ -#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ - /* Computed inode geometry for the filesystem. */ struct xfs_ino_geometry { /* Maximum inode count in this filesystem. */ @@ -229,6 +249,9 @@ struct xfs_ino_geometry { /* precomputed value for di_flags2 */ uint64_t new_diflags2; + /* minimum folio order of a page cache allocation */ + unsigned int min_folio_order; + }; #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index ffb1317a9212..fb47a76ead18 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -92,8 +92,10 @@ xfs_symlink_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_dsymlink_hdr *dsl = bp->b_addr; + /* no verification of non-crc buffers */ if (!xfs_has_crc(mp)) - return __this_address; + return NULL; + if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) @@ -169,7 +171,8 @@ xfs_symlink_local_to_remote( struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, - struct xfs_ifork *ifp) + struct xfs_ifork *ifp, + void *priv) { struct xfs_mount *mp = ip->i_mount; char *buf; @@ -310,6 +313,7 @@ int xfs_symlink_write_target( struct xfs_trans *tp, struct xfs_inode *ip, + xfs_ino_t owner, const char *target_path, int pathlen, xfs_fsblock_t fs_blocks, @@ -364,8 +368,7 @@ xfs_symlink_write_target( byte_cnt = min(byte_cnt, pathlen); buf = bp->b_addr; - buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, byte_cnt, - bp); + buf += xfs_symlink_hdr_set(mp, owner, offset, byte_cnt, bp); memcpy(buf, cur_chunk, byte_cnt); @@ -380,3 +383,50 @@ xfs_symlink_write_target( ASSERT(pathlen == 0); return 0; } + +/* Remove all the blocks from a symlink and invalidate buffers. */ +int +xfs_symlink_remote_truncate( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *bp; + int nmaps = XFS_SYMLINK_MAPS; + int done = 0; + int i; + int error; + + /* Read mappings and invalidate buffers. */ + error = xfs_bmapi_read(ip, 0, XFS_MAX_FILEOFF, mval, &nmaps, 0); + if (error) + return error; + + for (i = 0; i < nmaps; i++) { + if (!xfs_bmap_is_real_extent(&mval[i])) + break; + + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0, + &bp); + if (error) + return error; + + xfs_trans_binval(tp, bp); + } + + /* Unmap the remote blocks. */ + error = xfs_bunmapi(tp, ip, 0, XFS_MAX_FILEOFF, 0, nmaps, &done); + if (error) + return error; + if (!done) { + ASSERT(done); + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); + return -EFSCORRUPTED; + } + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h index a63bd38ae4fa..c1672fe1f17b 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.h +++ b/fs/xfs/libxfs/xfs_symlink_remote.h @@ -16,11 +16,13 @@ int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, uint32_t size, struct xfs_buf *bp); void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_inode *ip, struct xfs_ifork *ifp); + struct xfs_inode *ip, struct xfs_ifork *ifp, + void *priv); xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); int xfs_symlink_remote_read(struct xfs_inode *ip, char *link); int xfs_symlink_write_target(struct xfs_trans *tp, struct xfs_inode *ip, - const char *target_path, int pathlen, xfs_fsblock_t fs_blocks, - uint resblks); + xfs_ino_t owner, const char *target_path, int pathlen, + xfs_fsblock_t fs_blocks, uint resblks); +int xfs_symlink_remote_truncate(struct xfs_trans *tp, struct xfs_inode *ip); #endif /* __XFS_SYMLINK_REMOTE_H */ diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 69fc5b981352..c962ad64b0c1 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -62,12 +62,14 @@ xfs_trans_ichgtime( ASSERT(tp); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - tv = current_time(inode); + /* If the mtime changes, then ctime must also change */ + ASSERT(flags & XFS_ICHGTIME_CHG); + tv = inode_set_ctime_current(inode); if (flags & XFS_ICHGTIME_MOD) inode_set_mtime_to_ts(inode, tv); - if (flags & XFS_ICHGTIME_CHG) - inode_set_ctime_to_ts(inode, tv); + if (flags & XFS_ICHGTIME_ACCESS) + inode_set_atime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) ip->i_crtime = tv; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 6cd45e8c118d..86a111d0f2fc 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -20,6 +20,14 @@ #include "xfs_qm.h" #include "xfs_trans_space.h" #include "xfs_rtbitmap.h" +#include "xfs_attr_item.h" +#include "xfs_log.h" +#include "xfs_defer.h" +#include "xfs_bmap_item.h" +#include "xfs_extfree_item.h" +#include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_trace.h" #define _ALLOC true #define _FREE false @@ -90,6 +98,14 @@ xfs_refcountbt_block_count( return num_ops * (2 * mp->m_refc_maxlevels - 1); } +static unsigned int +xfs_rtrefcountbt_block_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + return num_ops * (2 * mp->m_rtrefc_maxlevels - 1); +} + /* * Logging inodes is really tricksy. They are logged in memory format, * which means that what we write into the log doesn't directly translate into @@ -128,7 +144,7 @@ xfs_calc_inode_res( (4 * sizeof(struct xlog_op_header) + sizeof(struct xfs_inode_log_format) + mp->m_sb.sb_inodesize + - 2 * XFS_BMBT_BLOCK_LEN(mp)); + 2 * xfs_bmbt_block_len(mp)); } /* @@ -211,7 +227,9 @@ xfs_calc_inode_chunk_res( * Per-extent log reservation for the btree changes involved in freeing or * allocating a realtime extent. We have to be able to log as many rtbitmap * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime - * extents, as well as the realtime summary block. + * extents, as well as the realtime summary block (t1). Realtime rmap btree + * operations happen in a second transaction, so factor in a couple of rtrmapbt + * splits (t2). */ static unsigned int xfs_rtalloc_block_count( @@ -220,10 +238,16 @@ xfs_rtalloc_block_count( { unsigned int rtbmp_blocks; xfs_rtxlen_t rtxlen; + unsigned int t1, t2 = 0; rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN); - rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen); - return (rtbmp_blocks + 1) * num_ops; + rtbmp_blocks = xfs_rtbitmap_blockcount_len(mp, rtxlen); + t1 = (rtbmp_blocks + 1) * num_ops; + + if (xfs_has_rmapbt(mp)) + t2 = num_ops * (2 * mp->m_rtrmap_maxlevels - 1); + + return max(t1, t2); } /* @@ -246,26 +270,64 @@ xfs_rtalloc_block_count( */ /* + * Finishing a data device refcount updates (t1): + * the agfs of the ags containing the blocks: nr_ops * sector size + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_cui_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + if (!xfs_has_reflink(mp)) + return 0; + + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), + mp->m_sb.sb_blocksize); +} + +/* + * Realtime refcount updates (t2); + * the rt refcount inode + * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_rt_cui_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + if (!xfs_has_rtreflink(mp)) + return 0; + + return xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops), + mp->m_sb.sb_blocksize); +} + +/* * Compute the log reservation required to handle the refcount update * transaction. Refcount updates are always done via deferred log items. * - * This is calculated as: + * This is calculated as the max of: * Data device refcount updates (t1): * the agfs of the ags containing the blocks: nr_ops * sector size * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + * Realtime refcount updates (t2); + * the rt refcount inode + * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size */ static unsigned int xfs_calc_refcountbt_reservation( struct xfs_mount *mp, unsigned int nr_ops) { - unsigned int blksz = XFS_FSB_TO_B(mp, 1); + unsigned int t1, t2; - if (!xfs_has_reflink(mp)) - return 0; + t1 = xfs_calc_finish_cui_reservation(mp, nr_ops); + t2 = xfs_calc_finish_rt_cui_reservation(mp, nr_ops); - return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); + return max(t1, t2); } /* @@ -336,11 +398,11 @@ xfs_calc_write_reservation( blksz); t1 += adj; t3 += adj; - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + return XFS_DQUOT_LOGRES + max3(t1, t2, t3); } t4 = xfs_calc_refcountbt_reservation(mp, 1); - return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); + return XFS_DQUOT_LOGRES + max(t4, max3(t1, t2, t3)); } unsigned int @@ -351,6 +413,96 @@ xfs_calc_write_reservation_minlogsize( } /* + * Finishing an EFI can free the blocks and bmap blocks (t2): + * the agf for each of the ags: nr * sector size + * the agfl for each of the ags: nr * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming nr extents: + * nr exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: + * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_rt_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_realtime(mp)) + return 0; + + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, nr), + mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Finishing an RUI is the same as an EFI. We can split the rmap btree twice + * on each end of the record, and that can cause the AGFL to be refilled or + * emptied out. + */ +inline unsigned int +xfs_calc_finish_rui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_rmapbt(mp)) + return 0; + return xfs_calc_finish_efi_reservation(mp, nr); +} + +/* + * Finishing an RUI is the same as an EFI. We can split the rmap btree twice + * on each end of the record, and that can cause the AGFL to be refilled or + * emptied out. + */ +inline unsigned int +xfs_calc_finish_rt_rui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_rtrmapbt(mp)) + return 0; + return xfs_calc_finish_rt_efi_reservation(mp, nr); +} + +/* + * In finishing a BUI, we can modify: + * the inode being truncated: inode size + * dquots + * the inode's bmap btree: (max depth + 1) * block size + */ +inline unsigned int +xfs_calc_finish_bui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + mp->m_sb.sb_blocksize); +} + +/* * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size * the inode's bmap btree: (max depth + 1) * block size @@ -382,16 +534,8 @@ xfs_calc_itruncate_reservation( t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); - t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); - - if (xfs_has_realtime(mp)) { - t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); - } else { - t3 = 0; - } + t2 = xfs_calc_finish_efi_reservation(mp, 4); + t3 = xfs_calc_finish_rt_efi_reservation(mp, 2); /* * In the early days of reflink, we included enough reservation to log @@ -408,11 +552,11 @@ xfs_calc_itruncate_reservation( xfs_refcountbt_block_count(mp, 4), blksz); - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + return XFS_DQUOT_LOGRES + max3(t1, t2, t3); } t4 = xfs_calc_refcountbt_reservation(mp, 2); - return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); + return XFS_DQUOT_LOGRES + max(t4, max3(t1, t2, t3)); } unsigned int @@ -422,29 +566,108 @@ xfs_calc_itruncate_reservation_minlogsize( return xfs_calc_itruncate_reservation(mp, true); } +static inline unsigned int xfs_calc_pptr_link_overhead(void) +{ + return sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1); +} +static inline unsigned int xfs_calc_pptr_unlink_overhead(void) +{ + return sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1); +} +static inline unsigned int xfs_calc_pptr_replace_overhead(void) +{ + return sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1); +} + /* * In renaming a files we can modify: * the five inodes involved: 5 * inode size * the two directory btrees: 2 * (max depth + v2) * dir block size * the two directory bmap btrees: 2 * max depth * block size * And the bmap_finish transaction can free dir and bmap blocks (two sets - * of bmap blocks) giving: + * of bmap blocks) giving (t2): * the agf for the ags in which the blocks live: 3 * sector size * the agfl for the ags in which the blocks live: 3 * sector size * the superblock for the free block count: sector size * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size + * If parent pointers are enabled (t3), then each transaction in the chain + * must be capable of setting or removing the extended attribute + * containing the parent information. It must also be able to handle + * the three xattr intent items that track the progress of the parent + * pointer update. */ STATIC uint xfs_calc_rename_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 5) + - xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), - XFS_FSB_TO_B(mp, 1)))); + unsigned int overhead = XFS_DQUOT_LOGRES; + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int t1, t2, t3 = 0; + + t1 = xfs_calc_inode_res(mp, 5) + + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1)); + + t2 = xfs_calc_finish_efi_reservation(mp, 3); + + if (xfs_has_parent(mp)) { + unsigned int rename_overhead, exchange_overhead; + + t3 = max(resp->tr_attrsetm.tr_logres, + resp->tr_attrrm.tr_logres); + + /* + * For a standard rename, the three xattr intent log items + * are (1) replacing the pptr for the source file; (2) + * removing the pptr on the dest file; and (3) adding a + * pptr for the whiteout file in the src dir. + * + * For an RENAME_EXCHANGE, there are two xattr intent + * items to replace the pptr for both src and dest + * files. Link counts don't change and there is no + * whiteout. + * + * In the worst case we can end up relogging all log + * intent items to allow the log tail to move ahead, so + * they become overhead added to each transaction in a + * processing chain. + */ + rename_overhead = xfs_calc_pptr_replace_overhead() + + xfs_calc_pptr_unlink_overhead() + + xfs_calc_pptr_link_overhead(); + exchange_overhead = 2 * xfs_calc_pptr_replace_overhead(); + + overhead += max(rename_overhead, exchange_overhead); + } + + return overhead + max3(t1, t2, t3); +} + +static inline unsigned int +xfs_rename_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* One for the rename, one more for freeing blocks */ + unsigned int ret = XFS_RENAME_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to remove or add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += max(resp->tr_attrsetm.tr_logcount, + resp->tr_attrrm.tr_logcount); + + return ret; } /* @@ -461,6 +684,23 @@ xfs_calc_iunlink_remove_reservation( 2 * M_IGEO(mp)->inode_cluster_size; } +static inline unsigned int +xfs_link_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_LINK_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} + /* * For creating a link to an inode: * the parent directory inode: inode size @@ -477,14 +717,21 @@ STATIC uint xfs_calc_link_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - xfs_calc_iunlink_remove_reservation(mp) + - max((xfs_calc_inode_res(mp, 2) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), - XFS_FSB_TO_B(mp, 1)))); + unsigned int overhead = XFS_DQUOT_LOGRES; + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int t1, t2, t3 = 0; + + overhead += xfs_calc_iunlink_remove_reservation(mp); + t1 = xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 1); + + if (xfs_has_parent(mp)) { + t3 = resp->tr_attrsetm.tr_logres; + overhead += xfs_calc_pptr_link_overhead(); + } + + return overhead + max3(t1, t2, t3); } /* @@ -499,6 +746,23 @@ xfs_calc_iunlink_add_reservation(xfs_mount_t *mp) M_IGEO(mp)->inode_cluster_size; } +static inline unsigned int +xfs_remove_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_REMOVE_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrrm.tr_logcount; + + return ret; +} + /* * For removing a directory entry we can modify: * the parent directory inode: inode size @@ -515,14 +779,22 @@ STATIC uint xfs_calc_remove_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - xfs_calc_iunlink_add_reservation(mp) + - max((xfs_calc_inode_res(mp, 2) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), - XFS_FSB_TO_B(mp, 1)))); + unsigned int overhead = XFS_DQUOT_LOGRES; + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int t1, t2, t3 = 0; + + overhead += xfs_calc_iunlink_add_reservation(mp); + + t1 = xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 2); + + if (xfs_has_parent(mp)) { + t3 = resp->tr_attrrm.tr_logres; + overhead += xfs_calc_pptr_unlink_overhead(); + } + + return overhead + max3(t1, t2, t3); } /* @@ -571,24 +843,69 @@ xfs_calc_icreate_resv_alloc( xfs_calc_finobt_res(mp); } +static inline unsigned int +xfs_icreate_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_CREATE_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} + STATIC uint -xfs_calc_icreate_reservation(xfs_mount_t *mp) +xfs_calc_icreate_reservation( + struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max(xfs_calc_icreate_resv_alloc(mp), - xfs_calc_create_resv_modify(mp)); + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int overhead = XFS_DQUOT_LOGRES; + unsigned int t1, t2, t3 = 0; + + t1 = xfs_calc_icreate_resv_alloc(mp); + t2 = xfs_calc_create_resv_modify(mp); + + if (xfs_has_parent(mp)) { + t3 = resp->tr_attrsetm.tr_logres; + overhead += xfs_calc_pptr_link_overhead(); + } + + return overhead + max3(t1, t2, t3); } STATIC uint xfs_calc_create_tmpfile_reservation( struct xfs_mount *mp) { - uint res = XFS_DQUOT_LOGRES(mp); + uint res = XFS_DQUOT_LOGRES; res += xfs_calc_icreate_resv_alloc(mp); return res + xfs_calc_iunlink_add_reservation(mp); } +static inline unsigned int +xfs_mkdir_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_MKDIR_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} + /* * Making a new directory is the same as creating a new file. */ @@ -599,6 +916,22 @@ xfs_calc_mkdir_reservation( return xfs_calc_icreate_reservation(mp); } +static inline unsigned int +xfs_symlink_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_SYMLINK_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} /* * Making a new symplink is the same as creating a new file, but @@ -632,7 +965,7 @@ STATIC uint xfs_calc_ifree_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + xfs_calc_iunlink_remove_reservation(mp) + @@ -649,7 +982,7 @@ STATIC uint xfs_calc_ichange_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); @@ -721,7 +1054,7 @@ xfs_calc_growrtfree_reservation( return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + - xfs_calc_buf_res(1, mp->m_rsumsize); + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, mp->m_rsumblocks)); } /* @@ -758,7 +1091,7 @@ STATIC uint xfs_calc_addafork_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + @@ -806,7 +1139,7 @@ STATIC uint xfs_calc_attrsetm_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); @@ -846,7 +1179,7 @@ STATIC uint xfs_calc_attrrm_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + + return XFS_DQUOT_LOGRES + max((xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)) + @@ -911,54 +1244,85 @@ xfs_calc_sb_reservation( return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); } -void -xfs_trans_resv_calc( +/* + * Namespace reservations. + * + * These get tricky when parent pointers are enabled as we have attribute + * modifications occurring from within these transactions. Rather than confuse + * each of these reservation calculations with the conditional attribute + * reservations, add them here in a clear and concise manner. This requires that + * the attribute reservations have already been calculated. + * + * Note that we only include the static attribute reservation here; the runtime + * reservation will have to be modified by the size of the attributes being + * added/removed/modified. See the comments on the attribute reservation + * calculations for more details. + */ +STATIC void +xfs_calc_namespace_reservations( struct xfs_mount *mp, struct xfs_trans_resv *resp) { - int logcount_adj = 0; - - /* - * The following transactions are logged in physical format and - * require a permanent reservation on space. - */ - resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; - resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - - resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; - resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + ASSERT(resp->tr_attrsetm.tr_logres > 0); resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); - resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT; + resp->tr_rename.tr_logcount = xfs_rename_log_count(mp, resp); resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_link.tr_logres = xfs_calc_link_reservation(mp); - resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT; + resp->tr_link.tr_logcount = xfs_link_log_count(mp, resp); resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp); - resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT; + resp->tr_remove.tr_logcount = xfs_remove_log_count(mp, resp); resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp); - resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT; + resp->tr_symlink.tr_logcount = xfs_symlink_log_count(mp, resp); resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_create.tr_logres = xfs_calc_icreate_reservation(mp); - resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; + resp->tr_create.tr_logcount = xfs_icreate_log_count(mp, resp); resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); + resp->tr_mkdir.tr_logcount = xfs_mkdir_log_count(mp, resp); + resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; +} + +STATIC void +xfs_calc_default_atomic_ioend_reservation( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* Pick a default that will scale reasonably for the log size. */ + resp->tr_atomic_ioend = resp->tr_itruncate; +} + +void +xfs_trans_resv_calc( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + int logcount_adj = 0; + + /* + * The following transactions are logged in physical format and + * require a permanent reservation on space. + */ + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + resp->tr_create_tmpfile.tr_logres = xfs_calc_create_tmpfile_reservation(mp); resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT; resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); - resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; - resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp); resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT; resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES; @@ -988,6 +1352,8 @@ xfs_trans_resv_calc( resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + xfs_calc_namespace_reservations(mp, resp); + /* * The following transactions are logged in logical format with * a default log count. @@ -1027,4 +1393,167 @@ xfs_trans_resv_calc( resp->tr_itruncate.tr_logcount += logcount_adj; resp->tr_write.tr_logcount += logcount_adj; resp->tr_qm_dqalloc.tr_logcount += logcount_adj; + + /* + * Now that we've finished computing the static reservations, we can + * compute the dynamic reservation for atomic writes. + */ + xfs_calc_default_atomic_ioend_reservation(mp, resp); +} + +/* + * Return the per-extent and fixed transaction reservation sizes needed to + * complete an atomic write. + */ +STATIC unsigned int +xfs_calc_atomic_write_ioend_geometry( + struct xfs_mount *mp, + unsigned int *step_size) +{ + const unsigned int efi = xfs_efi_log_space(1); + const unsigned int efd = xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1); + const unsigned int rud = xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1); + const unsigned int cud = xfs_cud_log_space(); + const unsigned int bui = xfs_bui_log_space(1); + const unsigned int bud = xfs_bud_log_space(); + + /* + * Maximum overhead to complete an atomic write ioend in software: + * remove data fork extent + remove cow fork extent + map extent into + * data fork. + * + * tx0: Creates a BUI and a CUI and that's all it needs. + * + * tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and + * enough space to relog the CUI (== CUI + CUD). + * + * tx2: Roll again to finish the RUI. Need space for the RUD and space + * to relog the CUI. + * + * tx3: Roll again, need space for the CUD and possibly a new EFI. + * + * tx4: Roll again, need space for an EFD. + * + * If the extent referenced by the pair of BUI/CUI items is not the one + * being currently processed, then we need to reserve space to relog + * both items. + */ + const unsigned int tx0 = bui + cui; + const unsigned int tx1 = bud + rui + cui + cud; + const unsigned int tx2 = rud + cui + cud; + const unsigned int tx3 = cud + efi; + const unsigned int tx4 = efd; + const unsigned int relog = bui + bud + cui + cud; + + const unsigned int per_intent = max(max3(tx0, tx1, tx2), + max3(tx3, tx4, relog)); + + /* Overhead to finish one step of each intent item type */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); + const unsigned int f4 = xfs_calc_finish_bui_reservation(mp, 1); + + /* We only finish one item per transaction in a chain */ + *step_size = max(f4, max3(f1, f2, f3)); + + return per_intent; +} + +/* + * Compute the maximum size (in fsblocks) of atomic writes that we can complete + * given the existing log reservations. + */ +xfs_extlen_t +xfs_calc_max_atomic_write_fsblocks( + struct xfs_mount *mp) +{ + const struct xfs_trans_res *resv = &M_RES(mp)->tr_atomic_ioend; + unsigned int per_intent = 0; + unsigned int step_size = 0; + unsigned int ret = 0; + + if (resv->tr_logres > 0) { + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, + &step_size); + + if (resv->tr_logres >= step_size) + ret = (resv->tr_logres - step_size) / per_intent; + } + + trace_xfs_calc_max_atomic_write_fsblocks(mp, per_intent, step_size, + resv->tr_logres, ret); + + return ret; +} + +/* + * Compute the log blocks and transaction reservation needed to complete an + * atomic write of a given number of blocks. Worst case, each block requires + * separate handling. A return value of 0 means something went wrong. + */ +xfs_extlen_t +xfs_calc_atomic_write_log_geometry( + struct xfs_mount *mp, + xfs_extlen_t blockcount, + unsigned int *new_logres) +{ + struct xfs_trans_res *curr_res = &M_RES(mp)->tr_atomic_ioend; + uint old_logres = curr_res->tr_logres; + unsigned int per_intent, step_size; + unsigned int logres; + xfs_extlen_t min_logblocks; + + ASSERT(blockcount > 0); + + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size); + + /* Check for overflows */ + if (check_mul_overflow(blockcount, per_intent, &logres) || + check_add_overflow(logres, step_size, &logres)) + return 0; + + curr_res->tr_logres = logres; + min_logblocks = xfs_log_calc_minimum_size(mp); + curr_res->tr_logres = old_logres; + + trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size, + blockcount, min_logblocks, logres); + + *new_logres = logres; + return min_logblocks; +} + +/* + * Compute the transaction reservation needed to complete an out of place + * atomic write of a given number of blocks. + */ +int +xfs_calc_atomic_write_reservation( + struct xfs_mount *mp, + xfs_extlen_t blockcount) +{ + unsigned int new_logres; + xfs_extlen_t min_logblocks; + + /* + * If the caller doesn't ask for a specific atomic write size, then + * use the defaults. + */ + if (blockcount == 0) { + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + return 0; + } + + min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount, + &new_logres); + if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks) + return -EINVAL; + + M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres; + return 0; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 0554b9d775d2..336279e0fc61 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -48,6 +48,7 @@ struct xfs_trans_resv { struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ struct xfs_trans_res tr_sb; /* modify superblock */ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ + struct xfs_trans_res tr_atomic_ioend; /* untorn write completion */ }; /* shorthand way of accessing reservation structure */ @@ -98,8 +99,32 @@ struct xfs_trans_resv { void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); +unsigned int xfs_calc_finish_bui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_rui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_rui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_cui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_cui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp, + xfs_extlen_t blockcount, unsigned int *new_logres); +int xfs_calc_atomic_write_reservation(struct xfs_mount *mp, + xfs_extlen_t blockcount); + #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_space.c b/fs/xfs/libxfs/xfs_trans_space.c new file mode 100644 index 000000000000..b9dc3752f702 --- /dev/null +++ b/fs/xfs/libxfs/xfs_trans_space.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_da_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" + +/* Calculate the disk space required to add a parent pointer. */ +unsigned int +xfs_parent_calc_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + /* + * Parent pointers are always the first attr in an attr tree, and never + * larger than a block + */ + return XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + + XFS_NEXTENTADD_SPACE_RES(mp, namelen, XFS_ATTR_FORK); +} + +unsigned int +xfs_create_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int ret; + + ret = XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp, namelen); + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_mkdir_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + return xfs_create_space_res(mp, namelen); +} + +unsigned int +xfs_link_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int ret; + + ret = XFS_DIRENTER_SPACE_RES(mp, namelen); + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_symlink_space_res( + struct xfs_mount *mp, + unsigned int namelen, + unsigned int fsblocks) +{ + unsigned int ret; + + ret = XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp, namelen) + + fsblocks; + + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_remove_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int ret = XFS_DIRREMOVE_SPACE_RES(mp); + + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_rename_space_res( + struct xfs_mount *mp, + unsigned int src_namelen, + bool target_exists, + unsigned int target_namelen, + bool has_whiteout) +{ + unsigned int ret; + + ret = XFS_DIRREMOVE_SPACE_RES(mp) + + XFS_DIRENTER_SPACE_RES(mp, target_namelen); + + if (xfs_has_parent(mp)) { + if (has_whiteout) + ret += xfs_parent_calc_space_res(mp, src_namelen); + ret += 2 * xfs_parent_calc_space_res(mp, target_namelen); + } + + if (target_exists) + ret += xfs_parent_calc_space_res(mp, target_namelen); + + return ret; +} diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 87b31c69a773..d89b570aafcc 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -10,6 +10,23 @@ * Components of space reservations. */ +/* Worst case number of bmaps that can be held in a block. */ +#define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp) \ + (((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0])) + +/* Worst case number of realtime rmaps that can be held in a block. */ +#define XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) \ + (((mp)->m_rtrmap_mxr[0]) - ((mp)->m_rtrmap_mnr[0])) + +/* Adding one realtime rmap could split every level to the top of the tree. */ +#define XFS_RTRMAPADD_SPACE_RES(mp) ((mp)->m_rtrmap_maxlevels) + +/* Blocks we might need to add "b" realtime rmaps to a tree. */ +#define XFS_NRTRMAPADD_SPACE_RES(mp, b) \ + ((((b) + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * \ + XFS_RTRMAPADD_SPACE_RES(mp)) + /* Worst case number of rmaps that can be held in a block. */ #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) @@ -76,31 +93,32 @@ /* This macro is not used - see inline code in xfs_attr_set */ #define XFS_ATTRSET_SPACE_RES(mp, v) \ (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v)) -#define XFS_CREATE_SPACE_RES(mp,nl) \ - (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) #define XFS_DIOSTRAT_SPACE_RES(mp, v) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) #define XFS_GROWFS_SPACE_RES(mp) \ (2 * (mp)->m_alloc_maxlevels) #define XFS_GROWFSRT_SPACE_RES(mp,b) \ ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) -#define XFS_LINK_SPACE_RES(mp,nl) \ - XFS_DIRENTER_SPACE_RES(mp,nl) -#define XFS_MKDIR_SPACE_RES(mp,nl) \ - (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) #define XFS_QM_DQALLOC_SPACE_RES(mp) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \ XFS_DQUOT_CLUSTER_SIZE_FSB) #define XFS_QM_QINOCREATE_SPACE_RES(mp) \ XFS_IALLOC_SPACE_RES(mp) -#define XFS_REMOVE_SPACE_RES(mp) \ - XFS_DIRREMOVE_SPACE_RES(mp) -#define XFS_RENAME_SPACE_RES(mp,nl) \ - (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) -#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ - (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) #define XFS_IFREE_SPACE_RES(mp) \ (xfs_has_finobt(mp) ? M_IGEO(mp)->inobt_maxlevels : 0) +unsigned int xfs_parent_calc_space_res(struct xfs_mount *mp, + unsigned int namelen); + +unsigned int xfs_create_space_res(struct xfs_mount *mp, unsigned int namelen); +unsigned int xfs_mkdir_space_res(struct xfs_mount *mp, unsigned int namelen); +unsigned int xfs_link_space_res(struct xfs_mount *mp, unsigned int namelen); +unsigned int xfs_symlink_space_res(struct xfs_mount *mp, unsigned int namelen, + unsigned int fsblocks); +unsigned int xfs_remove_space_res(struct xfs_mount *mp, unsigned int namelen); + +unsigned int xfs_rename_space_res(struct xfs_mount *mp, + unsigned int src_namelen, bool target_exists, + unsigned int target_namelen, bool has_whiteout); #endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index c299b16c9365..1faf04204c5d 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -12,6 +12,8 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" /* @@ -111,7 +113,7 @@ xfs_verify_ino( /* Is this an internal inode number? */ inline bool -xfs_internal_inum( +xfs_is_sb_inum( struct xfs_mount *mp, xfs_ino_t ino) { @@ -129,24 +131,42 @@ xfs_verify_dir_ino( struct xfs_mount *mp, xfs_ino_t ino) { - if (xfs_internal_inum(mp, ino)) + if (xfs_is_sb_inum(mp, ino)) return false; return xfs_verify_ino(mp, ino); } /* - * Verify that an realtime block number pointer doesn't point off the - * end of the realtime device. + * Verify that a realtime block number pointer neither points outside the + * allocatable areas of the rtgroup nor off the end of the realtime + * device. */ inline bool xfs_verify_rtbno( struct xfs_mount *mp, xfs_rtblock_t rtbno) { + if (xfs_has_rtgroups(mp)) { + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); + xfs_rtxnum_t rtx = xfs_rtb_to_rtx(mp, rtbno); + + if (rgno >= mp->m_sb.sb_rgcount) + return false; + if (rtx >= xfs_rtgroup_extents(mp, rgno)) + return false; + if (xfs_has_rtsb(mp) && rgno == 0 && rtx == 0) + return false; + return true; + } + return rtbno < mp->m_sb.sb_rblocks; } -/* Verify that a realtime device extent is fully contained inside the volume. */ +/* + * Verify that an allocated realtime device extent neither points outside + * allocatable areas of the rtgroup, across an rtgroup boundary, nor off the + * end of the realtime device. + */ bool xfs_verify_rtbext( struct xfs_mount *mp, @@ -159,7 +179,14 @@ xfs_verify_rtbext( if (!xfs_verify_rtbno(mp, rtbno)) return false; - return xfs_verify_rtbno(mp, rtbno + len - 1); + if (!xfs_verify_rtbno(mp, rtbno + len - 1)) + return false; + + if (xfs_has_rtgroups(mp) && + xfs_rtb_to_rgno(mp, rtbno) != xfs_rtb_to_rgno(mp, rtbno + len - 1)) + return false; + + return true; } /* Calculate the range of valid icount values. */ @@ -170,13 +197,12 @@ xfs_icount_range( unsigned long long *max) { unsigned long long nr_inos = 0; - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; /* root, rtbitmap, rtsum all live in the first chunk */ *min = XFS_INODES_PER_CHUNK; - for_each_perag(mp, agno, pag) + while ((pag = xfs_perag_next(mp, pag))) nr_inos += pag->agino_max - pag->agino_min + 1; *max = nr_inos; } diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 76eb9e328835..f6f4f2d4b5db 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -9,10 +9,12 @@ typedef uint32_t prid_t; /* project ID */ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef uint32_t xfs_rgblock_t; /* blockno in realtime group */ typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef uint32_t xfs_extlen_t; /* extent length in blocks */ typedef uint32_t xfs_rtxlen_t; /* file extent length in rtextents */ typedef uint32_t xfs_agnumber_t; /* allocation group number */ +typedef uint32_t xfs_rgnumber_t; /* realtime group number */ typedef uint64_t xfs_extnum_t; /* # of extents in a file */ typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ typedef int64_t xfs_fsize_t; /* bytes in a file */ @@ -53,7 +55,9 @@ typedef void * xfs_failaddr_t; #define NULLFILEOFF ((xfs_fileoff_t)-1) #define NULLAGBLOCK ((xfs_agblock_t)-1) +#define NULLRGBLOCK ((xfs_rgblock_t)-1) #define NULLAGNUMBER ((xfs_agnumber_t)-1) +#define NULLRGNUMBER ((xfs_rgnumber_t)-1) #define NULLCOMMITLSN ((xfs_lsn_t)-1) @@ -198,6 +202,13 @@ enum xfs_ag_resv_type { * altering fdblocks. If you think you need this you're wrong. */ XFS_AG_RESV_IGNORE, + + /* + * This allocation activity is being done on behalf of a metadata file. + * These files maintain their own permanent space reservations and are + * required to adjust fdblocks using the xfs_metafile_resv_* helpers. + */ + XFS_AG_RESV_METAFILE, }; /* Results of scanning a btree keyspace to check occupancy. */ @@ -212,6 +223,44 @@ enum xbtree_recpacking { XBTREE_RECPACKING_FULL, }; +enum xfs_group_type { + XG_TYPE_AG, + XG_TYPE_RTG, + XG_TYPE_MAX, +} __packed; + +#define XG_TYPE_STRINGS \ + { XG_TYPE_AG, "ag" }, \ + { XG_TYPE_RTG, "rtg" } + +enum xfs_free_counter { + /* + * Number of free blocks on the data device. + */ + XC_FREE_BLOCKS, + + /* + * Number of free RT extents on the RT device. + */ + XC_FREE_RTEXTENTS, + + /* + * Number of available for use RT extents. + * + * This counter only exists for zoned RT device and indicates the number + * of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS + * also includes blocks that have been written previously and freed, but + * sit in a rtgroup that still needs a zone reset. + */ + XC_FREE_RTAVAILABLE, + XC_FREE_NR, +}; + +#define XFS_FREECOUNTER_STR \ + { XC_FREE_BLOCKS, "blocks" }, \ + { XC_FREE_RTEXTENTS, "rtextents" }, \ + { XC_FREE_RTAVAILABLE, "rtavailable" } + /* * Type verifier functions */ @@ -222,7 +271,7 @@ bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno, xfs_fsblock_t len); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); -bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); +bool xfs_is_sb_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); bool xfs_verify_rtbext(struct xfs_mount *mp, xfs_rtblock_t rtbno, @@ -235,16 +284,4 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off); bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off, xfs_fileoff_t len); -/* Do we support an rt volume having this number of rtextents? */ -static inline bool -xfs_validate_rtextents( - xfs_rtbxlen_t rtextents) -{ - /* No runt rt volumes */ - if (rtextents == 0) - return false; - - return true; -} - #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c new file mode 100644 index 000000000000..b0791a71931c --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtgroup.h" +#include "xfs_zones.h" + +static bool +xfs_zone_validate_empty( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > 0) { + xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = 0; + return true; +} + +static bool +xfs_zone_validate_wp( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) { + xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.", + rtg_rgno(rtg), wp_fsb); + return false; + } + + *write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb); + if (*write_pointer >= rtg->rtg_extents) { + xfs_warn(mp, "zone %u has invalid write pointer (0x%x).", + rtg_rgno(rtg), *write_pointer); + return false; + } + + return true; +} + +static bool +xfs_zone_validate_full( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = rtg->rtg_extents; + return true; +} + +static bool +xfs_zone_validate_seq( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + return xfs_zone_validate_empty(zone, rtg, write_pointer); + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return xfs_zone_validate_wp(zone, rtg, write_pointer); + case BLK_ZONE_COND_FULL: + return xfs_zone_validate_full(zone, rtg, write_pointer); + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + default: + xfs_warn(mp, "zone %u has unknown zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +static bool +xfs_zone_validate_conv( + struct blk_zone *zone, + struct xfs_rtgroup *rtg) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_NOT_WP: + return true; + default: + xfs_warn(mp, +"conventional zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +bool +xfs_zone_validate( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + uint32_t expected_size; + + /* + * Check that the zone capacity matches the rtgroup size stored in the + * superblock. Note that all zones including the last one must have a + * uniform capacity. + */ + if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) { + xfs_warn(mp, +"zone %u capacity (0x%llx) does not match RT group size (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity), + g->blocks); + return false; + } + + if (g->has_daddr_gaps) { + expected_size = 1 << g->blklog; + } else { + if (zone->len != zone->capacity) { + xfs_warn(mp, +"zone %u has capacity != size ((0x%llx vs 0x%llx)", + rtg_rgno(rtg), + XFS_BB_TO_FSB(mp, zone->len), + XFS_BB_TO_FSB(mp, zone->capacity)); + return false; + } + expected_size = g->blocks; + } + + if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) { + xfs_warn(mp, +"zone %u length (0x%llx) does match geometry (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len), + expected_size); + } + + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + return xfs_zone_validate_conv(zone, rtg); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + return xfs_zone_validate_seq(zone, rtg, write_pointer); + default: + xfs_warn(mp, "zoned %u has unsupported type 0x%x.", + rtg_rgno(rtg), zone->type); + return false; + } +} diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h new file mode 100644 index 000000000000..c4f1367b2cca --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LIBXFS_ZONES_H +#define _LIBXFS_ZONES_H + +struct xfs_rtgroup; + +/* + * In order to guarantee forward progress for GC we need to reserve at least + * two zones: one that will be used for moving data into and one spare zone + * making sure that we have enough space to relocate a nearly-full zone. + * To allow for slightly sloppy accounting for when we need to reserve the + * second zone, we actually reserve three as that is easier than doing fully + * accurate bookkeeping. + */ +#define XFS_GC_ZONES 3U + +/* + * In addition we need two zones for user writes, one open zone for writing + * and one to still have available blocks without resetting the open zone + * when data in the open zone has been freed. + */ +#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1) +#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1) + +/* + * Always keep one zone out of the general open zone pool to allow for GC to + * happen while other writers are waiting for free space. + */ +#define XFS_OPEN_GC_ZONES 1U +#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) + +bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer); + +#endif /* _LIBXFS_ZONES_H */ |