diff options
Diffstat (limited to 'fs/xfs/libxfs/xfs_alloc.c')
| -rw-r--r-- | fs/xfs/libxfs/xfs_alloc.c | 3535 |
1 files changed, 2279 insertions, 1256 deletions
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index b715668886a4..ad381c73abc4 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -10,10 +10,8 @@ #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" -#include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_rmap.h" #include "xfs_alloc_btree.h" @@ -21,29 +19,23 @@ #include "xfs_extent_busy.h" #include "xfs_errortag.h" #include "xfs_error.h" -#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_bmap.h" +#include "xfs_health.h" +#include "xfs_extfree_item.h" -extern kmem_zone_t *xfs_bmap_free_item_zone; +struct kmem_cache *xfs_extfree_item_cache; struct workqueue_struct *xfs_alloc_wq; -#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) - #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); -STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); -STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); -STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, - xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); - /* * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in * the beginning of the block for a proper header with the location information @@ -55,7 +47,7 @@ xfs_agfl_size( { unsigned int size = mp->m_sb.sb_sectsize; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) size -= sizeof(struct xfs_agfl); return size / sizeof(xfs_agblock_t); @@ -65,9 +57,9 @@ unsigned int xfs_refc_block( struct xfs_mount *mp) { - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) return XFS_RMAP_BLOCK(mp) + 1; - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) return XFS_FIBT_BLOCK(mp) + 1; return XFS_IBT_BLOCK(mp) + 1; } @@ -76,16 +68,34 @@ xfs_extlen_t xfs_prealloc_blocks( struct xfs_mount *mp) { - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) return xfs_refc_block(mp) + 1; - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) return XFS_RMAP_BLOCK(mp) + 1; - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) return XFS_FIBT_BLOCK(mp) + 1; return XFS_IBT_BLOCK(mp) + 1; } /* + * The number of blocks per AG that we withhold from xfs_dec_fdblocks to + * guarantee that we can refill the AGFL prior to allocating space in a nearly + * full AG. Although the space described by the free space btrees, the + * blocks used by the freesp btrees themselves, and the blocks owned by the + * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk + * free space in the AG drop so low that the free space btrees cannot refill an + * empty AGFL up to the minimum level. Rather than grind through empty AGs + * until the fs goes down, we subtract this many AG blocks from the incore + * fdblocks to ensure user allocation does not overcommit the space the + * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to + * withhold space from xfs_dec_fdblocks, so we do not account for that here. + */ +#define XFS_ALLOCBT_AGFL_RESERVE 4 + +/* + * Compute the number of blocks that we set aside to guarantee the ability to + * refill the AGFL and handle a full bmap btree split. + * * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of * AGF buffer (PV 947395), we place constraints on the relationship among * actual allocations for data blocks, freelist blocks, and potential file data @@ -97,14 +107,14 @@ xfs_prealloc_blocks( * extents need to be actually allocated. To get around this, we explicitly set * aside a few blocks which will not be reserved in delayed allocation. * - * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a - * potential split of the file's bmap btree. + * For each AG, we need to reserve enough blocks to replenish a totally empty + * AGFL and 4 more to handle a potential split of the file's bmap btree. */ unsigned int xfs_alloc_set_aside( struct xfs_mount *mp) { - return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4); + return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); } /* @@ -128,31 +138,50 @@ xfs_alloc_ag_max_usable( unsigned int blocks; blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ - blocks += XFS_ALLOC_AGFL_RESERVE; + blocks += XFS_ALLOCBT_AGFL_RESERVE; blocks += 3; /* AGF, AGI btree root blocks */ - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) blocks++; /* finobt root block */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - blocks++; /* rmap root block */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) + blocks++; /* rmap root block */ + if (xfs_has_reflink(mp)) blocks++; /* refcount root block */ return mp->m_sb.sb_agblocks - blocks; } + +static int +xfs_alloc_lookup( + struct xfs_btree_cur *cur, + xfs_lookup_t dir, + xfs_agblock_t bno, + xfs_extlen_t len, + int *stat) +{ + int error; + + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + error = xfs_btree_lookup(cur, dir, stat); + if (*stat == 1) + cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; + else + cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; + return error; +} + /* * Lookup the record equal to [bno, len] in the btree given by cur. */ -STATIC int /* error */ +static inline int /* error */ xfs_alloc_lookup_eq( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); + return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat); } /* @@ -166,9 +195,7 @@ xfs_alloc_lookup_ge( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); + return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat); } /* @@ -182,9 +209,14 @@ xfs_alloc_lookup_le( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat); +} + +static inline bool +xfs_alloc_cur_active( + struct xfs_btree_cur *cur) +{ + return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE); } /* @@ -205,6 +237,50 @@ xfs_alloc_update( return xfs_btree_update(cur, &rec); } +/* Convert the ondisk btree record to its incore representation. */ +void +xfs_alloc_btrec_to_irec( + const union xfs_btree_rec *rec, + struct xfs_alloc_rec_incore *irec) +{ + irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); + irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); +} + +/* Simple checks for free space records. */ +xfs_failaddr_t +xfs_alloc_check_irec( + struct xfs_perag *pag, + const struct xfs_alloc_rec_incore *irec) +{ + if (irec->ar_blockcount == 0) + return __this_address; + + /* check for valid extent range, including overflow */ + if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount)) + return __this_address; + + return NULL; +} + +static inline int +xfs_alloc_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_alloc_rec_incore *irec) +{ + struct xfs_mount *mp = cur->bc_mp; + + xfs_warn(mp, + "%sbt record corruption in AG %d detected at %pS!", + cur->bc_ops->name, cur->bc_group->xg_gno, fa); + xfs_warn(mp, + "start block 0x%x block count 0x%x", irec->ar_startblock, + irec->ar_blockcount); + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; +} + /* * Get the data from the pointed-to record. */ @@ -215,38 +291,23 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat) /* output: success/failure */ { - struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + struct xfs_alloc_rec_incore irec; union xfs_btree_rec *rec; + xfs_failaddr_t fa; int error; error = xfs_btree_get_rec(cur, &rec, stat); if (error || !(*stat)) return error; - *bno = be32_to_cpu(rec->alloc.ar_startblock); - *len = be32_to_cpu(rec->alloc.ar_blockcount); - - if (*len == 0) - goto out_bad_rec; - - /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(mp, agno, *bno)) - goto out_bad_rec; - if (*bno > *bno + *len) - goto out_bad_rec; - if (!xfs_verify_agbno(mp, agno, *bno + *len - 1)) - goto out_bad_rec; + xfs_alloc_btrec_to_irec(rec, &irec); + fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec); + if (fa) + return xfs_alloc_complain_bad_rec(cur, fa, &irec); + *bno = irec.ar_startblock; + *len = irec.ar_blockcount; return 0; - -out_bad_rec: - xfs_warn(mp, - "%s Freespace BTree record corruption in AG %d detected!", - cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno); - xfs_warn(mp, - "start block 0x%x block count 0x%x", *bno, *len); - return -EFSCORRUPTED; } /* @@ -268,7 +329,8 @@ xfs_alloc_compute_aligned( bool busy; /* Trim busy sections out of found extent */ - busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); + busy = xfs_extent_busy_trim(pag_group(args->pag), args->minlen, + args->maxlen, &bno, &len, busy_gen); /* * If we have a largish extent that happens to start before min_agbno, @@ -317,7 +379,7 @@ xfs_alloc_compute_diff( xfs_extlen_t newlen1=0; /* length with newbno1 */ xfs_extlen_t newlen2=0; /* length with newbno2 */ xfs_agblock_t wantend; /* end of target extent */ - bool userdata = xfs_alloc_is_userdata(datatype); + bool userdata = datatype & XFS_ALLOC_USERDATA; ASSERT(freelen >= wantlen); freeend = freebno + freelen; @@ -346,8 +408,8 @@ xfs_alloc_compute_diff( if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { if (newlen1 < newlen2 || (newlen1 == newlen2 && - XFS_ABSDIFF(newbno1, wantbno) > - XFS_ABSDIFF(newbno2, wantbno))) + abs_diff(newbno1, wantbno) > + abs_diff(newbno2, wantbno))) newbno1 = newbno2; } else if (newbno2 != NULLAGBLOCK) newbno1 = newbno2; @@ -363,7 +425,7 @@ xfs_alloc_compute_diff( } else newbno1 = freeend - wantlen; *newbnop = newbno1; - return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); + return newbno1 == NULLAGBLOCK ? 0 : abs_diff(newbno1, wantbno); } /* @@ -404,6 +466,97 @@ xfs_alloc_fix_len( } /* + * Determine if the cursor points to the block that contains the right-most + * block of records in the by-count btree. This block contains the largest + * contiguous free extent in the AG, so if we modify a record in this block we + * need to call xfs_alloc_fixup_longest() once the modifications are done to + * ensure the agf->agf_longest field is kept up to date with the longest free + * extent tracked by the by-count btree. + */ +static bool +xfs_alloc_cursor_at_lastrec( + struct xfs_btree_cur *cnt_cur) +{ + struct xfs_btree_block *block; + union xfs_btree_ptr ptr; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cnt_cur, 0, &bp); + + xfs_btree_get_sibling(cnt_cur, block, &ptr, XFS_BB_RIGHTSIB); + return xfs_btree_ptr_is_null(cnt_cur, &ptr); +} + +/* + * Find the rightmost record of the cntbt, and return the longest free space + * recorded in it. Simply set both the block number and the length to their + * maximum values before searching. + */ +static int +xfs_cntbt_longest( + struct xfs_btree_cur *cnt_cur, + xfs_extlen_t *longest) +{ + struct xfs_alloc_rec_incore irec; + union xfs_btree_rec *rec; + int stat = 0; + int error; + + memset(&cnt_cur->bc_rec, 0xFF, sizeof(cnt_cur->bc_rec)); + error = xfs_btree_lookup(cnt_cur, XFS_LOOKUP_LE, &stat); + if (error) + return error; + if (!stat) { + /* totally empty tree */ + *longest = 0; + return 0; + } + + error = xfs_btree_get_rec(cnt_cur, &rec, &stat); + if (error) + return error; + if (XFS_IS_CORRUPT(cnt_cur->bc_mp, !stat)) { + xfs_btree_mark_sick(cnt_cur); + return -EFSCORRUPTED; + } + + xfs_alloc_btrec_to_irec(rec, &irec); + *longest = irec.ar_blockcount; + return 0; +} + +/* + * Update the longest contiguous free extent in the AG from the by-count cursor + * that is passed to us. This should be done at the end of any allocation or + * freeing operation that touches the longest extent in the btree. + * + * Needing to update the longest extent can be determined by calling + * xfs_alloc_cursor_at_lastrec() after the cursor is positioned for record + * modification but before the modification begins. + */ +static int +xfs_alloc_fixup_longest( + struct xfs_btree_cur *cnt_cur) +{ + struct xfs_perag *pag = to_perag(cnt_cur->bc_group); + struct xfs_buf *bp = cnt_cur->bc_ag.agbp; + struct xfs_agf *agf = bp->b_addr; + xfs_extlen_t longest = 0; + int error; + + /* Lookup last rec in order to update AGF. */ + error = xfs_cntbt_longest(cnt_cur, &longest); + if (error) + return error; + + pag->pagf_longest = longest; + agf->agf_longest = cpu_to_be32(pag->pagf_longest); + xfs_alloc_log_agf(cnt_cur->bc_tp, bp, XFS_AGF_LONGEST); + + return 0; +} + +/* * Update the two btrees, logically removing from freespace the extent * starting at rbno, rlen blocks. The extent is contained within the * actual (current) free extent fbno for flen blocks. @@ -412,8 +565,8 @@ xfs_alloc_fix_len( */ STATIC int /* error code */ xfs_alloc_fixup_trees( - xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */ - xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */ + struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */ + struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */ xfs_agblock_t fbno, /* starting block of free extent */ xfs_extlen_t flen, /* length of free extent */ xfs_agblock_t rbno, /* starting block of returned extent */ @@ -427,6 +580,7 @@ xfs_alloc_fixup_trees( xfs_extlen_t nflen1=0; /* first new free length */ xfs_extlen_t nflen2=0; /* second new free length */ struct xfs_mount *mp; + bool fixup_longest = false; mp = cnt_cur->bc_mp; @@ -437,13 +591,21 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, - i == 1 && nfbno1 == fbno && nflen1 == flen); + if (XFS_IS_CORRUPT(mp, + i != 1 || + nfbno1 != fbno || + nflen1 != flen)) { + xfs_btree_mark_sick(cnt_cur); + return -EFSCORRUPTED; + } #endif } else { if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + return -EFSCORRUPTED; + } } /* * Look up the record in the by-block tree if necessary. @@ -452,13 +614,21 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, - i == 1 && nfbno1 == fbno && nflen1 == flen); + if (XFS_IS_CORRUPT(mp, + i != 1 || + nfbno1 != fbno || + nflen1 != flen)) { + xfs_btree_mark_sick(bno_cur); + return -EFSCORRUPTED; + } #endif } else { if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); + return -EFSCORRUPTED; + } } #ifdef DEBUG @@ -466,11 +636,15 @@ xfs_alloc_fixup_trees( struct xfs_btree_block *bnoblock; struct xfs_btree_block *cntblock; - bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); - cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); + bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp); + cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp); - XFS_WANT_CORRUPTED_RETURN(mp, - bnoblock->bb_numrecs == cntblock->bb_numrecs); + if (XFS_IS_CORRUPT(mp, + bnoblock->bb_numrecs != + cntblock->bb_numrecs)) { + xfs_btree_mark_sick(bno_cur); + return -EFSCORRUPTED; + } } #endif @@ -495,30 +669,49 @@ xfs_alloc_fixup_trees( nfbno2 = rbno + rlen; nflen2 = (fbno + flen) - nfbno2; } + + if (xfs_alloc_cursor_at_lastrec(cnt_cur)) + fixup_longest = true; + /* * Delete the entry from the by-size btree. */ if ((error = xfs_btree_delete(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + return -EFSCORRUPTED; + } /* * Add new by-size btree entry(s). */ if (nfbno1 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cnt_cur); + return -EFSCORRUPTED; + } if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + return -EFSCORRUPTED; + } } if (nfbno2 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cnt_cur); + return -EFSCORRUPTED; + } if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + return -EFSCORRUPTED; + } } /* * Fix up the by-block btree entry(s). @@ -529,7 +722,10 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); + return -EFSCORRUPTED; + } } else { /* * Update the by-block entry to start later|be shorter. @@ -543,34 +739,58 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bno_cur); + return -EFSCORRUPTED; + } if ((error = xfs_btree_insert(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); + return -EFSCORRUPTED; + } } + + if (fixup_longest) + return xfs_alloc_fixup_longest(cnt_cur); + return 0; } +/* + * We do not verify the AGFL contents against AGF-based index counters here, + * even though we may have access to the perag that contains shadow copies. We + * don't know if the AGF based counters have been checked, and if they have they + * still may be inconsistent because they haven't yet been reset on the first + * allocation after the AGF has been read in. + * + * This means we can only check that all agfl entries contain valid or null + * values because we can't reliably determine the active range to exclude + * NULLAGBNO as a valid value. + * + * However, we can't even do that for v4 format filesystems because there are + * old versions of mkfs out there that does not initialise the AGFL to known, + * verifiable values. HEnce we can't tell the difference between a AGFL block + * allocated by mkfs and a corrupted AGFL block here on v4 filesystems. + * + * As a result, we can only fully validate AGFL block numbers when we pull them + * from the freelist in xfs_alloc_get_freelist(). + */ static xfs_failaddr_t xfs_agfl_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); int i; - /* - * There is no verification of non-crc AGFLs because mkfs does not - * initialise the AGFL to zero or NULL. Hence the only valid part of the - * AGFL is what the AGF says is active. We can't get to the AGF, so we - * can't verify just those entries are valid. - */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return NULL; - if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) + if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) return __this_address; - if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; /* * during growfs operations, the perag is not fully initialised, @@ -578,12 +798,12 @@ xfs_agfl_verify( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) + if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != pag_agno((bp->b_pag))) return __this_address; for (i = 0; i < xfs_agfl_size(mp); i++) { - if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && - be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK && + be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks) return __this_address; } @@ -596,7 +816,7 @@ static void xfs_agfl_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; /* @@ -605,7 +825,7 @@ xfs_agfl_read_verify( * AGFL is what the AGF says is active. We can't get to the AGF, so we * can't verify just those entries are valid. */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) @@ -621,12 +841,12 @@ static void xfs_agfl_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; xfs_failaddr_t fa; /* no verification of non-crc AGFLs */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; fa = xfs_agfl_verify(bp); @@ -643,6 +863,7 @@ xfs_agfl_write_verify( const struct xfs_buf_ops xfs_agfl_buf_ops = { .name = "xfs_agfl", + .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, .verify_struct = xfs_agfl_verify, @@ -651,21 +872,21 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = { /* * Read in the allocation group free block array. */ -int /* error */ +int xfs_alloc_read_agfl( - xfs_mount_t *mp, /* mount point structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_buf_t **bpp) /* buffer for the ag free block array */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf **bpp) { - xfs_buf_t *bp; /* return value */ - int error; + struct xfs_mount *mp = pag_mount(pag); + struct xfs_buf *bp; + int error; - ASSERT(agno != NULLAGNUMBER); - error = xfs_trans_read_buf( - mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); if (error) return error; xfs_buf_set_ref(bp, XFS_AGFL_REF); @@ -676,98 +897,411 @@ xfs_alloc_read_agfl( STATIC int xfs_alloc_update_counters( struct xfs_trans *tp, - struct xfs_perag *pag, struct xfs_buf *agbp, long len) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_agf *agf = agbp->b_addr; - pag->pagf_freeblks += len; + agbp->b_pag->pagf_freeblks += len; be32_add_cpu(&agf->agf_freeblks, len); - xfs_trans_agblocks_delta(tp, len); if (unlikely(be32_to_cpu(agf->agf_freeblks) > - be32_to_cpu(agf->agf_length))) + be32_to_cpu(agf->agf_length))) { + xfs_buf_mark_corrupt(agbp); + xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF); return -EFSCORRUPTED; + } xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); return 0; } /* - * Allocation group level functions. + * Block allocation algorithm and data structures. */ +struct xfs_alloc_cur { + struct xfs_btree_cur *cnt; /* btree cursors */ + struct xfs_btree_cur *bnolt; + struct xfs_btree_cur *bnogt; + xfs_extlen_t cur_len;/* current search length */ + xfs_agblock_t rec_bno;/* extent startblock */ + xfs_extlen_t rec_len;/* extent length */ + xfs_agblock_t bno; /* alloc bno */ + xfs_extlen_t len; /* alloc len */ + xfs_extlen_t diff; /* diff from search bno */ + unsigned int busy_gen;/* busy state */ + bool busy; +}; /* - * Allocate a variable extent in the allocation group agno. - * Type and bno are used to determine where in the allocation group the - * extent will start. - * Extent's length (returned in *len) will be between minlen and maxlen, - * and of the form k * prod + mod unless there's nothing that large. - * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + * Set up cursors, etc. in the extent allocation cursor. This function can be + * called multiple times to reset an initialized structure without having to + * reallocate cursors. */ -STATIC int /* error */ -xfs_alloc_ag_vextent( - xfs_alloc_arg_t *args) /* argument structure for allocation */ +static int +xfs_alloc_cur_setup( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur) { - int error=0; + int error; + int i; - ASSERT(args->minlen > 0); - ASSERT(args->maxlen > 0); - ASSERT(args->minlen <= args->maxlen); - ASSERT(args->mod < args->prod); - ASSERT(args->alignment > 0); + acur->cur_len = args->maxlen; + acur->rec_bno = 0; + acur->rec_len = 0; + acur->bno = 0; + acur->len = 0; + acur->diff = -1; + acur->busy = false; + acur->busy_gen = 0; /* - * Branch to correct routine based on the type. + * Perform an initial cntbt lookup to check for availability of maxlen + * extents. If this fails, we'll return -ENOSPC to signal the caller to + * attempt a small allocation. */ - args->wasfromfl = 0; - switch (args->type) { - case XFS_ALLOCTYPE_THIS_AG: - error = xfs_alloc_ag_vextent_size(args); - break; - case XFS_ALLOCTYPE_NEAR_BNO: - error = xfs_alloc_ag_vextent_near(args); - break; - case XFS_ALLOCTYPE_THIS_BNO: - error = xfs_alloc_ag_vextent_exact(args); - break; - default: - ASSERT(0); - /* NOTREACHED */ - } + if (!acur->cnt) + acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp, + args->agbp, args->pag); + error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); + if (error) + return error; + + /* + * Allocate the bnobt left and right search cursors. + */ + if (!acur->bnolt) + acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp, + args->agbp, args->pag); + if (!acur->bnogt) + acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp, + args->agbp, args->pag); + return i == 1 ? 0 : -ENOSPC; +} + +static void +xfs_alloc_cur_close( + struct xfs_alloc_cur *acur, + bool error) +{ + int cur_error = XFS_BTREE_NOERROR; + + if (error) + cur_error = XFS_BTREE_ERROR; + + if (acur->cnt) + xfs_btree_del_cursor(acur->cnt, cur_error); + if (acur->bnolt) + xfs_btree_del_cursor(acur->bnolt, cur_error); + if (acur->bnogt) + xfs_btree_del_cursor(acur->bnogt, cur_error); + acur->cnt = acur->bnolt = acur->bnogt = NULL; +} + +/* + * Check an extent for allocation and track the best available candidate in the + * allocation structure. The cursor is deactivated if it has entered an out of + * range state based on allocation arguments. Optionally return the extent + * extent geometry and allocation status if requested by the caller. + */ +static int +xfs_alloc_cur_check( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur, + struct xfs_btree_cur *cur, + int *new) +{ + int error, i; + xfs_agblock_t bno, bnoa, bnew; + xfs_extlen_t len, lena, diff = -1; + bool busy; + unsigned busy_gen = 0; + bool deactivate = false; + bool isbnobt = xfs_btree_is_bno(cur->bc_ops); + + *new = 0; - if (error || args->agbno == NULLAGBLOCK) + error = xfs_alloc_get_rec(cur, &bno, &len, &i); + if (error) return error; + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + /* + * Check minlen and deactivate a cntbt cursor if out of acceptable size + * range (i.e., walking backwards looking for a minlen extent). + */ + if (len < args->minlen) { + deactivate = !isbnobt; + goto out; + } + + busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena, + &busy_gen); + acur->busy |= busy; + if (busy) + acur->busy_gen = busy_gen; + /* deactivate a bnobt cursor outside of locality range */ + if (bnoa < args->min_agbno || bnoa > args->max_agbno) { + deactivate = isbnobt; + goto out; + } + if (lena < args->minlen) + goto out; + + args->len = XFS_EXTLEN_MIN(lena, args->maxlen); + xfs_alloc_fix_len(args); ASSERT(args->len >= args->minlen); - ASSERT(args->len <= args->maxlen); - ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL); - ASSERT(args->agbno % args->alignment == 0); + if (args->len < acur->len) + goto out; - /* if not file data, insert new block into the reverse map btree */ - if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { - error = xfs_rmap_alloc(args->tp, args->agbp, args->agno, - args->agbno, args->len, &args->oinfo); + /* + * We have an aligned record that satisfies minlen and beats or matches + * the candidate extent size. Compare locality for near allocation mode. + */ + diff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, args->datatype, + bnoa, lena, &bnew); + if (bnew == NULLAGBLOCK) + goto out; + + /* + * Deactivate a bnobt cursor with worse locality than the current best. + */ + if (diff > acur->diff) { + deactivate = isbnobt; + goto out; + } + + ASSERT(args->len > acur->len || + (args->len == acur->len && diff <= acur->diff)); + acur->rec_bno = bno; + acur->rec_len = len; + acur->bno = bnew; + acur->len = args->len; + acur->diff = diff; + *new = 1; + + /* + * We're done if we found a perfect allocation. This only deactivates + * the current cursor, but this is just an optimization to terminate a + * cntbt search that otherwise runs to the edge of the tree. + */ + if (acur->diff == 0 && acur->len == args->maxlen) + deactivate = true; +out: + if (deactivate) + cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; + trace_xfs_alloc_cur_check(cur, bno, len, diff, *new); + return 0; +} + +/* + * Complete an allocation of a candidate extent. Remove the extent from both + * trees and update the args structure. + */ +STATIC int +xfs_alloc_cur_finish( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur) +{ + int error; + + ASSERT(acur->cnt && acur->bnolt); + ASSERT(acur->bno >= acur->rec_bno); + ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); + ASSERT(xfs_verify_agbext(args->pag, acur->rec_bno, acur->rec_len)); + + error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, + acur->rec_len, acur->bno, acur->len, 0); + if (error) + return error; + + args->agbno = acur->bno; + args->len = acur->len; + args->wasfromfl = 0; + + trace_xfs_alloc_cur(args); + return 0; +} + +/* + * Locality allocation lookup algorithm. This expects a cntbt cursor and uses + * bno optimized lookup to search for extents with ideal size and locality. + */ +STATIC int +xfs_alloc_cntbt_iter( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur) +{ + struct xfs_btree_cur *cur = acur->cnt; + xfs_agblock_t bno; + xfs_extlen_t len, cur_len; + int error; + int i; + + if (!xfs_alloc_cur_active(cur)) + return 0; + + /* locality optimized lookup */ + cur_len = acur->cur_len; + error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i); + if (error) + return error; + if (i == 0) + return 0; + error = xfs_alloc_get_rec(cur, &bno, &len, &i); + if (error) + return error; + + /* check the current record and update search length from it */ + error = xfs_alloc_cur_check(args, acur, cur, &i); + if (error) + return error; + ASSERT(len >= acur->cur_len); + acur->cur_len = len; + + /* + * We looked up the first record >= [agbno, len] above. The agbno is a + * secondary key and so the current record may lie just before or after + * agbno. If it is past agbno, check the previous record too so long as + * the length matches as it may be closer. Don't check a smaller record + * because that could deactivate our cursor. + */ + if (bno > args->agbno) { + error = xfs_btree_decrement(cur, 0, &i); + if (!error && i) { + error = xfs_alloc_get_rec(cur, &bno, &len, &i); + if (!error && i && len == acur->cur_len) + error = xfs_alloc_cur_check(args, acur, cur, + &i); + } if (error) return error; } - if (!args->wasfromfl) { - error = xfs_alloc_update_counters(args->tp, args->pag, - args->agbp, - -((long)(args->len))); + /* + * Increment the search key until we find at least one allocation + * candidate or if the extent we found was larger. Otherwise, double the + * search key to optimize the search. Efficiency is more important here + * than absolute best locality. + */ + cur_len <<= 1; + if (!acur->len || acur->cur_len >= cur_len) + acur->cur_len++; + else + acur->cur_len = cur_len; + + return error; +} + +/* + * Deal with the case where only small freespaces remain. Either return the + * contents of the last freespace record, or allocate space from the freelist if + * there is nothing in the tree. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_small( + struct xfs_alloc_arg *args, /* allocation argument structure */ + struct xfs_btree_cur *ccur, /* optional by-size cursor */ + xfs_agblock_t *fbnop, /* result block number */ + xfs_extlen_t *flenp, /* result length */ + int *stat) /* status: 0-freelist, 1-normal/none */ +{ + struct xfs_agf *agf = args->agbp->b_addr; + int error = 0; + xfs_agblock_t fbno = NULLAGBLOCK; + xfs_extlen_t flen = 0; + int i = 0; + + /* + * If a cntbt cursor is provided, try to allocate the largest record in + * the tree. Try the AGFL if the cntbt is empty, otherwise fail the + * allocation. Make sure to respect minleft even when pulling from the + * freelist. + */ + if (ccur) + error = xfs_btree_decrement(ccur, 0, &i); + if (error) + goto error; + if (i) { + error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i); if (error) - return error; + goto error; + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(ccur); + error = -EFSCORRUPTED; + goto error; + } + goto out; + } - ASSERT(!xfs_extent_busy_search(args->mp, args->agno, - args->agbno, args->len)); + if (args->minlen != 1 || args->alignment != 1 || + args->resv == XFS_AG_RESV_AGFL || + be32_to_cpu(agf->agf_flcount) <= args->minleft) + goto out; + + error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp, + &fbno, 0); + if (error) + goto error; + if (fbno == NULLAGBLOCK) + goto out; + + xfs_extent_busy_reuse(pag_group(args->pag), fbno, 1, + (args->datatype & XFS_ALLOC_NOBUSY)); + + if (args->datatype & XFS_ALLOC_USERDATA) { + struct xfs_buf *bp; + + error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, + xfs_agbno_to_daddr(args->pag, fbno), + args->mp->m_bsize, 0, &bp); + if (error) + goto error; + xfs_trans_binval(args->tp, bp); } + *fbnop = args->agbno = fbno; + *flenp = args->len = 1; + if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { + xfs_btree_mark_sick(ccur); + error = -EFSCORRUPTED; + goto error; + } + args->wasfromfl = 1; + trace_xfs_alloc_small_freelist(args); - xfs_ag_resv_alloc_extent(args->pag, args->resv, args); + /* + * If we're feeding an AGFL block to something that doesn't live in the + * free space, we need to clear out the OWN_AG rmap. + */ + error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1, + &XFS_RMAP_OINFO_AG); + if (error) + goto error; + + *stat = 0; + return 0; + +out: + /* + * Can't do the allocation, give up. + */ + if (flen < args->minlen) { + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_small_notenough(args); + flen = 0; + } + *fbnop = fbno; + *flenp = flen; + *stat = 1; + trace_xfs_alloc_small_done(args); + return 0; - XFS_STATS_INC(args->mp, xs_allocx); - XFS_STATS_ADD(args->mp, xs_allocb, args->len); +error: + trace_xfs_alloc_small_error(args); return error; } @@ -781,8 +1315,8 @@ STATIC int /* error */ xfs_alloc_ag_vextent_exact( xfs_alloc_arg_t *args) /* allocation argument structure */ { - xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ - xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ + struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ + struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ int error; xfs_agblock_t fbno; /* start block of found extent */ xfs_extlen_t flen; /* length of found extent */ @@ -797,8 +1331,8 @@ xfs_alloc_ag_vextent_exact( /* * Allocate/initialize a cursor for the by-number freespace btree. */ - bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO); + bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); /* * Lookup bno and minlen in the btree (minlen is irrelevant, really). @@ -817,7 +1351,11 @@ xfs_alloc_ag_vextent_exact( error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); + error = -EFSCORRUPTED; + goto error0; + } ASSERT(fbno <= args->agbno); /* @@ -825,7 +1363,8 @@ xfs_alloc_ag_vextent_exact( */ tbno = fbno; tlen = flen; - xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); + xfs_extent_busy_trim(pag_group(args->pag), args->minlen, args->maxlen, + &tbno, &tlen, &busy_gen); /* * Give up if the start of the extent is busy, or the freespace isn't @@ -854,10 +1393,9 @@ xfs_alloc_ag_vextent_exact( * We are allocating agbno for args->len * Allocate/initialize a cursor for the by-size btree. */ - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT); - ASSERT(args->agbno + args->len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); + ASSERT(xfs_verify_agbext(args->pag, args->agbno, args->len)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); if (error) { @@ -886,98 +1424,244 @@ error0: } /* - * Search the btree in a given direction via the search cursor and compare - * the records found against the good extent we've already found. + * Search a given number of btree records in a given direction. Check each + * record against the good extent we've already found. */ STATIC int -xfs_alloc_find_best_extent( - struct xfs_alloc_arg *args, /* allocation argument structure */ - struct xfs_btree_cur **gcur, /* good cursor */ - struct xfs_btree_cur **scur, /* searching cursor */ - xfs_agblock_t gdiff, /* difference for search comparison */ - xfs_agblock_t *sbno, /* extent found by search */ - xfs_extlen_t *slen, /* extent length */ - xfs_agblock_t *sbnoa, /* aligned extent found by search */ - xfs_extlen_t *slena, /* aligned extent length */ - int dir) /* 0 = search right, 1 = search left */ -{ - xfs_agblock_t new; - xfs_agblock_t sdiff; +xfs_alloc_walk_iter( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur, + struct xfs_btree_cur *cur, + bool increment, + bool find_one, /* quit on first candidate */ + int count, /* rec count (-1 for infinite) */ + int *stat) +{ int error; int i; - unsigned busy_gen; - /* The good extent is perfect, no need to search. */ - if (!gdiff) - goto out_use_good; + *stat = 0; /* - * Look until we find a better one, run out of space or run off the end. + * Search so long as the cursor is active or we find a better extent. + * The cursor is deactivated if it extends beyond the range of the + * current allocation candidate. */ - do { - error = xfs_alloc_get_rec(*scur, sbno, slen, &i); + while (xfs_alloc_cur_active(cur) && count) { + error = xfs_alloc_cur_check(args, acur, cur, &i); if (error) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, *sbno, *slen, - sbnoa, slena, &busy_gen); + return error; + if (i == 1) { + *stat = 1; + if (find_one) + break; + } + if (!xfs_alloc_cur_active(cur)) + break; + + if (increment) + error = xfs_btree_increment(cur, 0, &i); + else + error = xfs_btree_decrement(cur, 0, &i); + if (error) + return error; + if (i == 0) + cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; + + if (count > 0) + count--; + } + + return 0; +} + +/* + * Search the by-bno and by-size btrees in parallel in search of an extent with + * ideal locality based on the NEAR mode ->agbno locality hint. + */ +STATIC int +xfs_alloc_ag_vextent_locality( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur, + int *stat) +{ + struct xfs_btree_cur *fbcur = NULL; + int error; + int i; + bool fbinc; + + ASSERT(acur->len == 0); + + *stat = 0; + + error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i); + if (error) + return error; + error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i); + if (error) + return error; + error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i); + if (error) + return error; + + /* + * Search the bnobt and cntbt in parallel. Search the bnobt left and + * right and lookup the closest extent to the locality hint for each + * extent size key in the cntbt. The entire search terminates + * immediately on a bnobt hit because that means we've found best case + * locality. Otherwise the search continues until the cntbt cursor runs + * off the end of the tree. If no allocation candidate is found at this + * point, give up on locality, walk backwards from the end of the cntbt + * and take the first available extent. + * + * The parallel tree searches balance each other out to provide fairly + * consistent performance for various situations. The bnobt search can + * have pathological behavior in the worst case scenario of larger + * allocation requests and fragmented free space. On the other hand, the + * bnobt is able to satisfy most smaller allocation requests much more + * quickly than the cntbt. The cntbt search can sift through fragmented + * free space and sets of free extents for larger allocation requests + * more quickly than the bnobt. Since the locality hint is just a hint + * and we don't want to scan the entire bnobt for perfect locality, the + * cntbt search essentially bounds the bnobt search such that we can + * find good enough locality at reasonable performance in most cases. + */ + while (xfs_alloc_cur_active(acur->bnolt) || + xfs_alloc_cur_active(acur->bnogt) || + xfs_alloc_cur_active(acur->cnt)) { + + trace_xfs_alloc_cur_lookup(args); /* - * The good extent is closer than this one. + * Search the bnobt left and right. In the case of a hit, finish + * the search in the opposite direction and we're done. */ - if (!dir) { - if (*sbnoa > args->max_agbno) - goto out_use_good; - if (*sbnoa >= args->agbno + gdiff) - goto out_use_good; - } else { - if (*sbnoa < args->min_agbno) - goto out_use_good; - if (*sbnoa <= args->agbno - gdiff) - goto out_use_good; + error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false, + true, 1, &i); + if (error) + return error; + if (i == 1) { + trace_xfs_alloc_cur_left(args); + fbcur = acur->bnogt; + fbinc = true; + break; + } + error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true, + 1, &i); + if (error) + return error; + if (i == 1) { + trace_xfs_alloc_cur_right(args); + fbcur = acur->bnolt; + fbinc = false; + break; } /* - * Same distance, compare length and pick the best. + * Check the extent with best locality based on the current + * extent size search key and keep track of the best candidate. */ - if (*slena >= args->minlen) { - args->len = XFS_EXTLEN_MIN(*slena, args->maxlen); - xfs_alloc_fix_len(args); - - sdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, - args->datatype, *sbnoa, - *slena, &new); + error = xfs_alloc_cntbt_iter(args, acur); + if (error) + return error; + if (!xfs_alloc_cur_active(acur->cnt)) { + trace_xfs_alloc_cur_lookup_done(args); + break; + } + } - /* - * Choose closer size and invalidate other cursor. - */ - if (sdiff < gdiff) - goto out_use_search; - goto out_use_good; + /* + * If we failed to find anything due to busy extents, return empty + * handed so the caller can flush and retry. If no busy extents were + * found, walk backwards from the end of the cntbt as a last resort. + */ + if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) { + error = xfs_btree_decrement(acur->cnt, 0, &i); + if (error) + return error; + if (i) { + acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; + fbcur = acur->cnt; + fbinc = false; } + } - if (!dir) - error = xfs_btree_increment(*scur, 0, &i); - else - error = xfs_btree_decrement(*scur, 0, &i); + /* + * Search in the opposite direction for a better entry in the case of + * a bnobt hit or walk backwards from the end of the cntbt. + */ + if (fbcur) { + error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1, + &i); if (error) - goto error0; - } while (i); + return error; + } -out_use_good: - xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR); - *scur = NULL; - return 0; + if (acur->len) + *stat = 1; -out_use_search: - xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR); - *gcur = NULL; return 0; +} -error0: - /* caller invalidates cursors */ - return error; +/* Check the last block of the cnt btree for allocations. */ +static int +xfs_alloc_ag_vextent_lastblock( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur, + xfs_agblock_t *bno, + xfs_extlen_t *len, + bool *allocated) +{ + int error; + int i; + +#ifdef DEBUG + /* Randomly don't execute the first algorithm. */ + if (get_random_u32_below(2)) + return 0; +#endif + + /* + * Start from the entry that lookup found, sequence through all larger + * free blocks. If we're actually pointing at a record smaller than + * maxlen, go to the start of this block, and skip all those smaller + * than minlen. + */ + if (*len || args->alignment > 1) { + acur->cnt->bc_levels[0].ptr = 1; + do { + error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); + if (error) + return error; + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(acur->cnt); + return -EFSCORRUPTED; + } + if (*len >= args->minlen) + break; + error = xfs_btree_increment(acur->cnt, 0, &i); + if (error) + return error; + } while (i); + ASSERT(*len >= args->minlen); + if (!i) + return 0; + } + + error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i); + if (error) + return error; + + /* + * It didn't work. We COULD be in a case where there's a good record + * somewhere, so try again. + */ + if (acur->len == 0) + return 0; + + trace_xfs_alloc_near_first(args); + *allocated = true; + return 0; } /* @@ -986,41 +1670,18 @@ error0: * and of the form k * prod + mod unless there's nothing that large. * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. */ -STATIC int /* error */ +STATIC int xfs_alloc_ag_vextent_near( - xfs_alloc_arg_t *args) /* allocation argument structure */ + struct xfs_alloc_arg *args, + uint32_t alloc_flags) { - xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */ - xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */ - xfs_btree_cur_t *cnt_cur; /* cursor for count btree */ - xfs_agblock_t gtbno; /* start bno of right side entry */ - xfs_agblock_t gtbnoa; /* aligned ... */ - xfs_extlen_t gtdiff; /* difference to right side entry */ - xfs_extlen_t gtlen; /* length of right side entry */ - xfs_extlen_t gtlena; /* aligned ... */ - xfs_agblock_t gtnew; /* useful start bno of right side */ - int error; /* error code */ - int i; /* result code, temporary */ - int j; /* result code, temporary */ - xfs_agblock_t ltbno; /* start bno of left side entry */ - xfs_agblock_t ltbnoa; /* aligned ... */ - xfs_extlen_t ltdiff; /* difference to left side entry */ - xfs_extlen_t ltlen; /* length of left side entry */ - xfs_extlen_t ltlena; /* aligned ... */ - xfs_agblock_t ltnew; /* useful start bno of left side */ - xfs_extlen_t rlen; /* length of returned extent */ - bool busy; - unsigned busy_gen; -#ifdef DEBUG - /* - * Randomly don't execute the first algorithm. - */ - int dofirst; /* set to do first algorithm */ + struct xfs_alloc_cur acur = {}; + int error; /* error code */ + int i; /* result code, temporary */ + xfs_agblock_t bno; + xfs_extlen_t len; - dofirst = prandom_u32() & 1; -#endif - - /* handle unitialized agbno range so caller doesn't have to */ + /* handle uninitialized agbno range so caller doesn't have to */ if (!args->min_agbno && !args->max_agbno) args->max_agbno = args->mp->m_sb.sb_agblocks - 1; ASSERT(args->min_agbno <= args->max_agbno); @@ -1031,41 +1692,30 @@ xfs_alloc_ag_vextent_near( if (args->agbno > args->max_agbno) args->agbno = args->max_agbno; + /* Retry once quickly if we find busy extents before blocking. */ + alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; restart: - bno_cur_lt = NULL; - bno_cur_gt = NULL; - ltlen = 0; - gtlena = 0; - ltlena = 0; - busy = false; + len = 0; /* - * Get a cursor for the by-size btree. - */ - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT); - - /* - * See if there are any free extents as big as maxlen. - */ - if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i))) - goto error0; - /* - * If none, then pick up the last entry in the tree unless the - * tree is empty. + * Set up cursors and see if there are any free extents as big as + * maxlen. If not, pick the last entry in the tree unless the tree is + * empty. */ - if (!i) { - if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, <bno, - <len, &i))) - goto error0; - if (i == 0 || ltlen == 0) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + error = xfs_alloc_cur_setup(args, &acur); + if (error == -ENOSPC) { + error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno, + &len, &i); + if (error) + goto out; + if (i == 0 || len == 0) { trace_xfs_alloc_near_noentry(args); - return 0; + goto out; } ASSERT(i == 1); + } else if (error) { + goto out; } - args->wasfromfl = 0; /* * First algorithm. @@ -1074,311 +1724,59 @@ restart: * near the right edge of the tree. If it's in the last btree leaf * block, then we just examine all the entries in that block * that are big enough, and pick the best one. - * This is written as a while loop so we can break out of it, - * but we never loop back to the top. */ - while (xfs_btree_islastblock(cnt_cur, 0)) { - xfs_extlen_t bdiff; - int besti=0; - xfs_extlen_t blen=0; - xfs_agblock_t bnew=0; - -#ifdef DEBUG - if (dofirst) - break; -#endif - /* - * Start from the entry that lookup found, sequence through - * all larger free blocks. If we're actually pointing at a - * record smaller than maxlen, go to the start of this block, - * and skip all those smaller than minlen. - */ - if (ltlen || args->alignment > 1) { - cnt_cur->bc_ptrs[0] = 1; - do { - if ((error = xfs_alloc_get_rec(cnt_cur, <bno, - <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - if (ltlen >= args->minlen) - break; - if ((error = xfs_btree_increment(cnt_cur, 0, &i))) - goto error0; - } while (i); - ASSERT(ltlen >= args->minlen); - if (!i) - break; - } - i = cnt_cur->bc_ptrs[0]; - for (j = 1, blen = 0, bdiff = 0; - !error && j && (blen < args->maxlen || bdiff > 0); - error = xfs_btree_increment(cnt_cur, 0, &j)) { - /* - * For each entry, decide if it's better than - * the previous best entry. - */ - if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - busy = xfs_alloc_compute_aligned(args, ltbno, ltlen, - <bnoa, <lena, &busy_gen); - if (ltlena < args->minlen) - continue; - if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) - continue; - args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); - xfs_alloc_fix_len(args); - ASSERT(args->len >= args->minlen); - if (args->len < blen) - continue; - ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->datatype, ltbnoa, - ltlena, <new); - if (ltnew != NULLAGBLOCK && - (args->len > blen || ltdiff < bdiff)) { - bdiff = ltdiff; - bnew = ltnew; - blen = args->len; - besti = cnt_cur->bc_ptrs[0]; - } - } - /* - * It didn't work. We COULD be in a case where - * there's a good record somewhere, so try again. - */ - if (blen == 0) - break; - /* - * Point at the best entry, and retrieve it again. - */ - cnt_cur->bc_ptrs[0] = besti; - if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); - args->len = blen; + if (xfs_btree_islastblock(acur.cnt, 0)) { + bool allocated = false; - /* - * We are allocating starting at bnew for blen blocks. - */ - args->agbno = bnew; - ASSERT(bnew >= ltbno); - ASSERT(bnew + blen <= ltbno + ltlen); - /* - * Set up a cursor for the by-bno tree. - */ - bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_BNO); - /* - * Fix up the btree entries. - */ - if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, - ltlen, bnew, blen, XFSA_FIXUP_CNT_OK))) - goto error0; - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); - - trace_xfs_alloc_near_first(args); - return 0; - } - /* - * Second algorithm. - * Search in the by-bno tree to the left and to the right - * simultaneously, until in each case we find a space big enough, - * or run into the edge of the tree. When we run into the edge, - * we deallocate that cursor. - * If both searches succeed, we compare the two spaces and pick - * the better one. - * With alignment, it's possible for both to fail; the upper - * level algorithm that picks allocation groups for allocations - * is not supposed to do this. - */ - /* - * Allocate and initialize the cursor for the leftward search. - */ - bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO); - /* - * Lookup <= bno to find the leftward search's starting point. - */ - if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i))) - goto error0; - if (!i) { - /* - * Didn't find anything; use this cursor for the rightward - * search. - */ - bno_cur_gt = bno_cur_lt; - bno_cur_lt = NULL; - } - /* - * Found something. Duplicate the cursor for the rightward search. - */ - else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt))) - goto error0; - /* - * Increment the cursor, so we will point at the entry just right - * of the leftward entry if any, or to the leftmost entry. - */ - if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) - goto error0; - if (!i) { - /* - * It failed, there are no rightward entries. - */ - xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); - bno_cur_gt = NULL; + error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len, + &allocated); + if (error) + goto out; + if (allocated) + goto alloc_finish; } - /* - * Loop going left with the leftward cursor, right with the - * rightward cursor, until either both directions give up or - * we find an entry at least as big as minlen. - */ - do { - if (bno_cur_lt) { - if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen, - <bnoa, <lena, &busy_gen); - if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) - break; - if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) - goto error0; - if (!i || ltbnoa < args->min_agbno) { - xfs_btree_del_cursor(bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - } - } - if (bno_cur_gt) { - if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen, - >bnoa, >lena, &busy_gen); - if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) - break; - if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) - goto error0; - if (!i || gtbnoa > args->max_agbno) { - xfs_btree_del_cursor(bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - } - } - } while (bno_cur_lt || bno_cur_gt); /* - * Got both cursors still active, need to find better entry. + * Second algorithm. Combined cntbt and bnobt search to find ideal + * locality. */ - if (bno_cur_lt && bno_cur_gt) { - if (ltlena >= args->minlen) { - /* - * Left side is good, look for a right side entry. - */ - args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); - xfs_alloc_fix_len(args); - ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->datatype, ltbnoa, - ltlena, <new); - - error = xfs_alloc_find_best_extent(args, - &bno_cur_lt, &bno_cur_gt, - ltdiff, >bno, >len, - >bnoa, >lena, - 0 /* search right */); - } else { - ASSERT(gtlena >= args->minlen); - - /* - * Right side is good, look for a left side entry. - */ - args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); - xfs_alloc_fix_len(args); - gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->datatype, gtbnoa, - gtlena, >new); - - error = xfs_alloc_find_best_extent(args, - &bno_cur_gt, &bno_cur_lt, - gtdiff, <bno, <len, - <bnoa, <lena, - 1 /* search left */); - } - - if (error) - goto error0; - } + error = xfs_alloc_ag_vextent_locality(args, &acur, &i); + if (error) + goto out; /* * If we couldn't get anything, give up. */ - if (bno_cur_lt == NULL && bno_cur_gt == NULL) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - - if (busy) { + if (!acur.len) { + if (acur.busy) { + /* + * Our only valid extents must have been busy. Flush and + * retry the allocation again. If we get an -EAGAIN + * error, we're being told that a deadlock was avoided + * and the current transaction needs committing before + * the allocation can be retried. + */ trace_xfs_alloc_near_busy(args); - xfs_extent_busy_flush(args->mp, args->pag, busy_gen); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), acur.busy_gen, + alloc_flags); + if (error) + goto out; + + alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; goto restart; } trace_xfs_alloc_size_neither(args); args->agbno = NULLAGBLOCK; - return 0; + goto out; } - /* - * At this point we have selected a freespace entry, either to the - * left or to the right. If it's on the right, copy all the - * useful variables to the "left" set so we only have one - * copy of this code. - */ - if (bno_cur_gt) { - bno_cur_lt = bno_cur_gt; - bno_cur_gt = NULL; - ltbno = gtbno; - ltbnoa = gtbnoa; - ltlen = gtlen; - ltlena = gtlena; - j = 1; - } else - j = 0; - - /* - * Fix up the length and compute the useful address. - */ - args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); - xfs_alloc_fix_len(args); - rlen = args->len; - (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, - args->datatype, ltbnoa, ltlena, <new); - ASSERT(ltnew >= ltbno); - ASSERT(ltnew + rlen <= ltbnoa + ltlena); - ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); - ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno); - args->agbno = ltnew; - - if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, - ltnew, rlen, XFSA_FIXUP_BNO_OK))) - goto error0; - - if (j) - trace_xfs_alloc_near_greater(args); - else - trace_xfs_alloc_near_lesser(args); - - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); - return 0; +alloc_finish: + /* fix up btrees on a successful allocation */ + error = xfs_alloc_cur_finish(args, &acur); - error0: - trace_xfs_alloc_near_error(args); - if (cnt_cur != NULL) - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); - if (bno_cur_lt != NULL) - xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR); - if (bno_cur_gt != NULL) - xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR); +out: + xfs_alloc_cur_close(&acur, error); return error; } @@ -1388,29 +1786,32 @@ restart: * and of the form k * prod + mod unless there's nothing that large. * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. */ -STATIC int /* error */ +static int xfs_alloc_ag_vextent_size( - xfs_alloc_arg_t *args) /* allocation argument structure */ + struct xfs_alloc_arg *args, + uint32_t alloc_flags) { - xfs_btree_cur_t *bno_cur; /* cursor for bno btree */ - xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */ - int error; /* error result */ - xfs_agblock_t fbno; /* start of found freespace */ - xfs_extlen_t flen; /* length of found freespace */ - int i; /* temp status variable */ - xfs_agblock_t rbno; /* returned block number */ - xfs_extlen_t rlen; /* length of returned extent */ - bool busy; - unsigned busy_gen; + struct xfs_agf *agf = args->agbp->b_addr; + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + xfs_agblock_t fbno; /* start of found freespace */ + xfs_extlen_t flen; /* length of found freespace */ + xfs_agblock_t rbno; /* returned block number */ + xfs_extlen_t rlen; /* length of returned extent */ + bool busy; + unsigned busy_gen; + int error; + int i; + /* Retry once quickly if we find busy extents before blocking. */ + alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; restart: /* * Allocate and initialize a cursor for the by-size btree. */ - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); bno_cur = NULL; - busy = false; /* * Look for an entry >= maxlen+alignment-1 blocks. @@ -1447,7 +1848,11 @@ restart: error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen, &busy_gen); @@ -1458,19 +1863,26 @@ restart: error = xfs_btree_increment(cnt_cur, 0, &i); if (error) goto error0; - if (i == 0) { - /* - * Our only valid extents must have been busy. - * Make it unbusy by forcing the log out and - * retrying. - */ - xfs_btree_del_cursor(cnt_cur, - XFS_BTREE_NOERROR); - trace_xfs_alloc_size_busy(args); - xfs_extent_busy_flush(args->mp, - args->pag, busy_gen); - goto restart; - } + if (i) + continue; + + /* + * Our only valid extents must have been busy. Flush and + * retry the allocation again. If we get an -EAGAIN + * error, we're being told that a deadlock was avoided + * and the current transaction needs committing before + * the allocation can be retried. + */ + trace_xfs_alloc_size_busy(args); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), busy_gen, + alloc_flags); + if (error) + goto error0; + + alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + goto restart; } } @@ -1481,8 +1893,14 @@ restart: * This can't happen in the second case above. */ rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || - (rlen <= flen && rbno + rlen <= fbno + flen), error0); + if (XFS_IS_CORRUPT(args->mp, + rlen != 0 && + (rlen > flen || + rbno + rlen > fbno + flen))) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } if (rlen < args->maxlen) { xfs_agblock_t bestfbno; xfs_extlen_t bestflen; @@ -1501,15 +1919,24 @@ restart: if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - if (flen < bestrlen) + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } + if (flen <= bestrlen) break; busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen, &busy_gen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || - (rlen <= flen && rbno + rlen <= fbno + flen), - error0); + if (XFS_IS_CORRUPT(args->mp, + rlen != 0 && + (rlen > flen || + rbno + rlen > fbno + flen))) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } if (rlen > bestrlen) { bestrlen = rlen; bestrbno = rbno; @@ -1522,7 +1949,11 @@ restart: if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } rlen = bestrlen; rbno = bestrbno; flen = bestflen; @@ -1535,9 +1966,22 @@ restart: args->len = rlen; if (rlen < args->minlen) { if (busy) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + /* + * Our only valid extents must have been busy. Flush and + * retry the allocation again. If we get an -EAGAIN + * error, we're being told that a deadlock was avoided + * and the current transaction needs committing before + * the allocation can be retried. + */ trace_xfs_alloc_size_busy(args); - xfs_extent_busy_flush(args->mp, args->pag, busy_gen); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), busy_gen, + alloc_flags); + if (error) + goto error0; + + alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); goto restart; } goto out_nominleft; @@ -1545,12 +1989,16 @@ restart: xfs_alloc_fix_len(args); rlen = args->len; - XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0); + if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } /* * Allocate and initialize a cursor for the by-block tree. */ - bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO); + bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, rbno, rlen, XFSA_FIXUP_CNT_OK))) goto error0; @@ -1559,10 +2007,13 @@ restart: cnt_cur = bno_cur = NULL; args->len = rlen; args->agbno = rbno; - XFS_WANT_CORRUPTED_GOTO(args->mp, - args->agbno + args->len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), - error0); + if (XFS_IS_CORRUPT(args->mp, + args->agbno + args->len > + be32_to_cpu(agf->agf_length))) { + xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT); + error = -EFSCORRUPTED; + goto error0; + } trace_xfs_alloc_size_done(args); return 0; @@ -1582,126 +2033,18 @@ out_nominleft: } /* - * Deal with the case where only small freespaces remain. - * Either return the contents of the last freespace record, - * or allocate space from the freelist if there is nothing in the tree. - */ -STATIC int /* error */ -xfs_alloc_ag_vextent_small( - xfs_alloc_arg_t *args, /* allocation argument structure */ - xfs_btree_cur_t *ccur, /* by-size cursor */ - xfs_agblock_t *fbnop, /* result block number */ - xfs_extlen_t *flenp, /* result length */ - int *stat) /* status: 0-freelist, 1-normal/none */ -{ - int error; - xfs_agblock_t fbno; - xfs_extlen_t flen; - int i; - - if ((error = xfs_btree_decrement(ccur, 0, &i))) - goto error0; - if (i) { - if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - } - /* - * Nothing in the btree, try the freelist. Make sure - * to respect minleft even when pulling from the - * freelist. - */ - else if (args->minlen == 1 && args->alignment == 1 && - args->resv != XFS_AG_RESV_AGFL && - (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) - > args->minleft)) { - error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); - if (error) - goto error0; - if (fbno != NULLAGBLOCK) { - xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, - xfs_alloc_allow_busy_reuse(args->datatype)); - - if (xfs_alloc_is_userdata(args->datatype)) { - xfs_buf_t *bp; - - bp = xfs_btree_get_bufs(args->mp, args->tp, - args->agno, fbno, 0); - if (!bp) { - error = -EFSCORRUPTED; - goto error0; - } - xfs_trans_binval(args->tp, bp); - } - args->len = 1; - args->agbno = fbno; - XFS_WANT_CORRUPTED_GOTO(args->mp, - args->agbno + args->len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), - error0); - args->wasfromfl = 1; - trace_xfs_alloc_small_freelist(args); - - /* - * If we're feeding an AGFL block to something that - * doesn't live in the free space, we need to clear - * out the OWN_AG rmap. - */ - error = xfs_rmap_free(args->tp, args->agbp, args->agno, - fbno, 1, &XFS_RMAP_OINFO_AG); - if (error) - goto error0; - - *stat = 0; - return 0; - } - /* - * Nothing in the freelist. - */ - else - flen = 0; - } - /* - * Can't allocate from the freelist for some reason. - */ - else { - fbno = NULLAGBLOCK; - flen = 0; - } - /* - * Can't do the allocation, give up. - */ - if (flen < args->minlen) { - args->agbno = NULLAGBLOCK; - trace_xfs_alloc_small_notenough(args); - flen = 0; - } - *fbnop = fbno; - *flenp = flen; - *stat = 1; - trace_xfs_alloc_small_done(args); - return 0; - -error0: - trace_xfs_alloc_small_error(args); - return error; -} - -/* * Free the extent starting at agno/bno for length. */ -STATIC int +int xfs_free_ag_extent( struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type) { struct xfs_mount *mp; - struct xfs_perag *pag; struct xfs_btree_cur *bno_cur; struct xfs_btree_cur *cnt_cur; xfs_agblock_t gtbno; /* start of right neighbor */ @@ -1714,12 +2057,14 @@ xfs_free_ag_extent( int haveright; /* have a right neighbor */ int i; int error; + struct xfs_perag *pag = agbp->b_pag; + bool fixup_longest = false; bno_cur = cnt_cur = NULL; mp = tp->t_mountp; if (!xfs_rmap_should_skip_owner_update(oinfo)) { - error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo); + error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo); if (error) goto error0; } @@ -1727,7 +2072,7 @@ xfs_free_ag_extent( /* * Allocate and initialize a cursor for the by-block btree. */ - bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO); + bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); /* * Look for a neighboring block on the left (lower block numbers) * that is contiguous with this space. @@ -1740,7 +2085,11 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); + error = -EFSCORRUPTED; + goto error0; + } /* * It's not contiguous, though. */ @@ -1752,8 +2101,11 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(mp, - ltbno + ltlen <= bno, error0); + if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { + xfs_btree_mark_sick(bno_cur); + error = -EFSCORRUPTED; + goto error0; + } } } /* @@ -1768,7 +2120,11 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); + error = -EFSCORRUPTED; + goto error0; + } /* * It's not contiguous, though. */ @@ -1780,13 +2136,17 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0); + if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { + xfs_btree_mark_sick(bno_cur); + error = -EFSCORRUPTED; + goto error0; + } } } /* * Now allocate and initialize a cursor for the by-size tree. */ - cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); /* * Have both left and right contiguous neighbors. * Merge all three into a single free block. @@ -1797,31 +2157,55 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } /* * Delete the old by-size entry on the right. */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } /* * Delete the old by-block entry for the right block. */ if ((error = xfs_btree_delete(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); + error = -EFSCORRUPTED; + goto error0; + } /* * Move the by-block cursor back to the left neighbor. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); + error = -EFSCORRUPTED; + goto error0; + } #ifdef DEBUG /* * Check that this is the right record: delete didn't @@ -1834,9 +2218,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, - i == 1 && xxbno == ltbno && xxlen == ltlen, - error0); + if (XFS_IS_CORRUPT(mp, + i != 1 || + xxbno != ltbno || + xxlen != ltlen)) { + xfs_btree_mark_sick(bno_cur); + error = -EFSCORRUPTED; + goto error0; + } } #endif /* @@ -1857,17 +2246,29 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } /* * Back up the by-block cursor to the left neighbor, and * update its length. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); + error = -EFSCORRUPTED; + goto error0; + } nbno = ltbno; nlen = len + ltlen; if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) @@ -1883,10 +2284,18 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } /* * Update the starting block and length of the right * neighbor in the by-block tree. @@ -1905,41 +2314,64 @@ xfs_free_ag_extent( nlen = len; if ((error = xfs_btree_insert(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); + error = -EFSCORRUPTED; + goto error0; + } } xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); bno_cur = NULL; + /* * In all cases we need to insert the new freespace in the by-size tree. + * + * If this new freespace is being inserted in the block that contains + * the largest free space in the btree, make sure we also fix up the + * agf->agf-longest tracker field. */ if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0); + if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } + if (xfs_alloc_cursor_at_lastrec(cnt_cur)) + fixup_longest = true; if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto error0; + } + if (fixup_longest) { + error = xfs_alloc_fixup_longest(cnt_cur); + if (error) + goto error0; + } + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); cnt_cur = NULL; /* * Update the freespace totals in the ag and superblock. */ - pag = xfs_perag_get(mp, agno); - error = xfs_alloc_update_counters(tp, pag, agbp, len); + error = xfs_alloc_update_counters(tp, agbp, len); xfs_ag_resv_free_extent(pag, type, tp, len); - xfs_perag_put(pag); if (error) goto error0; XFS_STATS_INC(mp, xs_freex); XFS_STATS_ADD(mp, xs_freeb, len); - trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); + trace_xfs_free_extent(pag, bno, len, type, haveleft, haveright); return 0; error0: - trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); + trace_xfs_free_extent(pag, bno, len, type, -1, -1); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); if (cnt_cur) @@ -1953,14 +2385,15 @@ xfs_free_ag_extent( */ /* - * Compute and fill in value of m_ag_maxlevels. + * Compute and fill in value of m_alloc_maxlevels. */ void xfs_alloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, + mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, (mp->m_sb.sb_agblocks + 1) / 2); + ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk()); } /* @@ -1997,31 +2430,58 @@ xfs_alloc_longest_free_extent( * reservations and AGFL rules in place, we can return this extent. */ if (pag->pagf_longest > delta) - return pag->pagf_longest - delta; + return min_t(xfs_extlen_t, pag_mount(pag)->m_ag_max_usable, + pag->pagf_longest - delta); /* Otherwise, let the caller try for 1 block if there's space. */ return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } +/* + * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, + * return the largest possible minimum length. + */ unsigned int xfs_alloc_min_freelist( struct xfs_mount *mp, struct xfs_perag *pag) { + /* AG btrees have at least 1 level. */ + const unsigned int bno_level = pag ? pag->pagf_bno_level : 1; + const unsigned int cnt_level = pag ? pag->pagf_cnt_level : 1; + const unsigned int rmap_level = pag ? pag->pagf_rmap_level : 1; unsigned int min_free; + ASSERT(mp->m_alloc_maxlevels > 0); + + /* + * For a btree shorter than the maximum height, the worst case is that + * every level gets split and a new level is added, then while inserting + * another entry to refill the AGFL, every level under the old root gets + * split again. This is: + * + * (full height split reservation) + (AGFL refill split height) + * = (current height + 1) + (current height - 1) + * = (new height) + (new height - 2) + * = 2 * new height - 2 + * + * For a btree of maximum height, the worst case is that every level + * under the root gets split, then while inserting another entry to + * refill the AGFL, every level under the root gets split again. This is + * also: + * + * 2 * (current height - 1) + * = 2 * (new height - 1) + * = 2 * new height - 2 + */ + /* space needed by-bno freespace btree */ - min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1, - mp->m_ag_maxlevels); + min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2; /* space needed by-size freespace btree */ - min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, - mp->m_ag_maxlevels); + min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2; /* space needed reverse mapping used space btree */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - min_free += min_t(unsigned int, - pag->pagf_levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels); - + if (xfs_has_rmapbt(mp)) + min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2; return min_free; } @@ -2041,6 +2501,7 @@ xfs_alloc_space_available( xfs_extlen_t alloc_len, longest; xfs_extlen_t reservation; /* blocks that are still reserved */ int available; + xfs_extlen_t agflcount; if (flags & XFS_ALLOC_FLAG_FREEING) return true; @@ -2053,8 +2514,13 @@ xfs_alloc_space_available( if (longest < alloc_len) return false; - /* do we have enough free space remaining for the allocation? */ - available = (int)(pag->pagf_freeblks + pag->pagf_flcount - + /* + * Do we have enough free space remaining for the allocation? Don't + * account extra agfl blocks because we are about to defer free them, + * making them unavailable until the current transaction commits. + */ + agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free); + available = (int)(pag->pagf_freeblks + agflcount - reservation - min_free - args->minleft); if (available < (int)max(args->total, alloc_len)) return false; @@ -2072,37 +2538,17 @@ xfs_alloc_space_available( return true; } -int -xfs_free_agfl_block( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t agbno, - struct xfs_buf *agbp, - struct xfs_owner_info *oinfo) -{ - int error; - struct xfs_buf *bp; - - error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, - XFS_AG_RESV_AGFL); - if (error) - return error; - - bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno, 0); - if (!bp) - return -EFSCORRUPTED; - xfs_trans_binval(tp, bp); - - return 0; -} - /* - * Check the agfl fields of the agf for inconsistency or corruption. The purpose - * is to detect an agfl header padding mismatch between current and early v5 - * kernels. This problem manifests as a 1-slot size difference between the - * on-disk flcount and the active [first, last] range of a wrapped agfl. This - * may also catch variants of agfl count corruption unrelated to padding. Either - * way, we'll reset the agfl and warn the user. + * Check the agfl fields of the agf for inconsistency or corruption. + * + * The original purpose was to detect an agfl header padding mismatch between + * current and early v5 kernels. This problem manifests as a 1-slot size + * difference between the on-disk flcount and the active [first, last] range of + * a wrapped agfl. + * + * However, we need to use these same checks to catch agfl count corruptions + * unrelated to padding. This could occur on any v4 or v5 filesystem, so either + * way, we need to reset the agfl and warn the user. * * Return true if a reset is required before the agfl can be used, false * otherwise. @@ -2118,10 +2564,6 @@ xfs_agfl_needs_reset( int agfl_size = xfs_agfl_size(mp); int active; - /* no agfl header on v4 supers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - /* * The agf read verifier catches severe corruption of these fields. * Repeat some sanity checks to cover a packed -> unpacked mismatch if @@ -2163,15 +2605,15 @@ xfs_agfl_reset( struct xfs_perag *pag) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_agf *agf = agbp->b_addr; - ASSERT(pag->pagf_agflreset); + ASSERT(xfs_perag_agfl_needs_reset(pag)); trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); xfs_warn(mp, "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " "Please unmount and run xfs_repair.", - pag->pag_agno, pag->pagf_flcount); + pag_agno(pag), pag->pagf_flcount); agf->agf_flfirst = 0; agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); @@ -2180,41 +2622,200 @@ xfs_agfl_reset( XFS_AGF_FLCOUNT); pag->pagf_flcount = 0; - pag->pagf_agflreset = false; + clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); } /* - * Defer an AGFL block free. This is effectively equivalent to - * xfs_bmap_add_free() with some special handling particular to AGFL blocks. - * - * Deferring AGFL frees helps prevent log reservation overruns due to too many - * allocation operations in a transaction. AGFL frees are prone to this problem - * because for one they are always freed one at a time. Further, an immediate - * AGFL block free can cause a btree join and require another block free before - * the real allocation can proceed. Deferring the free disconnects freeing up - * the AGFL slot from freeing the block. + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). */ -STATIC void -xfs_defer_agfl_block( +static int +xfs_defer_extent_free( struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_fsblock_t agbno, - struct xfs_owner_info *oinfo) + xfs_fsblock_t bno, + xfs_filblks_t len, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type, + unsigned int free_flags, + struct xfs_defer_pending **dfpp) { + struct xfs_extent_free_item *xefi; struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent_free_item *new; /* new element */ - ASSERT(xfs_bmap_free_item_zone != NULL); - ASSERT(oinfo != NULL); + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); + ASSERT(!isnullstartblock(bno)); + ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS)); + + if (free_flags & XFS_FREE_EXTENT_REALTIME) { + if (type != XFS_AG_RESV_NONE) { + ASSERT(type == XFS_AG_RESV_NONE); + return -EFSCORRUPTED; + } + if (XFS_IS_CORRUPT(mp, !xfs_verify_rtbext(mp, bno, len))) + return -EFSCORRUPTED; + } else { + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) + return -EFSCORRUPTED; + } + + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, + GFP_KERNEL | __GFP_NOFAIL); + xefi->xefi_startblock = bno; + xefi->xefi_blockcount = (xfs_extlen_t)len; + xefi->xefi_agresv = type; + if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD) + xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; + if (free_flags & XFS_FREE_EXTENT_REALTIME) + xefi->xefi_flags |= XFS_EFI_REALTIME; + if (oinfo) { + ASSERT(oinfo->oi_offset == 0); + + if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) + xefi->xefi_flags |= XFS_EFI_ATTR_FORK; + if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) + xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK; + xefi->xefi_owner = oinfo->oi_owner; + } else { + xefi->xefi_owner = XFS_RMAP_OWN_NULL; + } - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); - new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); - new->xefi_blockcount = 1; - new->xefi_oinfo = *oinfo; + xfs_extent_free_defer_add(tp, xefi, dfpp); + return 0; +} + +int +xfs_free_extent_later( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_filblks_t len, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type, + unsigned int free_flags) +{ + struct xfs_defer_pending *dontcare = NULL; + + return xfs_defer_extent_free(tp, bno, len, oinfo, type, free_flags, + &dontcare); +} + +/* + * Set up automatic freeing of unwritten space in the filesystem. + * + * This function attached a paused deferred extent free item to the + * transaction. Pausing means that the EFI will be logged in the next + * transaction commit, but the pending EFI will not be finished until the + * pending item is unpaused. + * + * If the system goes down after the EFI has been persisted to the log but + * before the pending item is unpaused, log recovery will find the EFI, fail to + * find the EFD, and free the space. + * + * If the pending item is unpaused, the next transaction commit will log an EFD + * without freeing the space. + * + * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the + * @args structure are set to the relevant values. + */ +int +xfs_alloc_schedule_autoreap( + const struct xfs_alloc_arg *args, + unsigned int free_flags, + struct xfs_alloc_autoreap *aarp) +{ + int error; + + error = xfs_defer_extent_free(args->tp, args->fsbno, args->len, + &args->oinfo, args->resv, free_flags, &aarp->dfp); + if (error) + return error; + + xfs_defer_item_pause(args->tp, aarp->dfp); + return 0; +} + +/* + * Cancel automatic freeing of unwritten space in the filesystem. + * + * Earlier, we created a paused deferred extent free item and attached it to + * this transaction so that we could automatically roll back a new space + * allocation if the system went down. Now we want to cancel the paused work + * item by marking the EFI stale so we don't actually free the space, unpausing + * the pending item and logging an EFD. + * + * The caller generally should have already mapped the space into the ondisk + * filesystem. If the reserved space was partially used, the caller must call + * xfs_free_extent_later to create a new EFI to free the unused space. + */ +void +xfs_alloc_cancel_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + struct xfs_defer_pending *dfp = aarp->dfp; + struct xfs_extent_free_item *xefi; + + if (!dfp) + return; - trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); + list_for_each_entry(xefi, &dfp->dfp_work, xefi_list) + xefi->xefi_flags |= XFS_EFI_CANCELLED; - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); + xfs_defer_item_unpause(tp, dfp); +} + +/* + * Commit automatic freeing of unwritten space in the filesystem. + * + * This unpauses an earlier _schedule_autoreap and commits to freeing the + * allocated space. Call this if none of the reserved space was used. + */ +void +xfs_alloc_commit_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + if (aarp->dfp) + xfs_defer_item_unpause(tp, aarp->dfp); +} + +/* + * Check if an AGF has a free extent record whose length is equal to + * args->minlen. + */ +STATIC int +xfs_exact_minlen_extent_available( + struct xfs_alloc_arg *args, + struct xfs_buf *agbp, + int *stat) +{ + struct xfs_btree_cur *cnt_cur; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error = 0; + + cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp, + args->pag); + error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); + if (error) + goto out; + + if (*stat == 0) { + xfs_btree_mark_sick(cnt_cur); + error = -EFSCORRUPTED; + goto out; + } + + error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat); + if (error) + goto out; + + if (*stat == 1 && flen != args->minlen) + *stat = 0; + +out: + xfs_btree_del_cursor(cnt_cur, error); + + return error; } /* @@ -2224,7 +2825,7 @@ xfs_defer_agfl_block( int /* error */ xfs_alloc_fix_freelist( struct xfs_alloc_arg *args, /* allocation argument structure */ - int flags) /* XFS_ALLOC_FLAG_... */ + uint32_t alloc_flags) { struct xfs_mount *mp = args->mp; struct xfs_perag *pag = args->pag; @@ -2236,14 +2837,16 @@ xfs_alloc_fix_freelist( xfs_extlen_t need; /* total blocks needed in freelist */ int error = 0; - if (!pag->pagf_init) { - error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); - if (error) + /* deferred ops (AGFL block frees) require permanent transactions */ + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + if (!xfs_perag_initialised_agf(pag)) { + error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); + if (error) { + /* Couldn't lock the AGF so skip this AG. */ + if (error == -EAGAIN) + error = 0; goto out_no_agbp; - if (!pag->pagf_init) { - ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); - ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - goto out_agbp_relse; } } @@ -2252,14 +2855,15 @@ xfs_alloc_fix_freelist( * somewhere else if we are not being asked to try harder at this * point */ - if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) && - (flags & XFS_ALLOC_FLAG_TRYLOCK)) { - ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + if (xfs_perag_prefers_metadata(pag) && + (args->datatype & XFS_ALLOC_USERDATA) && + (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) { + ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING)); goto out_agbp_relse; } need = xfs_alloc_min_freelist(mp, pag); - if (!xfs_alloc_space_available(args, need, flags | + if (!xfs_alloc_space_available(args, need, alloc_flags | XFS_ALLOC_FLAG_CHECK)) goto out_agbp_relse; @@ -2268,25 +2872,32 @@ xfs_alloc_fix_freelist( * Can fail if we're not blocking on locks, and it's held. */ if (!agbp) { - error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); - if (error) - goto out_no_agbp; - if (!agbp) { - ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); - ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); + if (error) { + /* Couldn't lock the AGF so skip this AG. */ + if (error == -EAGAIN) + error = 0; goto out_no_agbp; } } /* reset a padding mismatched agfl before final free space check */ - if (pag->pagf_agflreset) + if (xfs_perag_agfl_needs_reset(pag)) xfs_agfl_reset(tp, agbp, pag); /* If there isn't enough total space or single-extent, reject it. */ need = xfs_alloc_min_freelist(mp, pag); - if (!xfs_alloc_space_available(args, need, flags)) + if (!xfs_alloc_space_available(args, need, alloc_flags)) goto out_agbp_relse; + if (IS_ENABLED(CONFIG_XFS_DEBUG) && args->alloc_minlen_only) { + int stat; + + error = xfs_exact_minlen_extent_available(args, agbp, &stat); + if (error || !stat) + goto out_agbp_relse; + } + /* * Make the freelist shorter if it's too long. * @@ -2313,17 +2924,32 @@ xfs_alloc_fix_freelist( */ memset(&targs, 0, sizeof(targs)); /* struct copy below */ - if (flags & XFS_ALLOC_FLAG_NORMAP) + if (alloc_flags & XFS_ALLOC_FLAG_NORMAP) targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; else targs.oinfo = XFS_RMAP_OINFO_AG; - while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { - error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); + while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) && + pag->pagf_flcount > need) { + error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); if (error) goto out_agbp_relse; - /* defer agfl frees */ - xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); + /* + * Defer the AGFL block free. + * + * This helps to prevent log reservation overruns due to too + * many allocation operations in a transaction. AGFL frees are + * prone to this problem because for one they are always freed + * one at a time. Further, an immediate AGFL block free can + * cause a btree join and require another block free before the + * real allocation can proceed. + * Deferring the free disconnects freeing up the AGFL slot from + * freeing the block. + */ + error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, bno), + 1, &targs.oinfo, XFS_AG_RESV_AGFL, 0); + if (error) + goto out_agbp_relse; } targs.tp = tp; @@ -2331,9 +2957,8 @@ xfs_alloc_fix_freelist( targs.agbp = agbp; targs.agno = args->agno; targs.alignment = targs.minlen = targs.prod = 1; - targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; - error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); + error = xfs_alloc_read_agfl(pag, tp, &agflbp); if (error) goto out_agbp_relse; @@ -2344,7 +2969,7 @@ xfs_alloc_fix_freelist( targs.resv = XFS_AG_RESV_AGFL; /* Allocate as many blocks as possible at once. */ - error = xfs_alloc_ag_vextent(&targs); + error = xfs_alloc_ag_vextent_size(&targs, alloc_flags); if (error) goto out_agflbp_relse; @@ -2354,15 +2979,27 @@ xfs_alloc_fix_freelist( * on a completely full ag. */ if (targs.agbno == NULLAGBLOCK) { - if (flags & XFS_ALLOC_FLAG_FREEING) + if (alloc_flags & XFS_ALLOC_FLAG_FREEING) break; goto out_agflbp_relse; } + + if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) { + error = xfs_rmap_alloc(tp, agbp, pag, + targs.agbno, targs.len, &targs.oinfo); + if (error) + goto out_agflbp_relse; + } + error = xfs_alloc_update_counters(tp, agbp, + -((long)(targs.len))); + if (error) + goto out_agflbp_relse; + /* * Put each allocated block on the list. */ for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { - error = xfs_alloc_put_freelist(tp, agbp, + error = xfs_alloc_put_freelist(pag, tp, agbp, agflbp, bno, 0); if (error) goto out_agflbp_relse; @@ -2386,26 +3023,25 @@ out_no_agbp: * Get a block from the freelist. * Returns with the buffer for the block gotten. */ -int /* error */ +int xfs_alloc_get_freelist( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop, /* block address retrieved from freelist */ - int btreeblk) /* destination is a AGF btree */ -{ - xfs_agf_t *agf; /* a.g. freespace structure */ - xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ - xfs_agblock_t bno; /* block number returned */ - __be32 *agfl_bno; - int error; - int logflags; - xfs_mount_t *mp = tp->t_mountp; - xfs_perag_t *pag; /* per allocation group data */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agblock_t *bnop, + int btreeblk) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_buf *agflbp; + xfs_agblock_t bno; + __be32 *agfl_bno; + int error; + uint32_t logflags; + struct xfs_mount *mp = tp->t_mountp; /* * Freelist is empty, give up. */ - agf = XFS_BUF_TO_AGF(agbp); if (!agf->agf_flcount) { *bnop = NULLAGBLOCK; return 0; @@ -2413,8 +3049,7 @@ xfs_alloc_get_freelist( /* * Read the array of free blocks. */ - error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), - &agflbp); + error = xfs_alloc_read_agfl(pag, tp, &agflbp); if (error) return error; @@ -2422,17 +3057,18 @@ xfs_alloc_get_freelist( /* * Get the block number and update the data structures. */ - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno))) + return -EFSCORRUPTED; + be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) agf->agf_flfirst = 0; - pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); - ASSERT(!pag->pagf_agflreset); + ASSERT(!xfs_perag_agfl_needs_reset(pag)); be32_add_cpu(&agf->agf_flcount, -1); - xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; @@ -2441,7 +3077,6 @@ xfs_alloc_get_freelist( pag->pagf_btreeblks++; logflags |= XFS_AGF_BTREEBLKS; } - xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); *bnop = bno; @@ -2454,9 +3089,9 @@ xfs_alloc_get_freelist( */ void xfs_alloc_log_agf( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *bp, /* buffer for a.g. freelist header */ - int fields) /* mask of fields to be logged (XFS_AGF_...) */ + struct xfs_trans *tp, + struct xfs_buf *bp, + uint32_t fields) { int first; /* first byte offset */ int last; /* last byte offset */ @@ -2465,8 +3100,8 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_versionnum), offsetof(xfs_agf_t, agf_seqno), offsetof(xfs_agf_t, agf_length), - offsetof(xfs_agf_t, agf_roots[0]), - offsetof(xfs_agf_t, agf_levels[0]), + offsetof(xfs_agf_t, agf_bno_root), /* also cnt/rmap root */ + offsetof(xfs_agf_t, agf_bno_level), /* also cnt/rmap levels */ offsetof(xfs_agf_t, agf_flfirst), offsetof(xfs_agf_t, agf_fllast), offsetof(xfs_agf_t, agf_flcount), @@ -2483,7 +3118,7 @@ xfs_alloc_log_agf( sizeof(xfs_agf_t) }; - trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); @@ -2492,59 +3127,37 @@ xfs_alloc_log_agf( } /* - * Interface for inode allocation to force the pag data to be initialized. - */ -int /* error */ -xfs_alloc_pagf_init( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags) /* XFS_ALLOC_FLAGS_... */ -{ - xfs_buf_t *bp; - int error; - - if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp))) - return error; - if (bp) - xfs_trans_brelse(tp, bp); - return 0; -} - -/* * Put the block on the freelist for the allocation group. */ -int /* error */ +int xfs_alloc_put_freelist( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *agbp, /* buffer for a.g. freelist header */ - xfs_buf_t *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno, /* block being freed */ - int btreeblk) /* block came from a AGF btree */ -{ - xfs_agf_t *agf; /* a.g. freespace structure */ - __be32 *blockp;/* pointer to array entry */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_buf *agflbp, + xfs_agblock_t bno, + int btreeblk) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agf *agf = agbp->b_addr; + __be32 *blockp; int error; - int logflags; - xfs_mount_t *mp; /* mount structure */ - xfs_perag_t *pag; /* per allocation group data */ + uint32_t logflags; __be32 *agfl_bno; int startoff; - agf = XFS_BUF_TO_AGF(agbp); - mp = tp->t_mountp; + if (!agflbp) { + error = xfs_alloc_read_agfl(pag, tp, &agflbp); + if (error) + return error; + } - if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, - be32_to_cpu(agf->agf_seqno), &agflbp))) - return error; be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) agf->agf_fllast = 0; - pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); - ASSERT(!pag->pagf_agflreset); + ASSERT(!xfs_perag_agfl_needs_reset(pag)); be32_add_cpu(&agf->agf_flcount, 1); - xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++; logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; @@ -2553,13 +3166,10 @@ xfs_alloc_put_freelist( pag->pagf_btreeblks--; logflags |= XFS_AGF_BTREEBLKS; } - xfs_perag_put(pag); - - xfs_alloc_log_agf(tp, agbp, logflags); ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; *blockp = cpu_to_be32(bno); startoff = (char *)blockp - (char *)agflbp->b_addr; @@ -2572,75 +3182,146 @@ xfs_alloc_put_freelist( return 0; } +/* + * Check that this AGF/AGI header's sequence number and length matches the AG + * number and size in fsblocks. + */ +xfs_failaddr_t +xfs_validate_ag_length( + struct xfs_buf *bp, + uint32_t seqno, + uint32_t length) +{ + struct xfs_mount *mp = bp->b_mount; + /* + * During growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && seqno != pag_agno(bp->b_pag)) + return __this_address; + + /* + * Only the last AG in the filesystem is allowed to be shorter + * than the AG size recorded in the superblock. + */ + if (length != mp->m_sb.sb_agblocks) { + /* + * During growfs, the new last AG can get here before we + * have updated the superblock. Give it a pass on the seqno + * check. + */ + if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1) + return __this_address; + if (length < XFS_MIN_AG_BLOCKS) + return __this_address; + if (length > mp->m_sb.sb_agblocks) + return __this_address; + } + + return NULL; +} + +/* + * Verify the AGF is consistent. + * + * We do not verify the AGFL indexes in the AGF are fully consistent here + * because of issues with variable on-disk structure sizes. Instead, we check + * the agfl indexes for consistency when we initialise the perag from the AGF + * information after a read completes. + * + * If the index is inconsistent, then we mark the perag as needing an AGFL + * reset. The first AGFL update performed then resets the AGFL indexes and + * refills the AGFL with known good free blocks, allowing the filesystem to + * continue operating normally at the cost of a few leaked free space blocks. + */ static xfs_failaddr_t xfs_agf_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + struct xfs_mount *mp = bp->b_mount; + struct xfs_agf *agf = bp->b_addr; + xfs_failaddr_t fa; + uint32_t agf_seqno = be32_to_cpu(agf->agf_seqno); + uint32_t agf_length = be32_to_cpu(agf->agf_length); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (!xfs_log_check_lsn(mp, - be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn))) + if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) return __this_address; } - if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && - be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && - be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) - return __this_address; - - if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) + if (!xfs_verify_magic(bp, agf->agf_magicnum)) return __this_address; - if (xfs_sb_version_hasrmapbt(&mp->m_sb) && - (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) + if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum))) return __this_address; /* - * during growfs operations, the perag is not fully initialised, - * so we can't use it for any useful checking. growfs ensures we can't - * use it by using uncached buffers that don't have the perag attached - * so we can detect and avoid this problem. + * Both agf_seqno and agf_length need to validated before anything else + * block number related in the AGF or AGFL can be checked. */ - if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) + fa = xfs_validate_ag_length(bp, agf_seqno, agf_length); + if (fa) + return fa; + + if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp)) + return __this_address; + if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp)) + return __this_address; + if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp)) return __this_address; - if (xfs_sb_version_haslazysbcount(&mp->m_sb) && - be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) + if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || + be32_to_cpu(agf->agf_freeblks) > agf_length) return __this_address; - if (xfs_sb_version_hasreflink(&mp->m_sb) && - (be32_to_cpu(agf->agf_refcount_level) < 1 || - be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) + if (be32_to_cpu(agf->agf_bno_level) < 1 || + be32_to_cpu(agf->agf_cnt_level) < 1 || + be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels || + be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels) return __this_address; - return NULL; + if (xfs_has_lazysbcount(mp) && + be32_to_cpu(agf->agf_btreeblks) > agf_length) + return __this_address; + + if (xfs_has_rmapbt(mp)) { + if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length) + return __this_address; + if (be32_to_cpu(agf->agf_rmap_level) < 1 || + be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels) + return __this_address; + } + + if (xfs_has_reflink(mp)) { + if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length) + return __this_address; + + if (be32_to_cpu(agf->agf_refcount_level) < 1 || + be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels) + return __this_address; + } + + return NULL; } static void xfs_agf_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agf_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_ALLOC_READ_AGF)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } @@ -2649,8 +3330,9 @@ static void xfs_agf_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_agf *agf = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agf_verify(bp); @@ -2659,17 +3341,18 @@ xfs_agf_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) - XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); + agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); } const struct xfs_buf_ops xfs_agf_buf_ops = { .name = "xfs_agf", + .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, .verify_struct = xfs_agf_verify, @@ -2678,114 +3361,151 @@ const struct xfs_buf_ops xfs_agf_buf_ops = { /* * Read in the allocation group header (free/alloc section). */ -int /* error */ +int xfs_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_BUF_ */ - struct xfs_buf **bpp) /* buffer for the ag freelist header */ + struct xfs_perag *pag, + struct xfs_trans *tp, + int flags, + struct xfs_buf **agfbpp) { - int error; + struct xfs_mount *mp = pag_mount(pag); + int error; - trace_xfs_read_agf(mp, agno); + trace_xfs_read_agf(pag); - ASSERT(agno != NULLAGNUMBER); - error = xfs_trans_read_buf( - mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); if (error) return error; - if (!*bpp) - return 0; - ASSERT(!(*bpp)->b_error); - xfs_buf_set_ref(*bpp, XFS_AGF_REF); + xfs_buf_set_ref(*agfbpp, XFS_AGF_REF); return 0; } /* - * Read in the allocation group header (free/alloc section). + * Read in the allocation group header (free/alloc section) and initialise the + * perag structure if necessary. If the caller provides @agfbpp, then return the + * locked buffer to the caller, otherwise free it. */ -int /* error */ +int xfs_alloc_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_ALLOC_FLAG_... */ - struct xfs_buf **bpp) /* buffer for the ag freelist header */ -{ - struct xfs_agf *agf; /* ag freelist header */ - struct xfs_perag *pag; /* per allocation group data */ + struct xfs_perag *pag, + struct xfs_trans *tp, + int flags, + struct xfs_buf **agfbpp) +{ + struct xfs_mount *mp = pag_mount(pag); + struct xfs_buf *agfbp; + struct xfs_agf *agf; int error; + int allocbt_blks; - trace_xfs_alloc_read_agf(mp, agno); + trace_xfs_alloc_read_agf(pag); - ASSERT(agno != NULLAGNUMBER); - error = xfs_read_agf(mp, tp, agno, + /* We don't support trylock when freeing. */ + ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != + (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); + error = xfs_read_agf(pag, tp, (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, - bpp); + &agfbp); if (error) return error; - if (!*bpp) - return 0; - ASSERT(!(*bpp)->b_error); - agf = XFS_BUF_TO_AGF(*bpp); - pag = xfs_perag_get(mp, agno); - if (!pag->pagf_init) { + agf = agfbp->b_addr; + if (!xfs_perag_initialised_agf(pag)) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); pag->pagf_longest = be32_to_cpu(agf->agf_longest); - pag->pagf_levels[XFS_BTNUM_BNOi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); - pag->pagf_levels[XFS_BTNUM_CNTi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); - pag->pagf_levels[XFS_BTNUM_RMAPi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); + pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level); + pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); + pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); - pag->pagf_init = 1; - pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf); + if (xfs_agfl_needs_reset(mp, agf)) + set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); + else + clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); + + /* + * Update the in-core allocbt counter. Filter out the rmapbt + * subset of the btreeblks counter because the rmapbt is managed + * by perag reservation. Subtract one for the rmapbt root block + * because the rmap counter includes it while the btreeblks + * counter only tracks non-root blocks. + */ + allocbt_blks = pag->pagf_btreeblks; + if (xfs_has_rmapbt(mp)) + allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; + if (allocbt_blks > 0) + atomic64_add(allocbt_blks, &mp->m_allocbt_blks); + + set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); } + #ifdef DEBUG - else if (!XFS_FORCED_SHUTDOWN(mp)) { - ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); - ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); - ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); - ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); - ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi])); - ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); + /* + * It's possible for the AGF to be out of sync if the block device is + * silently dropping writes. This can happen in fstests with dmflakey + * enabled, which allows the buffer to be cleaned and reclaimed by + * memory pressure and then re-read from disk here. We will get a + * stale version of the AGF from disk, and nothing good can happen from + * here. Hence if we detect this situation, immediately shut down the + * filesystem. + * + * This can also happen if we are already in the middle of a forced + * shutdown, so don't bother checking if we are already shut down. + */ + if (!xfs_is_shutdown(pag_mount(pag))) { + bool ok = true; + + ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks); + ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks); + ok &= pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks); + ok &= pag->pagf_flcount == be32_to_cpu(agf->agf_flcount); + ok &= pag->pagf_longest == be32_to_cpu(agf->agf_longest); + ok &= pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level); + ok &= pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level); + + if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); + xfs_trans_brelse(tp, agfbp); + xfs_force_shutdown(pag_mount(pag), + SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } } -#endif - xfs_perag_put(pag); +#endif /* DEBUG */ + + if (agfbpp) + *agfbpp = agfbp; + else + xfs_trans_brelse(tp, agfbp); return 0; } /* - * Allocate an extent (variable-size). - * Depending on the allocation type, we either look in a single allocation - * group or loop over the allocation groups to find the result. + * Pre-proces allocation arguments to set initial state that we don't require + * callers to set up correctly, as well as bounds check the allocation args + * that are set up. */ -int /* error */ -xfs_alloc_vextent( - struct xfs_alloc_arg *args) /* allocation argument structure */ +static int +xfs_alloc_vextent_check_args( + struct xfs_alloc_arg *args, + xfs_fsblock_t target, + xfs_agnumber_t *minimum_agno) { - xfs_agblock_t agsize; /* allocation group size */ - int error; - int flags; /* XFS_ALLOC_FLAG_... locking flags */ - struct xfs_mount *mp; /* mount structure pointer */ - xfs_agnumber_t sagno; /* starting allocation group number */ - xfs_alloctype_t type; /* input allocation type */ - int bump_rotor = 0; - xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ - - mp = args->mp; - type = args->otype = args->type; - args->agbno = NULLAGBLOCK; + struct xfs_mount *mp = args->mp; + xfs_agblock_t agsize; + + args->fsbno = NULLFSBLOCK; + + *minimum_agno = 0; + if (args->tp->t_highest_agno != NULLAGNUMBER) + *minimum_agno = args->tp->t_highest_agno; + /* * Just fix this up, for the case where the last a.g. is shorter * (or there's only one a.g.) and the caller couldn't easily figure @@ -2796,182 +3516,459 @@ xfs_alloc_vextent( args->maxlen = agsize; if (args->alignment == 0) args->alignment = 1; - ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize); + + ASSERT(args->minlen > 0); + ASSERT(args->maxlen > 0); + ASSERT(args->alignment > 0); + ASSERT(args->resv != XFS_AG_RESV_AGFL); + + ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize); ASSERT(args->minlen <= args->maxlen); ASSERT(args->minlen <= agsize); ASSERT(args->mod < args->prod); - if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount || - XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize || + + if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount || + XFS_FSB_TO_AGBNO(mp, target) >= agsize || args->minlen > args->maxlen || args->minlen > agsize || args->mod >= args->prod) { - args->fsbno = NULLFSBLOCK; trace_xfs_alloc_vextent_badargs(args); + return -ENOSPC; + } + + if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) { + trace_xfs_alloc_vextent_skip_deadlock(args); + return -ENOSPC; + } + return 0; + +} + +/* + * Prepare an AG for allocation. If the AG is not prepared to accept the + * allocation, return failure. + * + * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are + * modified to hold their own perag references. + */ +static int +xfs_alloc_vextent_prepare_ag( + struct xfs_alloc_arg *args, + uint32_t alloc_flags) +{ + bool need_pag = !args->pag; + int error; + + if (need_pag) + args->pag = xfs_perag_get(args->mp, args->agno); + + args->agbp = NULL; + error = xfs_alloc_fix_freelist(args, alloc_flags); + if (error) { + trace_xfs_alloc_vextent_nofix(args); + if (need_pag) + xfs_perag_put(args->pag); + args->agbno = NULLAGBLOCK; + return error; + } + if (!args->agbp) { + /* cannot allocate in this AG at all */ + trace_xfs_alloc_vextent_noagbp(args); + args->agbno = NULLAGBLOCK; return 0; } + args->wasfromfl = 0; + return 0; +} - switch (type) { - case XFS_ALLOCTYPE_THIS_AG: - case XFS_ALLOCTYPE_NEAR_BNO: - case XFS_ALLOCTYPE_THIS_BNO: - /* - * These three force us into a single a.g. - */ - args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); - args->pag = xfs_perag_get(mp, args->agno); - error = xfs_alloc_fix_freelist(args, 0); - if (error) { - trace_xfs_alloc_vextent_nofix(args); - goto error0; - } - if (!args->agbp) { - trace_xfs_alloc_vextent_noagbp(args); +/* + * Post-process allocation results to account for the allocation if it succeed + * and set the allocated block number correctly for the caller. + * + * XXX: we should really be returning ENOSPC for ENOSPC, not + * hiding it behind a "successful" NULLFSBLOCK allocation. + */ +static int +xfs_alloc_vextent_finish( + struct xfs_alloc_arg *args, + xfs_agnumber_t minimum_agno, + int alloc_error, + bool drop_perag) +{ + struct xfs_mount *mp = args->mp; + int error = 0; + + /* + * We can end up here with a locked AGF. If we failed, the caller is + * likely going to try to allocate again with different parameters, and + * that can widen the AGs that are searched for free space. If we have + * to do BMBT block allocation, we have to do a new allocation. + * + * Hence leaving this function with the AGF locked opens up potential + * ABBA AGF deadlocks because a future allocation attempt in this + * transaction may attempt to lock a lower number AGF. + * + * We can't release the AGF until the transaction is commited, so at + * this point we must update the "first allocation" tracker to point at + * this AG if the tracker is empty or points to a lower AG. This allows + * the next allocation attempt to be modified appropriately to avoid + * deadlocks. + */ + if (args->agbp && + (args->tp->t_highest_agno == NULLAGNUMBER || + args->agno > minimum_agno)) + args->tp->t_highest_agno = args->agno; + + /* + * If the allocation failed with an error or we had an ENOSPC result, + * preserve the returned error whilst also marking the allocation result + * as "no extent allocated". This ensures that callers that fail to + * capture the error will still treat it as a failed allocation. + */ + if (alloc_error || args->agbno == NULLAGBLOCK) { + args->fsbno = NULLFSBLOCK; + error = alloc_error; + goto out_drop_perag; + } + + args->fsbno = xfs_agbno_to_fsb(args->pag, args->agbno); + + ASSERT(args->len >= args->minlen); + ASSERT(args->len <= args->maxlen); + ASSERT(args->agbno % args->alignment == 0); + XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len); + + /* if not file data, insert new block into the reverse map btree */ + if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { + error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, + args->agbno, args->len, &args->oinfo); + if (error) + goto out_drop_perag; + } + + if (!args->wasfromfl) { + error = xfs_alloc_update_counters(args->tp, args->agbp, + -((long)(args->len))); + if (error) + goto out_drop_perag; + + ASSERT(!xfs_extent_busy_search(pag_group(args->pag), + args->agbno, args->len)); + } + + xfs_ag_resv_alloc_extent(args->pag, args->resv, args); + + XFS_STATS_INC(mp, xs_allocx); + XFS_STATS_ADD(mp, xs_allocb, args->len); + + trace_xfs_alloc_vextent_finish(args); + +out_drop_perag: + if (drop_perag && args->pag) { + xfs_perag_rele(args->pag); + args->pag = NULL; + } + return error; +} + +/* + * Allocate within a single AG only. This uses a best-fit length algorithm so if + * you need an exact sized allocation without locality constraints, this is the + * fastest way to do it. + * + * Caller is expected to hold a perag reference in args->pag. + */ +int +xfs_alloc_vextent_this_ag( + struct xfs_alloc_arg *args, + xfs_agnumber_t agno) +{ + xfs_agnumber_t minimum_agno; + uint32_t alloc_flags = 0; + int error; + + ASSERT(args->pag != NULL); + ASSERT(pag_agno(args->pag) == agno); + + args->agno = agno; + args->agbno = 0; + + trace_xfs_alloc_vextent_this_ag(args); + + error = xfs_alloc_vextent_check_args(args, + xfs_agbno_to_fsb(args->pag, 0), &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); + if (!error && args->agbp) + error = xfs_alloc_ag_vextent_size(args, alloc_flags); + + return xfs_alloc_vextent_finish(args, minimum_agno, error, false); +} + +/* + * Iterate all AGs trying to allocate an extent starting from @start_ag. + * + * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the + * allocation attempts in @start_agno have locality information. If we fail to + * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs + * we attempt to allocation in as there is no locality optimisation possible for + * those allocations. + * + * On return, args->pag may be left referenced if we finish before the "all + * failed" return point. The allocation finish still needs the perag, and + * so the caller will release it once they've finished the allocation. + * + * When we wrap the AG iteration at the end of the filesystem, we have to be + * careful not to wrap into AGs below ones we already have locked in the + * transaction if we are doing a blocking iteration. This will result in an + * out-of-order locking of AGFs and hence can cause deadlocks. + */ +static int +xfs_alloc_vextent_iterate_ags( + struct xfs_alloc_arg *args, + xfs_agnumber_t minimum_agno, + xfs_agnumber_t start_agno, + xfs_agblock_t target_agbno, + uint32_t alloc_flags) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t restart_agno = minimum_agno; + xfs_agnumber_t agno; + int error = 0; + + if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) + restart_agno = 0; +restart: + for_each_perag_wrap_range(mp, start_agno, restart_agno, + mp->m_sb.sb_agcount, agno, args->pag) { + args->agno = agno; + error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); + if (error) break; + if (!args->agbp) { + trace_xfs_alloc_vextent_loopfailed(args); + continue; } - args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); - if ((error = xfs_alloc_ag_vextent(args))) - goto error0; - break; - case XFS_ALLOCTYPE_START_BNO: - /* - * Try near allocation first, then anywhere-in-ag after - * the first a.g. fails. - */ - if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && - (mp->m_flags & XFS_MOUNT_32BITINODES)) { - args->fsbno = XFS_AGB_TO_FSB(mp, - ((mp->m_agfrotor / rotorstep) % - mp->m_sb.sb_agcount), 0); - bump_rotor = 1; - } - args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); - args->type = XFS_ALLOCTYPE_NEAR_BNO; - /* FALLTHROUGH */ - case XFS_ALLOCTYPE_FIRST_AG: + /* - * Rotate through the allocation groups looking for a winner. + * Allocation is supposed to succeed now, so break out of the + * loop regardless of whether we succeed or not. */ - if (type == XFS_ALLOCTYPE_FIRST_AG) { - /* - * Start with allocation group given by bno. - */ - args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); - args->type = XFS_ALLOCTYPE_THIS_AG; - sagno = 0; - flags = 0; + if (args->agno == start_agno && target_agbno) { + args->agbno = target_agbno; + error = xfs_alloc_ag_vextent_near(args, alloc_flags); } else { - /* - * Start with the given allocation group. - */ - args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno); - flags = XFS_ALLOC_FLAG_TRYLOCK; + args->agbno = 0; + error = xfs_alloc_ag_vextent_size(args, alloc_flags); } - /* - * Loop over allocation groups twice; first time with - * trylock set, second time without. - */ - for (;;) { - args->pag = xfs_perag_get(mp, args->agno); - error = xfs_alloc_fix_freelist(args, flags); - if (error) { - trace_xfs_alloc_vextent_nofix(args); - goto error0; - } - /* - * If we get a buffer back then the allocation will fly. - */ - if (args->agbp) { - if ((error = xfs_alloc_ag_vextent(args))) - goto error0; - break; - } + break; + } + if (error) { + xfs_perag_rele(args->pag); + args->pag = NULL; + return error; + } + if (args->agbp) + return 0; - trace_xfs_alloc_vextent_loopfailed(args); + /* + * We didn't find an AG we can alloation from. If we were given + * constraining flags by the caller, drop them and retry the allocation + * without any constraints being set. + */ + if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) { + alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK; + restart_agno = minimum_agno; + goto restart; + } - /* - * Didn't work, figure out the next iteration. - */ - if (args->agno == sagno && - type == XFS_ALLOCTYPE_START_BNO) - args->type = XFS_ALLOCTYPE_THIS_AG; - /* - * For the first allocation, we can try any AG to get - * space. However, if we already have allocated a - * block, we don't want to try AGs whose number is below - * sagno. Otherwise, we may end up with out-of-order - * locking of AGF, which might cause deadlock. - */ - if (++(args->agno) == mp->m_sb.sb_agcount) { - if (args->tp->t_firstblock != NULLFSBLOCK) - args->agno = sagno; - else - args->agno = 0; - } - /* - * Reached the starting a.g., must either be done - * or switch to non-trylock mode. - */ - if (args->agno == sagno) { - if (flags == 0) { - args->agbno = NULLAGBLOCK; - trace_xfs_alloc_vextent_allfailed(args); - break; - } - - flags = 0; - if (type == XFS_ALLOCTYPE_START_BNO) { - args->agbno = XFS_FSB_TO_AGBNO(mp, - args->fsbno); - args->type = XFS_ALLOCTYPE_NEAR_BNO; - } - } - xfs_perag_put(args->pag); - } - if (bump_rotor) { - if (args->agno == sagno) - mp->m_agfrotor = (mp->m_agfrotor + 1) % - (mp->m_sb.sb_agcount * rotorstep); - else - mp->m_agfrotor = (args->agno * rotorstep + 1) % - (mp->m_sb.sb_agcount * rotorstep); - } - break; - default: - ASSERT(0); - /* NOTREACHED */ + ASSERT(args->pag == NULL); + trace_xfs_alloc_vextent_allfailed(args); + return 0; +} + +/* + * Iterate from the AGs from the start AG to the end of the filesystem, trying + * to allocate blocks. It starts with a near allocation attempt in the initial + * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap + * back to zero if allowed by previous allocations in this transaction, + * otherwise will wrap back to the start AG and run a second blocking pass to + * the end of the filesystem. + */ +int +xfs_alloc_vextent_start_ag( + struct xfs_alloc_arg *args, + xfs_fsblock_t target) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + xfs_agnumber_t start_agno; + xfs_agnumber_t rotorstep = xfs_rotorstep; + bool bump_rotor = false; + uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; + int error; + + ASSERT(args->pag == NULL); + + args->agno = NULLAGNUMBER; + args->agbno = NULLAGBLOCK; + + trace_xfs_alloc_vextent_start_ag(args); + + error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; } - if (args->agbno == NULLAGBLOCK) - args->fsbno = NULLFSBLOCK; - else { - args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); -#ifdef DEBUG - ASSERT(args->len >= args->minlen); - ASSERT(args->len <= args->maxlen); - ASSERT(args->agbno % args->alignment == 0); - XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), - args->len); -#endif - /* Zero the extent if we were asked to do so */ - if (args->datatype & XFS_ALLOC_USERDATA_ZERO) { - error = xfs_zero_extent(args->ip, args->fsbno, args->len); - if (error) - goto error0; - } + if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && + xfs_is_inode32(mp)) { + target = XFS_AGB_TO_FSB(mp, + ((mp->m_agfrotor / rotorstep) % + mp->m_sb.sb_agcount), 0); + bump_rotor = 1; + } + start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); + error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, + XFS_FSB_TO_AGBNO(mp, target), alloc_flags); + + if (bump_rotor) { + if (args->agno == start_agno) + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); + else + mp->m_agfrotor = (args->agno * rotorstep + 1) % + (mp->m_sb.sb_agcount * rotorstep); } - xfs_perag_put(args->pag); - return 0; -error0: - xfs_perag_put(args->pag); - return error; + + return xfs_alloc_vextent_finish(args, minimum_agno, error, true); +} + +/* + * Iterate from the agno indicated via @target through to the end of the + * filesystem attempting blocking allocation. This does not wrap or try a second + * pass, so will not recurse into AGs lower than indicated by the target. + */ +int +xfs_alloc_vextent_first_ag( + struct xfs_alloc_arg *args, + xfs_fsblock_t target) + { + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + xfs_agnumber_t start_agno; + uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; + int error; + + ASSERT(args->pag == NULL); + + args->agno = NULLAGNUMBER; + args->agbno = NULLAGBLOCK; + + trace_xfs_alloc_vextent_first_ag(args); + + error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); + error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, + XFS_FSB_TO_AGBNO(mp, target), alloc_flags); + return xfs_alloc_vextent_finish(args, minimum_agno, error, true); +} + +/* + * Allocate at the exact block target or fail. Caller is expected to hold a + * perag reference in args->pag. + */ +int +xfs_alloc_vextent_exact_bno( + struct xfs_alloc_arg *args, + xfs_fsblock_t target) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + int error; + + ASSERT(args->pag != NULL); + ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target)); + + args->agno = XFS_FSB_TO_AGNO(mp, target); + args->agbno = XFS_FSB_TO_AGBNO(mp, target); + + trace_xfs_alloc_vextent_exact_bno(args); + + error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + error = xfs_alloc_vextent_prepare_ag(args, 0); + if (!error && args->agbp) + error = xfs_alloc_ag_vextent_exact(args); + + return xfs_alloc_vextent_finish(args, minimum_agno, error, false); +} + +/* + * Allocate an extent as close to the target as possible. If there are not + * viable candidates in the AG, then fail the allocation. + * + * Caller may or may not have a per-ag reference in args->pag. + */ +int +xfs_alloc_vextent_near_bno( + struct xfs_alloc_arg *args, + xfs_fsblock_t target) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + bool needs_perag = args->pag == NULL; + uint32_t alloc_flags = 0; + int error; + + if (!needs_perag) + ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target)); + + args->agno = XFS_FSB_TO_AGNO(mp, target); + args->agbno = XFS_FSB_TO_AGBNO(mp, target); + + trace_xfs_alloc_vextent_near_bno(args); + + error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + if (needs_perag) + args->pag = xfs_perag_grab(mp, args->agno); + + error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); + if (!error && args->agbp) + error = xfs_alloc_ag_vextent_near(args, alloc_flags); + + return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag); } /* Ensure that the freelist is at full capacity. */ int xfs_free_extent_fix_freelist( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_buf **agbp) { struct xfs_alloc_arg args; @@ -2980,7 +3977,8 @@ xfs_free_extent_fix_freelist( memset(&args, 0, sizeof(struct xfs_alloc_arg)); args.tp = tp; args.mp = tp->t_mountp; - args.agno = agno; + args.agno = pag_agno(pag); + args.pag = pag; /* * validate that the block number is legal - the enables us to detect @@ -2989,17 +3987,12 @@ xfs_free_extent_fix_freelist( if (args.agno >= args.mp->m_sb.sb_agcount) return -EFSCORRUPTED; - args.pag = xfs_perag_get(args.mp, args.agno); - ASSERT(args.pag); - error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); if (error) - goto out; + return error; *agbp = args.agbp; -out: - xfs_perag_put(args.pag); - return error; + return 0; } /* @@ -3010,7 +4003,8 @@ out: int __xfs_free_extent( struct xfs_trans *tp, - xfs_fsblock_t bno, + struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, @@ -3018,39 +4012,48 @@ __xfs_free_extent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); + struct xfs_agf *agf; int error; unsigned int busy_flags = 0; ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); - if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_FREE_EXTENT)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT)) return -EIO; - error = xfs_free_extent_fix_freelist(tp, agno, &agbp); - if (error) + error = xfs_free_extent_fix_freelist(tp, pag, &agbp); + if (error) { + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); return error; + } + + agf = agbp->b_addr; - XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err); + if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); + error = -EFSCORRUPTED; + goto err_release; + } /* validate the extent size is legal now we have the agf locked */ - XFS_WANT_CORRUPTED_GOTO(mp, - agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), - err); + if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); + error = -EFSCORRUPTED; + goto err_release; + } - error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); + error = xfs_free_ag_extent(tp, agbp, agbno, len, oinfo, type); if (error) - goto err; + goto err_release; if (skip_discard) busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; - xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags); + xfs_extent_busy_insert(tp, pag_group(pag), agbno, len, busy_flags); return 0; -err: +err_release: xfs_trans_brelse(tp, agbp); return error; } @@ -3064,14 +4067,18 @@ struct xfs_alloc_query_range_info { STATIC int xfs_alloc_query_range_helper( struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, void *priv) { struct xfs_alloc_query_range_info *query = priv; struct xfs_alloc_rec_incore irec; + xfs_failaddr_t fa; + + xfs_alloc_btrec_to_irec(rec, &irec); + fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec); + if (fa) + return xfs_alloc_complain_bad_rec(cur, fa, &irec); - irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); - irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); return query->fn(cur, &irec, query->priv); } @@ -3079,20 +4086,16 @@ xfs_alloc_query_range_helper( int xfs_alloc_query_range( struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *low_rec, - struct xfs_alloc_rec_incore *high_rec, + const struct xfs_alloc_rec_incore *low_rec, + const struct xfs_alloc_rec_incore *high_rec, xfs_alloc_query_range_fn fn, void *priv) { - union xfs_btree_irec low_brec; - union xfs_btree_irec high_brec; - struct xfs_alloc_query_range_info query; + union xfs_btree_irec low_brec = { .a = *low_rec }; + union xfs_btree_irec high_brec = { .a = *high_rec }; + struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn }; - ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); - low_brec.a = *low_rec; - high_brec.a = *high_rec; - query.priv = priv; - query.fn = fn; + ASSERT(xfs_btree_is_bno(cur->bc_ops)); return xfs_btree_query_range(cur, &low_brec, &high_brec, xfs_alloc_query_range_helper, &query); } @@ -3106,19 +4109,22 @@ xfs_alloc_query_all( { struct xfs_alloc_query_range_info query; - ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); + ASSERT(xfs_btree_is_bno(cur->bc_ops)); query.priv = priv; query.fn = fn; return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); } -/* Is there a record covering a given extent? */ +/* + * Scan part of the keyspace of the free space and tell us if the area has no + * records, is fully mapped by records, or is partially filled. + */ int -xfs_alloc_has_record( +xfs_alloc_has_records( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { union xfs_btree_irec low; union xfs_btree_irec high; @@ -3128,12 +4134,12 @@ xfs_alloc_has_record( memset(&high, 0xFF, sizeof(high)); high.a.ar_startblock = bno + len - 1; - return xfs_btree_has_record(cur, &low, &high, exists); + return xfs_btree_has_records(cur, &low, &high, NULL, outcome); } /* * Walk all the blocks in the AGFL. The @walk_fn can return any negative - * error code or XFS_BTREE_QUERY_RANGE_ABORT. + * error code or XFS_ITER_*. */ int xfs_agfl_walk( @@ -3147,7 +4153,7 @@ xfs_agfl_walk( unsigned int i; int error; - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); i = be32_to_cpu(agf->agf_flfirst); /* Nothing to walk in an empty AGFL. */ @@ -3167,3 +4173,20 @@ xfs_agfl_walk( return 0; } + +int __init +xfs_extfree_intent_init_cache(void) +{ + xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent", + sizeof(struct xfs_extent_free_item), + 0, 0, NULL); + + return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_extfree_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_extfree_item_cache); + xfs_extfree_item_cache = NULL; +} |
