summaryrefslogtreecommitdiff
path: root/fs/xfs/libxfs/xfs_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/libxfs/xfs_alloc.c')
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c1870
1 files changed, 1247 insertions, 623 deletions
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 989cf341779b..ad381c73abc4 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -26,20 +26,16 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_bmap.h"
+#include "xfs_health.h"
+#include "xfs_extfree_item.h"
struct kmem_cache *xfs_extfree_item_cache;
struct workqueue_struct *xfs_alloc_wq;
-#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
-
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
-STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
-
/*
* Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
* the beginning of the block for a proper header with the location information
@@ -82,7 +78,7 @@ xfs_prealloc_blocks(
}
/*
- * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
+ * The number of blocks per AG that we withhold from xfs_dec_fdblocks to
* guarantee that we can refill the AGFL prior to allocating space in a nearly
* full AG. Although the space described by the free space btrees, the
* blocks used by the freesp btrees themselves, and the blocks owned by the
@@ -92,7 +88,7 @@ xfs_prealloc_blocks(
* until the fs goes down, we subtract this many AG blocks from the incore
* fdblocks to ensure user allocation does not overcommit the space the
* filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to
- * withhold space from xfs_mod_fdblocks, so we do not account for that here.
+ * withhold space from xfs_dec_fdblocks, so we do not account for that here.
*/
#define XFS_ALLOCBT_AGFL_RESERVE 4
@@ -154,23 +150,38 @@ xfs_alloc_ag_max_usable(
return mp->m_sb.sb_agblocks - blocks;
}
+
+static int
+xfs_alloc_lookup(
+ struct xfs_btree_cur *cur,
+ xfs_lookup_t dir,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ int *stat)
+{
+ int error;
+
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ error = xfs_btree_lookup(cur, dir, stat);
+ if (*stat == 1)
+ cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
+ else
+ cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
+ return error;
+}
+
/*
* Lookup the record equal to [bno, len] in the btree given by cur.
*/
-STATIC int /* error */
+static inline int /* error */
xfs_alloc_lookup_eq(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agblock_t bno, /* starting block of extent */
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
- int error;
-
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
- cur->bc_ag.abt.active = (*stat == 1);
- return error;
+ return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat);
}
/*
@@ -184,13 +195,7 @@ xfs_alloc_lookup_ge(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
- int error;
-
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
- cur->bc_ag.abt.active = (*stat == 1);
- return error;
+ return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat);
}
/*
@@ -204,19 +209,14 @@ xfs_alloc_lookup_le(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
- int error;
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
- cur->bc_ag.abt.active = (*stat == 1);
- return error;
+ return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat);
}
static inline bool
xfs_alloc_cur_active(
struct xfs_btree_cur *cur)
{
- return cur && cur->bc_ag.abt.active;
+ return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE);
}
/*
@@ -237,6 +237,50 @@ xfs_alloc_update(
return xfs_btree_update(cur, &rec);
}
+/* Convert the ondisk btree record to its incore representation. */
+void
+xfs_alloc_btrec_to_irec(
+ const union xfs_btree_rec *rec,
+ struct xfs_alloc_rec_incore *irec)
+{
+ irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
+ irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
+}
+
+/* Simple checks for free space records. */
+xfs_failaddr_t
+xfs_alloc_check_irec(
+ struct xfs_perag *pag,
+ const struct xfs_alloc_rec_incore *irec)
+{
+ if (irec->ar_blockcount == 0)
+ return __this_address;
+
+ /* check for valid extent range, including overflow */
+ if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount))
+ return __this_address;
+
+ return NULL;
+}
+
+static inline int
+xfs_alloc_complain_bad_rec(
+ struct xfs_btree_cur *cur,
+ xfs_failaddr_t fa,
+ const struct xfs_alloc_rec_incore *irec)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+
+ xfs_warn(mp,
+ "%sbt record corruption in AG %d detected at %pS!",
+ cur->bc_ops->name, cur->bc_group->xg_gno, fa);
+ xfs_warn(mp,
+ "start block 0x%x block count 0x%x", irec->ar_startblock,
+ irec->ar_blockcount);
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+}
+
/*
* Get the data from the pointed-to record.
*/
@@ -247,35 +291,23 @@ xfs_alloc_get_rec(
xfs_extlen_t *len, /* output: length of extent */
int *stat) /* output: success/failure */
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_perag *pag = cur->bc_ag.pag;
+ struct xfs_alloc_rec_incore irec;
union xfs_btree_rec *rec;
+ xfs_failaddr_t fa;
int error;
error = xfs_btree_get_rec(cur, &rec, stat);
if (error || !(*stat))
return error;
- *bno = be32_to_cpu(rec->alloc.ar_startblock);
- *len = be32_to_cpu(rec->alloc.ar_blockcount);
-
- if (*len == 0)
- goto out_bad_rec;
-
- /* check for valid extent range, including overflow */
- if (!xfs_verify_agbext(pag, *bno, *len))
- goto out_bad_rec;
+ xfs_alloc_btrec_to_irec(rec, &irec);
+ fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec);
+ if (fa)
+ return xfs_alloc_complain_bad_rec(cur, fa, &irec);
+ *bno = irec.ar_startblock;
+ *len = irec.ar_blockcount;
return 0;
-
-out_bad_rec:
- xfs_warn(mp,
- "%s Freespace BTree record corruption in AG %d detected!",
- cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
- pag->pag_agno);
- xfs_warn(mp,
- "start block 0x%x block count 0x%x", *bno, *len);
- return -EFSCORRUPTED;
}
/*
@@ -297,7 +329,8 @@ xfs_alloc_compute_aligned(
bool busy;
/* Trim busy sections out of found extent */
- busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
+ busy = xfs_extent_busy_trim(pag_group(args->pag), args->minlen,
+ args->maxlen, &bno, &len, busy_gen);
/*
* If we have a largish extent that happens to start before min_agbno,
@@ -375,8 +408,8 @@ xfs_alloc_compute_diff(
if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
if (newlen1 < newlen2 ||
(newlen1 == newlen2 &&
- XFS_ABSDIFF(newbno1, wantbno) >
- XFS_ABSDIFF(newbno2, wantbno)))
+ abs_diff(newbno1, wantbno) >
+ abs_diff(newbno2, wantbno)))
newbno1 = newbno2;
} else if (newbno2 != NULLAGBLOCK)
newbno1 = newbno2;
@@ -392,7 +425,7 @@ xfs_alloc_compute_diff(
} else
newbno1 = freeend - wantlen;
*newbnop = newbno1;
- return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+ return newbno1 == NULLAGBLOCK ? 0 : abs_diff(newbno1, wantbno);
}
/*
@@ -433,6 +466,97 @@ xfs_alloc_fix_len(
}
/*
+ * Determine if the cursor points to the block that contains the right-most
+ * block of records in the by-count btree. This block contains the largest
+ * contiguous free extent in the AG, so if we modify a record in this block we
+ * need to call xfs_alloc_fixup_longest() once the modifications are done to
+ * ensure the agf->agf_longest field is kept up to date with the longest free
+ * extent tracked by the by-count btree.
+ */
+static bool
+xfs_alloc_cursor_at_lastrec(
+ struct xfs_btree_cur *cnt_cur)
+{
+ struct xfs_btree_block *block;
+ union xfs_btree_ptr ptr;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cnt_cur, 0, &bp);
+
+ xfs_btree_get_sibling(cnt_cur, block, &ptr, XFS_BB_RIGHTSIB);
+ return xfs_btree_ptr_is_null(cnt_cur, &ptr);
+}
+
+/*
+ * Find the rightmost record of the cntbt, and return the longest free space
+ * recorded in it. Simply set both the block number and the length to their
+ * maximum values before searching.
+ */
+static int
+xfs_cntbt_longest(
+ struct xfs_btree_cur *cnt_cur,
+ xfs_extlen_t *longest)
+{
+ struct xfs_alloc_rec_incore irec;
+ union xfs_btree_rec *rec;
+ int stat = 0;
+ int error;
+
+ memset(&cnt_cur->bc_rec, 0xFF, sizeof(cnt_cur->bc_rec));
+ error = xfs_btree_lookup(cnt_cur, XFS_LOOKUP_LE, &stat);
+ if (error)
+ return error;
+ if (!stat) {
+ /* totally empty tree */
+ *longest = 0;
+ return 0;
+ }
+
+ error = xfs_btree_get_rec(cnt_cur, &rec, &stat);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(cnt_cur->bc_mp, !stat)) {
+ xfs_btree_mark_sick(cnt_cur);
+ return -EFSCORRUPTED;
+ }
+
+ xfs_alloc_btrec_to_irec(rec, &irec);
+ *longest = irec.ar_blockcount;
+ return 0;
+}
+
+/*
+ * Update the longest contiguous free extent in the AG from the by-count cursor
+ * that is passed to us. This should be done at the end of any allocation or
+ * freeing operation that touches the longest extent in the btree.
+ *
+ * Needing to update the longest extent can be determined by calling
+ * xfs_alloc_cursor_at_lastrec() after the cursor is positioned for record
+ * modification but before the modification begins.
+ */
+static int
+xfs_alloc_fixup_longest(
+ struct xfs_btree_cur *cnt_cur)
+{
+ struct xfs_perag *pag = to_perag(cnt_cur->bc_group);
+ struct xfs_buf *bp = cnt_cur->bc_ag.agbp;
+ struct xfs_agf *agf = bp->b_addr;
+ xfs_extlen_t longest = 0;
+ int error;
+
+ /* Lookup last rec in order to update AGF. */
+ error = xfs_cntbt_longest(cnt_cur, &longest);
+ if (error)
+ return error;
+
+ pag->pagf_longest = longest;
+ agf->agf_longest = cpu_to_be32(pag->pagf_longest);
+ xfs_alloc_log_agf(cnt_cur->bc_tp, bp, XFS_AGF_LONGEST);
+
+ return 0;
+}
+
+/*
* Update the two btrees, logically removing from freespace the extent
* starting at rbno, rlen blocks. The extent is contained within the
* actual (current) free extent fbno for flen blocks.
@@ -456,6 +580,7 @@ xfs_alloc_fixup_trees(
xfs_extlen_t nflen1=0; /* first new free length */
xfs_extlen_t nflen2=0; /* second new free length */
struct xfs_mount *mp;
+ bool fixup_longest = false;
mp = cnt_cur->bc_mp;
@@ -469,14 +594,18 @@ xfs_alloc_fixup_trees(
if (XFS_IS_CORRUPT(mp,
i != 1 ||
nfbno1 != fbno ||
- nflen1 != flen))
+ nflen1 != flen)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
#endif
} else {
if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
}
/*
* Look up the record in the by-block tree if necessary.
@@ -488,14 +617,18 @@ xfs_alloc_fixup_trees(
if (XFS_IS_CORRUPT(mp,
i != 1 ||
nfbno1 != fbno ||
- nflen1 != flen))
+ nflen1 != flen)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
#endif
} else {
if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
}
#ifdef DEBUG
@@ -508,8 +641,10 @@ xfs_alloc_fixup_trees(
if (XFS_IS_CORRUPT(mp,
bnoblock->bb_numrecs !=
- cntblock->bb_numrecs))
+ cntblock->bb_numrecs)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
}
#endif
@@ -534,35 +669,49 @@ xfs_alloc_fixup_trees(
nfbno2 = rbno + rlen;
nflen2 = (fbno + flen) - nfbno2;
}
+
+ if (xfs_alloc_cursor_at_lastrec(cnt_cur))
+ fixup_longest = true;
+
/*
* Delete the entry from the by-size btree.
*/
if ((error = xfs_btree_delete(cnt_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
/*
* Add new by-size btree entry(s).
*/
if (nfbno1 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 0))
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
}
if (nfbno2 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 0))
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
}
/*
* Fix up the by-block btree entry(s).
@@ -573,8 +722,10 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_btree_delete(bno_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
} else {
/*
* Update the by-block entry to start later|be shorter.
@@ -588,16 +739,43 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 0))
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
if ((error = xfs_btree_insert(bno_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
}
+
+ if (fixup_longest)
+ return xfs_alloc_fixup_longest(cnt_cur);
+
return 0;
}
+/*
+ * We do not verify the AGFL contents against AGF-based index counters here,
+ * even though we may have access to the perag that contains shadow copies. We
+ * don't know if the AGF based counters have been checked, and if they have they
+ * still may be inconsistent because they haven't yet been reset on the first
+ * allocation after the AGF has been read in.
+ *
+ * This means we can only check that all agfl entries contain valid or null
+ * values because we can't reliably determine the active range to exclude
+ * NULLAGBNO as a valid value.
+ *
+ * However, we can't even do that for v4 format filesystems because there are
+ * old versions of mkfs out there that does not initialise the AGFL to known,
+ * verifiable values. HEnce we can't tell the difference between a AGFL block
+ * allocated by mkfs and a corrupted AGFL block here on v4 filesystems.
+ *
+ * As a result, we can only fully validate AGFL block numbers when we pull them
+ * from the freelist in xfs_alloc_get_freelist().
+ */
static xfs_failaddr_t
xfs_agfl_verify(
struct xfs_buf *bp)
@@ -607,12 +785,6 @@ xfs_agfl_verify(
__be32 *agfl_bno = xfs_buf_to_agfl_bno(bp);
int i;
- /*
- * There is no verification of non-crc AGFLs because mkfs does not
- * initialise the AGFL to zero or NULL. Hence the only valid part of the
- * AGFL is what the AGF says is active. We can't get to the AGF, so we
- * can't verify just those entries are valid.
- */
if (!xfs_has_crc(mp))
return NULL;
@@ -626,7 +798,7 @@ xfs_agfl_verify(
* use it by using uncached buffers that don't have the perag attached
* so we can detect and avoid this problem.
*/
- if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+ if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != pag_agno((bp->b_pag)))
return __this_address;
for (i = 0; i < xfs_agfl_size(mp); i++) {
@@ -706,14 +878,15 @@ xfs_alloc_read_agfl(
struct xfs_trans *tp,
struct xfs_buf **bpp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_buf *bp;
int error;
- error = xfs_trans_read_buf(
- mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
if (error)
return error;
xfs_buf_set_ref(bp, XFS_AGFL_REF);
@@ -735,6 +908,7 @@ xfs_alloc_update_counters(
if (unlikely(be32_to_cpu(agf->agf_freeblks) >
be32_to_cpu(agf->agf_length))) {
xfs_buf_mark_corrupt(agbp);
+ xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF);
return -EFSCORRUPTED;
}
@@ -772,8 +946,6 @@ xfs_alloc_cur_setup(
int error;
int i;
- ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
-
acur->cur_len = args->maxlen;
acur->rec_bno = 0;
acur->rec_len = 0;
@@ -789,8 +961,8 @@ xfs_alloc_cur_setup(
* attempt a small allocation.
*/
if (!acur->cnt)
- acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->pag, XFS_BTNUM_CNT);
+ acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag);
error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
if (error)
return error;
@@ -799,11 +971,11 @@ xfs_alloc_cur_setup(
* Allocate the bnobt left and right search cursors.
*/
if (!acur->bnolt)
- acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->pag, XFS_BTNUM_BNO);
+ acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag);
if (!acur->bnogt)
- acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->pag, XFS_BTNUM_BNO);
+ acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag);
return i == 1 ? 0 : -ENOSPC;
}
@@ -845,15 +1017,17 @@ xfs_alloc_cur_check(
bool busy;
unsigned busy_gen = 0;
bool deactivate = false;
- bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
+ bool isbnobt = xfs_btree_is_bno(cur->bc_ops);
*new = 0;
error = xfs_alloc_get_rec(cur, &bno, &len, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(args->mp, i != 1))
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
/*
* Check minlen and deactivate a cntbt cursor if out of acceptable size
@@ -887,7 +1061,6 @@ xfs_alloc_cur_check(
* We have an aligned record that satisfies minlen and beats or matches
* the candidate extent size. Compare locality for near allocation mode.
*/
- ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
diff = xfs_alloc_compute_diff(args->agbno, args->len,
args->alignment, args->datatype,
bnoa, lena, &bnew);
@@ -920,9 +1093,8 @@ xfs_alloc_cur_check(
deactivate = true;
out:
if (deactivate)
- cur->bc_ag.abt.active = false;
- trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
- *new);
+ cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
+ trace_xfs_alloc_cur_check(cur, bno, len, diff, *new);
return 0;
}
@@ -935,13 +1107,12 @@ xfs_alloc_cur_finish(
struct xfs_alloc_arg *args,
struct xfs_alloc_cur *acur)
{
- struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
int error;
ASSERT(acur->cnt && acur->bnolt);
ASSERT(acur->bno >= acur->rec_bno);
ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
- ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
+ ASSERT(xfs_verify_agbext(args->pag, acur->rec_bno, acur->rec_len));
error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
acur->rec_len, acur->bno, acur->len, 0);
@@ -1060,6 +1231,7 @@ xfs_alloc_ag_vextent_small(
if (error)
goto error;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(ccur);
error = -EFSCORRUPTED;
goto error;
}
@@ -1078,14 +1250,14 @@ xfs_alloc_ag_vextent_small(
if (fbno == NULLAGBLOCK)
goto out;
- xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
+ xfs_extent_busy_reuse(pag_group(args->pag), fbno, 1,
(args->datatype & XFS_ALLOC_NOBUSY));
if (args->datatype & XFS_ALLOC_USERDATA) {
struct xfs_buf *bp;
error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
+ xfs_agbno_to_daddr(args->pag, fbno),
args->mp->m_bsize, 0, &bp);
if (error)
goto error;
@@ -1094,6 +1266,7 @@ xfs_alloc_ag_vextent_small(
*fbnop = args->agbno = fbno;
*flenp = args->len = 1;
if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
+ xfs_btree_mark_sick(ccur);
error = -EFSCORRUPTED;
goto error;
}
@@ -1133,78 +1306,6 @@ error:
}
/*
- * Allocate a variable extent in the allocation group agno.
- * Type and bno are used to determine where in the allocation group the
- * extent will start.
- * Extent's length (returned in *len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
- */
-STATIC int /* error */
-xfs_alloc_ag_vextent(
- xfs_alloc_arg_t *args) /* argument structure for allocation */
-{
- int error=0;
-
- ASSERT(args->minlen > 0);
- ASSERT(args->maxlen > 0);
- ASSERT(args->minlen <= args->maxlen);
- ASSERT(args->mod < args->prod);
- ASSERT(args->alignment > 0);
-
- /*
- * Branch to correct routine based on the type.
- */
- args->wasfromfl = 0;
- switch (args->type) {
- case XFS_ALLOCTYPE_THIS_AG:
- error = xfs_alloc_ag_vextent_size(args);
- break;
- case XFS_ALLOCTYPE_NEAR_BNO:
- error = xfs_alloc_ag_vextent_near(args);
- break;
- case XFS_ALLOCTYPE_THIS_BNO:
- error = xfs_alloc_ag_vextent_exact(args);
- break;
- default:
- ASSERT(0);
- /* NOTREACHED */
- }
-
- if (error || args->agbno == NULLAGBLOCK)
- return error;
-
- ASSERT(args->len >= args->minlen);
- ASSERT(args->len <= args->maxlen);
- ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
- ASSERT(args->agbno % args->alignment == 0);
-
- /* if not file data, insert new block into the reverse map btree */
- if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
- error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
- args->agbno, args->len, &args->oinfo);
- if (error)
- return error;
- }
-
- if (!args->wasfromfl) {
- error = xfs_alloc_update_counters(args->tp, args->agbp,
- -((long)(args->len)));
- if (error)
- return error;
-
- ASSERT(!xfs_extent_busy_search(args->mp, args->pag,
- args->agbno, args->len));
- }
-
- xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
-
- XFS_STATS_INC(args->mp, xs_allocx);
- XFS_STATS_ADD(args->mp, xs_allocb, args->len);
- return error;
-}
-
-/*
* Allocate a variable extent at exactly agno/bno.
* Extent's length (returned in *len) will be between minlen and maxlen,
* and of the form k * prod + mod unless there's nothing that large.
@@ -1214,7 +1315,6 @@ STATIC int /* error */
xfs_alloc_ag_vextent_exact(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
- struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
int error;
@@ -1231,8 +1331,8 @@ xfs_alloc_ag_vextent_exact(
/*
* Allocate/initialize a cursor for the by-number freespace btree.
*/
- bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_BNO);
+ bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
/*
* Lookup bno and minlen in the btree (minlen is irrelevant, really).
@@ -1252,6 +1352,7 @@ xfs_alloc_ag_vextent_exact(
if (error)
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1262,7 +1363,8 @@ xfs_alloc_ag_vextent_exact(
*/
tbno = fbno;
tlen = flen;
- xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
+ xfs_extent_busy_trim(pag_group(args->pag), args->minlen, args->maxlen,
+ &tbno, &tlen, &busy_gen);
/*
* Give up if the start of the extent is busy, or the freespace isn't
@@ -1291,9 +1393,9 @@ xfs_alloc_ag_vextent_exact(
* We are allocating agbno for args->len
* Allocate/initialize a cursor for the by-size btree.
*/
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_CNT);
- ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
+ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
+ ASSERT(xfs_verify_agbext(args->pag, args->agbno, args->len));
error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
args->len, XFSA_FIXUP_BNO_OK);
if (error) {
@@ -1364,7 +1466,7 @@ xfs_alloc_walk_iter(
if (error)
return error;
if (i == 0)
- cur->bc_ag.abt.active = false;
+ cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
if (count > 0)
count--;
@@ -1389,7 +1491,6 @@ xfs_alloc_ag_vextent_locality(
bool fbinc;
ASSERT(acur->len == 0);
- ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
*stat = 0;
@@ -1479,7 +1580,7 @@ xfs_alloc_ag_vextent_locality(
if (error)
return error;
if (i) {
- acur->cnt->bc_ag.abt.active = true;
+ acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
fbcur = acur->cnt;
fbinc = false;
}
@@ -1532,8 +1633,10 @@ xfs_alloc_ag_vextent_lastblock(
error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(args->mp, i != 1))
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(acur->cnt);
return -EFSCORRUPTED;
+ }
if (*len >= args->minlen)
break;
error = xfs_btree_increment(acur->cnt, 0, &i);
@@ -1569,7 +1672,8 @@ xfs_alloc_ag_vextent_lastblock(
*/
STATIC int
xfs_alloc_ag_vextent_near(
- struct xfs_alloc_arg *args)
+ struct xfs_alloc_arg *args,
+ uint32_t alloc_flags)
{
struct xfs_alloc_cur acur = {};
int error; /* error code */
@@ -1588,6 +1692,8 @@ xfs_alloc_ag_vextent_near(
if (args->agbno > args->max_agbno)
args->agbno = args->max_agbno;
+ /* Retry once quickly if we find busy extents before blocking. */
+ alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
restart:
len = 0;
@@ -1643,9 +1749,21 @@ restart:
*/
if (!acur.len) {
if (acur.busy) {
+ /*
+ * Our only valid extents must have been busy. Flush and
+ * retry the allocation again. If we get an -EAGAIN
+ * error, we're being told that a deadlock was avoided
+ * and the current transaction needs committing before
+ * the allocation can be retried.
+ */
trace_xfs_alloc_near_busy(args);
- xfs_extent_busy_flush(args->mp, args->pag,
- acur.busy_gen);
+ error = xfs_extent_busy_flush(args->tp,
+ pag_group(args->pag), acur.busy_gen,
+ alloc_flags);
+ if (error)
+ goto out;
+
+ alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
goto restart;
}
trace_xfs_alloc_size_neither(args);
@@ -1668,28 +1786,31 @@ out:
* and of the form k * prod + mod unless there's nothing that large.
* Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
*/
-STATIC int /* error */
+static int
xfs_alloc_ag_vextent_size(
- xfs_alloc_arg_t *args) /* allocation argument structure */
+ struct xfs_alloc_arg *args,
+ uint32_t alloc_flags)
{
- struct xfs_agf *agf = args->agbp->b_addr;
- struct xfs_btree_cur *bno_cur; /* cursor for bno btree */
- struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */
- int error; /* error result */
- xfs_agblock_t fbno; /* start of found freespace */
- xfs_extlen_t flen; /* length of found freespace */
- int i; /* temp status variable */
- xfs_agblock_t rbno; /* returned block number */
- xfs_extlen_t rlen; /* length of returned extent */
- bool busy;
- unsigned busy_gen;
+ struct xfs_agf *agf = args->agbp->b_addr;
+ struct xfs_btree_cur *bno_cur;
+ struct xfs_btree_cur *cnt_cur;
+ xfs_agblock_t fbno; /* start of found freespace */
+ xfs_extlen_t flen; /* length of found freespace */
+ xfs_agblock_t rbno; /* returned block number */
+ xfs_extlen_t rlen; /* length of returned extent */
+ bool busy;
+ unsigned busy_gen;
+ int error;
+ int i;
+ /* Retry once quickly if we find busy extents before blocking. */
+ alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
restart:
/*
* Allocate and initialize a cursor for the by-size btree.
*/
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
bno_cur = NULL;
/*
@@ -1728,6 +1849,7 @@ restart:
if (error)
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1741,19 +1863,26 @@ restart:
error = xfs_btree_increment(cnt_cur, 0, &i);
if (error)
goto error0;
- if (i == 0) {
- /*
- * Our only valid extents must have been busy.
- * Make it unbusy by forcing the log out and
- * retrying.
- */
- xfs_btree_del_cursor(cnt_cur,
- XFS_BTREE_NOERROR);
- trace_xfs_alloc_size_busy(args);
- xfs_extent_busy_flush(args->mp,
- args->pag, busy_gen);
- goto restart;
- }
+ if (i)
+ continue;
+
+ /*
+ * Our only valid extents must have been busy. Flush and
+ * retry the allocation again. If we get an -EAGAIN
+ * error, we're being told that a deadlock was avoided
+ * and the current transaction needs committing before
+ * the allocation can be retried.
+ */
+ trace_xfs_alloc_size_busy(args);
+ error = xfs_extent_busy_flush(args->tp,
+ pag_group(args->pag), busy_gen,
+ alloc_flags);
+ if (error)
+ goto error0;
+
+ alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ goto restart;
}
}
@@ -1768,6 +1897,7 @@ restart:
rlen != 0 &&
(rlen > flen ||
rbno + rlen > fbno + flen))) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1790,10 +1920,11 @@ restart:
&i)))
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
- if (flen < bestrlen)
+ if (flen <= bestrlen)
break;
busy = xfs_alloc_compute_aligned(args, fbno, flen,
&rbno, &rlen, &busy_gen);
@@ -1802,6 +1933,7 @@ restart:
rlen != 0 &&
(rlen > flen ||
rbno + rlen > fbno + flen))) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1818,6 +1950,7 @@ restart:
&i)))
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1833,9 +1966,22 @@ restart:
args->len = rlen;
if (rlen < args->minlen) {
if (busy) {
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ /*
+ * Our only valid extents must have been busy. Flush and
+ * retry the allocation again. If we get an -EAGAIN
+ * error, we're being told that a deadlock was avoided
+ * and the current transaction needs committing before
+ * the allocation can be retried.
+ */
trace_xfs_alloc_size_busy(args);
- xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
+ error = xfs_extent_busy_flush(args->tp,
+ pag_group(args->pag), busy_gen,
+ alloc_flags);
+ if (error)
+ goto error0;
+
+ alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
goto restart;
}
goto out_nominleft;
@@ -1844,14 +1990,15 @@ restart:
rlen = args->len;
if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
/*
* Allocate and initialize a cursor for the by-block tree.
*/
- bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_BNO);
+ bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
rbno, rlen, XFSA_FIXUP_CNT_OK)))
goto error0;
@@ -1863,6 +2010,7 @@ restart:
if (XFS_IS_CORRUPT(args->mp,
args->agbno + args->len >
be32_to_cpu(agf->agf_length))) {
+ xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1887,11 +2035,10 @@ out_nominleft:
/*
* Free the extent starting at agno/bno for length.
*/
-STATIC int
+int
xfs_free_ag_extent(
struct xfs_trans *tp,
struct xfs_buf *agbp,
- xfs_agnumber_t agno,
xfs_agblock_t bno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo,
@@ -1911,6 +2058,7 @@ xfs_free_ag_extent(
int i;
int error;
struct xfs_perag *pag = agbp->b_pag;
+ bool fixup_longest = false;
bno_cur = cnt_cur = NULL;
mp = tp->t_mountp;
@@ -1924,7 +2072,7 @@ xfs_free_ag_extent(
/*
* Allocate and initialize a cursor for the by-block btree.
*/
- bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
+ bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
/*
* Look for a neighboring block on the left (lower block numbers)
* that is contiguous with this space.
@@ -1938,6 +2086,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1953,6 +2102,7 @@ xfs_free_ag_extent(
* Very bad.
*/
if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1971,6 +2121,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1986,6 +2137,7 @@ xfs_free_ag_extent(
* Very bad.
*/
if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1994,7 +2146,7 @@ xfs_free_ag_extent(
/*
* Now allocate and initialize a cursor for the by-size tree.
*/
- cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
/*
* Have both left and right contiguous neighbors.
* Merge all three into a single free block.
@@ -2006,12 +2158,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2021,12 +2175,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2036,6 +2192,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_delete(bno_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2045,6 +2202,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2064,6 +2222,7 @@ xfs_free_ag_extent(
i != 1 ||
xxbno != ltbno ||
xxlen != ltlen)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2088,12 +2247,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2104,6 +2265,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2123,12 +2285,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2151,27 +2315,43 @@ xfs_free_ag_extent(
if ((error = xfs_btree_insert(bno_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
}
xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
bno_cur = NULL;
+
/*
* In all cases we need to insert the new freespace in the by-size tree.
+ *
+ * If this new freespace is being inserted in the block that contains
+ * the largest free space in the btree, make sure we also fix up the
+ * agf->agf-longest tracker field.
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
+ if (xfs_alloc_cursor_at_lastrec(cnt_cur))
+ fixup_longest = true;
if ((error = xfs_btree_insert(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
+ if (fixup_longest) {
+ error = xfs_alloc_fixup_longest(cnt_cur);
+ if (error)
+ goto error0;
+ }
+
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
cnt_cur = NULL;
@@ -2179,19 +2359,19 @@ xfs_free_ag_extent(
* Update the freespace totals in the ag and superblock.
*/
error = xfs_alloc_update_counters(tp, agbp, len);
- xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
+ xfs_ag_resv_free_extent(pag, type, tp, len);
if (error)
goto error0;
XFS_STATS_INC(mp, xs_freex);
XFS_STATS_ADD(mp, xs_freeb, len);
- trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
+ trace_xfs_free_extent(pag, bno, len, type, haveleft, haveright);
return 0;
error0:
- trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
+ trace_xfs_free_extent(pag, bno, len, type, -1, -1);
if (bno_cur)
xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
if (cnt_cur)
@@ -2250,7 +2430,7 @@ xfs_alloc_longest_free_extent(
* reservations and AGFL rules in place, we can return this extent.
*/
if (pag->pagf_longest > delta)
- return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
+ return min_t(xfs_extlen_t, pag_mount(pag)->m_ag_max_usable,
pag->pagf_longest - delta);
/* Otherwise, let the caller try for 1 block if there's space. */
@@ -2267,23 +2447,41 @@ xfs_alloc_min_freelist(
struct xfs_perag *pag)
{
/* AG btrees have at least 1 level. */
- static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
- const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
+ const unsigned int bno_level = pag ? pag->pagf_bno_level : 1;
+ const unsigned int cnt_level = pag ? pag->pagf_cnt_level : 1;
+ const unsigned int rmap_level = pag ? pag->pagf_rmap_level : 1;
unsigned int min_free;
ASSERT(mp->m_alloc_maxlevels > 0);
+ /*
+ * For a btree shorter than the maximum height, the worst case is that
+ * every level gets split and a new level is added, then while inserting
+ * another entry to refill the AGFL, every level under the old root gets
+ * split again. This is:
+ *
+ * (full height split reservation) + (AGFL refill split height)
+ * = (current height + 1) + (current height - 1)
+ * = (new height) + (new height - 2)
+ * = 2 * new height - 2
+ *
+ * For a btree of maximum height, the worst case is that every level
+ * under the root gets split, then while inserting another entry to
+ * refill the AGFL, every level under the root gets split again. This is
+ * also:
+ *
+ * 2 * (current height - 1)
+ * = 2 * (new height - 1)
+ * = 2 * new height - 2
+ */
+
/* space needed by-bno freespace btree */
- min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
- mp->m_alloc_maxlevels);
+ min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
/* space needed by-size freespace btree */
- min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
- mp->m_alloc_maxlevels);
+ min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
/* space needed reverse mapping used space btree */
if (xfs_has_rmapbt(mp))
- min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
- mp->m_rmap_maxlevels);
-
+ min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2;
return min_free;
}
@@ -2340,39 +2538,17 @@ xfs_alloc_space_available(
return true;
}
-int
-xfs_free_agfl_block(
- struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t agbno,
- struct xfs_buf *agbp,
- struct xfs_owner_info *oinfo)
-{
- int error;
- struct xfs_buf *bp;
-
- error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
- XFS_AG_RESV_AGFL);
- if (error)
- return error;
-
- error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
- XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
- tp->t_mountp->m_bsize, 0, &bp);
- if (error)
- return error;
- xfs_trans_binval(tp, bp);
-
- return 0;
-}
-
/*
- * Check the agfl fields of the agf for inconsistency or corruption. The purpose
- * is to detect an agfl header padding mismatch between current and early v5
- * kernels. This problem manifests as a 1-slot size difference between the
- * on-disk flcount and the active [first, last] range of a wrapped agfl. This
- * may also catch variants of agfl count corruption unrelated to padding. Either
- * way, we'll reset the agfl and warn the user.
+ * Check the agfl fields of the agf for inconsistency or corruption.
+ *
+ * The original purpose was to detect an agfl header padding mismatch between
+ * current and early v5 kernels. This problem manifests as a 1-slot size
+ * difference between the on-disk flcount and the active [first, last] range of
+ * a wrapped agfl.
+ *
+ * However, we need to use these same checks to catch agfl count corruptions
+ * unrelated to padding. This could occur on any v4 or v5 filesystem, so either
+ * way, we need to reset the agfl and warn the user.
*
* Return true if a reset is required before the agfl can be used, false
* otherwise.
@@ -2388,10 +2564,6 @@ xfs_agfl_needs_reset(
int agfl_size = xfs_agfl_size(mp);
int active;
- /* no agfl header on v4 supers */
- if (!xfs_has_crc(mp))
- return false;
-
/*
* The agf read verifier catches severe corruption of these fields.
* Repeat some sanity checks to cover a packed -> unpacked mismatch if
@@ -2435,13 +2607,13 @@ xfs_agfl_reset(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_agf *agf = agbp->b_addr;
- ASSERT(pag->pagf_agflreset);
+ ASSERT(xfs_perag_agfl_needs_reset(pag));
trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
xfs_warn(mp,
"WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
"Please unmount and run xfs_repair.",
- pag->pag_agno, pag->pagf_flcount);
+ pag_agno(pag), pag->pagf_flcount);
agf->agf_flfirst = 0;
agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
@@ -2450,99 +2622,162 @@ xfs_agfl_reset(
XFS_AGF_FLCOUNT);
pag->pagf_flcount = 0;
- pag->pagf_agflreset = false;
-}
-
-/*
- * Defer an AGFL block free. This is effectively equivalent to
- * xfs_free_extent_later() with some special handling particular to AGFL blocks.
- *
- * Deferring AGFL frees helps prevent log reservation overruns due to too many
- * allocation operations in a transaction. AGFL frees are prone to this problem
- * because for one they are always freed one at a time. Further, an immediate
- * AGFL block free can cause a btree join and require another block free before
- * the real allocation can proceed. Deferring the free disconnects freeing up
- * the AGFL slot from freeing the block.
- */
-STATIC void
-xfs_defer_agfl_block(
- struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_fsblock_t agbno,
- struct xfs_owner_info *oinfo)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_extent_free_item *new; /* new element */
-
- ASSERT(xfs_extfree_item_cache != NULL);
- ASSERT(oinfo != NULL);
-
- new = kmem_cache_zalloc(xfs_extfree_item_cache,
- GFP_KERNEL | __GFP_NOFAIL);
- new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
- new->xefi_blockcount = 1;
- new->xefi_owner = oinfo->oi_owner;
-
- trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
-
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
+ clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
}
/*
* Add the extent to the list of extents to be free at transaction end.
* The list is maintained sorted (by block number).
*/
-void
-__xfs_free_extent_later(
+static int
+xfs_defer_extent_free(
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_filblks_t len,
const struct xfs_owner_info *oinfo,
- bool skip_discard)
+ enum xfs_ag_resv_type type,
+ unsigned int free_flags,
+ struct xfs_defer_pending **dfpp)
{
- struct xfs_extent_free_item *new; /* new element */
-#ifdef DEBUG
+ struct xfs_extent_free_item *xefi;
struct xfs_mount *mp = tp->t_mountp;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- ASSERT(bno != NULLFSBLOCK);
- ASSERT(len > 0);
ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
ASSERT(!isnullstartblock(bno));
- agno = XFS_FSB_TO_AGNO(mp, bno);
- agbno = XFS_FSB_TO_AGBNO(mp, bno);
- ASSERT(agno < mp->m_sb.sb_agcount);
- ASSERT(agbno < mp->m_sb.sb_agblocks);
- ASSERT(len < mp->m_sb.sb_agblocks);
- ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
- ASSERT(xfs_extfree_item_cache != NULL);
+ ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS));
- new = kmem_cache_zalloc(xfs_extfree_item_cache,
+ if (free_flags & XFS_FREE_EXTENT_REALTIME) {
+ if (type != XFS_AG_RESV_NONE) {
+ ASSERT(type == XFS_AG_RESV_NONE);
+ return -EFSCORRUPTED;
+ }
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_rtbext(mp, bno, len)))
+ return -EFSCORRUPTED;
+ } else {
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
+ return -EFSCORRUPTED;
+ }
+
+ xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
GFP_KERNEL | __GFP_NOFAIL);
- new->xefi_startblock = bno;
- new->xefi_blockcount = (xfs_extlen_t)len;
- if (skip_discard)
- new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+ xefi->xefi_startblock = bno;
+ xefi->xefi_blockcount = (xfs_extlen_t)len;
+ xefi->xefi_agresv = type;
+ if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD)
+ xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+ if (free_flags & XFS_FREE_EXTENT_REALTIME)
+ xefi->xefi_flags |= XFS_EFI_REALTIME;
if (oinfo) {
ASSERT(oinfo->oi_offset == 0);
if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
- new->xefi_flags |= XFS_EFI_ATTR_FORK;
+ xefi->xefi_flags |= XFS_EFI_ATTR_FORK;
if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
- new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
- new->xefi_owner = oinfo->oi_owner;
+ xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK;
+ xefi->xefi_owner = oinfo->oi_owner;
} else {
- new->xefi_owner = XFS_RMAP_OWN_NULL;
+ xefi->xefi_owner = XFS_RMAP_OWN_NULL;
}
- trace_xfs_bmap_free_defer(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
- XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
+
+ xfs_extent_free_defer_add(tp, xefi, dfpp);
+ return 0;
+}
+
+int
+xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type,
+ unsigned int free_flags)
+{
+ struct xfs_defer_pending *dontcare = NULL;
+
+ return xfs_defer_extent_free(tp, bno, len, oinfo, type, free_flags,
+ &dontcare);
+}
+
+/*
+ * Set up automatic freeing of unwritten space in the filesystem.
+ *
+ * This function attached a paused deferred extent free item to the
+ * transaction. Pausing means that the EFI will be logged in the next
+ * transaction commit, but the pending EFI will not be finished until the
+ * pending item is unpaused.
+ *
+ * If the system goes down after the EFI has been persisted to the log but
+ * before the pending item is unpaused, log recovery will find the EFI, fail to
+ * find the EFD, and free the space.
+ *
+ * If the pending item is unpaused, the next transaction commit will log an EFD
+ * without freeing the space.
+ *
+ * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the
+ * @args structure are set to the relevant values.
+ */
+int
+xfs_alloc_schedule_autoreap(
+ const struct xfs_alloc_arg *args,
+ unsigned int free_flags,
+ struct xfs_alloc_autoreap *aarp)
+{
+ int error;
+
+ error = xfs_defer_extent_free(args->tp, args->fsbno, args->len,
+ &args->oinfo, args->resv, free_flags, &aarp->dfp);
+ if (error)
+ return error;
+
+ xfs_defer_item_pause(args->tp, aarp->dfp);
+ return 0;
+}
+
+/*
+ * Cancel automatic freeing of unwritten space in the filesystem.
+ *
+ * Earlier, we created a paused deferred extent free item and attached it to
+ * this transaction so that we could automatically roll back a new space
+ * allocation if the system went down. Now we want to cancel the paused work
+ * item by marking the EFI stale so we don't actually free the space, unpausing
+ * the pending item and logging an EFD.
+ *
+ * The caller generally should have already mapped the space into the ondisk
+ * filesystem. If the reserved space was partially used, the caller must call
+ * xfs_free_extent_later to create a new EFI to free the unused space.
+ */
+void
+xfs_alloc_cancel_autoreap(
+ struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp)
+{
+ struct xfs_defer_pending *dfp = aarp->dfp;
+ struct xfs_extent_free_item *xefi;
+
+ if (!dfp)
+ return;
+
+ list_for_each_entry(xefi, &dfp->dfp_work, xefi_list)
+ xefi->xefi_flags |= XFS_EFI_CANCELLED;
+
+ xfs_defer_item_unpause(tp, dfp);
+}
+
+/*
+ * Commit automatic freeing of unwritten space in the filesystem.
+ *
+ * This unpauses an earlier _schedule_autoreap and commits to freeing the
+ * allocated space. Call this if none of the reserved space was used.
+ */
+void
+xfs_alloc_commit_autoreap(
+ struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp)
+{
+ if (aarp->dfp)
+ xfs_defer_item_unpause(tp, aarp->dfp);
}
-#ifdef DEBUG
/*
* Check if an AGF has a free extent record whose length is equal to
* args->minlen.
@@ -2558,13 +2793,14 @@ xfs_exact_minlen_extent_available(
xfs_extlen_t flen;
int error = 0;
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
- args->pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp,
+ args->pag);
error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
if (error)
goto out;
if (*stat == 0) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto out;
}
@@ -2581,7 +2817,6 @@ out:
return error;
}
-#endif
/*
* Decide whether to use this allocation group for this allocation.
@@ -2590,7 +2825,7 @@ out:
int /* error */
xfs_alloc_fix_freelist(
struct xfs_alloc_arg *args, /* allocation argument structure */
- int flags) /* XFS_ALLOC_FLAG_... */
+ uint32_t alloc_flags)
{
struct xfs_mount *mp = args->mp;
struct xfs_perag *pag = args->pag;
@@ -2605,8 +2840,8 @@ xfs_alloc_fix_freelist(
/* deferred ops (AGFL block frees) require permanent transactions */
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
- if (!pag->pagf_init) {
- error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
+ if (!xfs_perag_initialised_agf(pag)) {
+ error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
if (error) {
/* Couldn't lock the AGF so skip this AG. */
if (error == -EAGAIN)
@@ -2620,14 +2855,15 @@ xfs_alloc_fix_freelist(
* somewhere else if we are not being asked to try harder at this
* point
*/
- if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
- (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
- ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+ if (xfs_perag_prefers_metadata(pag) &&
+ (args->datatype & XFS_ALLOC_USERDATA) &&
+ (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) {
+ ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING));
goto out_agbp_relse;
}
need = xfs_alloc_min_freelist(mp, pag);
- if (!xfs_alloc_space_available(args, need, flags |
+ if (!xfs_alloc_space_available(args, need, alloc_flags |
XFS_ALLOC_FLAG_CHECK))
goto out_agbp_relse;
@@ -2636,7 +2872,7 @@ xfs_alloc_fix_freelist(
* Can fail if we're not blocking on locks, and it's held.
*/
if (!agbp) {
- error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
if (error) {
/* Couldn't lock the AGF so skip this AG. */
if (error == -EAGAIN)
@@ -2646,23 +2882,22 @@ xfs_alloc_fix_freelist(
}
/* reset a padding mismatched agfl before final free space check */
- if (pag->pagf_agflreset)
+ if (xfs_perag_agfl_needs_reset(pag))
xfs_agfl_reset(tp, agbp, pag);
/* If there isn't enough total space or single-extent, reject it. */
need = xfs_alloc_min_freelist(mp, pag);
- if (!xfs_alloc_space_available(args, need, flags))
+ if (!xfs_alloc_space_available(args, need, alloc_flags))
goto out_agbp_relse;
-#ifdef DEBUG
- if (args->alloc_minlen_only) {
+ if (IS_ENABLED(CONFIG_XFS_DEBUG) && args->alloc_minlen_only) {
int stat;
error = xfs_exact_minlen_extent_available(args, agbp, &stat);
if (error || !stat)
goto out_agbp_relse;
}
-#endif
+
/*
* Make the freelist shorter if it's too long.
*
@@ -2689,17 +2924,32 @@ xfs_alloc_fix_freelist(
*/
memset(&targs, 0, sizeof(targs));
/* struct copy below */
- if (flags & XFS_ALLOC_FLAG_NORMAP)
+ if (alloc_flags & XFS_ALLOC_FLAG_NORMAP)
targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
else
targs.oinfo = XFS_RMAP_OINFO_AG;
- while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
+ while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) &&
+ pag->pagf_flcount > need) {
error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
if (error)
goto out_agbp_relse;
- /* defer agfl frees */
- xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
+ /*
+ * Defer the AGFL block free.
+ *
+ * This helps to prevent log reservation overruns due to too
+ * many allocation operations in a transaction. AGFL frees are
+ * prone to this problem because for one they are always freed
+ * one at a time. Further, an immediate AGFL block free can
+ * cause a btree join and require another block free before the
+ * real allocation can proceed.
+ * Deferring the free disconnects freeing up the AGFL slot from
+ * freeing the block.
+ */
+ error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, bno),
+ 1, &targs.oinfo, XFS_AG_RESV_AGFL, 0);
+ if (error)
+ goto out_agbp_relse;
}
targs.tp = tp;
@@ -2707,7 +2957,6 @@ xfs_alloc_fix_freelist(
targs.agbp = agbp;
targs.agno = args->agno;
targs.alignment = targs.minlen = targs.prod = 1;
- targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
error = xfs_alloc_read_agfl(pag, tp, &agflbp);
if (error)
@@ -2720,7 +2969,7 @@ xfs_alloc_fix_freelist(
targs.resv = XFS_AG_RESV_AGFL;
/* Allocate as many blocks as possible at once. */
- error = xfs_alloc_ag_vextent(&targs);
+ error = xfs_alloc_ag_vextent_size(&targs, alloc_flags);
if (error)
goto out_agflbp_relse;
@@ -2730,10 +2979,22 @@ xfs_alloc_fix_freelist(
* on a completely full ag.
*/
if (targs.agbno == NULLAGBLOCK) {
- if (flags & XFS_ALLOC_FLAG_FREEING)
+ if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
break;
goto out_agflbp_relse;
}
+
+ if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) {
+ error = xfs_rmap_alloc(tp, agbp, pag,
+ targs.agbno, targs.len, &targs.oinfo);
+ if (error)
+ goto out_agflbp_relse;
+ }
+ error = xfs_alloc_update_counters(tp, agbp,
+ -((long)(targs.len)));
+ if (error)
+ goto out_agflbp_relse;
+
/*
* Put each allocated block on the list.
*/
@@ -2798,12 +3059,15 @@ xfs_alloc_get_freelist(
*/
agfl_bno = xfs_buf_to_agfl_bno(agflbp);
bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+ if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno)))
+ return -EFSCORRUPTED;
+
be32_add_cpu(&agf->agf_flfirst, 1);
xfs_trans_brelse(tp, agflbp);
if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
agf->agf_flfirst = 0;
- ASSERT(!pag->pagf_agflreset);
+ ASSERT(!xfs_perag_agfl_needs_reset(pag));
be32_add_cpu(&agf->agf_flcount, -1);
pag->pagf_flcount--;
@@ -2836,8 +3100,8 @@ xfs_alloc_log_agf(
offsetof(xfs_agf_t, agf_versionnum),
offsetof(xfs_agf_t, agf_seqno),
offsetof(xfs_agf_t, agf_length),
- offsetof(xfs_agf_t, agf_roots[0]),
- offsetof(xfs_agf_t, agf_levels[0]),
+ offsetof(xfs_agf_t, agf_bno_root), /* also cnt/rmap root */
+ offsetof(xfs_agf_t, agf_bno_level), /* also cnt/rmap levels */
offsetof(xfs_agf_t, agf_flfirst),
offsetof(xfs_agf_t, agf_fllast),
offsetof(xfs_agf_t, agf_flcount),
@@ -2892,7 +3156,7 @@ xfs_alloc_put_freelist(
if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
agf->agf_fllast = 0;
- ASSERT(!pag->pagf_agflreset);
+ ASSERT(!xfs_perag_agfl_needs_reset(pag));
be32_add_cpu(&agf->agf_flcount, 1);
pag->pagf_flcount++;
@@ -2903,8 +3167,6 @@ xfs_alloc_put_freelist(
logflags |= XFS_AGF_BTREEBLKS;
}
- xfs_alloc_log_agf(tp, agbp, logflags);
-
ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
agfl_bno = xfs_buf_to_agfl_bno(agflbp);
@@ -2920,12 +3182,69 @@ xfs_alloc_put_freelist(
return 0;
}
+/*
+ * Check that this AGF/AGI header's sequence number and length matches the AG
+ * number and size in fsblocks.
+ */
+xfs_failaddr_t
+xfs_validate_ag_length(
+ struct xfs_buf *bp,
+ uint32_t seqno,
+ uint32_t length)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ /*
+ * During growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag && seqno != pag_agno(bp->b_pag))
+ return __this_address;
+
+ /*
+ * Only the last AG in the filesystem is allowed to be shorter
+ * than the AG size recorded in the superblock.
+ */
+ if (length != mp->m_sb.sb_agblocks) {
+ /*
+ * During growfs, the new last AG can get here before we
+ * have updated the superblock. Give it a pass on the seqno
+ * check.
+ */
+ if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1)
+ return __this_address;
+ if (length < XFS_MIN_AG_BLOCKS)
+ return __this_address;
+ if (length > mp->m_sb.sb_agblocks)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+/*
+ * Verify the AGF is consistent.
+ *
+ * We do not verify the AGFL indexes in the AGF are fully consistent here
+ * because of issues with variable on-disk structure sizes. Instead, we check
+ * the agfl indexes for consistency when we initialise the perag from the AGF
+ * information after a read completes.
+ *
+ * If the index is inconsistent, then we mark the perag as needing an AGFL
+ * reset. The first AGFL update performed then resets the AGFL indexes and
+ * refills the AGFL with known good free blocks, allowing the filesystem to
+ * continue operating normally at the cost of a few leaked free space blocks.
+ */
static xfs_failaddr_t
xfs_agf_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_agf *agf = bp->b_addr;
+ xfs_failaddr_t fa;
+ uint32_t agf_seqno = be32_to_cpu(agf->agf_seqno);
+ uint32_t agf_length = be32_to_cpu(agf->agf_length);
if (xfs_has_crc(mp)) {
if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
@@ -2937,63 +3256,57 @@ xfs_agf_verify(
if (!xfs_verify_magic(bp, agf->agf_magicnum))
return __this_address;
- if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
- be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
- be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
- be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
- be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
+ if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)))
return __this_address;
- if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
- return __this_address;
+ /*
+ * Both agf_seqno and agf_length need to validated before anything else
+ * block number related in the AGF or AGFL can be checked.
+ */
+ fa = xfs_validate_ag_length(bp, agf_seqno, agf_length);
+ if (fa)
+ return fa;
- if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
- be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
+ if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp))
return __this_address;
-
- if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
- mp->m_alloc_maxlevels ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
- mp->m_alloc_maxlevels)
+ if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp))
return __this_address;
-
- if (xfs_has_rmapbt(mp) &&
- (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
- mp->m_rmap_maxlevels))
+ if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp))
return __this_address;
- if (xfs_has_rmapbt(mp) &&
- be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
+ if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
+ be32_to_cpu(agf->agf_freeblks) > agf_length)
return __this_address;
- /*
- * during growfs operations, the perag is not fully initialised,
- * so we can't use it for any useful checking. growfs ensures we can't
- * use it by using uncached buffers that don't have the perag attached
- * so we can detect and avoid this problem.
- */
- if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
+ if (be32_to_cpu(agf->agf_bno_level) < 1 ||
+ be32_to_cpu(agf->agf_cnt_level) < 1 ||
+ be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels ||
+ be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels)
return __this_address;
if (xfs_has_lazysbcount(mp) &&
- be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
+ be32_to_cpu(agf->agf_btreeblks) > agf_length)
return __this_address;
- if (xfs_has_reflink(mp) &&
- be32_to_cpu(agf->agf_refcount_blocks) >
- be32_to_cpu(agf->agf_length))
- return __this_address;
+ if (xfs_has_rmapbt(mp)) {
+ if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length)
+ return __this_address;
- if (xfs_has_reflink(mp) &&
- (be32_to_cpu(agf->agf_refcount_level) < 1 ||
- be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels))
- return __this_address;
+ if (be32_to_cpu(agf->agf_rmap_level) < 1 ||
+ be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels)
+ return __this_address;
+ }
- return NULL;
+ if (xfs_has_reflink(mp)) {
+ if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length)
+ return __this_address;
+
+ if (be32_to_cpu(agf->agf_refcount_level) < 1 ||
+ be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)
+ return __this_address;
+ }
+ return NULL;
}
static void
@@ -3008,7 +3321,7 @@ xfs_agf_read_verify(
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_agf_verify(bp);
- if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
+ if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_ALLOC_READ_AGF))
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
}
@@ -3055,14 +3368,16 @@ xfs_read_agf(
int flags,
struct xfs_buf **agfbpp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
int error;
- trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
+ trace_xfs_read_agf(pag);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
+ XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
if (error)
return error;
@@ -3082,12 +3397,13 @@ xfs_alloc_read_agf(
int flags,
struct xfs_buf **agfbpp)
{
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_buf *agfbp;
struct xfs_agf *agf;
int error;
int allocbt_blks;
- trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
+ trace_xfs_alloc_read_agf(pag);
/* We don't support trylock when freeing. */
ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
@@ -3099,20 +3415,19 @@ xfs_alloc_read_agf(
return error;
agf = agfbp->b_addr;
- if (!pag->pagf_init) {
+ if (!xfs_perag_initialised_agf(pag)) {
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
pag->pagf_longest = be32_to_cpu(agf->agf_longest);
- pag->pagf_levels[XFS_BTNUM_BNOi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
- pag->pagf_levels[XFS_BTNUM_CNTi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
- pag->pagf_levels[XFS_BTNUM_RMAPi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+ pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
+ pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
+ pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
- pag->pagf_init = 1;
- pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf);
+ if (xfs_agfl_needs_reset(mp, agf))
+ set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
+ else
+ clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
/*
* Update the in-core allocbt counter. Filter out the rmapbt
@@ -3122,24 +3437,48 @@ xfs_alloc_read_agf(
* counter only tracks non-root blocks.
*/
allocbt_blks = pag->pagf_btreeblks;
- if (xfs_has_rmapbt(pag->pag_mount))
+ if (xfs_has_rmapbt(mp))
allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
if (allocbt_blks > 0)
- atomic64_add(allocbt_blks,
- &pag->pag_mount->m_allocbt_blks);
+ atomic64_add(allocbt_blks, &mp->m_allocbt_blks);
+
+ set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
}
+
#ifdef DEBUG
- else if (!xfs_is_shutdown(pag->pag_mount)) {
- ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
- ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
- ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
- ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
- ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
- ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
+ /*
+ * It's possible for the AGF to be out of sync if the block device is
+ * silently dropping writes. This can happen in fstests with dmflakey
+ * enabled, which allows the buffer to be cleaned and reclaimed by
+ * memory pressure and then re-read from disk here. We will get a
+ * stale version of the AGF from disk, and nothing good can happen from
+ * here. Hence if we detect this situation, immediately shut down the
+ * filesystem.
+ *
+ * This can also happen if we are already in the middle of a forced
+ * shutdown, so don't bother checking if we are already shut down.
+ */
+ if (!xfs_is_shutdown(pag_mount(pag))) {
+ bool ok = true;
+
+ ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
+ ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
+ ok &= pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks);
+ ok &= pag->pagf_flcount == be32_to_cpu(agf->agf_flcount);
+ ok &= pag->pagf_longest == be32_to_cpu(agf->agf_longest);
+ ok &= pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level);
+ ok &= pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level);
+
+ if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
+ xfs_trans_brelse(tp, agfbp);
+ xfs_force_shutdown(pag_mount(pag),
+ SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+ }
}
-#endif
+#endif /* DEBUG */
+
if (agfbpp)
*agfbpp = agfbp;
else
@@ -3148,26 +3487,25 @@ xfs_alloc_read_agf(
}
/*
- * Allocate an extent (variable-size).
- * Depending on the allocation type, we either look in a single allocation
- * group or loop over the allocation groups to find the result.
+ * Pre-proces allocation arguments to set initial state that we don't require
+ * callers to set up correctly, as well as bounds check the allocation args
+ * that are set up.
*/
-int /* error */
-xfs_alloc_vextent(
- struct xfs_alloc_arg *args) /* allocation argument structure */
+static int
+xfs_alloc_vextent_check_args(
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t target,
+ xfs_agnumber_t *minimum_agno)
{
- xfs_agblock_t agsize; /* allocation group size */
- int error;
- int flags; /* XFS_ALLOC_FLAG_... locking flags */
- struct xfs_mount *mp; /* mount structure pointer */
- xfs_agnumber_t sagno; /* starting allocation group number */
- xfs_alloctype_t type; /* input allocation type */
- int bump_rotor = 0;
- xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
-
- mp = args->mp;
- type = args->otype = args->type;
- args->agbno = NULLAGBLOCK;
+ struct xfs_mount *mp = args->mp;
+ xfs_agblock_t agsize;
+
+ args->fsbno = NULLFSBLOCK;
+
+ *minimum_agno = 0;
+ if (args->tp->t_highest_agno != NULLAGNUMBER)
+ *minimum_agno = args->tp->t_highest_agno;
+
/*
* Just fix this up, for the case where the last a.g. is shorter
* (or there's only one a.g.) and the caller couldn't easily figure
@@ -3178,168 +3516,452 @@ xfs_alloc_vextent(
args->maxlen = agsize;
if (args->alignment == 0)
args->alignment = 1;
- ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
+
+ ASSERT(args->minlen > 0);
+ ASSERT(args->maxlen > 0);
+ ASSERT(args->alignment > 0);
+ ASSERT(args->resv != XFS_AG_RESV_AGFL);
+
+ ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize);
ASSERT(args->minlen <= args->maxlen);
ASSERT(args->minlen <= agsize);
ASSERT(args->mod < args->prod);
- if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
- XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
+
+ if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount ||
+ XFS_FSB_TO_AGBNO(mp, target) >= agsize ||
args->minlen > args->maxlen || args->minlen > agsize ||
args->mod >= args->prod) {
- args->fsbno = NULLFSBLOCK;
trace_xfs_alloc_vextent_badargs(args);
+ return -ENOSPC;
+ }
+
+ if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) {
+ trace_xfs_alloc_vextent_skip_deadlock(args);
+ return -ENOSPC;
+ }
+ return 0;
+
+}
+
+/*
+ * Prepare an AG for allocation. If the AG is not prepared to accept the
+ * allocation, return failure.
+ *
+ * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are
+ * modified to hold their own perag references.
+ */
+static int
+xfs_alloc_vextent_prepare_ag(
+ struct xfs_alloc_arg *args,
+ uint32_t alloc_flags)
+{
+ bool need_pag = !args->pag;
+ int error;
+
+ if (need_pag)
+ args->pag = xfs_perag_get(args->mp, args->agno);
+
+ args->agbp = NULL;
+ error = xfs_alloc_fix_freelist(args, alloc_flags);
+ if (error) {
+ trace_xfs_alloc_vextent_nofix(args);
+ if (need_pag)
+ xfs_perag_put(args->pag);
+ args->agbno = NULLAGBLOCK;
+ return error;
+ }
+ if (!args->agbp) {
+ /* cannot allocate in this AG at all */
+ trace_xfs_alloc_vextent_noagbp(args);
+ args->agbno = NULLAGBLOCK;
return 0;
}
+ args->wasfromfl = 0;
+ return 0;
+}
- switch (type) {
- case XFS_ALLOCTYPE_THIS_AG:
- case XFS_ALLOCTYPE_NEAR_BNO:
- case XFS_ALLOCTYPE_THIS_BNO:
- /*
- * These three force us into a single a.g.
- */
- args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
- args->pag = xfs_perag_get(mp, args->agno);
- error = xfs_alloc_fix_freelist(args, 0);
- if (error) {
- trace_xfs_alloc_vextent_nofix(args);
- goto error0;
- }
- if (!args->agbp) {
- trace_xfs_alloc_vextent_noagbp(args);
+/*
+ * Post-process allocation results to account for the allocation if it succeed
+ * and set the allocated block number correctly for the caller.
+ *
+ * XXX: we should really be returning ENOSPC for ENOSPC, not
+ * hiding it behind a "successful" NULLFSBLOCK allocation.
+ */
+static int
+xfs_alloc_vextent_finish(
+ struct xfs_alloc_arg *args,
+ xfs_agnumber_t minimum_agno,
+ int alloc_error,
+ bool drop_perag)
+{
+ struct xfs_mount *mp = args->mp;
+ int error = 0;
+
+ /*
+ * We can end up here with a locked AGF. If we failed, the caller is
+ * likely going to try to allocate again with different parameters, and
+ * that can widen the AGs that are searched for free space. If we have
+ * to do BMBT block allocation, we have to do a new allocation.
+ *
+ * Hence leaving this function with the AGF locked opens up potential
+ * ABBA AGF deadlocks because a future allocation attempt in this
+ * transaction may attempt to lock a lower number AGF.
+ *
+ * We can't release the AGF until the transaction is commited, so at
+ * this point we must update the "first allocation" tracker to point at
+ * this AG if the tracker is empty or points to a lower AG. This allows
+ * the next allocation attempt to be modified appropriately to avoid
+ * deadlocks.
+ */
+ if (args->agbp &&
+ (args->tp->t_highest_agno == NULLAGNUMBER ||
+ args->agno > minimum_agno))
+ args->tp->t_highest_agno = args->agno;
+
+ /*
+ * If the allocation failed with an error or we had an ENOSPC result,
+ * preserve the returned error whilst also marking the allocation result
+ * as "no extent allocated". This ensures that callers that fail to
+ * capture the error will still treat it as a failed allocation.
+ */
+ if (alloc_error || args->agbno == NULLAGBLOCK) {
+ args->fsbno = NULLFSBLOCK;
+ error = alloc_error;
+ goto out_drop_perag;
+ }
+
+ args->fsbno = xfs_agbno_to_fsb(args->pag, args->agbno);
+
+ ASSERT(args->len >= args->minlen);
+ ASSERT(args->len <= args->maxlen);
+ ASSERT(args->agbno % args->alignment == 0);
+ XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len);
+
+ /* if not file data, insert new block into the reverse map btree */
+ if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
+ error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
+ args->agbno, args->len, &args->oinfo);
+ if (error)
+ goto out_drop_perag;
+ }
+
+ if (!args->wasfromfl) {
+ error = xfs_alloc_update_counters(args->tp, args->agbp,
+ -((long)(args->len)));
+ if (error)
+ goto out_drop_perag;
+
+ ASSERT(!xfs_extent_busy_search(pag_group(args->pag),
+ args->agbno, args->len));
+ }
+
+ xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
+
+ XFS_STATS_INC(mp, xs_allocx);
+ XFS_STATS_ADD(mp, xs_allocb, args->len);
+
+ trace_xfs_alloc_vextent_finish(args);
+
+out_drop_perag:
+ if (drop_perag && args->pag) {
+ xfs_perag_rele(args->pag);
+ args->pag = NULL;
+ }
+ return error;
+}
+
+/*
+ * Allocate within a single AG only. This uses a best-fit length algorithm so if
+ * you need an exact sized allocation without locality constraints, this is the
+ * fastest way to do it.
+ *
+ * Caller is expected to hold a perag reference in args->pag.
+ */
+int
+xfs_alloc_vextent_this_ag(
+ struct xfs_alloc_arg *args,
+ xfs_agnumber_t agno)
+{
+ xfs_agnumber_t minimum_agno;
+ uint32_t alloc_flags = 0;
+ int error;
+
+ ASSERT(args->pag != NULL);
+ ASSERT(pag_agno(args->pag) == agno);
+
+ args->agno = agno;
+ args->agbno = 0;
+
+ trace_xfs_alloc_vextent_this_ag(args);
+
+ error = xfs_alloc_vextent_check_args(args,
+ xfs_agbno_to_fsb(args->pag, 0), &minimum_agno);
+ if (error) {
+ if (error == -ENOSPC)
+ return 0;
+ return error;
+ }
+
+ error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
+ if (!error && args->agbp)
+ error = xfs_alloc_ag_vextent_size(args, alloc_flags);
+
+ return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
+}
+
+/*
+ * Iterate all AGs trying to allocate an extent starting from @start_ag.
+ *
+ * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the
+ * allocation attempts in @start_agno have locality information. If we fail to
+ * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs
+ * we attempt to allocation in as there is no locality optimisation possible for
+ * those allocations.
+ *
+ * On return, args->pag may be left referenced if we finish before the "all
+ * failed" return point. The allocation finish still needs the perag, and
+ * so the caller will release it once they've finished the allocation.
+ *
+ * When we wrap the AG iteration at the end of the filesystem, we have to be
+ * careful not to wrap into AGs below ones we already have locked in the
+ * transaction if we are doing a blocking iteration. This will result in an
+ * out-of-order locking of AGFs and hence can cause deadlocks.
+ */
+static int
+xfs_alloc_vextent_iterate_ags(
+ struct xfs_alloc_arg *args,
+ xfs_agnumber_t minimum_agno,
+ xfs_agnumber_t start_agno,
+ xfs_agblock_t target_agbno,
+ uint32_t alloc_flags)
+{
+ struct xfs_mount *mp = args->mp;
+ xfs_agnumber_t restart_agno = minimum_agno;
+ xfs_agnumber_t agno;
+ int error = 0;
+
+ if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)
+ restart_agno = 0;
+restart:
+ for_each_perag_wrap_range(mp, start_agno, restart_agno,
+ mp->m_sb.sb_agcount, agno, args->pag) {
+ args->agno = agno;
+ error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
+ if (error)
break;
+ if (!args->agbp) {
+ trace_xfs_alloc_vextent_loopfailed(args);
+ continue;
}
- args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
- if ((error = xfs_alloc_ag_vextent(args)))
- goto error0;
- break;
- case XFS_ALLOCTYPE_START_BNO:
- /*
- * Try near allocation first, then anywhere-in-ag after
- * the first a.g. fails.
- */
- if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
- xfs_is_inode32(mp)) {
- args->fsbno = XFS_AGB_TO_FSB(mp,
- ((mp->m_agfrotor / rotorstep) %
- mp->m_sb.sb_agcount), 0);
- bump_rotor = 1;
- }
- args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
- args->type = XFS_ALLOCTYPE_NEAR_BNO;
- fallthrough;
- case XFS_ALLOCTYPE_FIRST_AG:
+
/*
- * Rotate through the allocation groups looking for a winner.
+ * Allocation is supposed to succeed now, so break out of the
+ * loop regardless of whether we succeed or not.
*/
- if (type == XFS_ALLOCTYPE_FIRST_AG) {
- /*
- * Start with allocation group given by bno.
- */
- args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
- args->type = XFS_ALLOCTYPE_THIS_AG;
- sagno = 0;
- flags = 0;
+ if (args->agno == start_agno && target_agbno) {
+ args->agbno = target_agbno;
+ error = xfs_alloc_ag_vextent_near(args, alloc_flags);
} else {
- /*
- * Start with the given allocation group.
- */
- args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
- flags = XFS_ALLOC_FLAG_TRYLOCK;
- }
- /*
- * Loop over allocation groups twice; first time with
- * trylock set, second time without.
- */
- for (;;) {
- args->pag = xfs_perag_get(mp, args->agno);
- error = xfs_alloc_fix_freelist(args, flags);
- if (error) {
- trace_xfs_alloc_vextent_nofix(args);
- goto error0;
- }
- /*
- * If we get a buffer back then the allocation will fly.
- */
- if (args->agbp) {
- if ((error = xfs_alloc_ag_vextent(args)))
- goto error0;
- break;
- }
-
- trace_xfs_alloc_vextent_loopfailed(args);
-
- /*
- * Didn't work, figure out the next iteration.
- */
- if (args->agno == sagno &&
- type == XFS_ALLOCTYPE_START_BNO)
- args->type = XFS_ALLOCTYPE_THIS_AG;
- /*
- * For the first allocation, we can try any AG to get
- * space. However, if we already have allocated a
- * block, we don't want to try AGs whose number is below
- * sagno. Otherwise, we may end up with out-of-order
- * locking of AGF, which might cause deadlock.
- */
- if (++(args->agno) == mp->m_sb.sb_agcount) {
- if (args->tp->t_firstblock != NULLFSBLOCK)
- args->agno = sagno;
- else
- args->agno = 0;
- }
- /*
- * Reached the starting a.g., must either be done
- * or switch to non-trylock mode.
- */
- if (args->agno == sagno) {
- if (flags == 0) {
- args->agbno = NULLAGBLOCK;
- trace_xfs_alloc_vextent_allfailed(args);
- break;
- }
-
- flags = 0;
- if (type == XFS_ALLOCTYPE_START_BNO) {
- args->agbno = XFS_FSB_TO_AGBNO(mp,
- args->fsbno);
- args->type = XFS_ALLOCTYPE_NEAR_BNO;
- }
- }
- xfs_perag_put(args->pag);
- }
- if (bump_rotor) {
- if (args->agno == sagno)
- mp->m_agfrotor = (mp->m_agfrotor + 1) %
- (mp->m_sb.sb_agcount * rotorstep);
- else
- mp->m_agfrotor = (args->agno * rotorstep + 1) %
- (mp->m_sb.sb_agcount * rotorstep);
+ args->agbno = 0;
+ error = xfs_alloc_ag_vextent_size(args, alloc_flags);
}
break;
- default:
- ASSERT(0);
- /* NOTREACHED */
}
- if (args->agbno == NULLAGBLOCK)
- args->fsbno = NULLFSBLOCK;
- else {
- args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
-#ifdef DEBUG
- ASSERT(args->len >= args->minlen);
- ASSERT(args->len <= args->maxlen);
- ASSERT(args->agbno % args->alignment == 0);
- XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
- args->len);
-#endif
+ if (error) {
+ xfs_perag_rele(args->pag);
+ args->pag = NULL;
+ return error;
+ }
+ if (args->agbp)
+ return 0;
+ /*
+ * We didn't find an AG we can alloation from. If we were given
+ * constraining flags by the caller, drop them and retry the allocation
+ * without any constraints being set.
+ */
+ if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) {
+ alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK;
+ restart_agno = minimum_agno;
+ goto restart;
}
- xfs_perag_put(args->pag);
+
+ ASSERT(args->pag == NULL);
+ trace_xfs_alloc_vextent_allfailed(args);
return 0;
-error0:
- xfs_perag_put(args->pag);
- return error;
+}
+
+/*
+ * Iterate from the AGs from the start AG to the end of the filesystem, trying
+ * to allocate blocks. It starts with a near allocation attempt in the initial
+ * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap
+ * back to zero if allowed by previous allocations in this transaction,
+ * otherwise will wrap back to the start AG and run a second blocking pass to
+ * the end of the filesystem.
+ */
+int
+xfs_alloc_vextent_start_ag(
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t target)
+{
+ struct xfs_mount *mp = args->mp;
+ xfs_agnumber_t minimum_agno;
+ xfs_agnumber_t start_agno;
+ xfs_agnumber_t rotorstep = xfs_rotorstep;
+ bool bump_rotor = false;
+ uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
+ int error;
+
+ ASSERT(args->pag == NULL);
+
+ args->agno = NULLAGNUMBER;
+ args->agbno = NULLAGBLOCK;
+
+ trace_xfs_alloc_vextent_start_ag(args);
+
+ error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
+ if (error) {
+ if (error == -ENOSPC)
+ return 0;
+ return error;
+ }
+
+ if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
+ xfs_is_inode32(mp)) {
+ target = XFS_AGB_TO_FSB(mp,
+ ((mp->m_agfrotor / rotorstep) %
+ mp->m_sb.sb_agcount), 0);
+ bump_rotor = 1;
+ }
+
+ start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
+ error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
+ XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
+
+ if (bump_rotor) {
+ if (args->agno == start_agno)
+ mp->m_agfrotor = (mp->m_agfrotor + 1) %
+ (mp->m_sb.sb_agcount * rotorstep);
+ else
+ mp->m_agfrotor = (args->agno * rotorstep + 1) %
+ (mp->m_sb.sb_agcount * rotorstep);
+ }
+
+ return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
+}
+
+/*
+ * Iterate from the agno indicated via @target through to the end of the
+ * filesystem attempting blocking allocation. This does not wrap or try a second
+ * pass, so will not recurse into AGs lower than indicated by the target.
+ */
+int
+xfs_alloc_vextent_first_ag(
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t target)
+ {
+ struct xfs_mount *mp = args->mp;
+ xfs_agnumber_t minimum_agno;
+ xfs_agnumber_t start_agno;
+ uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
+ int error;
+
+ ASSERT(args->pag == NULL);
+
+ args->agno = NULLAGNUMBER;
+ args->agbno = NULLAGBLOCK;
+
+ trace_xfs_alloc_vextent_first_ag(args);
+
+ error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
+ if (error) {
+ if (error == -ENOSPC)
+ return 0;
+ return error;
+ }
+
+ start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
+ error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
+ XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
+ return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
+}
+
+/*
+ * Allocate at the exact block target or fail. Caller is expected to hold a
+ * perag reference in args->pag.
+ */
+int
+xfs_alloc_vextent_exact_bno(
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t target)
+{
+ struct xfs_mount *mp = args->mp;
+ xfs_agnumber_t minimum_agno;
+ int error;
+
+ ASSERT(args->pag != NULL);
+ ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target));
+
+ args->agno = XFS_FSB_TO_AGNO(mp, target);
+ args->agbno = XFS_FSB_TO_AGBNO(mp, target);
+
+ trace_xfs_alloc_vextent_exact_bno(args);
+
+ error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
+ if (error) {
+ if (error == -ENOSPC)
+ return 0;
+ return error;
+ }
+
+ error = xfs_alloc_vextent_prepare_ag(args, 0);
+ if (!error && args->agbp)
+ error = xfs_alloc_ag_vextent_exact(args);
+
+ return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
+}
+
+/*
+ * Allocate an extent as close to the target as possible. If there are not
+ * viable candidates in the AG, then fail the allocation.
+ *
+ * Caller may or may not have a per-ag reference in args->pag.
+ */
+int
+xfs_alloc_vextent_near_bno(
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t target)
+{
+ struct xfs_mount *mp = args->mp;
+ xfs_agnumber_t minimum_agno;
+ bool needs_perag = args->pag == NULL;
+ uint32_t alloc_flags = 0;
+ int error;
+
+ if (!needs_perag)
+ ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target));
+
+ args->agno = XFS_FSB_TO_AGNO(mp, target);
+ args->agbno = XFS_FSB_TO_AGBNO(mp, target);
+
+ trace_xfs_alloc_vextent_near_bno(args);
+
+ error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
+ if (error) {
+ if (error == -ENOSPC)
+ return 0;
+ return error;
+ }
+
+ if (needs_perag)
+ args->pag = xfs_perag_grab(mp, args->agno);
+
+ error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
+ if (!error && args->agbp)
+ error = xfs_alloc_ag_vextent_near(args, alloc_flags);
+
+ return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag);
}
/* Ensure that the freelist is at full capacity. */
@@ -3355,7 +3977,7 @@ xfs_free_extent_fix_freelist(
memset(&args, 0, sizeof(struct xfs_alloc_arg));
args.tp = tp;
args.mp = tp->t_mountp;
- args.agno = pag->pag_agno;
+ args.agno = pag_agno(pag);
args.pag = pag;
/*
@@ -3381,7 +4003,8 @@ xfs_free_extent_fix_freelist(
int
__xfs_free_extent(
struct xfs_trans *tp,
- xfs_fsblock_t bno,
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type,
@@ -3389,51 +4012,49 @@ __xfs_free_extent(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *agbp;
- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
- xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
struct xfs_agf *agf;
int error;
unsigned int busy_flags = 0;
- struct xfs_perag *pag;
ASSERT(len != 0);
ASSERT(type != XFS_AG_RESV_AGFL);
- if (XFS_TEST_ERROR(false, mp,
- XFS_ERRTAG_FREE_EXTENT))
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT))
return -EIO;
- pag = xfs_perag_get(mp, agno);
error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
- if (error)
- goto err;
+ if (error) {
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
+ return error;
+ }
+
agf = agbp->b_addr;
if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
error = -EFSCORRUPTED;
goto err_release;
}
/* validate the extent size is legal now we have the agf locked */
if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
error = -EFSCORRUPTED;
goto err_release;
}
- error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
+ error = xfs_free_ag_extent(tp, agbp, agbno, len, oinfo, type);
if (error)
goto err_release;
if (skip_discard)
busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
- xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
- xfs_perag_put(pag);
+ xfs_extent_busy_insert(tp, pag_group(pag), agbno, len, busy_flags);
return 0;
err_release:
xfs_trans_brelse(tp, agbp);
-err:
- xfs_perag_put(pag);
return error;
}
@@ -3451,9 +4072,13 @@ xfs_alloc_query_range_helper(
{
struct xfs_alloc_query_range_info *query = priv;
struct xfs_alloc_rec_incore irec;
+ xfs_failaddr_t fa;
+
+ xfs_alloc_btrec_to_irec(rec, &irec);
+ fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec);
+ if (fa)
+ return xfs_alloc_complain_bad_rec(cur, fa, &irec);
- irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
- irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
return query->fn(cur, &irec, query->priv);
}
@@ -3466,15 +4091,11 @@ xfs_alloc_query_range(
xfs_alloc_query_range_fn fn,
void *priv)
{
- union xfs_btree_irec low_brec;
- union xfs_btree_irec high_brec;
- struct xfs_alloc_query_range_info query;
+ union xfs_btree_irec low_brec = { .a = *low_rec };
+ union xfs_btree_irec high_brec = { .a = *high_rec };
+ struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn };
- ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
- low_brec.a = *low_rec;
- high_brec.a = *high_rec;
- query.priv = priv;
- query.fn = fn;
+ ASSERT(xfs_btree_is_bno(cur->bc_ops));
return xfs_btree_query_range(cur, &low_brec, &high_brec,
xfs_alloc_query_range_helper, &query);
}
@@ -3488,19 +4109,22 @@ xfs_alloc_query_all(
{
struct xfs_alloc_query_range_info query;
- ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ ASSERT(xfs_btree_is_bno(cur->bc_ops));
query.priv = priv;
query.fn = fn;
return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
}
-/* Is there a record covering a given extent? */
+/*
+ * Scan part of the keyspace of the free space and tell us if the area has no
+ * records, is fully mapped by records, or is partially filled.
+ */
int
-xfs_alloc_has_record(
+xfs_alloc_has_records(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
xfs_extlen_t len,
- bool *exists)
+ enum xbtree_recpacking *outcome)
{
union xfs_btree_irec low;
union xfs_btree_irec high;
@@ -3510,7 +4134,7 @@ xfs_alloc_has_record(
memset(&high, 0xFF, sizeof(high));
high.a.ar_startblock = bno + len - 1;
- return xfs_btree_has_record(cur, &low, &high, exists);
+ return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
}
/*