summaryrefslogtreecommitdiff
path: root/fs/xfs/libxfs/xfs_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/libxfs/xfs_alloc.c')
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c325
1 files changed, 260 insertions, 65 deletions
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 744dcaec34cc..39387bdd225d 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -31,6 +31,7 @@
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_cksum.h"
#include "xfs_trace.h"
@@ -52,6 +53,23 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+/*
+ * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
+ * the beginning of the block for a proper header with the location information
+ * and CRC.
+ */
+unsigned int
+xfs_agfl_size(
+ struct xfs_mount *mp)
+{
+ unsigned int size = mp->m_sb.sb_sectsize;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ size -= sizeof(struct xfs_agfl);
+
+ return size / sizeof(xfs_agblock_t);
+}
+
unsigned int
xfs_refc_block(
struct xfs_mount *mp)
@@ -166,7 +184,7 @@ xfs_alloc_lookup_ge(
* Lookup the first record less than or equal to [bno, len]
* in the btree given by cur.
*/
-static int /* error */
+int /* error */
xfs_alloc_lookup_le(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agblock_t bno, /* starting block of extent */
@@ -519,7 +537,7 @@ xfs_alloc_fixup_trees(
return 0;
}
-static bool
+static xfs_failaddr_t
xfs_agfl_verify(
struct xfs_buf *bp)
{
@@ -527,10 +545,19 @@ xfs_agfl_verify(
struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
int i;
+ /*
+ * There is no verification of non-crc AGFLs because mkfs does not
+ * initialise the AGFL to zero or NULL. Hence the only valid part of the
+ * AGFL is what the AGF says is active. We can't get to the AGF, so we
+ * can't verify just those entries are valid.
+ */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return NULL;
+
if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
- return false;
+ return __this_address;
if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
- return false;
+ return __this_address;
/*
* during growfs operations, the perag is not fully initialised,
* so we can't use it for any useful checking. growfs ensures we can't
@@ -538,16 +565,17 @@ xfs_agfl_verify(
* so we can detect and avoid this problem.
*/
if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
- return false;
+ return __this_address;
- for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+ for (i = 0; i < xfs_agfl_size(mp); i++) {
if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
- return false;
+ return __this_address;
}
- return xfs_log_check_lsn(mp,
- be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn));
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
+ return __this_address;
+ return NULL;
}
static void
@@ -555,6 +583,7 @@ xfs_agfl_read_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
+ xfs_failaddr_t fa;
/*
* There is no verification of non-crc AGFLs because mkfs does not
@@ -566,28 +595,29 @@ xfs_agfl_read_verify(
return;
if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
- xfs_buf_ioerror(bp, -EFSBADCRC);
- else if (!xfs_agfl_verify(bp))
- xfs_buf_ioerror(bp, -EFSCORRUPTED);
-
- if (bp->b_error)
- xfs_verifier_error(bp);
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_agfl_verify(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
}
static void
xfs_agfl_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ xfs_failaddr_t fa;
/* no verification of non-crc AGFLs */
if (!xfs_sb_version_hascrc(&mp->m_sb))
return;
- if (!xfs_agfl_verify(bp)) {
- xfs_buf_ioerror(bp, -EFSCORRUPTED);
- xfs_verifier_error(bp);
+ fa = xfs_agfl_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
@@ -601,6 +631,7 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = {
.name = "xfs_agfl",
.verify_read = xfs_agfl_read_verify,
.verify_write = xfs_agfl_write_verify,
+ .verify_struct = xfs_agfl_verify,
};
/*
@@ -701,7 +732,7 @@ xfs_alloc_ag_vextent(
ASSERT(args->agbno % args->alignment == 0);
/* if not file data, insert new block into the reverse map btree */
- if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+ if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
args->agbno, args->len, &args->oinfo);
if (error)
@@ -1550,7 +1581,6 @@ xfs_alloc_ag_vextent_small(
int *stat) /* status: 0-freelist, 1-normal/none */
{
struct xfs_owner_info oinfo;
- struct xfs_perag *pag;
int error;
xfs_agblock_t fbno;
xfs_extlen_t flen;
@@ -1584,6 +1614,10 @@ xfs_alloc_ag_vextent_small(
bp = xfs_btree_get_bufs(args->mp, args->tp,
args->agno, fbno, 0);
+ if (!bp) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
xfs_trans_binval(args->tp, bp);
}
args->len = 1;
@@ -1598,18 +1632,13 @@ xfs_alloc_ag_vextent_small(
/*
* If we're feeding an AGFL block to something that
* doesn't live in the free space, we need to clear
- * out the OWN_AG rmap and add the block back to
- * the AGFL per-AG reservation.
+ * out the OWN_AG rmap.
*/
xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
error = xfs_rmap_free(args->tp, args->agbp, args->agno,
fbno, 1, &oinfo);
if (error)
goto error0;
- pag = xfs_perag_get(args->mp, args->agno);
- xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL,
- args->tp, 1);
- xfs_perag_put(pag);
*stat = 0;
return 0;
@@ -1677,7 +1706,7 @@ xfs_free_ag_extent(
bno_cur = cnt_cur = NULL;
mp = tp->t_mountp;
- if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+ if (!xfs_rmap_should_skip_owner_update(oinfo)) {
error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
if (error)
goto error0;
@@ -1893,14 +1922,12 @@ xfs_free_ag_extent(
XFS_STATS_INC(mp, xs_freex);
XFS_STATS_ADD(mp, xs_freeb, len);
- trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
- haveleft, haveright);
+ trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
return 0;
error0:
- trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
- -1, -1);
+ trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
if (bno_cur)
xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
if (cnt_cur)
@@ -2036,6 +2063,93 @@ xfs_alloc_space_available(
}
/*
+ * Check the agfl fields of the agf for inconsistency or corruption. The purpose
+ * is to detect an agfl header padding mismatch between current and early v5
+ * kernels. This problem manifests as a 1-slot size difference between the
+ * on-disk flcount and the active [first, last] range of a wrapped agfl. This
+ * may also catch variants of agfl count corruption unrelated to padding. Either
+ * way, we'll reset the agfl and warn the user.
+ *
+ * Return true if a reset is required before the agfl can be used, false
+ * otherwise.
+ */
+static bool
+xfs_agfl_needs_reset(
+ struct xfs_mount *mp,
+ struct xfs_agf *agf)
+{
+ uint32_t f = be32_to_cpu(agf->agf_flfirst);
+ uint32_t l = be32_to_cpu(agf->agf_fllast);
+ uint32_t c = be32_to_cpu(agf->agf_flcount);
+ int agfl_size = xfs_agfl_size(mp);
+ int active;
+
+ /* no agfl header on v4 supers */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+
+ /*
+ * The agf read verifier catches severe corruption of these fields.
+ * Repeat some sanity checks to cover a packed -> unpacked mismatch if
+ * the verifier allows it.
+ */
+ if (f >= agfl_size || l >= agfl_size)
+ return true;
+ if (c > agfl_size)
+ return true;
+
+ /*
+ * Check consistency between the on-disk count and the active range. An
+ * agfl padding mismatch manifests as an inconsistent flcount.
+ */
+ if (c && l >= f)
+ active = l - f + 1;
+ else if (c)
+ active = agfl_size - f + l + 1;
+ else
+ active = 0;
+
+ return active != c;
+}
+
+/*
+ * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
+ * agfl content cannot be trusted. Warn the user that a repair is required to
+ * recover leaked blocks.
+ *
+ * The purpose of this mechanism is to handle filesystems affected by the agfl
+ * header padding mismatch problem. A reset keeps the filesystem online with a
+ * relatively minor free space accounting inconsistency rather than suffer the
+ * inevitable crash from use of an invalid agfl block.
+ */
+static void
+xfs_agfl_reset(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+
+ ASSERT(pag->pagf_agflreset);
+ trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
+
+ xfs_warn(mp,
+ "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
+ "Please unmount and run xfs_repair.",
+ pag->pag_agno, pag->pagf_flcount);
+
+ agf->agf_flfirst = 0;
+ agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
+ agf->agf_flcount = 0;
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
+ XFS_AGF_FLCOUNT);
+
+ pag->pagf_flcount = 0;
+ pag->pagf_agflreset = false;
+}
+
+/*
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
*/
@@ -2096,6 +2210,10 @@ xfs_alloc_fix_freelist(
}
}
+ /* reset a padding mismatched agfl before final free space check */
+ if (pag->pagf_agflreset)
+ xfs_agfl_reset(tp, agbp, pag);
+
/* If there isn't enough total space or single-extent, reject it. */
need = xfs_alloc_min_freelist(mp, pag);
if (!xfs_alloc_space_available(args, need, flags))
@@ -2141,6 +2259,10 @@ xfs_alloc_fix_freelist(
if (error)
goto out_agbp_relse;
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
+ if (!bp) {
+ error = -EFSCORRUPTED;
+ goto out_agbp_relse;
+ }
xfs_trans_binval(tp, bp);
}
@@ -2244,10 +2366,11 @@ xfs_alloc_get_freelist(
bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
be32_add_cpu(&agf->agf_flfirst, 1);
xfs_trans_brelse(tp, agflbp);
- if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
+ if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
agf->agf_flfirst = 0;
pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+ ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, -1);
xfs_trans_agflist_delta(tp, -1);
pag->pagf_flcount--;
@@ -2355,10 +2478,11 @@ xfs_alloc_put_freelist(
be32_to_cpu(agf->agf_seqno), &agflbp)))
return error;
be32_add_cpu(&agf->agf_fllast, 1);
- if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
+ if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
agf->agf_fllast = 0;
pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+ ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, 1);
xfs_trans_agflist_delta(tp, 1);
pag->pagf_flcount++;
@@ -2373,7 +2497,7 @@ xfs_alloc_put_freelist(
xfs_alloc_log_agf(tp, agbp, logflags);
- ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
+ ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
@@ -2388,39 +2512,39 @@ xfs_alloc_put_freelist(
return 0;
}
-static bool
+static xfs_failaddr_t
xfs_agf_verify(
- struct xfs_mount *mp,
- struct xfs_buf *bp)
- {
- struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
- return false;
+ return __this_address;
if (!xfs_log_check_lsn(mp,
be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn)))
- return false;
+ return __this_address;
}
if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
- be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
- return false;
+ be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
+ be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
+ be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
+ return __this_address;
if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
- return false;
+ return __this_address;
if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
(be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
- return false;
+ return __this_address;
/*
* during growfs operations, the perag is not fully initialised,
@@ -2429,18 +2553,18 @@ xfs_agf_verify(
* so we can detect and avoid this problem.
*/
if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
- return false;
+ return __this_address;
if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
- return false;
+ return __this_address;
if (xfs_sb_version_hasreflink(&mp->m_sb) &&
(be32_to_cpu(agf->agf_refcount_level) < 1 ||
be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
- return false;
+ return __this_address;
- return true;;
+ return NULL;
}
@@ -2449,28 +2573,29 @@ xfs_agf_read_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
+ xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
!xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
- xfs_buf_ioerror(bp, -EFSBADCRC);
- else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
- XFS_ERRTAG_ALLOC_READ_AGF))
- xfs_buf_ioerror(bp, -EFSCORRUPTED);
-
- if (bp->b_error)
- xfs_verifier_error(bp);
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ else {
+ fa = xfs_agf_verify(bp);
+ if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ }
}
static void
xfs_agf_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ xfs_failaddr_t fa;
- if (!xfs_agf_verify(mp, bp)) {
- xfs_buf_ioerror(bp, -EFSCORRUPTED);
- xfs_verifier_error(bp);
+ fa = xfs_agf_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
@@ -2487,6 +2612,7 @@ const struct xfs_buf_ops xfs_agf_buf_ops = {
.name = "xfs_agf",
.verify_read = xfs_agf_read_verify,
.verify_write = xfs_agf_write_verify,
+ .verify_struct = xfs_agf_verify,
};
/*
@@ -2564,6 +2690,7 @@ xfs_alloc_read_agf(
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
pag->pagf_init = 1;
+ pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
}
#ifdef DEBUG
else if (!XFS_FORCED_SHUTDOWN(mp)) {
@@ -2923,3 +3050,71 @@ xfs_alloc_query_all(
query.fn = fn;
return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
}
+
+/* Find the size of the AG, in blocks. */
+xfs_agblock_t
+xfs_ag_block_count(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ ASSERT(agno < mp->m_sb.sb_agcount);
+
+ if (agno < mp->m_sb.sb_agcount - 1)
+ return mp->m_sb.sb_agblocks;
+ return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Verify that an AG block number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+bool
+xfs_verify_agbno(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno)
+{
+ xfs_agblock_t eoag;
+
+ eoag = xfs_ag_block_count(mp, agno);
+ if (agbno >= eoag)
+ return false;
+ if (agbno <= XFS_AGFL_BLOCK(mp))
+ return false;
+ return true;
+}
+
+/*
+ * Verify that an FS block number pointer neither points outside the
+ * filesystem nor points at static AG metadata.
+ */
+bool
+xfs_verify_fsbno(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno)
+{
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
+
+ if (agno >= mp->m_sb.sb_agcount)
+ return false;
+ return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno));
+}
+
+/* Is there a record covering a given extent? */
+int
+xfs_alloc_has_record(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool *exists)
+{
+ union xfs_btree_irec low;
+ union xfs_btree_irec high;
+
+ memset(&low, 0, sizeof(low));
+ low.a.ar_startblock = bno;
+ memset(&high, 0xFF, sizeof(high));
+ high.a.ar_startblock = bno + len - 1;
+
+ return xfs_btree_has_record(cur, &low, &high, exists);
+}