diff options
Diffstat (limited to 'fs/xfs/libxfs/xfs_ialloc.c')
| -rw-r--r-- | fs/xfs/libxfs/xfs_ialloc.c | 905 |
1 files changed, 568 insertions, 337 deletions
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 5118dedf9267..d97295eaebe6 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -27,6 +27,7 @@ #include "xfs_log.h" #include "xfs_rmap.h" #include "xfs_ag.h" +#include "xfs_health.h" /* * Lookup a record by ino in the btree given by cur. @@ -95,6 +96,61 @@ xfs_inobt_btrec_to_irec( irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } +/* Compute the freecount of an incore inode record. */ +uint8_t +xfs_inobt_rec_freecount( + const struct xfs_inobt_rec_incore *irec) +{ + uint64_t realfree = irec->ir_free; + + if (xfs_inobt_issparse(irec->ir_holemask)) + realfree &= xfs_inobt_irec_to_allocmask(irec); + return hweight64(realfree); +} + +/* Simple checks for inode records. */ +xfs_failaddr_t +xfs_inobt_check_irec( + struct xfs_perag *pag, + const struct xfs_inobt_rec_incore *irec) +{ + /* Record has to be properly aligned within the AG. */ + if (!xfs_verify_agino(pag, irec->ir_startino)) + return __this_address; + if (!xfs_verify_agino(pag, + irec->ir_startino + XFS_INODES_PER_CHUNK - 1)) + return __this_address; + if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || + irec->ir_count > XFS_INODES_PER_CHUNK) + return __this_address; + if (irec->ir_freecount > XFS_INODES_PER_CHUNK) + return __this_address; + + if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount) + return __this_address; + + return NULL; +} + +static inline int +xfs_inobt_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_inobt_rec_incore *irec) +{ + struct xfs_mount *mp = cur->bc_mp; + + xfs_warn(mp, + "%sbt record corruption in AG %d detected at %pS!", + cur->bc_ops->name, cur->bc_group->xg_gno, fa); + xfs_warn(mp, +"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", + irec->ir_startino, irec->ir_count, irec->ir_freecount, + irec->ir_free, irec->ir_holemask); + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; +} + /* * Get the data from the pointed-to record. */ @@ -106,43 +162,19 @@ xfs_inobt_get_rec( { struct xfs_mount *mp = cur->bc_mp; union xfs_btree_rec *rec; + xfs_failaddr_t fa; int error; - uint64_t realfree; error = xfs_btree_get_rec(cur, &rec, stat); if (error || *stat == 0) return error; xfs_inobt_btrec_to_irec(mp, rec, irec); - - if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino)) - goto out_bad_rec; - if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || - irec->ir_count > XFS_INODES_PER_CHUNK) - goto out_bad_rec; - if (irec->ir_freecount > XFS_INODES_PER_CHUNK) - goto out_bad_rec; - - /* if there are no holes, return the first available offset */ - if (!xfs_inobt_issparse(irec->ir_holemask)) - realfree = irec->ir_free; - else - realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec); - if (hweight64(realfree) != irec->ir_freecount) - goto out_bad_rec; + fa = xfs_inobt_check_irec(to_perag(cur->bc_group), irec); + if (fa) + return xfs_inobt_complain_bad_rec(cur, fa, irec); return 0; - -out_bad_rec: - xfs_warn(mp, - "%s Inode BTree record corruption in AG %d detected!", - cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", - cur->bc_ag.pag->pag_agno); - xfs_warn(mp, -"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", - irec->ir_startino, irec->ir_count, irec->ir_freecount, - irec->ir_free, irec->ir_holemask); - return -EFSCORRUPTED; } /* @@ -169,20 +201,22 @@ xfs_inobt_insert_rec( */ STATIC int xfs_inobt_insert( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_agino_t newino, xfs_agino_t newlen, - xfs_btnum_t btnum) + bool is_finobt) { struct xfs_btree_cur *cur; xfs_agino_t thisino; int i; int error; - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum); + if (is_finobt) + cur = xfs_finobt_init_cursor(pag, tp, agbp); + else + cur = xfs_inobt_init_cursor(pag, tp, agbp); for (thisino = newino; thisino < newino + newlen; @@ -241,8 +275,10 @@ xfs_check_agi_freecount( } } while (i == 1); - if (!xfs_is_shutdown(cur->bc_mp)) - ASSERT(freecount == cur->bc_ag.pag->pagi_freecount); + if (!xfs_is_shutdown(cur->bc_mp)) { + ASSERT(freecount == + to_perag(cur->bc_group)->pagi_freecount); + } } return 0; } @@ -328,7 +364,7 @@ xfs_ialloc_inode_init( (j * M_IGEO(mp)->blocks_per_cluster)); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, - XBF_UNMAPPED, &fbuf); + 0, &fbuf); if (error) return error; @@ -498,36 +534,32 @@ __xfs_inobt_rec_merge( } /* - * Insert a new sparse inode chunk into the associated inode btree. The inode - * record for the sparse chunk is pre-aligned to a startino that should match - * any pre-existing sparse inode record in the tree. This allows sparse chunks - * to fill over time. + * Insert a new sparse inode chunk into the associated inode allocation btree. + * The inode record for the sparse chunk is pre-aligned to a startino that + * should match any pre-existing sparse inode record in the tree. This allows + * sparse chunks to fill over time. * - * This function supports two modes of handling preexisting records depending on - * the merge flag. If merge is true, the provided record is merged with the + * If no preexisting record exists, the provided record is inserted. + * If there is a preexisting record, the provided record is merged with the * existing record and updated in place. The merged record is returned in nrec. - * If merge is false, an existing record is replaced with the provided record. - * If no preexisting record exists, the provided record is always inserted. * * It is considered corruption if a merge is requested and not possible. Given * the sparse inode alignment constraints, this should never happen. */ STATIC int xfs_inobt_insert_sprec( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, - int btnum, - struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ - bool merge) /* merge or replace */ + struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */ { + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; int error; int i; struct xfs_inobt_rec_incore rec; - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum); + cur = xfs_inobt_init_cursor(pag, tp, agbp); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); @@ -541,6 +573,7 @@ xfs_inobt_insert_sprec( if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -549,45 +582,42 @@ xfs_inobt_insert_sprec( } /* - * A record exists at this startino. Merge or replace the record - * depending on what we've been asked to do. + * A record exists at this startino. Merge the records. */ - if (merge) { - error = xfs_inobt_get_rec(cur, &rec, &i); - if (error) - goto error; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto error; - } - if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { - error = -EFSCORRUPTED; - goto error; - } + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } + if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } - /* - * This should never fail. If we have coexisting records that - * cannot merge, something is seriously wrong. - */ - if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { - error = -EFSCORRUPTED; - goto error; - } + /* + * This should never fail. If we have coexisting records that + * cannot merge, something is seriously wrong. + */ + if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } - trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino, - rec.ir_holemask, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_pre(pag, &rec, nrec); - /* merge to nrec to output the updated record */ - __xfs_inobt_rec_merge(nrec, &rec); + /* merge to nrec to output the updated record */ + __xfs_inobt_rec_merge(nrec, &rec); - trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_post(pag, nrec); - error = xfs_inobt_rec_check_count(mp, nrec); - if (error) - goto error; - } + error = xfs_inobt_rec_check_count(mp, nrec); + if (error) + goto error; error = xfs_inobt_update(cur, nrec); if (error) @@ -602,6 +632,59 @@ error: } /* + * Insert a new sparse inode chunk into the free inode btree. The inode + * record for the sparse chunk is pre-aligned to a startino that should match + * any pre-existing sparse inode record in the tree. This allows sparse chunks + * to fill over time. + * + * The new record is always inserted, overwriting a pre-existing record if + * there is one. + */ +STATIC int +xfs_finobt_insert_sprec( + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */ +{ + struct xfs_mount *mp = pag_mount(pag); + struct xfs_btree_cur *cur; + int error; + int i; + + cur = xfs_finobt_init_cursor(pag, tp, agbp); + + /* the new record is pre-aligned so we know where to look */ + error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + /* if nothing there, insert a new record and return */ + if (i == 0) { + error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, + nrec->ir_count, nrec->ir_freecount, + nrec->ir_free, &i); + if (error) + goto error; + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } + } else { + error = xfs_inobt_update(cur, nrec); + if (error) + goto error; + } + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + + +/* * Allocate new inodes in the allocation group specified by agbp. Returns 0 if * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so * the caller knows it can try another AG, a hard -ENOSPC when over the maximum @@ -609,9 +692,9 @@ error: */ STATIC int xfs_ialloc_ag_alloc( + struct xfs_perag *pag, struct xfs_trans *tp, - struct xfs_buf *agbp, - struct xfs_perag *pag) + struct xfs_buf *agbp) { struct xfs_agi *agi; struct xfs_alloc_arg args; @@ -631,6 +714,7 @@ xfs_ialloc_ag_alloc( args.mp = tp->t_mountp; args.fsbno = NULLFSBLOCK; args.oinfo = XFS_RMAP_OINFO_INODES; + args.pag = pag; #ifdef DEBUG /* randomly do sparse inode allocations */ @@ -662,8 +746,6 @@ xfs_ialloc_ag_alloc( goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { - args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); - args.type = XFS_ALLOCTYPE_THIS_BNO; args.prod = 1; /* @@ -684,7 +766,9 @@ xfs_ialloc_ag_alloc( /* Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; - if ((error = xfs_alloc_vextent(&args))) + error = xfs_alloc_vextent_exact_bno(&args, + xfs_agbno_to_fsb(pag, args.agbno)); + if (error) return error; /* @@ -717,22 +801,17 @@ xfs_ialloc_ag_alloc( } else args.alignment = igeo->cluster_align; /* - * Need to figure out where to allocate the inode blocks. - * Ideally they should be spaced out through the a.g. - * For now, just allocate blocks up front. - */ - args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); - /* * Allocate a fixed-size extent of inodes. */ - args.type = XFS_ALLOCTYPE_NEAR_BNO; args.prod = 1; /* * Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; - if ((error = xfs_alloc_vextent(&args))) + error = xfs_alloc_vextent_near_bno(&args, + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); + if (error) return error; } @@ -741,11 +820,11 @@ xfs_ialloc_ag_alloc( * alignment. */ if (isaligned && args.fsbno == NULLFSBLOCK) { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.alignment = igeo->cluster_align; - if ((error = xfs_alloc_vextent(&args))) + error = xfs_alloc_vextent_near_bno(&args, + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); + if (error) return error; } @@ -757,9 +836,6 @@ xfs_ialloc_ag_alloc( igeo->ialloc_min_blks < igeo->ialloc_blks && args.fsbno == NULLFSBLOCK) { sparse_alloc: - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.alignment = args.mp->m_sb.sb_spino_align; args.prod = 1; @@ -777,11 +853,14 @@ sparse_alloc: * the end of the AG. */ args.min_agbno = args.mp->m_sb.sb_inoalignmt; - args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, + args.max_agbno = round_down(xfs_ag_block_count(args.mp, + pag_agno(pag)), args.mp->m_sb.sb_inoalignmt) - igeo->ialloc_blks; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_near_bno(&args, + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); if (error) return error; @@ -804,7 +883,7 @@ sparse_alloc: * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag_agno(pag), args.agbno, args.len, get_random_u32()); if (error) @@ -831,13 +910,11 @@ sparse_alloc: * if necessary. If a merge does occur, rec is updated to the * merged record. */ - error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, - XFS_BTNUM_INO, &rec, true); + error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec); if (error == -EFSCORRUPTED) { xfs_alert(args.mp, "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", - XFS_AGINO_TO_INO(args.mp, pag->pag_agno, - rec.ir_startino), + xfs_agino_to_ino(pag, rec.ir_startino), rec.ir_holemask, rec.ir_count); xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); } @@ -856,21 +933,19 @@ sparse_alloc: * existing record with this one. */ if (xfs_has_finobt(args.mp)) { - error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, - XFS_BTNUM_FINO, &rec, false); + error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec); if (error) return error; } } else { /* full chunk - insert new records to both btrees */ - error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen, - XFS_BTNUM_INO); + error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false); if (error) return error; if (xfs_has_finobt(args.mp)) { - error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, - newlen, XFS_BTNUM_FINO); + error = xfs_inobt_insert(pag, tp, agbp, newino, + newlen, true); if (error) return error; } @@ -923,8 +998,10 @@ xfs_ialloc_next_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } return 0; @@ -948,8 +1025,10 @@ xfs_ialloc_get_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } return 0; @@ -977,13 +1056,40 @@ xfs_inobt_first_free_inode( } /* + * If this AG has corrupt inodes, check if allocating this inode would fail + * with corruption errors. Returns 0 if we're clear, or EAGAIN to try again + * somewhere else. + */ +static int +xfs_dialloc_check_ino( + struct xfs_perag *pag, + struct xfs_trans *tp, + xfs_ino_t ino) +{ + struct xfs_imap imap; + struct xfs_buf *bp; + int error; + + error = xfs_imap(pag, tp, ino, &imap, 0); + if (error) + return -EAGAIN; + + error = xfs_imap_to_bp(pag_mount(pag), tp, &imap, &bp); + if (error) + return -EAGAIN; + + xfs_trans_brelse(tp, bp); + return 0; +} + +/* * Allocate an inode using the inobt-only algorithm. */ STATIC int xfs_dialloc_ag_inobt( + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_ino_t parent, xfs_ino_t *inop) { @@ -999,12 +1105,12 @@ xfs_dialloc_ag_inobt( int i, j; int searchdistance = 10; - ASSERT(pag->pagi_init); - ASSERT(pag->pagi_inodeok); + ASSERT(xfs_perag_initialised_agi(pag)); + ASSERT(xfs_perag_allows_inodes(pag)); ASSERT(pag->pagi_freecount > 0); restart_pagno: - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1019,7 +1125,7 @@ xfs_dialloc_ag_inobt( /* * If in the same AG as the parent, try to get near the parent. */ - if (pagno == pag->pag_agno) { + if (pagno == pag_agno(pag)) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ @@ -1027,6 +1133,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1035,6 +1142,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, j != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1193,6 +1301,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1202,6 +1311,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1211,6 +1321,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1222,7 +1333,14 @@ alloc_inode: ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); + + if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { + error = xfs_dialloc_check_ino(pag, tp, ino); + if (error) + goto error0; + } + rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; error = xfs_inobt_update(cur, &rec); @@ -1271,8 +1389,10 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(lcur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(lcur); return -EFSCORRUPTED; + } /* * See if we've landed in the parent inode record. The finobt @@ -1296,12 +1416,14 @@ xfs_dialloc_ag_finobt_near( if (error) goto error_rcur; if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) { + xfs_btree_mark_sick(lcur); error = -EFSCORRUPTED; goto error_rcur; } } if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) { + xfs_btree_mark_sick(lcur); error = -EFSCORRUPTED; goto error_rcur; } @@ -1357,8 +1479,10 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return 0; } } @@ -1369,14 +1493,18 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return 0; } @@ -1398,14 +1526,18 @@ xfs_dialloc_ag_update_inobt( error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); @@ -1414,8 +1546,10 @@ xfs_dialloc_ag_update_inobt( if (XFS_IS_CORRUPT(cur->bc_mp, rec.ir_free != frec->ir_free || - rec.ir_freecount != frec->ir_freecount)) + rec.ir_freecount != frec->ir_freecount)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return xfs_inobt_update(cur, &rec); } @@ -1429,9 +1563,9 @@ xfs_dialloc_ag_update_inobt( */ static int xfs_dialloc_ag( + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_ino_t parent, xfs_ino_t *inop) { @@ -1448,7 +1582,7 @@ xfs_dialloc_ag( int i; if (!xfs_has_finobt(mp)) - return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop); + return xfs_dialloc_ag_inobt(pag, tp, agbp, parent, inop); /* * If pagino is 0 (this is the root inode allocation) use newino. @@ -1457,7 +1591,7 @@ xfs_dialloc_ag( if (!pagino) pagino = be32_to_cpu(agi->agi_newino); - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(cur); if (error) @@ -1468,7 +1602,7 @@ xfs_dialloc_ag( * parent. If so, find the closest available inode to the parent. If * not, consider the agi hint or find the first free inode in the AG. */ - if (pag->pag_agno == pagno) + if (pag_agno(pag) == pagno) error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); else error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); @@ -1480,7 +1614,13 @@ xfs_dialloc_ag( ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); + + if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { + error = xfs_dialloc_check_ino(pag, tp, ino); + if (error) + goto error_cur; + } /* * Modify or remove the finobt record. @@ -1500,7 +1640,7 @@ xfs_dialloc_ag( * the original freecount. If all is well, make the equivalent update to * the inobt using the finobt record and offset information. */ - icur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); + icur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(icur); if (error) @@ -1577,25 +1717,10 @@ xfs_dialloc_roll( return error; } -static xfs_agnumber_t -xfs_ialloc_next_ag( - xfs_mount_t *mp) -{ - xfs_agnumber_t agno; - - spin_lock(&mp->m_agirotor_lock); - agno = mp->m_agirotor; - if (++mp->m_agirotor >= mp->m_maxagi) - mp->m_agirotor = 0; - spin_unlock(&mp->m_agirotor_lock); - - return agno; -} - static bool xfs_dialloc_good_ag( - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, umode_t mode, int flags, bool ok_alloc) @@ -1606,11 +1731,13 @@ xfs_dialloc_good_ag( int needspace; int error; - if (!pag->pagi_inodeok) + if (!pag) + return false; + if (!xfs_perag_allows_inodes(pag)) return false; - if (!pag->pagi_init) { - error = xfs_ialloc_read_agi(pag, tp, NULL); + if (!xfs_perag_initialised_agi(pag)) { + error = xfs_ialloc_read_agi(pag, tp, 0, NULL); if (error) return false; } @@ -1620,7 +1747,7 @@ xfs_dialloc_good_ag( if (!ok_alloc) return false; - if (!pag->pagf_init) { + if (!xfs_perag_initialised_agf(pag)) { error = xfs_alloc_read_agf(pag, tp, flags, NULL); if (error) return false; @@ -1665,8 +1792,8 @@ xfs_dialloc_good_ag( static int xfs_dialloc_try_ag( - struct xfs_trans **tpp, struct xfs_perag *pag, + struct xfs_trans **tpp, xfs_ino_t parent, xfs_ino_t *new_ino, bool ok_alloc) @@ -1679,7 +1806,7 @@ xfs_dialloc_try_ag( * Then read in the AGI buffer and recheck with the AGI buffer * lock held. */ - error = xfs_ialloc_read_agi(pag, *tpp, &agbp); + error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp); if (error) return error; @@ -1689,7 +1816,7 @@ xfs_dialloc_try_ag( goto out_release; } - error = xfs_ialloc_ag_alloc(*tpp, agbp, pag); + error = xfs_ialloc_ag_alloc(pag, *tpp, agbp); if (error < 0) goto out_release; @@ -1705,7 +1832,7 @@ xfs_dialloc_try_ag( } /* Allocate an inode in the found AG */ - error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino); + error = xfs_dialloc_ag(pag, *tpp, agbp, parent, &ino); if (!error) *new_ino = ino; return error; @@ -1716,6 +1843,40 @@ out_release: } /* + * Pick an AG for the new inode. + * + * Directories, symlinks, and regular files frequently allocate at least one + * block, so factor that potential expansion when we examine whether an AG has + * enough space for file creation. Try to keep metadata files all in the same + * AG. + */ +static inline xfs_agnumber_t +xfs_dialloc_pick_ag( + struct xfs_mount *mp, + struct xfs_inode *dp, + umode_t mode) +{ + xfs_agnumber_t start_agno; + + if (!dp) + return 0; + if (xfs_is_metadir_inode(dp)) { + if (mp->m_sb.sb_logstart) + return XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); + return 0; + } + + if (S_ISDIR(mode)) + return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi; + + start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino); + if (start_agno >= mp->m_maxagi) + start_agno = 0; + + return start_agno; +} + +/* * Allocate an on-disk inode. * * Mode is used to tell whether the new inode is a directory and hence where to @@ -1726,32 +1887,23 @@ out_release: int xfs_dialloc( struct xfs_trans **tpp, - xfs_ino_t parent, - umode_t mode, + const struct xfs_icreate_args *args, xfs_ino_t *new_ino) { struct xfs_mount *mp = (*tpp)->t_mountp; - xfs_agnumber_t agno; - int error = 0; - xfs_agnumber_t start_agno; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_ino_t ino = NULLFSINO; + xfs_ino_t parent = args->pip ? args->pip->i_ino : 0; + xfs_agnumber_t agno; + xfs_agnumber_t start_agno; + umode_t mode = args->mode & S_IFMT; bool ok_alloc = true; + bool low_space = false; int flags; - xfs_ino_t ino; + int error = 0; - /* - * Directories, symlinks, and regular files frequently allocate at least - * one block, so factor that potential expansion when we examine whether - * an AG has enough space for file creation. - */ - if (S_ISDIR(mode)) - start_agno = xfs_ialloc_next_ag(mp); - else { - start_agno = XFS_INO_TO_AGNO(mp, parent); - if (start_agno >= mp->m_maxagi) - start_agno = 0; - } + start_agno = xfs_dialloc_pick_ag(mp, args->pip, mode); /* * If we have already hit the ceiling of inode blocks then clear @@ -1768,41 +1920,70 @@ xfs_dialloc( } /* + * If we are near to ENOSPC, we want to prefer allocation from AGs that + * have free inodes in them rather than use up free space allocating new + * inode chunks. Hence we turn off allocation for the first non-blocking + * pass through the AGs if we are near ENOSPC to consume free inodes + * that we can immediately allocate, but then we allow allocation on the + * second pass if we fail to find an AG with free inodes in it. + */ + if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) < + mp->m_low_space[XFS_LOWSP_1_PCNT]) { + ok_alloc = false; + low_space = true; + } + + /* * Loop until we find an allocation group that either has free inodes * or in which we can allocate some inodes. Iterate through the * allocation groups upward, wrapping at the end. */ - agno = start_agno; flags = XFS_ALLOC_FLAG_TRYLOCK; - for (;;) { - pag = xfs_perag_get(mp, agno); - if (xfs_dialloc_good_ag(*tpp, pag, mode, flags, ok_alloc)) { - error = xfs_dialloc_try_ag(tpp, pag, parent, +retry: + for_each_perag_wrap_at(mp, start_agno, mp->m_maxagi, agno, pag) { + if (xfs_dialloc_good_ag(pag, *tpp, mode, flags, ok_alloc)) { + error = xfs_dialloc_try_ag(pag, tpp, parent, &ino, ok_alloc); if (error != -EAGAIN) break; + error = 0; } if (xfs_is_shutdown(mp)) { error = -EFSCORRUPTED; break; } - if (++agno == mp->m_maxagi) - agno = 0; - if (agno == start_agno) { - if (!flags) { - error = -ENOSPC; - break; - } + } + if (pag) + xfs_perag_rele(pag); + if (error) + return error; + if (ino == NULLFSINO) { + if (flags) { flags = 0; + if (low_space) + ok_alloc = true; + goto retry; } - xfs_perag_put(pag); + return -ENOSPC; } - if (!error) - *new_ino = ino; - xfs_perag_put(pag); - return error; + /* + * Protect against obviously corrupt allocation btree records. Later + * xfs_iget checks will catch re-allocation of other active in-memory + * and on-disk inodes. If we don't catch reallocating the parent inode + * here we will deadlock in xfs_iget() so we have to do these checks + * first. + */ + if (ino == parent || !xfs_verify_dir_ino(mp, ino)) { + xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); + xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), + XFS_SICK_AG_INOBT); + return -EFSCORRUPTED; + } + + *new_ino = ino; + return 0; } /* @@ -1810,10 +1991,10 @@ xfs_dialloc( * might be sparse and only free the regions that are allocated as part of the * chunk. */ -STATIC void +static int xfs_difree_inode_chunk( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_inobt_rec_incore *rec) { struct xfs_mount *mp = tp->t_mountp; @@ -1827,10 +2008,9 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), - M_IGEO(mp)->ialloc_blks, - &XFS_RMAP_OINFO_INODES); - return; + return xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, sagbno), + M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, + XFS_AG_RESV_NONE, 0); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1847,6 +2027,8 @@ xfs_difree_inode_chunk( XFS_INOBT_HOLEMASK_BITS); nextbit = startidx + 1; while (startidx < XFS_INOBT_HOLEMASK_BITS) { + int error; + nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, nextbit); /* @@ -1872,8 +2054,11 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), - contigblk, &XFS_RMAP_OINFO_INODES); + error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, agbno), + contigblk, &XFS_RMAP_OINFO_INODES, + XFS_AG_RESV_NONE, 0); + if (error) + return error; /* reset range to current bit and carry on... */ startidx = endidx = nextbit; @@ -1881,18 +2066,19 @@ xfs_difree_inode_chunk( next: nextbit++; } + return 0; } STATIC int xfs_difree_inobt( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_agino_t agino, struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { + struct xfs_mount *mp = pag_mount(pag); struct xfs_agi *agi = agbp->b_addr; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; @@ -1907,7 +2093,7 @@ xfs_difree_inobt( /* * Initialize the cursor. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(cur); if (error) @@ -1922,6 +2108,7 @@ xfs_difree_inobt( goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1932,6 +2119,7 @@ xfs_difree_inobt( goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1952,13 +2140,10 @@ xfs_difree_inobt( * remove the chunk if the block size is large enough for multiple inode * chunks (that might not be free). */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { - struct xfs_perag *pag = agbp->b_pag; - xic->deleted = true; - xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, - rec.ir_startino); + xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* @@ -1981,7 +2166,9 @@ xfs_difree_inobt( goto error0; } - xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); + error = xfs_difree_inode_chunk(tp, pag, &rec); + if (error) + goto error0; } else { xic->deleted = false; @@ -2019,20 +2206,20 @@ error0: */ STATIC int xfs_difree_finobt( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; int error; int i; - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) @@ -2044,6 +2231,7 @@ xfs_difree_finobt( * something is out of sync. */ if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -2070,6 +2258,7 @@ xfs_difree_finobt( if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -2080,6 +2269,7 @@ xfs_difree_finobt( if (XFS_IS_CORRUPT(mp, rec.ir_free != ibtrec->ir_free || rec.ir_freecount != ibtrec->ir_freecount)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -2096,7 +2286,7 @@ xfs_difree_finobt( * enough for multiple chunks. Leave the finobt record to remain in sync * with the inobt. */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { error = xfs_btree_delete(cur, &i); if (error) @@ -2145,31 +2335,31 @@ xfs_difree( /* * Break up inode number into its components. */ - if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) { - xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).", - __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno); + if (pag_agno(pag) != XFS_INO_TO_AGNO(mp, inode)) { + xfs_warn(mp, "%s: agno != pag_agno(pag) (%d != %d).", + __func__, XFS_INO_TO_AGNO(mp, inode), pag_agno(pag)); ASSERT(0); return -EINVAL; } agino = XFS_INO_TO_AGINO(mp, inode); - if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { - xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", + if (inode != xfs_agino_to_ino(pag, agino)) { + xfs_warn(mp, "%s: inode != xfs_agino_to_ino() (%llu != %llu).", __func__, (unsigned long long)inode, - (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); + (unsigned long long)xfs_agino_to_ino(pag, agino)); ASSERT(0); return -EINVAL; } agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agbno >= mp->m_sb.sb_agblocks) { - xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", - __func__, agbno, mp->m_sb.sb_agblocks); + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { + xfs_warn(mp, "%s: agbno >= xfs_ag_block_count (%d >= %d).", + __func__, agbno, xfs_ag_block_count(mp, pag_agno(pag))); ASSERT(0); return -EINVAL; } /* * Get the allocation group header. */ - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); @@ -2179,7 +2369,7 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, pag, agino, xic, &rec); + error = xfs_difree_inobt(pag, tp, agbp, agino, xic, &rec); if (error) goto error0; @@ -2187,7 +2377,7 @@ xfs_difree( * Fix up the free inode btree. */ if (xfs_has_finobt(mp)) { - error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec); + error = xfs_difree_finobt(pag, tp, agbp, agino, &rec); if (error) goto error0; } @@ -2200,26 +2390,26 @@ error0: STATIC int xfs_imap_lookup( - struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, xfs_agino_t agino, xfs_agblock_t agbno, xfs_agblock_t *chunk_agbno, xfs_agblock_t *offset_agbno, int flags) { + struct xfs_mount *mp = pag_mount(pag); struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; int error; int i; - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", - __func__, error, pag->pag_agno); + __func__, error, pag_agno(pag)); return error; } @@ -2229,7 +2419,7 @@ xfs_imap_lookup( * we have a record, we need to ensure it contains the inode number * we are looking up. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) @@ -2263,12 +2453,13 @@ xfs_imap_lookup( */ int xfs_imap( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ + struct xfs_perag *pag, + struct xfs_trans *tp, xfs_ino_t ino, /* inode to locate */ struct xfs_imap *imap, /* location map structure */ uint flags) /* flags for inode btree lookup */ { + struct xfs_mount *mp = pag_mount(pag); xfs_agblock_t agbno; /* block number of inode in the alloc group */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ @@ -2276,18 +2467,16 @@ xfs_imap( int error; /* error code */ int offset; /* index of inode in its buffer */ xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ - struct xfs_perag *pag; ASSERT(ino != NULLFSINO); /* * Split up the inode number into its parts. */ - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); agino = XFS_INO_TO_AGINO(mp, ino); agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (!pag || agbno >= mp->m_sb.sb_agblocks || - ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag)) || + ino != xfs_agino_to_ino(pag, agino)) { error = -EINVAL; #ifdef DEBUG /* @@ -2295,28 +2484,23 @@ xfs_imap( * as they can be invalid without implying corruption. */ if (flags & XFS_IGET_UNTRUSTED) - goto out_drop; - if (!pag) { - xfs_alert(mp, - "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)", - __func__, XFS_INO_TO_AGNO(mp, ino), - mp->m_sb.sb_agcount); - } - if (agbno >= mp->m_sb.sb_agblocks) { + return error; + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { xfs_alert(mp, "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", __func__, (unsigned long long)agbno, - (unsigned long)mp->m_sb.sb_agblocks); + (unsigned long)xfs_ag_block_count(mp, + pag_agno(pag))); } - if (pag && ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + if (ino != xfs_agino_to_ino(pag, agino)) { xfs_alert(mp, - "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", + "%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)", __func__, ino, - XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); + xfs_agino_to_ino(pag, agino)); } xfs_stack_trace(); #endif /* DEBUG */ - goto out_drop; + return error; } /* @@ -2327,10 +2511,10 @@ xfs_imap( * in all cases where an untrusted inode number is passed. */ if (flags & XFS_IGET_UNTRUSTED) { - error = xfs_imap_lookup(mp, tp, pag, agino, agbno, + error = xfs_imap_lookup(pag, tp, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) - goto out_drop; + return error; goto out_map; } @@ -2342,12 +2526,11 @@ xfs_imap( offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno); + imap->im_blkno = xfs_agbno_to_daddr(pag, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); - error = 0; - goto out_drop; + return 0; } /* @@ -2359,10 +2542,10 @@ xfs_imap( offset_agbno = agbno & M_IGEO(mp)->inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { - error = xfs_imap_lookup(mp, tp, pag, agino, agbno, + error = xfs_imap_lookup(pag, tp, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) - goto out_drop; + return error; } out_map: @@ -2373,7 +2556,7 @@ out_map: offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno); + imap->im_blkno = xfs_agbno_to_daddr(pag, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); @@ -2390,14 +2573,9 @@ out_map: __func__, (unsigned long long) imap->im_blkno, (unsigned long long) imap->im_len, XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); - error = -EINVAL; - goto out_drop; + return -EINVAL; } - error = 0; -out_drop: - if (pag) - xfs_perag_put(pag); - return error; + return 0; } /* @@ -2469,11 +2647,14 @@ xfs_ialloc_log_agi( static xfs_failaddr_t xfs_agi_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_mount; - struct xfs_agi *agi = bp->b_addr; - int i; + struct xfs_mount *mp = bp->b_mount; + struct xfs_agi *agi = bp->b_addr; + xfs_failaddr_t fa; + uint32_t agi_seqno = be32_to_cpu(agi->agi_seqno); + uint32_t agi_length = be32_to_cpu(agi->agi_length); + int i; if (xfs_has_crc(mp)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) @@ -2490,6 +2671,10 @@ xfs_agi_verify( if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return __this_address; + fa = xfs_validate_ag_length(bp, agi_seqno, agi_length); + if (fa) + return fa; + if (be32_to_cpu(agi->agi_level) < 1 || be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels) return __this_address; @@ -2499,15 +2684,6 @@ xfs_agi_verify( be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels)) return __this_address; - /* - * during growfs operations, the perag is not fully initialised, - * so we can't use it for any useful checking. growfs ensures we can't - * use it by using uncached buffers that don't have the perag attached - * so we can detect and avoid this problem. - */ - if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) - return __this_address; - for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO)) continue; @@ -2530,7 +2706,7 @@ xfs_agi_read_verify( xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agi_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_IALLOC_READ_AGI)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } @@ -2573,16 +2749,19 @@ int xfs_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, + xfs_buf_flags_t flags, struct xfs_buf **agibpp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); int error; - trace_xfs_read_agi(pag->pag_mount, pag->pag_agno); + trace_xfs_read_agi(pag); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops); + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); if (error) return error; if (tp) @@ -2600,31 +2779,57 @@ int xfs_ialloc_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, + int flags, struct xfs_buf **agibpp) { struct xfs_buf *agibp; struct xfs_agi *agi; int error; - trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno); + trace_xfs_ialloc_read_agi(pag); - error = xfs_read_agi(pag, tp, &agibp); + error = xfs_read_agi(pag, tp, + (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, + &agibp); if (error) return error; agi = agibp->b_addr; - if (!pag->pagi_init) { + if (!xfs_perag_initialised_agi(pag)) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); - pag->pagi_init = 1; + set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); } +#ifdef DEBUG /* - * It's possible for these to be out of sync if - * we are in the middle of a forced shutdown. + * It's possible for the AGF to be out of sync if the block device is + * silently dropping writes. This can happen in fstests with dmflakey + * enabled, which allows the buffer to be cleaned and reclaimed by + * memory pressure and then re-read from disk here. We will get a + * stale version of the AGF from disk, and nothing good can happen from + * here. Hence if we detect this situation, immediately shut down the + * filesystem. + * + * This can also happen if we are already in the middle of a forced + * shutdown, so don't bother checking if we are already shut down. */ - ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - xfs_is_shutdown(pag->pag_mount)); + if (!xfs_is_shutdown(pag_mount(pag))) { + bool ok = true; + + ok &= pag->pagi_freecount == be32_to_cpu(agi->agi_freecount); + ok &= pag->pagi_count == be32_to_cpu(agi->agi_count); + + if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + xfs_trans_brelse(tp, agibp); + xfs_force_shutdown(pag_mount(pag), + SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } + } +#endif /* DEBUG */ + if (agibpp) *agibpp = agibp; else @@ -2632,44 +2837,50 @@ xfs_ialloc_read_agi( return 0; } -/* Is there an inode record covering a given range of inode numbers? */ -int -xfs_ialloc_has_inode_record( - struct xfs_btree_cur *cur, - xfs_agino_t low, - xfs_agino_t high, - bool *exists) +/* How many inodes are backed by inode clusters ondisk? */ +STATIC int +xfs_ialloc_count_ondisk( + struct xfs_btree_cur *cur, + xfs_agino_t low, + xfs_agino_t high, + unsigned int *allocated) { struct xfs_inobt_rec_incore irec; - xfs_agino_t agino; - uint16_t holemask; - int has_record; - int i; - int error; + unsigned int ret = 0; + int has_record; + int error; - *exists = false; error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record); - while (error == 0 && has_record) { + if (error) + return error; + + while (has_record) { + unsigned int i, hole_idx; + error = xfs_inobt_get_rec(cur, &irec, &has_record); - if (error || irec.ir_startino > high) + if (error) + return error; + if (irec.ir_startino > high) break; - agino = irec.ir_startino; - holemask = irec.ir_holemask; - for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1, - i++, agino += XFS_INODES_PER_HOLEMASK_BIT) { - if (holemask & 1) + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (irec.ir_startino + i < low) continue; - if (agino + XFS_INODES_PER_HOLEMASK_BIT > low && - agino <= high) { - *exists = true; - return 0; - } + if (irec.ir_startino + i > high) + break; + + hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT; + if (!(irec.ir_holemask & (1U << hole_idx))) + ret++; } error = xfs_btree_increment(cur, 0, &has_record); + if (error) + return error; } - return error; + + *allocated = ret; + return 0; } /* Is there an inode record covering a given extent? */ @@ -2678,15 +2889,27 @@ xfs_ialloc_has_inodes_at_extent( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { - xfs_agino_t low; - xfs_agino_t high; + xfs_agino_t agino; + xfs_agino_t last_agino; + unsigned int allocated; + int error; + + agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno); + last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1; - low = XFS_AGB_TO_AGINO(cur->bc_mp, bno); - high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1; + error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated); + if (error) + return error; - return xfs_ialloc_has_inode_record(cur, low, high, exists); + if (allocated == 0) + *outcome = XBTREE_RECPACKING_EMPTY; + else if (allocated == last_agino - agino + 1) + *outcome = XBTREE_RECPACKING_FULL; + else + *outcome = XBTREE_RECPACKING_SPARSE; + return 0; } struct xfs_ialloc_count_inodes { @@ -2703,8 +2926,13 @@ xfs_ialloc_count_inodes_rec( { struct xfs_inobt_rec_incore irec; struct xfs_ialloc_count_inodes *ci = priv; + xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); + fa = xfs_inobt_check_irec(to_perag(cur->bc_group), &irec); + if (fa) + return xfs_inobt_complain_bad_rec(cur, fa, &irec); + ci->count += irec.ir_count; ci->freecount += irec.ir_freecount; @@ -2721,7 +2949,7 @@ xfs_ialloc_count_inodes( struct xfs_ialloc_count_inodes ci = {0}; int error; - ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + ASSERT(xfs_btree_is_ino(cur->bc_ops)); error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci); if (error) return error; @@ -2762,8 +2990,8 @@ xfs_ialloc_setup_geometry( /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; - igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); - igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); + igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, true); + igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, false); igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2; igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2; @@ -2848,6 +3076,11 @@ xfs_ialloc_setup_geometry( igeo->ialloc_align = mp->m_dalign; else igeo->ialloc_align = 0; + + if (mp->m_sb.sb_blocksize > PAGE_SIZE) + igeo->min_folio_order = mp->m_sb.sb_blocklog - PAGE_SHIFT; + else + igeo->min_folio_order = 0; } /* Compute the location of the root directory inode that is laid out by mkfs. */ @@ -2924,26 +3157,24 @@ xfs_ialloc_calc_rootino( */ int xfs_ialloc_check_shrink( + struct xfs_perag *pag, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_buf *agibp, xfs_agblock_t new_length) { struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; - struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; - xfs_agino_t agino = XFS_AGB_TO_AGINO(mp, new_length); + xfs_agino_t agino; int has; int error; - if (!xfs_has_sparseinodes(mp)) + if (!xfs_has_sparseinodes(pag_mount(pag))) return 0; - pag = xfs_perag_get(mp, agno); - cur = xfs_inobt_init_cursor(mp, tp, agibp, pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agibp); /* Look up the inobt record that would correspond to the new EOFS. */ + agino = XFS_AGB_TO_AGINO(pag_mount(pag), new_length); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has); if (error || !has) goto out; @@ -2953,6 +3184,7 @@ xfs_ialloc_check_shrink( goto out; if (!has) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT); error = -EFSCORRUPTED; goto out; } @@ -2964,6 +3196,5 @@ xfs_ialloc_check_shrink( } out: xfs_btree_del_cursor(cur, error); - xfs_perag_put(pag); return error; } |
