diff options
Diffstat (limited to 'fs/xfs')
161 files changed, 10450 insertions, 10103 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 06b68b6115bc..4f95df476181 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -26,8 +26,8 @@ xfs-y += $(addprefix libxfs/, \ xfs_bmap.o \ xfs_bmap_btree.o \ xfs_btree.o \ + xfs_btree_staging.o \ xfs_da_btree.o \ - xfs_da_format.o \ xfs_defer.o \ xfs_dir2.o \ xfs_dir2_block.o \ diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index da031b93e182..f1366475c389 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -32,7 +32,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) /* - * __vmalloc() will allocate data pages and auxillary structures (e.g. + * __vmalloc() will allocate data pages and auxiliary structures (e.g. * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence * we need to tell memory reclaim that we are in such a context via * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here @@ -48,7 +48,7 @@ __kmem_vmalloc(size_t size, xfs_km_flags_t flags) if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = __vmalloc(size, lflags); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 8170d95cf930..6143117770e9 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -78,39 +78,9 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) * Zone interfaces */ -#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN -#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT -#define KM_ZONE_SPREAD SLAB_MEM_SPREAD -#define KM_ZONE_ACCOUNT SLAB_ACCOUNT - #define kmem_zone kmem_cache #define kmem_zone_t struct kmem_cache -static inline kmem_zone_t * -kmem_zone_init(int size, char *zone_name) -{ - return kmem_cache_create(zone_name, size, 0, 0, NULL); -} - -static inline kmem_zone_t * -kmem_zone_init_flags(int size, char *zone_name, slab_flags_t flags, - void (*construct)(void *)) -{ - return kmem_cache_create(zone_name, size, 0, flags, construct); -} - -static inline void -kmem_zone_free(kmem_zone_t *zone, void *ptr) -{ - kmem_cache_free(zone, ptr); -} - -static inline void -kmem_zone_destroy(kmem_zone_t *zone) -{ - kmem_cache_destroy(zone); -} - extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); static inline void * diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 14fbdf22b7e7..9d84007a5c65 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -23,25 +23,28 @@ #include "xfs_ag_resv.h" #include "xfs_health.h" -static struct xfs_buf * +static int xfs_get_aghdr_buf( struct xfs_mount *mp, xfs_daddr_t blkno, size_t numblks, + struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; + int error; - bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0); - if (!bp) - return NULL; + error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp); + if (error) + return error; xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); bp->b_bn = blkno; bp->b_maps[0].bm_bn = blkno; bp->b_ops = ops; - return bp; + *bpp = bp; + return 0; } static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id) @@ -228,7 +231,7 @@ xfs_sbblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; xfs_sb_to_disk(dsb, &mp->m_sb); dsb->sb_inprogress = 1; @@ -240,7 +243,7 @@ xfs_agfblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + struct xfs_agf *agf = bp->b_addr; xfs_extlen_t tmpsize; agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -298,7 +301,7 @@ xfs_agflblock_init( uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); } - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); + agfl_bno = xfs_buf_to_agfl_bno(bp); for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); } @@ -309,7 +312,7 @@ xfs_agiblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + struct xfs_agi *agi = bp->b_addr; int bucket; agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -340,13 +343,13 @@ xfs_ag_init_hdr( struct aghdr_init_data *id, aghdr_init_work_f work, const struct xfs_buf_ops *ops) - { struct xfs_buf *bp; + int error; - bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, ops); - if (!bp) - return -ENOMEM; + error = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, &bp, ops); + if (error) + return error; (*work)(mp, bp, id); @@ -499,7 +502,7 @@ xfs_ag_extend_space( if (error) return error; - agi = XFS_BUF_TO_AGI(bp); + agi = bp->b_addr; be32_add_cpu(&agi->agi_length, len); ASSERT(id->agno == mp->m_sb.sb_agcount - 1 || be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); @@ -512,7 +515,7 @@ xfs_ag_extend_space( if (error) return error; - agf = XFS_BUF_TO_AGF(bp); + agf = bp->b_addr; be32_add_cpu(&agf->agf_length, len); ASSERT(agf->agf_length == agi->agi_length); xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); @@ -566,11 +569,11 @@ xfs_ag_get_geometry( memset(ageo, 0, sizeof(*ageo)); ageo->ag_number = agno; - agi = XFS_BUF_TO_AGI(agi_bp); + agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); ageo->ag_ifree = be32_to_cpu(agi->agi_freecount); - agf = XFS_BUF_TO_AGF(agf_bp); + agf = agf_bp->b_addr; ageo->ag_length = be32_to_cpu(agf->agf_length); freeblks = pag->pagf_freeblks + pag->pagf_flcount + diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 87a9747f1d36..fdfe6dc0d307 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -19,6 +19,8 @@ #include "xfs_btree.h" #include "xfs_refcount_btree.h" #include "xfs_ialloc_btree.h" +#include "xfs_sb.h" +#include "xfs_ag_resv.h" /* * Per-AG Block Reservations diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 533b04aaf6f6..203e74fa64aa 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -146,9 +146,13 @@ xfs_alloc_lookup_eq( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { + int error; + cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; - return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); + error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); + cur->bc_ag.abt.active = (*stat == 1); + return error; } /* @@ -162,9 +166,13 @@ xfs_alloc_lookup_ge( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { + int error; + cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; - return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); + error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); + cur->bc_ag.abt.active = (*stat == 1); + return error; } /* @@ -178,9 +186,19 @@ xfs_alloc_lookup_le( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { + int error; cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; - return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + cur->bc_ag.abt.active = (*stat == 1); + return error; +} + +static inline bool +xfs_alloc_cur_active( + struct xfs_btree_cur *cur) +{ + return cur && cur->bc_ag.abt.active; } /* @@ -212,7 +230,7 @@ xfs_alloc_get_rec( int *stat) /* output: success/failure */ { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; @@ -313,7 +331,7 @@ xfs_alloc_compute_diff( xfs_extlen_t newlen1=0; /* length with newbno1 */ xfs_extlen_t newlen2=0; /* length with newbno2 */ xfs_agblock_t wantend; /* end of target extent */ - bool userdata = xfs_alloc_is_userdata(datatype); + bool userdata = datatype & XFS_ALLOC_USERDATA; ASSERT(freelen >= wantlen); freeend = freebno + freelen; @@ -433,13 +451,17 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, - i == 1 && nfbno1 == fbno && nflen1 == flen); + if (XFS_IS_CORRUPT(mp, + i != 1 || + nfbno1 != fbno || + nflen1 != flen)) + return -EFSCORRUPTED; #endif } else { if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } /* * Look up the record in the by-block tree if necessary. @@ -448,13 +470,17 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, - i == 1 && nfbno1 == fbno && nflen1 == flen); + if (XFS_IS_CORRUPT(mp, + i != 1 || + nfbno1 != fbno || + nflen1 != flen)) + return -EFSCORRUPTED; #endif } else { if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } #ifdef DEBUG @@ -465,8 +491,10 @@ xfs_alloc_fixup_trees( bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); - XFS_WANT_CORRUPTED_RETURN(mp, - bnoblock->bb_numrecs == cntblock->bb_numrecs); + if (XFS_IS_CORRUPT(mp, + bnoblock->bb_numrecs != + cntblock->bb_numrecs)) + return -EFSCORRUPTED; } #endif @@ -496,25 +524,30 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; /* * Add new by-size btree entry(s). */ if (nfbno1 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if (XFS_IS_CORRUPT(mp, i != 0)) + return -EFSCORRUPTED; if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } if (nfbno2 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if (XFS_IS_CORRUPT(mp, i != 0)) + return -EFSCORRUPTED; if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } /* * Fix up the by-block btree entry(s). @@ -525,7 +558,8 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } else { /* * Update the by-block entry to start later|be shorter. @@ -539,10 +573,12 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if (XFS_IS_CORRUPT(mp, i != 0)) + return -EFSCORRUPTED; if ((error = xfs_btree_insert(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } return 0; } @@ -553,6 +589,7 @@ xfs_agfl_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); int i; /* @@ -578,8 +615,8 @@ xfs_agfl_verify( return __this_address; for (i = 0; i < xfs_agfl_size(mp); i++) { - if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && - be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK && + be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks) return __this_address; } @@ -677,23 +714,305 @@ xfs_alloc_update_counters( struct xfs_buf *agbp, long len) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_agf *agf = agbp->b_addr; pag->pagf_freeblks += len; be32_add_cpu(&agf->agf_freeblks, len); xfs_trans_agblocks_delta(tp, len); if (unlikely(be32_to_cpu(agf->agf_freeblks) > - be32_to_cpu(agf->agf_length))) + be32_to_cpu(agf->agf_length))) { + xfs_buf_mark_corrupt(agbp); return -EFSCORRUPTED; + } xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); return 0; } /* - * Allocation group level functions. + * Block allocation algorithm and data structures. */ +struct xfs_alloc_cur { + struct xfs_btree_cur *cnt; /* btree cursors */ + struct xfs_btree_cur *bnolt; + struct xfs_btree_cur *bnogt; + xfs_extlen_t cur_len;/* current search length */ + xfs_agblock_t rec_bno;/* extent startblock */ + xfs_extlen_t rec_len;/* extent length */ + xfs_agblock_t bno; /* alloc bno */ + xfs_extlen_t len; /* alloc len */ + xfs_extlen_t diff; /* diff from search bno */ + unsigned int busy_gen;/* busy state */ + bool busy; +}; + +/* + * Set up cursors, etc. in the extent allocation cursor. This function can be + * called multiple times to reset an initialized structure without having to + * reallocate cursors. + */ +static int +xfs_alloc_cur_setup( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur) +{ + int error; + int i; + + ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO); + + acur->cur_len = args->maxlen; + acur->rec_bno = 0; + acur->rec_len = 0; + acur->bno = 0; + acur->len = 0; + acur->diff = -1; + acur->busy = false; + acur->busy_gen = 0; + + /* + * Perform an initial cntbt lookup to check for availability of maxlen + * extents. If this fails, we'll return -ENOSPC to signal the caller to + * attempt a small allocation. + */ + if (!acur->cnt) + acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, + args->agbp, args->agno, XFS_BTNUM_CNT); + error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); + if (error) + return error; + + /* + * Allocate the bnobt left and right search cursors. + */ + if (!acur->bnolt) + acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, + args->agbp, args->agno, XFS_BTNUM_BNO); + if (!acur->bnogt) + acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, + args->agbp, args->agno, XFS_BTNUM_BNO); + return i == 1 ? 0 : -ENOSPC; +} + +static void +xfs_alloc_cur_close( + struct xfs_alloc_cur *acur, + bool error) +{ + int cur_error = XFS_BTREE_NOERROR; + + if (error) + cur_error = XFS_BTREE_ERROR; + + if (acur->cnt) + xfs_btree_del_cursor(acur->cnt, cur_error); + if (acur->bnolt) + xfs_btree_del_cursor(acur->bnolt, cur_error); + if (acur->bnogt) + xfs_btree_del_cursor(acur->bnogt, cur_error); + acur->cnt = acur->bnolt = acur->bnogt = NULL; +} + +/* + * Check an extent for allocation and track the best available candidate in the + * allocation structure. The cursor is deactivated if it has entered an out of + * range state based on allocation arguments. Optionally return the extent + * extent geometry and allocation status if requested by the caller. + */ +static int +xfs_alloc_cur_check( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur, + struct xfs_btree_cur *cur, + int *new) +{ + int error, i; + xfs_agblock_t bno, bnoa, bnew; + xfs_extlen_t len, lena, diff = -1; + bool busy; + unsigned busy_gen = 0; + bool deactivate = false; + bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO; + + *new = 0; + + error = xfs_alloc_get_rec(cur, &bno, &len, &i); + if (error) + return error; + if (XFS_IS_CORRUPT(args->mp, i != 1)) + return -EFSCORRUPTED; + + /* + * Check minlen and deactivate a cntbt cursor if out of acceptable size + * range (i.e., walking backwards looking for a minlen extent). + */ + if (len < args->minlen) { + deactivate = !isbnobt; + goto out; + } + + busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena, + &busy_gen); + acur->busy |= busy; + if (busy) + acur->busy_gen = busy_gen; + /* deactivate a bnobt cursor outside of locality range */ + if (bnoa < args->min_agbno || bnoa > args->max_agbno) { + deactivate = isbnobt; + goto out; + } + if (lena < args->minlen) + goto out; + + args->len = XFS_EXTLEN_MIN(lena, args->maxlen); + xfs_alloc_fix_len(args); + ASSERT(args->len >= args->minlen); + if (args->len < acur->len) + goto out; + + /* + * We have an aligned record that satisfies minlen and beats or matches + * the candidate extent size. Compare locality for near allocation mode. + */ + ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO); + diff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, args->datatype, + bnoa, lena, &bnew); + if (bnew == NULLAGBLOCK) + goto out; + + /* + * Deactivate a bnobt cursor with worse locality than the current best. + */ + if (diff > acur->diff) { + deactivate = isbnobt; + goto out; + } + + ASSERT(args->len > acur->len || + (args->len == acur->len && diff <= acur->diff)); + acur->rec_bno = bno; + acur->rec_len = len; + acur->bno = bnew; + acur->len = args->len; + acur->diff = diff; + *new = 1; + + /* + * We're done if we found a perfect allocation. This only deactivates + * the current cursor, but this is just an optimization to terminate a + * cntbt search that otherwise runs to the edge of the tree. + */ + if (acur->diff == 0 && acur->len == args->maxlen) + deactivate = true; +out: + if (deactivate) + cur->bc_ag.abt.active = false; + trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, + *new); + return 0; +} + +/* + * Complete an allocation of a candidate extent. Remove the extent from both + * trees and update the args structure. + */ +STATIC int +xfs_alloc_cur_finish( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur) +{ + struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; + int error; + + ASSERT(acur->cnt && acur->bnolt); + ASSERT(acur->bno >= acur->rec_bno); + ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); + ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); + + error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, + acur->rec_len, acur->bno, acur->len, 0); + if (error) + return error; + + args->agbno = acur->bno; + args->len = acur->len; + args->wasfromfl = 0; + + trace_xfs_alloc_cur(args); + return 0; +} + +/* + * Locality allocation lookup algorithm. This expects a cntbt cursor and uses + * bno optimized lookup to search for extents with ideal size and locality. + */ +STATIC int +xfs_alloc_cntbt_iter( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur) +{ + struct xfs_btree_cur *cur = acur->cnt; + xfs_agblock_t bno; + xfs_extlen_t len, cur_len; + int error; + int i; + + if (!xfs_alloc_cur_active(cur)) + return 0; + + /* locality optimized lookup */ + cur_len = acur->cur_len; + error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i); + if (error) + return error; + if (i == 0) + return 0; + error = xfs_alloc_get_rec(cur, &bno, &len, &i); + if (error) + return error; + + /* check the current record and update search length from it */ + error = xfs_alloc_cur_check(args, acur, cur, &i); + if (error) + return error; + ASSERT(len >= acur->cur_len); + acur->cur_len = len; + + /* + * We looked up the first record >= [agbno, len] above. The agbno is a + * secondary key and so the current record may lie just before or after + * agbno. If it is past agbno, check the previous record too so long as + * the length matches as it may be closer. Don't check a smaller record + * because that could deactivate our cursor. + */ + if (bno > args->agbno) { + error = xfs_btree_decrement(cur, 0, &i); + if (!error && i) { + error = xfs_alloc_get_rec(cur, &bno, &len, &i); + if (!error && i && len == acur->cur_len) + error = xfs_alloc_cur_check(args, acur, cur, + &i); + } + if (error) + return error; + } + + /* + * Increment the search key until we find at least one allocation + * candidate or if the extent we found was larger. Otherwise, double the + * search key to optimize the search. Efficiency is more important here + * than absolute best locality. + */ + cur_len <<= 1; + if (!acur->len || acur->cur_len >= cur_len) + acur->cur_len++; + else + acur->cur_len = cur_len; + + return error; +} /* * Deal with the case where only small freespaces remain. Either return the @@ -708,6 +1027,7 @@ xfs_alloc_ag_vextent_small( xfs_extlen_t *flenp, /* result length */ int *stat) /* status: 0-freelist, 1-normal/none */ { + struct xfs_agf *agf = args->agbp->b_addr; int error = 0; xfs_agblock_t fbno = NULLAGBLOCK; xfs_extlen_t flen = 0; @@ -727,14 +1047,16 @@ xfs_alloc_ag_vextent_small( error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i); if (error) goto error; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error); + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + error = -EFSCORRUPTED; + goto error; + } goto out; } if (args->minlen != 1 || args->alignment != 1 || args->resv == XFS_AG_RESV_AGFL || - (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <= - args->minleft)) + be32_to_cpu(agf->agf_flcount) <= args->minleft) goto out; error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); @@ -744,23 +1066,24 @@ xfs_alloc_ag_vextent_small( goto out; xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, - xfs_alloc_allow_busy_reuse(args->datatype)); + (args->datatype & XFS_ALLOC_NOBUSY)); - if (xfs_alloc_is_userdata(args->datatype)) { + if (args->datatype & XFS_ALLOC_USERDATA) { struct xfs_buf *bp; - bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno); - if (!bp) { - error = -EFSCORRUPTED; + error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, + XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), + args->mp->m_bsize, 0, &bp); + if (error) goto error; - } xfs_trans_binval(args->tp, bp); } *fbnop = args->agbno = fbno; *flenp = args->len = 1; - XFS_WANT_CORRUPTED_GOTO(args->mp, - fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), - error); + if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { + error = -EFSCORRUPTED; + goto error; + } args->wasfromfl = 1; trace_xfs_alloc_small_freelist(args); @@ -879,6 +1202,7 @@ STATIC int /* error */ xfs_alloc_ag_vextent_exact( xfs_alloc_arg_t *args) /* allocation argument structure */ { + struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ int error; @@ -915,7 +1239,10 @@ xfs_alloc_ag_vextent_exact( error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } ASSERT(fbno <= args->agbno); /* @@ -954,8 +1281,7 @@ xfs_alloc_ag_vextent_exact( */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, args->agno, XFS_BTNUM_CNT); - ASSERT(args->agbno + args->len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); if (error) { @@ -984,98 +1310,243 @@ error0: } /* - * Search the btree in a given direction via the search cursor and compare - * the records found against the good extent we've already found. + * Search a given number of btree records in a given direction. Check each + * record against the good extent we've already found. */ STATIC int -xfs_alloc_find_best_extent( - struct xfs_alloc_arg *args, /* allocation argument structure */ - struct xfs_btree_cur **gcur, /* good cursor */ - struct xfs_btree_cur **scur, /* searching cursor */ - xfs_agblock_t gdiff, /* difference for search comparison */ - xfs_agblock_t *sbno, /* extent found by search */ - xfs_extlen_t *slen, /* extent length */ - xfs_agblock_t *sbnoa, /* aligned extent found by search */ - xfs_extlen_t *slena, /* aligned extent length */ - int dir) /* 0 = search right, 1 = search left */ +xfs_alloc_walk_iter( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur, + struct xfs_btree_cur *cur, + bool increment, + bool find_one, /* quit on first candidate */ + int count, /* rec count (-1 for infinite) */ + int *stat) { - xfs_agblock_t new; - xfs_agblock_t sdiff; int error; int i; - unsigned busy_gen; - /* The good extent is perfect, no need to search. */ - if (!gdiff) - goto out_use_good; + *stat = 0; /* - * Look until we find a better one, run out of space or run off the end. + * Search so long as the cursor is active or we find a better extent. + * The cursor is deactivated if it extends beyond the range of the + * current allocation candidate. */ - do { - error = xfs_alloc_get_rec(*scur, sbno, slen, &i); + while (xfs_alloc_cur_active(cur) && count) { + error = xfs_alloc_cur_check(args, acur, cur, &i); if (error) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, *sbno, *slen, - sbnoa, slena, &busy_gen); + return error; + if (i == 1) { + *stat = 1; + if (find_one) + break; + } + if (!xfs_alloc_cur_active(cur)) + break; + + if (increment) + error = xfs_btree_increment(cur, 0, &i); + else + error = xfs_btree_decrement(cur, 0, &i); + if (error) + return error; + if (i == 0) + cur->bc_ag.abt.active = false; + + if (count > 0) + count--; + } + + return 0; +} + +/* + * Search the by-bno and by-size btrees in parallel in search of an extent with + * ideal locality based on the NEAR mode ->agbno locality hint. + */ +STATIC int +xfs_alloc_ag_vextent_locality( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur, + int *stat) +{ + struct xfs_btree_cur *fbcur = NULL; + int error; + int i; + bool fbinc; + + ASSERT(acur->len == 0); + ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO); + + *stat = 0; + + error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i); + if (error) + return error; + error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i); + if (error) + return error; + error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i); + if (error) + return error; + + /* + * Search the bnobt and cntbt in parallel. Search the bnobt left and + * right and lookup the closest extent to the locality hint for each + * extent size key in the cntbt. The entire search terminates + * immediately on a bnobt hit because that means we've found best case + * locality. Otherwise the search continues until the cntbt cursor runs + * off the end of the tree. If no allocation candidate is found at this + * point, give up on locality, walk backwards from the end of the cntbt + * and take the first available extent. + * + * The parallel tree searches balance each other out to provide fairly + * consistent performance for various situations. The bnobt search can + * have pathological behavior in the worst case scenario of larger + * allocation requests and fragmented free space. On the other hand, the + * bnobt is able to satisfy most smaller allocation requests much more + * quickly than the cntbt. The cntbt search can sift through fragmented + * free space and sets of free extents for larger allocation requests + * more quickly than the bnobt. Since the locality hint is just a hint + * and we don't want to scan the entire bnobt for perfect locality, the + * cntbt search essentially bounds the bnobt search such that we can + * find good enough locality at reasonable performance in most cases. + */ + while (xfs_alloc_cur_active(acur->bnolt) || + xfs_alloc_cur_active(acur->bnogt) || + xfs_alloc_cur_active(acur->cnt)) { + + trace_xfs_alloc_cur_lookup(args); /* - * The good extent is closer than this one. + * Search the bnobt left and right. In the case of a hit, finish + * the search in the opposite direction and we're done. */ - if (!dir) { - if (*sbnoa > args->max_agbno) - goto out_use_good; - if (*sbnoa >= args->agbno + gdiff) - goto out_use_good; - } else { - if (*sbnoa < args->min_agbno) - goto out_use_good; - if (*sbnoa <= args->agbno - gdiff) - goto out_use_good; + error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false, + true, 1, &i); + if (error) + return error; + if (i == 1) { + trace_xfs_alloc_cur_left(args); + fbcur = acur->bnogt; + fbinc = true; + break; + } + error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true, + 1, &i); + if (error) + return error; + if (i == 1) { + trace_xfs_alloc_cur_right(args); + fbcur = acur->bnolt; + fbinc = false; + break; } /* - * Same distance, compare length and pick the best. + * Check the extent with best locality based on the current + * extent size search key and keep track of the best candidate. */ - if (*slena >= args->minlen) { - args->len = XFS_EXTLEN_MIN(*slena, args->maxlen); - xfs_alloc_fix_len(args); - - sdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, - args->datatype, *sbnoa, - *slena, &new); + error = xfs_alloc_cntbt_iter(args, acur); + if (error) + return error; + if (!xfs_alloc_cur_active(acur->cnt)) { + trace_xfs_alloc_cur_lookup_done(args); + break; + } + } - /* - * Choose closer size and invalidate other cursor. - */ - if (sdiff < gdiff) - goto out_use_search; - goto out_use_good; + /* + * If we failed to find anything due to busy extents, return empty + * handed so the caller can flush and retry. If no busy extents were + * found, walk backwards from the end of the cntbt as a last resort. + */ + if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) { + error = xfs_btree_decrement(acur->cnt, 0, &i); + if (error) + return error; + if (i) { + acur->cnt->bc_ag.abt.active = true; + fbcur = acur->cnt; + fbinc = false; } + } - if (!dir) - error = xfs_btree_increment(*scur, 0, &i); - else - error = xfs_btree_decrement(*scur, 0, &i); + /* + * Search in the opposite direction for a better entry in the case of + * a bnobt hit or walk backwards from the end of the cntbt. + */ + if (fbcur) { + error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1, + &i); if (error) - goto error0; - } while (i); + return error; + } -out_use_good: - xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR); - *scur = NULL; - return 0; + if (acur->len) + *stat = 1; -out_use_search: - xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR); - *gcur = NULL; return 0; +} -error0: - /* caller invalidates cursors */ - return error; +/* Check the last block of the cnt btree for allocations. */ +static int +xfs_alloc_ag_vextent_lastblock( + struct xfs_alloc_arg *args, + struct xfs_alloc_cur *acur, + xfs_agblock_t *bno, + xfs_extlen_t *len, + bool *allocated) +{ + int error; + int i; + +#ifdef DEBUG + /* Randomly don't execute the first algorithm. */ + if (prandom_u32() & 1) + return 0; +#endif + + /* + * Start from the entry that lookup found, sequence through all larger + * free blocks. If we're actually pointing at a record smaller than + * maxlen, go to the start of this block, and skip all those smaller + * than minlen. + */ + if (*len || args->alignment > 1) { + acur->cnt->bc_ptrs[0] = 1; + do { + error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); + if (error) + return error; + if (XFS_IS_CORRUPT(args->mp, i != 1)) + return -EFSCORRUPTED; + if (*len >= args->minlen) + break; + error = xfs_btree_increment(acur->cnt, 0, &i); + if (error) + return error; + } while (i); + ASSERT(*len >= args->minlen); + if (!i) + return 0; + } + + error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i); + if (error) + return error; + + /* + * It didn't work. We COULD be in a case where there's a good record + * somewhere, so try again. + */ + if (acur->len == 0) + return 0; + + trace_xfs_alloc_near_first(args); + *allocated = true; + return 0; } /* @@ -1084,41 +1555,17 @@ error0: * and of the form k * prod + mod unless there's nothing that large. * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. */ -STATIC int /* error */ +STATIC int xfs_alloc_ag_vextent_near( - xfs_alloc_arg_t *args) /* allocation argument structure */ + struct xfs_alloc_arg *args) { - xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */ - xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */ - xfs_btree_cur_t *cnt_cur; /* cursor for count btree */ - xfs_agblock_t gtbno; /* start bno of right side entry */ - xfs_agblock_t gtbnoa; /* aligned ... */ - xfs_extlen_t gtdiff; /* difference to right side entry */ - xfs_extlen_t gtlen; /* length of right side entry */ - xfs_extlen_t gtlena; /* aligned ... */ - xfs_agblock_t gtnew; /* useful start bno of right side */ - int error; /* error code */ - int i; /* result code, temporary */ - int j; /* result code, temporary */ - xfs_agblock_t ltbno; /* start bno of left side entry */ - xfs_agblock_t ltbnoa; /* aligned ... */ - xfs_extlen_t ltdiff; /* difference to left side entry */ - xfs_extlen_t ltlen; /* length of left side entry */ - xfs_extlen_t ltlena; /* aligned ... */ - xfs_agblock_t ltnew; /* useful start bno of left side */ - xfs_extlen_t rlen; /* length of returned extent */ - bool busy; - unsigned busy_gen; -#ifdef DEBUG - /* - * Randomly don't execute the first algorithm. - */ - int dofirst; /* set to do first algorithm */ - - dofirst = prandom_u32() & 1; -#endif + struct xfs_alloc_cur acur = {}; + int error; /* error code */ + int i; /* result code, temporary */ + xfs_agblock_t bno; + xfs_extlen_t len; - /* handle unitialized agbno range so caller doesn't have to */ + /* handle uninitialized agbno range so caller doesn't have to */ if (!args->min_agbno && !args->max_agbno) args->max_agbno = args->mp->m_sb.sb_agblocks - 1; ASSERT(args->min_agbno <= args->max_agbno); @@ -1130,40 +1577,27 @@ xfs_alloc_ag_vextent_near( args->agbno = args->max_agbno; restart: - bno_cur_lt = NULL; - bno_cur_gt = NULL; - ltlen = 0; - gtlena = 0; - ltlena = 0; - busy = false; + len = 0; /* - * Get a cursor for the by-size btree. - */ - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT); - - /* - * See if there are any free extents as big as maxlen. - */ - if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i))) - goto error0; - /* - * If none, then pick up the last entry in the tree unless the - * tree is empty. + * Set up cursors and see if there are any free extents as big as + * maxlen. If not, pick the last entry in the tree unless the tree is + * empty. */ - if (!i) { - if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, <bno, - <len, &i))) - goto error0; - if (i == 0 || ltlen == 0) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + error = xfs_alloc_cur_setup(args, &acur); + if (error == -ENOSPC) { + error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno, + &len, &i); + if (error) + goto out; + if (i == 0 || len == 0) { trace_xfs_alloc_near_noentry(args); - return 0; + goto out; } ASSERT(i == 1); + } else if (error) { + goto out; } - args->wasfromfl = 0; /* * First algorithm. @@ -1172,311 +1606,47 @@ restart: * near the right edge of the tree. If it's in the last btree leaf * block, then we just examine all the entries in that block * that are big enough, and pick the best one. - * This is written as a while loop so we can break out of it, - * but we never loop back to the top. */ - while (xfs_btree_islastblock(cnt_cur, 0)) { - xfs_extlen_t bdiff; - int besti=0; - xfs_extlen_t blen=0; - xfs_agblock_t bnew=0; + if (xfs_btree_islastblock(acur.cnt, 0)) { + bool allocated = false; -#ifdef DEBUG - if (dofirst) - break; -#endif - /* - * Start from the entry that lookup found, sequence through - * all larger free blocks. If we're actually pointing at a - * record smaller than maxlen, go to the start of this block, - * and skip all those smaller than minlen. - */ - if (ltlen || args->alignment > 1) { - cnt_cur->bc_ptrs[0] = 1; - do { - if ((error = xfs_alloc_get_rec(cnt_cur, <bno, - <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - if (ltlen >= args->minlen) - break; - if ((error = xfs_btree_increment(cnt_cur, 0, &i))) - goto error0; - } while (i); - ASSERT(ltlen >= args->minlen); - if (!i) - break; - } - i = cnt_cur->bc_ptrs[0]; - for (j = 1, blen = 0, bdiff = 0; - !error && j && (blen < args->maxlen || bdiff > 0); - error = xfs_btree_increment(cnt_cur, 0, &j)) { - /* - * For each entry, decide if it's better than - * the previous best entry. - */ - if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - busy = xfs_alloc_compute_aligned(args, ltbno, ltlen, - <bnoa, <lena, &busy_gen); - if (ltlena < args->minlen) - continue; - if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) - continue; - args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); - xfs_alloc_fix_len(args); - ASSERT(args->len >= args->minlen); - if (args->len < blen) - continue; - ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->datatype, ltbnoa, - ltlena, <new); - if (ltnew != NULLAGBLOCK && - (args->len > blen || ltdiff < bdiff)) { - bdiff = ltdiff; - bnew = ltnew; - blen = args->len; - besti = cnt_cur->bc_ptrs[0]; - } - } - /* - * It didn't work. We COULD be in a case where - * there's a good record somewhere, so try again. - */ - if (blen == 0) - break; - /* - * Point at the best entry, and retrieve it again. - */ - cnt_cur->bc_ptrs[0] = besti; - if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); - args->len = blen; - - /* - * We are allocating starting at bnew for blen blocks. - */ - args->agbno = bnew; - ASSERT(bnew >= ltbno); - ASSERT(bnew + blen <= ltbno + ltlen); - /* - * Set up a cursor for the by-bno tree. - */ - bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_BNO); - /* - * Fix up the btree entries. - */ - if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, - ltlen, bnew, blen, XFSA_FIXUP_CNT_OK))) - goto error0; - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); - - trace_xfs_alloc_near_first(args); - return 0; - } - /* - * Second algorithm. - * Search in the by-bno tree to the left and to the right - * simultaneously, until in each case we find a space big enough, - * or run into the edge of the tree. When we run into the edge, - * we deallocate that cursor. - * If both searches succeed, we compare the two spaces and pick - * the better one. - * With alignment, it's possible for both to fail; the upper - * level algorithm that picks allocation groups for allocations - * is not supposed to do this. - */ - /* - * Allocate and initialize the cursor for the leftward search. - */ - bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO); - /* - * Lookup <= bno to find the leftward search's starting point. - */ - if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i))) - goto error0; - if (!i) { - /* - * Didn't find anything; use this cursor for the rightward - * search. - */ - bno_cur_gt = bno_cur_lt; - bno_cur_lt = NULL; - } - /* - * Found something. Duplicate the cursor for the rightward search. - */ - else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt))) - goto error0; - /* - * Increment the cursor, so we will point at the entry just right - * of the leftward entry if any, or to the leftmost entry. - */ - if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) - goto error0; - if (!i) { - /* - * It failed, there are no rightward entries. - */ - xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); - bno_cur_gt = NULL; + error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len, + &allocated); + if (error) + goto out; + if (allocated) + goto alloc_finish; } - /* - * Loop going left with the leftward cursor, right with the - * rightward cursor, until either both directions give up or - * we find an entry at least as big as minlen. - */ - do { - if (bno_cur_lt) { - if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen, - <bnoa, <lena, &busy_gen); - if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) - break; - if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) - goto error0; - if (!i || ltbnoa < args->min_agbno) { - xfs_btree_del_cursor(bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - } - } - if (bno_cur_gt) { - if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen, - >bnoa, >lena, &busy_gen); - if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) - break; - if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) - goto error0; - if (!i || gtbnoa > args->max_agbno) { - xfs_btree_del_cursor(bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - } - } - } while (bno_cur_lt || bno_cur_gt); /* - * Got both cursors still active, need to find better entry. + * Second algorithm. Combined cntbt and bnobt search to find ideal + * locality. */ - if (bno_cur_lt && bno_cur_gt) { - if (ltlena >= args->minlen) { - /* - * Left side is good, look for a right side entry. - */ - args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); - xfs_alloc_fix_len(args); - ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->datatype, ltbnoa, - ltlena, <new); - - error = xfs_alloc_find_best_extent(args, - &bno_cur_lt, &bno_cur_gt, - ltdiff, >bno, >len, - >bnoa, >lena, - 0 /* search right */); - } else { - ASSERT(gtlena >= args->minlen); - - /* - * Right side is good, look for a left side entry. - */ - args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); - xfs_alloc_fix_len(args); - gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->datatype, gtbnoa, - gtlena, >new); - - error = xfs_alloc_find_best_extent(args, - &bno_cur_gt, &bno_cur_lt, - gtdiff, <bno, <len, - <bnoa, <lena, - 1 /* search left */); - } - - if (error) - goto error0; - } + error = xfs_alloc_ag_vextent_locality(args, &acur, &i); + if (error) + goto out; /* * If we couldn't get anything, give up. */ - if (bno_cur_lt == NULL && bno_cur_gt == NULL) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - - if (busy) { + if (!acur.len) { + if (acur.busy) { trace_xfs_alloc_near_busy(args); - xfs_extent_busy_flush(args->mp, args->pag, busy_gen); + xfs_extent_busy_flush(args->mp, args->pag, + acur.busy_gen); goto restart; } trace_xfs_alloc_size_neither(args); args->agbno = NULLAGBLOCK; - return 0; + goto out; } - /* - * At this point we have selected a freespace entry, either to the - * left or to the right. If it's on the right, copy all the - * useful variables to the "left" set so we only have one - * copy of this code. - */ - if (bno_cur_gt) { - bno_cur_lt = bno_cur_gt; - bno_cur_gt = NULL; - ltbno = gtbno; - ltbnoa = gtbnoa; - ltlen = gtlen; - ltlena = gtlena; - j = 1; - } else - j = 0; +alloc_finish: + /* fix up btrees on a successful allocation */ + error = xfs_alloc_cur_finish(args, &acur); - /* - * Fix up the length and compute the useful address. - */ - args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); - xfs_alloc_fix_len(args); - rlen = args->len; - (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, - args->datatype, ltbnoa, ltlena, <new); - ASSERT(ltnew >= ltbno); - ASSERT(ltnew + rlen <= ltbnoa + ltlena); - ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); - ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno); - args->agbno = ltnew; - - if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, - ltnew, rlen, XFSA_FIXUP_BNO_OK))) - goto error0; - - if (j) - trace_xfs_alloc_near_greater(args); - else - trace_xfs_alloc_near_lesser(args); - - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); - return 0; - - error0: - trace_xfs_alloc_near_error(args); - if (cnt_cur != NULL) - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); - if (bno_cur_lt != NULL) - xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR); - if (bno_cur_gt != NULL) - xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR); +out: + xfs_alloc_cur_close(&acur, error); return error; } @@ -1490,6 +1660,7 @@ STATIC int /* error */ xfs_alloc_ag_vextent_size( xfs_alloc_arg_t *args) /* allocation argument structure */ { + struct xfs_agf *agf = args->agbp->b_addr; xfs_btree_cur_t *bno_cur; /* cursor for bno btree */ xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */ int error; /* error result */ @@ -1545,7 +1716,10 @@ restart: error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen, &busy_gen); @@ -1579,8 +1753,13 @@ restart: * This can't happen in the second case above. */ rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || - (rlen <= flen && rbno + rlen <= fbno + flen), error0); + if (XFS_IS_CORRUPT(args->mp, + rlen != 0 && + (rlen > flen || + rbno + rlen > fbno + flen))) { + error = -EFSCORRUPTED; + goto error0; + } if (rlen < args->maxlen) { xfs_agblock_t bestfbno; xfs_extlen_t bestflen; @@ -1599,15 +1778,22 @@ restart: if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if (flen < bestrlen) break; busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen, &busy_gen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || - (rlen <= flen && rbno + rlen <= fbno + flen), - error0); + if (XFS_IS_CORRUPT(args->mp, + rlen != 0 && + (rlen > flen || + rbno + rlen > fbno + flen))) { + error = -EFSCORRUPTED; + goto error0; + } if (rlen > bestrlen) { bestrlen = rlen; bestrbno = rbno; @@ -1620,7 +1806,10 @@ restart: if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } rlen = bestrlen; rbno = bestrbno; flen = bestflen; @@ -1643,7 +1832,10 @@ restart: xfs_alloc_fix_len(args); rlen = args->len; - XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0); + if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Allocate and initialize a cursor for the by-block tree. */ @@ -1657,10 +1849,12 @@ restart: cnt_cur = bno_cur = NULL; args->len = rlen; args->agbno = rbno; - XFS_WANT_CORRUPTED_GOTO(args->mp, - args->agbno + args->len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), - error0); + if (XFS_IS_CORRUPT(args->mp, + args->agbno + args->len > + be32_to_cpu(agf->agf_length))) { + error = -EFSCORRUPTED; + goto error0; + } trace_xfs_alloc_size_done(args); return 0; @@ -1732,7 +1926,10 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * It's not contiguous, though. */ @@ -1744,8 +1941,10 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(mp, - ltbno + ltlen <= bno, error0); + if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { + error = -EFSCORRUPTED; + goto error0; + } } } /* @@ -1760,7 +1959,10 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * It's not contiguous, though. */ @@ -1772,7 +1974,10 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0); + if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { + error = -EFSCORRUPTED; + goto error0; + } } } /* @@ -1789,31 +1994,49 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Delete the old by-size entry on the right. */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Delete the old by-block entry for the right block. */ if ((error = xfs_btree_delete(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Move the by-block cursor back to the left neighbor. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } #ifdef DEBUG /* * Check that this is the right record: delete didn't @@ -1826,9 +2049,13 @@ xfs_free_ag_extent( if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, - i == 1 && xxbno == ltbno && xxlen == ltlen, - error0); + if (XFS_IS_CORRUPT(mp, + i != 1 || + xxbno != ltbno || + xxlen != ltlen)) { + error = -EFSCORRUPTED; + goto error0; + } } #endif /* @@ -1849,17 +2076,26 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Back up the by-block cursor to the left neighbor, and * update its length. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } nbno = ltbno; nlen = len + ltlen; if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) @@ -1875,10 +2111,16 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Update the starting block and length of the right * neighbor in the by-block tree. @@ -1897,7 +2139,10 @@ xfs_free_ag_extent( nlen = len; if ((error = xfs_btree_insert(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } } xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); bno_cur = NULL; @@ -1906,10 +2151,16 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); cnt_cur = NULL; @@ -1989,30 +2240,39 @@ xfs_alloc_longest_free_extent( * reservations and AGFL rules in place, we can return this extent. */ if (pag->pagf_longest > delta) - return pag->pagf_longest - delta; + return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable, + pag->pagf_longest - delta); /* Otherwise, let the caller try for 1 block if there's space. */ return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } +/* + * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, + * return the largest possible minimum length. + */ unsigned int xfs_alloc_min_freelist( struct xfs_mount *mp, struct xfs_perag *pag) { + /* AG btrees have at least 1 level. */ + static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; + const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; unsigned int min_free; + ASSERT(mp->m_ag_maxlevels > 0); + /* space needed by-bno freespace btree */ - min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1, + min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, mp->m_ag_maxlevels); /* space needed by-size freespace btree */ - min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, + min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, mp->m_ag_maxlevels); /* space needed reverse mapping used space btree */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - min_free += min_t(unsigned int, - pag->pagf_levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels); + min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, + mp->m_rmap_maxlevels); return min_free; } @@ -2086,9 +2346,11 @@ xfs_free_agfl_block( if (error) return error; - bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno); - if (!bp) - return -EFSCORRUPTED; + error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp, + XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno), + tp->t_mountp->m_bsize, 0, &bp); + if (error) + return error; xfs_trans_binval(tp, bp); return 0; @@ -2161,7 +2423,7 @@ xfs_agfl_reset( struct xfs_perag *pag) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_agf *agf = agbp->b_addr; ASSERT(pag->pagf_agflreset); trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); @@ -2239,12 +2501,11 @@ xfs_alloc_fix_freelist( if (!pag->pagf_init) { error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); - if (error) + if (error) { + /* Couldn't lock the AGF so skip this AG. */ + if (error == -EAGAIN) + error = 0; goto out_no_agbp; - if (!pag->pagf_init) { - ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); - ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - goto out_agbp_relse; } } @@ -2253,7 +2514,7 @@ xfs_alloc_fix_freelist( * somewhere else if we are not being asked to try harder at this * point */ - if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) && + if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) && (flags & XFS_ALLOC_FLAG_TRYLOCK)) { ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); goto out_agbp_relse; @@ -2270,11 +2531,10 @@ xfs_alloc_fix_freelist( */ if (!agbp) { error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); - if (error) - goto out_no_agbp; - if (!agbp) { - ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); - ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + if (error) { + /* Couldn't lock the AGF so skip this AG. */ + if (error == -EAGAIN) + error = 0; goto out_no_agbp; } } @@ -2394,7 +2654,7 @@ xfs_alloc_get_freelist( xfs_agblock_t *bnop, /* block address retrieved from freelist */ int btreeblk) /* destination is a AGF btree */ { - xfs_agf_t *agf; /* a.g. freespace structure */ + struct xfs_agf *agf = agbp->b_addr; xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ xfs_agblock_t bno; /* block number returned */ __be32 *agfl_bno; @@ -2406,7 +2666,6 @@ xfs_alloc_get_freelist( /* * Freelist is empty, give up. */ - agf = XFS_BUF_TO_AGF(agbp); if (!agf->agf_flcount) { *bnop = NULLAGBLOCK; return 0; @@ -2423,7 +2682,7 @@ xfs_alloc_get_freelist( /* * Get the block number and update the data structures. */ - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); @@ -2484,7 +2743,7 @@ xfs_alloc_log_agf( sizeof(xfs_agf_t) }; - trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); @@ -2505,11 +2764,10 @@ xfs_alloc_pagf_init( xfs_buf_t *bp; int error; - if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp))) - return error; - if (bp) + error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp); + if (!error) xfs_trans_brelse(tp, bp); - return 0; + return error; } /* @@ -2523,18 +2781,15 @@ xfs_alloc_put_freelist( xfs_agblock_t bno, /* block being freed */ int btreeblk) /* block came from a AGF btree */ { - xfs_agf_t *agf; /* a.g. freespace structure */ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agf *agf = agbp->b_addr; __be32 *blockp;/* pointer to array entry */ int error; int logflags; - xfs_mount_t *mp; /* mount structure */ xfs_perag_t *pag; /* per allocation group data */ __be32 *agfl_bno; int startoff; - agf = XFS_BUF_TO_AGF(agbp); - mp = tp->t_mountp; - if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), &agflbp))) return error; @@ -2560,7 +2815,7 @@ xfs_alloc_put_freelist( ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; *blockp = cpu_to_be32(bno); startoff = (char *)blockp - (char *)agflbp->b_addr; @@ -2578,13 +2833,12 @@ xfs_agf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; - struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + struct xfs_agf *agf = bp->b_addr; if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (!xfs_log_check_lsn(mp, - be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn))) + if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) return __this_address; } @@ -2598,6 +2852,13 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) return __this_address; + if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks) + return __this_address; + + if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || + be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length)) + return __this_address; + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || @@ -2609,6 +2870,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) return __this_address; + if (xfs_sb_version_hasrmapbt(&mp->m_sb) && + be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length)) + return __this_address; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't @@ -2623,6 +2888,11 @@ xfs_agf_verify( return __this_address; if (xfs_sb_version_hasreflink(&mp->m_sb) && + be32_to_cpu(agf->agf_refcount_blocks) > + be32_to_cpu(agf->agf_length)) + return __this_address; + + if (xfs_sb_version_hasreflink(&mp->m_sb) && (be32_to_cpu(agf->agf_refcount_level) < 1 || be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) return __this_address; @@ -2654,6 +2924,7 @@ xfs_agf_write_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_agf *agf = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agf_verify(bp); @@ -2666,7 +2937,7 @@ xfs_agf_write_verify( return; if (bip) - XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); + agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); } @@ -2695,14 +2966,11 @@ xfs_read_agf( trace_xfs_read_agf(mp, agno); ASSERT(agno != NULLAGNUMBER); - error = xfs_trans_read_buf( - mp, tp, mp->m_ddev_targp, + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); if (error) return error; - if (!*bpp) - return 0; ASSERT(!(*bpp)->b_error); xfs_buf_set_ref(*bpp, XFS_AGF_REF); @@ -2726,17 +2994,18 @@ xfs_alloc_read_agf( trace_xfs_alloc_read_agf(mp, agno); + /* We don't support trylock when freeing. */ + ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != + (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); ASSERT(agno != NULLAGNUMBER); error = xfs_read_agf(mp, tp, agno, (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, bpp); if (error) return error; - if (!*bpp) - return 0; ASSERT(!(*bpp)->b_error); - agf = XFS_BUF_TO_AGF(*bpp); + agf = (*bpp)->b_addr; pag = xfs_perag_get(mp, agno); if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); @@ -2956,13 +3225,6 @@ xfs_alloc_vextent( args->len); #endif - /* Zero the extent if we were asked to do so */ - if (args->datatype & XFS_ALLOC_USERDATA_ZERO) { - error = xfs_zero_extent(args->ip, args->fsbno, args->len); - if (error) - goto error0; - } - } xfs_perag_put(args->pag); return 0; @@ -3024,6 +3286,7 @@ __xfs_free_extent( struct xfs_buf *agbp; xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); + struct xfs_agf *agf; int error; unsigned int busy_flags = 0; @@ -3037,13 +3300,18 @@ __xfs_free_extent( error = xfs_free_extent_fix_freelist(tp, agno, &agbp); if (error) return error; + agf = agbp->b_addr; - XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err); + if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { + error = -EFSCORRUPTED; + goto err; + } /* validate the extent size is legal now we have the agf locked */ - XFS_WANT_CORRUPTED_GOTO(mp, - agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), - err); + if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { + error = -EFSCORRUPTED; + goto err; + } error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); if (error) @@ -3151,7 +3419,7 @@ xfs_agfl_walk( unsigned int i; int error; - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); i = be32_to_cpu(agf->agf_flfirst); /* Nothing to walk in an empty AGFL. */ diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d6ed5d2c07c2..a851bf77f17b 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -54,7 +54,6 @@ typedef struct xfs_alloc_arg { struct xfs_mount *mp; /* file system mount point */ struct xfs_buf *agbp; /* buffer for a.g. freelist header */ struct xfs_perag *pag; /* per-ag struct for this agno */ - struct xfs_inode *ip; /* for userdata zeroing method */ xfs_fsblock_t fsbno; /* file system block number */ xfs_agnumber_t agno; /* allocation group number */ xfs_agblock_t agbno; /* allocation group-relative block # */ @@ -83,20 +82,7 @@ typedef struct xfs_alloc_arg { */ #define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ #define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ -#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ -#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ - -static inline bool -xfs_alloc_is_userdata(int datatype) -{ - return (datatype & ~XFS_ALLOC_NOBUSY) != 0; -} - -static inline bool -xfs_alloc_allow_busy_reuse(int datatype) -{ - return (datatype & XFS_ALLOC_NOBUSY) == 0; -} +#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ /* freespace limit calculations */ #define XFS_ALLOC_AGFL_RESERVE 4 @@ -250,4 +236,13 @@ typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno, int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf, struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv); +static inline __be32 * +xfs_buf_to_agfl_bno( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_mount->m_sb)) + return bp->b_addr + sizeof(struct xfs_agfl); + return bp->b_addr; +} + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 2a94543857a1..60c453cb3ee3 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -12,6 +12,7 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" @@ -25,7 +26,7 @@ xfs_allocbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_ag.agbp, cur->bc_ag.agno, cur->bc_btnum); } @@ -35,8 +36,8 @@ xfs_allocbt_set_root( union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); int btnum = cur->bc_btnum; struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); @@ -62,7 +63,7 @@ xfs_allocbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -72,7 +73,7 @@ xfs_allocbt_alloc_block( return 0; } - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); @@ -86,8 +87,8 @@ xfs_allocbt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agblock_t bno; int error; @@ -113,7 +114,7 @@ xfs_allocbt_update_lastrec( int ptr, int reason) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); struct xfs_perag *pag; __be32 len; @@ -162,7 +163,7 @@ xfs_allocbt_update_lastrec( pag = xfs_perag_get(cur->bc_mp, seqno); pag->pagf_longest = be32_to_cpu(len); xfs_perag_put(pag); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST); } STATIC int @@ -226,9 +227,9 @@ xfs_allocbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } @@ -471,18 +472,14 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .recs_inorder = xfs_cntbt_recs_inorder, }; -/* - * Allocate a new allocation btree cursor. - */ -struct xfs_btree_cur * /* new alloc btree cursor */ -xfs_allocbt_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for agf structure */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_btnum_t btnum) /* btree identifier */ +/* Allocate most of a new allocation btree cursor. */ +STATIC struct xfs_btree_cur * +xfs_allocbt_init_common( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_btnum_t btnum) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); @@ -495,18 +492,16 @@ xfs_allocbt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; if (btnum == XFS_BTNUM_CNT) { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); cur->bc_ops = &xfs_cntbt_ops; - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; } else { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); cur->bc_ops = &xfs_bnobt_ops; - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); } - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; + cur->bc_ag.agno = agno; + cur->bc_ag.abt.active = false; if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; @@ -515,6 +510,73 @@ xfs_allocbt_init_cursor( } /* + * Allocate a new allocation btree cursor. + */ +struct xfs_btree_cur * /* new alloc btree cursor */ +xfs_allocbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agf structure */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* btree identifier */ +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + + cur = xfs_allocbt_init_common(mp, tp, agno, btnum); + if (btnum == XFS_BTNUM_CNT) + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + else + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + + cur->bc_ag.agbp = agbp; + + return cur; +} + +/* Create a free space btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_allocbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + + cur = xfs_allocbt_init_common(mp, NULL, agno, btnum); + xfs_btree_stage_afakeroot(cur, afake); + return cur; +} + +/* + * Install a new free space btree root. Caller is responsible for invalidating + * and freeing the old btree blocks. + */ +void +xfs_allocbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); + agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); + + if (cur->bc_btnum == XFS_BTNUM_BNO) { + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops); + } else { + cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE; + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops); + } +} + +/* * Calculate number of records in an alloc btree block. */ int diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index c9305ebb69f6..047f09f0be3c 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xbtree_afakeroot; /* * Btree block header size depends on a superblock flag. @@ -48,8 +49,14 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, xfs_btnum_t); +struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno, + xfs_btnum_t btnum); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, unsigned long long len); +void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 510ca6974604..e4fe3dca9883 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -56,32 +56,6 @@ STATIC int xfs_attr_node_removename(xfs_da_args_t *args); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); - -STATIC int -xfs_attr_args_init( - struct xfs_da_args *args, - struct xfs_inode *dp, - const unsigned char *name, - int flags) -{ - - if (!name) - return -EINVAL; - - memset(args, 0, sizeof(*args)); - args->geo = dp->i_mount->m_attr_geo; - args->whichfork = XFS_ATTR_FORK; - args->dp = dp; - args->flags = flags; - args->name = name; - args->namelen = strlen((const char *)name); - if (args->namelen >= MAXNAMELEN) - return -EFAULT; /* match IRIX behaviour */ - - args->hashval = xfs_da_hashname(args->name, args->namelen); - return 0; -} - int xfs_inode_hasattr( struct xfs_inode *ip) @@ -103,84 +77,60 @@ xfs_inode_hasattr( */ int xfs_attr_get_ilocked( - struct xfs_inode *ip, struct xfs_da_args *args) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - if (!xfs_inode_hasattr(ip)) + if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; - else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + + if (args->dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); - else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) + if (xfs_bmap_one_block(args->dp, XFS_ATTR_FORK)) return xfs_attr_leaf_get(args); - else - return xfs_attr_node_get(args); + return xfs_attr_node_get(args); } /* * Retrieve an extended attribute by name, and its value if requested. * - * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value, - * just an indication whether the attribute exists and the size of the value if - * it exists. The size is returned in @valuelenp, + * If args->valuelen is zero, then the caller does not want the value, just an + * indication whether the attribute exists and the size of the value if it + * exists. The size is returned in args.valuelen. * - * If the attribute is found, but exceeds the size limit set by the caller in - * @valuelenp, return -ERANGE with the size of the attribute that was found in - * @valuelenp. + * If args->value is NULL but args->valuelen is non-zero, allocate the buffer + * for the value after existence of the attribute has been determined. The + * caller always has to free args->value if it is set, no matter if this + * function was successful or not. * - * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after - * existence of the attribute has been determined. On success, return that - * buffer to the caller and leave them to free it. On failure, free any - * allocated buffer and ensure the buffer pointer returned to the caller is - * null. + * If the attribute is found, but exceeds the size limit set by the caller in + * args->valuelen, return -ERANGE with the size of the attribute that was found + * in args->valuelen. */ int xfs_attr_get( - struct xfs_inode *ip, - const unsigned char *name, - unsigned char **value, - int *valuelenp, - int flags) + struct xfs_da_args *args) { - struct xfs_da_args args; uint lock_mode; int error; - ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value); - - XFS_STATS_INC(ip->i_mount, xs_attr_get); + XFS_STATS_INC(args->dp->i_mount, xs_attr_get); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(args->dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, ip, name, flags); - if (error) - return error; + args->geo = args->dp->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->hashval = xfs_da_hashname(args->name, args->namelen); /* Entirely possible to look up a name which doesn't exist */ - args.op_flags = XFS_DA_OP_OKNOENT; - if (flags & ATTR_ALLOC) - args.op_flags |= XFS_DA_OP_ALLOCVAL; - else - args.value = *value; - args.valuelen = *valuelenp; + args->op_flags = XFS_DA_OP_OKNOENT; - lock_mode = xfs_ilock_attr_map_shared(ip); - error = xfs_attr_get_ilocked(ip, &args); - xfs_iunlock(ip, lock_mode); - *valuelenp = args.valuelen; + lock_mode = xfs_ilock_attr_map_shared(args->dp); + error = xfs_attr_get_ilocked(args); + xfs_iunlock(args->dp, lock_mode); - /* on error, we have to clean up allocated value buffers */ - if (error) { - if (flags & ATTR_ALLOC) { - kmem_free(args.value); - *value = NULL; - } - return error; - } - *value = args.value; - return 0; + return error; } /* @@ -236,7 +186,7 @@ xfs_attr_try_sf_addname( * Commit the shortform mods, and we're done. * NOTE: this is also the error path (EEXIST, etc). */ - if (!error && (args->flags & ATTR_KERNOTIME) == 0) + if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); if (mp->m_flags & XFS_MOUNT_WSYNC) @@ -334,186 +284,127 @@ xfs_attr_remove_args( return error; } +/* + * Note: If args->value is NULL the attribute will be removed, just like the + * Linux ->setattr API. + */ int xfs_attr_set( - struct xfs_inode *dp, - const unsigned char *name, - unsigned char *value, - int valuelen, - int flags) + struct xfs_da_args *args) { + struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_da_args args; struct xfs_trans_res tres; - int rsvd = (flags & ATTR_ROOT) != 0; + bool rsvd = (args->attr_filter & XFS_ATTR_ROOT); int error, local; - - XFS_STATS_INC(mp, xs_attr_set); + unsigned int total; if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, dp, name, flags); - if (error) - return error; - - args.value = value; - args.valuelen = valuelen; - args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - args.total = xfs_attr_calc_size(&args, &local); - error = xfs_qm_dqattach(dp); if (error) return error; - /* - * If the inode doesn't have an attribute fork, add one. - * (inode must not be locked when we call this routine) - */ - if (XFS_IFORK_Q(dp) == 0) { - int sf_size = sizeof(xfs_attr_sf_hdr_t) + - XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen); - - error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); - if (error) - return error; - } - - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * args.total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - - /* - * Root fork attributes can use reserved data blocks for this - * operation if necessary - */ - error = xfs_trans_alloc(mp, &tres, args.total, 0, - rsvd ? XFS_TRANS_RESERVE : 0, &args.trans); - if (error) - return error; - - xfs_ilock(dp, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, - rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_trans_cancel; - - xfs_trans_ijoin(args.trans, dp, 0); - error = xfs_attr_set_args(&args); - if (error) - goto out_trans_cancel; - if (!args.trans) { - /* shortform attribute has already been committed */ - goto out_unlock; - } - - /* - * If this is a synchronous mount, make sure that the - * transaction goes to disk before returning to the user. - */ - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(args.trans); - - if ((flags & ATTR_KERNOTIME) == 0) - xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + args->geo = mp->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->hashval = xfs_da_hashname(args->name, args->namelen); /* - * Commit the last in the sequence of transactions. + * We have no control over the attribute names that userspace passes us + * to remove, so we have to allow the name lookup prior to attribute + * removal to fail as well. */ - xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans); -out_unlock: - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; + args->op_flags = XFS_DA_OP_OKNOENT; -out_trans_cancel: - if (args.trans) - xfs_trans_cancel(args.trans); - goto out_unlock; -} + if (args->value) { + XFS_STATS_INC(mp, xs_attr_set); -/* - * Generic handler routine to remove a name from an attribute list. - * Transitions attribute list from Btree to shortform as necessary. - */ -int -xfs_attr_remove( - struct xfs_inode *dp, - const unsigned char *name, - int flags) -{ - struct xfs_mount *mp = dp->i_mount; - struct xfs_da_args args; - int error; - - XFS_STATS_INC(mp, xs_attr_remove); + args->op_flags |= XFS_DA_OP_ADDNAME; + args->total = xfs_attr_calc_size(args, &local); - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return -EIO; + /* + * If the inode doesn't have an attribute fork, add one. + * (inode must not be locked when we call this routine) + */ + if (XFS_IFORK_Q(dp) == 0) { + int sf_size = sizeof(struct xfs_attr_sf_hdr) + + XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, + args->valuelen); - error = xfs_attr_args_init(&args, dp, name, flags); - if (error) - return error; + error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + if (error) + return error; + } - /* - * we have no control over the attribute names that userspace passes us - * to remove, so we have to allow the name lookup prior to attribute - * removal to fail. - */ - args.op_flags = XFS_DA_OP_OKNOENT; + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * + args->total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + total = args->total; + } else { + XFS_STATS_INC(mp, xs_attr_remove); - error = xfs_qm_dqattach(dp); - if (error) - return error; + tres = M_RES(mp)->tr_attrrm; + total = XFS_ATTRRM_SPACE_RES(mp); + } /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm, - XFS_ATTRRM_SPACE_RES(mp), 0, - (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0, - &args.trans); + error = xfs_trans_alloc(mp, &tres, total, 0, + rsvd ? XFS_TRANS_RESERVE : 0, &args->trans); if (error) return error; xfs_ilock(dp, XFS_ILOCK_EXCL); - /* - * No need to make quota reservations here. We expect to release some - * blocks not allocate in the common case. - */ - xfs_trans_ijoin(args.trans, dp, 0); - - error = xfs_attr_remove_args(&args); - if (error) - goto out; + xfs_trans_ijoin(args->trans, dp, 0); + if (args->value) { + unsigned int quota_flags = XFS_QMOPT_RES_REGBLKS; + + if (rsvd) + quota_flags |= XFS_QMOPT_FORCE_RES; + error = xfs_trans_reserve_quota_nblks(args->trans, dp, + args->total, 0, quota_flags); + if (error) + goto out_trans_cancel; + error = xfs_attr_set_args(args); + if (error) + goto out_trans_cancel; + /* shortform attribute has already been committed */ + if (!args->trans) + goto out_unlock; + } else { + error = xfs_attr_remove_args(args); + if (error) + goto out_trans_cancel; + } /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(args.trans); + xfs_trans_set_sync(args->trans); - if ((flags & ATTR_KERNOTIME) == 0) - xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + if (!(args->op_flags & XFS_DA_OP_NOTIME)) + xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); /* * Commit the last in the sequence of transactions. */ - xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args->trans); +out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; -out: - if (args.trans) - xfs_trans_cancel(args.trans); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; +out_trans_cancel: + if (args->trans) + xfs_trans_cancel(args->trans); + goto out_unlock; } /*======================================================================== @@ -532,10 +423,10 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) trace_xfs_attr_sf_addname(args); retval = xfs_attr_shortform_lookup(args); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) return retval; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) + if (retval == -EEXIST) { + if (args->attr_flags & XATTR_CREATE) return retval; retval = xfs_attr_shortform_remove(args); if (retval) @@ -545,7 +436,7 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * that the leaf format add routine won't trip over the attr * not being around. */ - args->flags &= ~ATTR_REPLACE; + args->attr_flags &= ~XATTR_REPLACE; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || @@ -589,7 +480,7 @@ xfs_attr_leaf_addname( */ dp = args->dp; args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); if (error) return error; @@ -598,14 +489,11 @@ xfs_attr_leaf_addname( * the given flags produce an error or call for an atomic rename. */ retval = xfs_attr3_leaf_lookup_int(bp, args); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { - xfs_trans_brelse(args->trans, bp); - return retval; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) { /* pure create op */ - xfs_trans_brelse(args->trans, bp); - return retval; - } + if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) + goto out_brelse; + if (retval == -EEXIST) { + if (args->attr_flags & XATTR_CREATE) + goto out_brelse; trace_xfs_attr_leaf_replace(args); @@ -715,7 +603,7 @@ xfs_attr_leaf_addname( * remove the "old" attr from that block (neat, huh!) */ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - -1, &bp); + &bp); if (error) return error; @@ -746,6 +634,9 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_clearflag(args); } return error; +out_brelse: + xfs_trans_brelse(args->trans, bp); + return retval; } /* @@ -769,7 +660,7 @@ xfs_attr_leaf_removename( */ dp = args->dp; args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); if (error) return error; @@ -813,7 +704,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args) trace_xfs_attr_leaf_get(args); args->blkno = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); if (error) return error; @@ -872,10 +763,10 @@ restart: goto out; blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) goto out; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) + if (retval == -EEXIST) { + if (args->attr_flags & XATTR_CREATE) goto out; trace_xfs_attr_node_replace(args); @@ -1007,7 +898,7 @@ restart: * The INCOMPLETE flag means that we will find the "old" * attr, not the "new" one. */ - args->flags |= XFS_ATTR_INCOMPLETE; + args->attr_filter |= XFS_ATTR_INCOMPLETE; state = xfs_da_state_alloc(); state->args = args; state->mp = mp; @@ -1173,7 +1064,7 @@ xfs_attr_node_removename( ASSERT(state->path.blk[0].bp); state->path.blk[0].bp = NULL; - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); if (error) goto out; @@ -1266,10 +1157,9 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da3_node_read(state->args->trans, - state->args->dp, - blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); if (error) return error; } else { @@ -1285,10 +1175,9 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da3_node_read(state->args->trans, - state->args->dp, - blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); if (error) return error; } else { diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 94badfa1743e..0d2d05908537 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -21,36 +21,6 @@ struct xfs_attr_list_context; * as possible so as to fit into the literal area of the inode. */ -/*======================================================================== - * External interfaces - *========================================================================*/ - - -#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ -#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ -#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ -#define ATTR_SECURE 0x0008 /* use attrs in security namespace */ -#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ -#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ - -#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ -#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ - -#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ -#define ATTR_ALLOC 0x8000 /* allocate xattr buffer on demand */ - -#define XFS_ATTR_FLAGS \ - { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ - { ATTR_ROOT, "ROOT" }, \ - { ATTR_TRUST, "TRUST" }, \ - { ATTR_SECURE, "SECURE" }, \ - { ATTR_CREATE, "CREATE" }, \ - { ATTR_REPLACE, "REPLACE" }, \ - { ATTR_KERNOTIME, "KERNOTIME" }, \ - { ATTR_KERNOVAL, "KERNOVAL" }, \ - { ATTR_INCOMPLETE, "INCOMPLETE" }, \ - { ATTR_ALLOC, "ALLOC" } - /* * The maximum size (into the kernel or returned from the kernel) of an * attribute value or the buffer used for an attr_list() call. Larger @@ -59,45 +29,16 @@ struct xfs_attr_list_context; #define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ /* - * Define how lists of attribute names are returned to the user from - * the attr_list() call. A large, 32bit aligned, buffer is passed in - * along with its size. We put an array of offsets at the top that each - * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom. - */ -typedef struct attrlist { - __s32 al_count; /* number of entries in attrlist */ - __s32 al_more; /* T/F: more attrs (do call again) */ - __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ -} attrlist_t; - -/* - * Show the interesting info about one attribute. This is what the - * al_offset[i] entry points to. - */ -typedef struct attrlist_ent { /* data from attr_list() */ - __u32 a_valuelen; /* number bytes in value of attr */ - char a_name[1]; /* attr name (NULL terminated) */ -} attrlist_ent_t; - -/* - * Given a pointer to the (char*) buffer containing the attr_list() result, - * and an index, return a pointer to the indicated attribute in the buffer. - */ -#define ATTR_ENTRY(buffer, index) \ - ((attrlist_ent_t *) \ - &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) - -/* * Kernel-internal version of the attrlist cursor. */ -typedef struct attrlist_cursor_kern { +struct xfs_attrlist_cursor_kern { __u32 hashval; /* hash value of next entry to add */ __u32 blkno; /* block containing entry (suggestion) */ __u32 offset; /* offset in list of equal-hashvals */ __u16 pad1; /* padding to match user-level */ __u8 pad2; /* padding to match user-level */ __u8 initted; /* T/F: cursor has been initialized */ -} attrlist_cursor_kern_t; +}; /*======================================================================== @@ -109,27 +50,28 @@ typedef struct attrlist_cursor_kern { typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, unsigned char *, int, int); -typedef struct xfs_attr_list_context { - struct xfs_trans *tp; - struct xfs_inode *dp; /* inode */ - struct attrlist_cursor_kern *cursor; /* position in list */ - char *alist; /* output buffer */ +struct xfs_attr_list_context { + struct xfs_trans *tp; + struct xfs_inode *dp; /* inode */ + struct xfs_attrlist_cursor_kern cursor; /* position in list */ + void *buffer; /* output buffer */ /* * Abort attribute list iteration if non-zero. Can be used to pass * error values to the xfs_attr_list caller. */ - int seen_enough; + int seen_enough; + bool allow_incomplete; - ssize_t count; /* num used entries */ - int dupcnt; /* count dup hashvals seen */ - int bufsize; /* total buffer size */ - int firstu; /* first used byte in buffer */ - int flags; /* from VOP call */ - int resynch; /* T/F: resynch with cursor */ - put_listent_func_t put_listent; /* list output fmt function */ - int index; /* index into output buffer */ -} xfs_attr_list_context_t; + ssize_t count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize; /* total buffer size */ + int firstu; /* first used byte in buffer */ + unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE} */ + int resynch; /* T/F: resynch with cursor */ + put_listent_func_t put_listent; /* list output fmt function */ + int index; /* index into output buffer */ +}; /*======================================================================== @@ -140,19 +82,14 @@ typedef struct xfs_attr_list_context { * Overall external interface routines. */ int xfs_attr_inactive(struct xfs_inode *dp); -int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *); -int xfs_attr_list_int(struct xfs_attr_list_context *); +int xfs_attr_list_ilocked(struct xfs_attr_list_context *); +int xfs_attr_list(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); -int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); -int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - unsigned char **value, int *valuelenp, int flags); -int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, - unsigned char *value, int valuelen, int flags); +int xfs_attr_get_ilocked(struct xfs_da_args *args); +int xfs_attr_get(struct xfs_da_args *args); +int xfs_attr_set(struct xfs_da_args *args); int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); -int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, - int flags, struct attrlist_cursor_kern *cursor); bool xfs_attr_namecheck(const void *name, size_t length); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index f0089e862216..863444e2dda7 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -233,6 +233,61 @@ xfs_attr3_leaf_hdr_to_disk( } static xfs_failaddr_t +xfs_attr3_leaf_verify_entry( + struct xfs_mount *mp, + char *buf_end, + struct xfs_attr_leafblock *leaf, + struct xfs_attr3_icleaf_hdr *leafhdr, + struct xfs_attr_leaf_entry *ent, + int idx, + __u32 *last_hashval) +{ + struct xfs_attr_leaf_name_local *lentry; + struct xfs_attr_leaf_name_remote *rentry; + char *name_end; + unsigned int nameidx; + unsigned int namesize; + __u32 hashval; + + /* hash order check */ + hashval = be32_to_cpu(ent->hashval); + if (hashval < *last_hashval) + return __this_address; + *last_hashval = hashval; + + nameidx = be16_to_cpu(ent->nameidx); + if (nameidx < leafhdr->firstused || nameidx >= mp->m_attr_geo->blksize) + return __this_address; + + /* + * Check the name information. The namelen fields are u8 so we can't + * possibly exceed the maximum name length of 255 bytes. + */ + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = xfs_attr3_leaf_name_local(leaf, idx); + namesize = xfs_attr_leaf_entsize_local(lentry->namelen, + be16_to_cpu(lentry->valuelen)); + name_end = (char *)lentry + namesize; + if (lentry->namelen == 0) + return __this_address; + } else { + rentry = xfs_attr3_leaf_name_remote(leaf, idx); + namesize = xfs_attr_leaf_entsize_remote(rentry->namelen); + name_end = (char *)rentry + namesize; + if (rentry->namelen == 0) + return __this_address; + if (!(ent->flags & XFS_ATTR_INCOMPLETE) && + rentry->valueblk == 0) + return __this_address; + } + + if (name_end > buf_end) + return __this_address; + + return NULL; +} + +static xfs_failaddr_t xfs_attr3_leaf_verify( struct xfs_buf *bp) { @@ -240,7 +295,10 @@ xfs_attr3_leaf_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_entry *ent; + char *buf_end; uint32_t end; /* must be 32bit - see below */ + __u32 last_hashval = 0; int i; xfs_failaddr_t fa; @@ -273,8 +331,13 @@ xfs_attr3_leaf_verify( (char *)bp->b_addr + ichdr.firstused) return __this_address; - /* XXX: need to range check rest of attr header values */ - /* XXX: hash order check? */ + buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; + for (i = 0, ent = entries; i < ichdr.count; ent++, i++) { + fa = xfs_attr3_leaf_verify_entry(mp, buf_end, leaf, &ichdr, + ent, i, &last_hashval); + if (fa) + return fa; + } /* * Quickly check the freemap information. Attribute data has to be @@ -367,13 +430,12 @@ xfs_attr3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mappedbno, struct xfs_buf **bpp) { int err; - err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); + err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK, + &xfs_attr3_leaf_buf_ops); if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); return err; @@ -383,14 +445,25 @@ xfs_attr3_leaf_read( * Namespace helper routines *========================================================================*/ -/* - * If namespace bits don't match return 0. - * If all match then return 1. - */ -STATIC int -xfs_attr_namesp_match(int arg_flags, int ondisk_flags) +static bool +xfs_attr_match( + struct xfs_da_args *args, + uint8_t namelen, + unsigned char *name, + int flags) { - return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); + if (args->namelen != namelen) + return false; + if (memcmp(args->name, name, namelen) != 0) + return false; + /* + * If we are looking for incomplete entries, show only those, else only + * show complete entries. + */ + if (args->attr_filter != + (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) + return false; + return true; } static int @@ -402,7 +475,7 @@ xfs_attr_copy_value( /* * No copy if all we have to do is get the length */ - if (args->flags & ATTR_KERNOVAL) { + if (!args->valuelen) { args->valuelen = valuelen; return 0; } @@ -415,7 +488,7 @@ xfs_attr_copy_value( return -ERANGE; } - if (args->op_flags & XFS_DA_OP_ALLOCVAL) { + if (!args->value) { args->value = kmem_alloc_large(valuelen, 0); if (!args->value) return -ENOMEM; @@ -453,16 +526,18 @@ xfs_attr_copy_value( * special case for dev/uuid inodes, they have fixed size data forks. */ int -xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) +xfs_attr_shortform_bytesfit( + struct xfs_inode *dp, + int bytes) { - int offset; - int minforkoff; /* lower limit on valid forkoff locations */ - int maxforkoff; /* upper limit on valid forkoff locations */ - int dsize; - xfs_mount_t *mp = dp->i_mount; + struct xfs_mount *mp = dp->i_mount; + int64_t dsize; + int minforkoff; + int maxforkoff; + int offset; /* rounded down */ - offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; + offset = (XFS_LITINO(mp) - bytes) >> 3; if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) { minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; @@ -525,12 +600,11 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) * A data fork btree root must have space for at least * MINDBTPTRS key/ptr pairs if the data fork is small or empty. */ - minforkoff = max(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); + minforkoff = max_t(int64_t, dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) @@ -614,15 +688,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { -#ifdef DEBUG - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - ASSERT(0); -#endif + ASSERT(!xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)); } offset = (char *)sfe - (char *)sf; @@ -633,7 +700,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; - sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + sfe->flags = args->attr_filter; memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); sf->hdr.count++; @@ -685,13 +752,9 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), base += size, i++) { size = XFS_ATTR_SF_ENTSIZE(sfe); - if (sfe->namelen != args->namelen) - continue; - if (memcmp(sfe->nameval, args->name, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - break; + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + break; } if (i == end) return -ENOATTR; @@ -752,23 +815,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - return -EEXIST; + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return -EEXIST; } return -ENOATTR; } /* - * Retreive the attribute value and length. + * Retrieve the attribute value and length. * - * If ATTR_KERNOVAL is specified, only the length needs to be returned. - * Unlike a lookup, we only return an error if the attribute does not - * exist or we can't retrieve the value. + * If args->valuelen is zero, only the length needs to be returned. Unlike a + * lookup, we only return an error if the attribute does not exist or we can't + * retrieve the value. */ int xfs_attr_shortform_getvalue( @@ -783,14 +842,10 @@ xfs_attr_shortform_getvalue( sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], - sfe->valuelen); + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return xfs_attr_copy_value(args, + &sfe->nameval[args->namelen], sfe->valuelen); } return -ENOATTR; } @@ -854,7 +909,7 @@ xfs_attr_shortform_to_leaf( nargs.valuelen = sfe->valuelen; nargs.hashval = xfs_da_hashname(sfe->nameval, sfe->namelen); - nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); + nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == -ENOATTR); error = xfs_attr3_leaf_add(bp, &nargs); @@ -924,7 +979,7 @@ xfs_attr_shortform_verify( char *endp; struct xfs_ifork *ifp; int i; - int size; + int64_t size; ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL); ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); @@ -1060,7 +1115,7 @@ xfs_attr3_leaf_to_shortform( nargs.value = &name_loc->nameval[nargs.namelen]; nargs.valuelen = be16_to_cpu(name_loc->valuelen); nargs.hashval = be32_to_cpu(entry->hashval); - nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); + nargs.attr_filter = entry->flags & XFS_ATTR_NSP_ONDISK_MASK; xfs_attr_shortform_add(&nargs, forkoff); } error = 0; @@ -1080,7 +1135,6 @@ xfs_attr3_leaf_to_node( struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr icleafhdr; struct xfs_attr_leaf_entry *entries; - struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr icnodehdr; struct xfs_da_intnode *node; struct xfs_inode *dp = args->dp; @@ -1095,11 +1149,11 @@ xfs_attr3_leaf_to_node( error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1); + error = xfs_attr3_leaf_read(args->trans, dp, 0, &bp1); if (error) goto out; - error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK); + error = xfs_da_get_buf(args->trans, dp, blkno, &bp2, XFS_ATTR_FORK); if (error) goto out; @@ -1120,18 +1174,17 @@ xfs_attr3_leaf_to_node( if (error) goto out; node = bp1->b_addr; - dp->d_ops->node_hdr_from_disk(&icnodehdr, node); - btree = dp->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(mp, &icnodehdr, node); leaf = bp2->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); /* both on-disk, don't endian-flip twice */ - btree[0].hashval = entries[icleafhdr.count - 1].hashval; - btree[0].before = cpu_to_be32(blkno); + icnodehdr.btree[0].hashval = entries[icleafhdr.count - 1].hashval; + icnodehdr.btree[0].before = cpu_to_be32(blkno); icnodehdr.count = 1; - dp->d_ops->node_hdr_to_disk(node, &icnodehdr); + xfs_da3_node_hdr_to_disk(dp->i_mount, node, &icnodehdr); xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1); error = 0; out: @@ -1161,7 +1214,7 @@ xfs_attr3_leaf_create( trace_xfs_attr_leaf_create(args); - error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, + error = xfs_da_get_buf(args->trans, args->dp, blkno, &bp, XFS_ATTR_FORK); if (error) return error; @@ -1387,8 +1440,9 @@ xfs_attr3_leaf_add_work( entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + ichdr->freemap[mapindex].size); entry->hashval = cpu_to_be32(args->hashval); - entry->flags = tmp ? XFS_ATTR_LOCAL : 0; - entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + entry->flags = args->attr_filter; + if (tmp) + entry->flags |= XFS_ATTR_LOCAL; if (args->op_flags & XFS_DA_OP_RENAME) { entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && @@ -1447,7 +1501,9 @@ xfs_attr3_leaf_add_work( for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { if (ichdr->freemap[i].base == tmp) { ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t); - ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t); + ichdr->freemap[i].size -= + min_t(uint16_t, ichdr->freemap[i].size, + sizeof(xfs_attr_leaf_entry_t)); } } ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); @@ -1931,7 +1987,7 @@ xfs_attr3_leaf_toosmall( if (blkno == 0) continue; error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, - blkno, -1, &bp); + blkno, &bp); if (error) return error; @@ -2281,8 +2337,10 @@ xfs_attr3_leaf_lookup_int( leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); - if (ichdr.count >= args->geo->blksize / 8) + if (ichdr.count >= args->geo->blksize / 8) { + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; + } /* * Binary search. (note: small blocks will skip this loop) @@ -2298,10 +2356,14 @@ xfs_attr3_leaf_lookup_int( else break; } - if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) + if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) { + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; - if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) + } + if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) { + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; + } /* * Since we may have duplicate hashval's, find the first matching @@ -2329,33 +2391,17 @@ xfs_attr3_leaf_lookup_int( /* * GROT: Add code to remove incomplete entries. */ - /* - * If we are looking for INCOMPLETE entries, show only those. - * If we are looking for complete entries, show only those. - */ - if ((args->flags & XFS_ATTR_INCOMPLETE) != - (entry->flags & XFS_ATTR_INCOMPLETE)) { - continue; - } if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, probe); - if (name_loc->namelen != args->namelen) - continue; - if (memcmp(args->name, name_loc->nameval, - args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, entry->flags)) + if (!xfs_attr_match(args, name_loc->namelen, + name_loc->nameval, entry->flags)) continue; args->index = probe; return -EEXIST; } else { name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); - if (name_rmt->namelen != args->namelen) - continue; - if (memcmp(args->name, name_rmt->name, - args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, entry->flags)) + if (!xfs_attr_match(args, name_rmt->namelen, + name_rmt->name, entry->flags)) continue; args->index = probe; args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); @@ -2374,9 +2420,9 @@ xfs_attr3_leaf_lookup_int( * Get the value associated with an attribute name from a leaf attribute * list structure. * - * If ATTR_KERNOVAL is specified, only the length needs to be returned. - * Unlike a lookup, we only return an error if the attribute does not - * exist or we can't retrieve the value. + * If args->valuelen is zero, only the length needs to be returned. Unlike a + * lookup, we only return an error if the attribute does not exist or we can't + * retrieve the value. */ int xfs_attr3_leaf_getvalue( @@ -2661,7 +2707,7 @@ xfs_attr3_leaf_clearflag( /* * Set up the operation. */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); if (error) return error; @@ -2728,7 +2774,7 @@ xfs_attr3_leaf_setflag( /* * Set up the operation. */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); if (error) return error; @@ -2790,7 +2836,7 @@ xfs_attr3_leaf_flipflags( /* * Read the block containing the "old" attr */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp1); if (error) return error; @@ -2799,7 +2845,7 @@ xfs_attr3_leaf_flipflags( */ if (args->blkno2 != args->blkno) { error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2, - -1, &bp2); + &bp2); if (error) return error; } else { diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 7b74e18becff..6dd2d937a42a 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -8,7 +8,6 @@ #define __XFS_ATTR_LEAF_H__ struct attrlist; -struct attrlist_cursor_kern; struct xfs_attr_list_context; struct xfs_da_args; struct xfs_da_state; @@ -17,13 +16,27 @@ struct xfs_inode; struct xfs_trans; /* - * Used to keep a list of "remote value" extents when unlinking an inode. + * Incore version of the attribute leaf header. */ -typedef struct xfs_attr_inactive_list { - xfs_dablk_t valueblk; /* block number of value bytes */ - int valuelen; /* number of bytes in value */ -} xfs_attr_inactive_list_t; - +struct xfs_attr3_icleaf_hdr { + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t usedbytes; + /* + * Firstused is 32-bit here instead of 16-bit like the on-disk variant + * to support maximum fsb size of 64k without overflow issues throughout + * the attr code. Instead, the overflow condition is handled on + * conversion to/from disk. + */ + uint32_t firstused; + __u8 holes; + struct { + uint16_t base; + uint16_t size; + } freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; /*======================================================================== * Function prototypes for the kernel. @@ -67,8 +80,8 @@ int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -void xfs_attr3_leaf_list_int(struct xfs_buf *bp, - struct xfs_attr_list_context *context); +int xfs_attr3_leaf_list_int(struct xfs_buf *bp, + struct xfs_attr_list_context *context); /* * Routines used for shrinking the Btree. @@ -85,8 +98,7 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bpp); + xfs_dablk_t bno, struct xfs_buf **bpp); void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 3e39b7d40f25..01ad7f353e08 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -19,12 +19,30 @@ #include "xfs_trans.h" #include "xfs_bmap.h" #include "xfs_attr.h" +#include "xfs_attr_remote.h" #include "xfs_trace.h" #include "xfs_error.h" #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ /* + * Remote Attribute Values + * ======================= + * + * Remote extended attribute values are conceptually simple -- they're written + * to data blocks mapped by an inode's attribute fork, and they have an upper + * size limit of 64k. Setting a value does not involve the XFS log. + * + * However, on a v5 filesystem, maximally sized remote attr values require one + * block more than 64k worth of space to hold both the remote attribute value + * header (64 bytes). On a 4k block filesystem this results in a 68k buffer; + * on a 64k block filesystem, this would be a 128k buffer. Note that the log + * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k). + * Therefore, we /must/ ensure that remote attribute value buffers never touch + * the logging system and therefore never have a log item. + */ + +/* * Each contiguous block has a header, so it is not just a simple attribute * length to FSB conversion. */ @@ -379,7 +397,7 @@ xfs_attr_rmtval_get( trace_xfs_attr_rmtval_get(args); - ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->valuelen != 0); ASSERT(args->rmtvaluelen == args->valuelen); valuelen = args->rmtvaluelen; @@ -400,17 +418,15 @@ xfs_attr_rmtval_get( (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_trans_read_buf(mp, args->trans, - mp->m_ddev_targp, - dblkno, dblkcnt, 0, &bp, - &xfs_attr3_rmt_buf_ops); + error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, + 0, &bp, &xfs_attr3_rmt_buf_ops); if (error) return error; error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, &offset, &valuelen, &dst); - xfs_trans_brelse(args->trans, bp); + xfs_buf_relse(bp); if (error) return error; @@ -529,9 +545,9 @@ xfs_attr_rmtval_set( dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt); - if (!bp) - return -ENOMEM; + error = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, &bp); + if (error) + return error; bp->b_ops = &xfs_attr3_rmt_buf_ops; xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, @@ -551,6 +567,33 @@ xfs_attr_rmtval_set( return 0; } +/* Mark stale any incore buffers for the remote value. */ +int +xfs_attr_rmtval_stale( + struct xfs_inode *ip, + struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) || + XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) + return -EFSCORRUPTED; + + bp = xfs_buf_incore(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map->br_startblock), + XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + + return 0; +} + /* * Remove the value associated with an attribute by deleting the * out-of-line buffer that it is stored on. @@ -559,7 +602,6 @@ int xfs_attr_rmtval_remove( struct xfs_da_args *args) { - struct xfs_mount *mp = args->dp->i_mount; xfs_dablk_t lblkno; int blkcnt; int error; @@ -574,9 +616,6 @@ xfs_attr_rmtval_remove( blkcnt = args->rmtblkcnt; while (blkcnt > 0) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_daddr_t dblkno; - int dblkcnt; int nmap; /* @@ -587,22 +626,11 @@ xfs_attr_rmtval_remove( blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - /* - * If the "remote" value is in the cache, remove it. - */ - bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - bp = NULL; - } + if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) + return -EFSCORRUPTED; + error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK); + if (error) + return error; lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 9d20b66ad379..6fb4572845ce 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -11,5 +11,7 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); +int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c index 7071ff98fdbc..40ce5f3094d1 100644 --- a/fs/xfs/libxfs/xfs_bit.c +++ b/fs/xfs/libxfs/xfs_bit.c @@ -5,6 +5,7 @@ */ #include "xfs.h" #include "xfs_log_format.h" +#include "xfs_bit.h" /* * XFS bit manipulation routines, used in non-realtime code. diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 02469d59c787..fda13cd7add0 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -34,6 +34,7 @@ #include "xfs_ag_resv.h" #include "xfs_refcount.h" #include "xfs_icache.h" +#include "xfs_iomap.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -192,14 +193,12 @@ xfs_default_attroffset( struct xfs_mount *mp = ip->i_mount; uint offset; - if (mp->m_sb.sb_inodesize == 256) { - offset = XFS_LITINO(mp, ip->i_d.di_version) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); - } else { + if (mp->m_sb.sb_inodesize == 256) + offset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + else offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); - } - ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version)); + ASSERT(offset < XFS_LITINO(mp)); return offset; } @@ -383,8 +382,10 @@ xfs_bmap_check_leaf_extents( xfs_check_block(block, mp, 0, 0); pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(mp, - xfs_verify_fsbno(mp, bno), error0); + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) { + error = -EFSCORRUPTED; + goto error0; + } if (bp_release) { bp_release = 0; xfs_trans_brelse(NULL, bp); @@ -611,8 +612,8 @@ xfs_bmap_btree_to_extents( pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); #ifdef DEBUG - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, - xfs_btree_check_lptr(cur, cbno, 1)); + if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1))) + return -EFSCORRUPTED; #endif error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); @@ -687,7 +688,7 @@ xfs_bmap_extents_to_btree( * Need a cursor. Can't allocate until bb_level is filled in. */ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; /* * Convert to a btree with two levels, one record in root. */ @@ -724,14 +725,14 @@ xfs_bmap_extents_to_btree( ASSERT(tp->t_firstblock == NULLFSBLOCK || args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock)); tp->t_firstblock = args.fsbno; - cur->bc_private.b.allocated++; + cur->bc_ino.allocated++; ip->i_d.di_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); - abp = xfs_btree_get_bufl(mp, tp, args.fsbno); - if (!abp) { - error = -EFSCORRUPTED; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, args.fsbno), + mp->m_bsize, 0, &abp); + if (error) goto out_unreserve_dquot; - } /* * Fill in the child block. @@ -875,7 +876,11 @@ xfs_bmap_local_to_extents( ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); tp->t_firstblock = args.fsbno; - bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno); + error = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, + XFS_FSB_TO_DADDR(args.mp, args.fsbno), + args.mp->m_bsize, 0, &bp); + if (error) + goto done; /* * Initialize the block, copy the data and log the remote buffer. @@ -936,14 +941,17 @@ xfs_bmap_add_attrfork_btree( if (error) goto error0; /* must be at least one entry */ - XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0); + if (XFS_IS_CORRUPT(mp, stat != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if ((error = xfs_btree_new_iroot(cur, flags, &stat))) goto error0; if (stat == 0) { xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return -ENOSPC; } - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); } return 0; @@ -970,7 +978,7 @@ xfs_bmap_add_attrfork_extents( error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, XFS_DATA_FORK); if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -1083,7 +1091,7 @@ xfs_bmap_add_attrfork( goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel; - if (ip->i_d.di_anextents != 0) { + if (XFS_IS_CORRUPT(mp, ip->i_d.di_anextents != 0)) { error = -EFSCORRUPTED; goto trans_cancel; } @@ -1154,6 +1162,65 @@ trans_cancel: * Internal and external extent tree search functions. */ +struct xfs_iread_state { + struct xfs_iext_cursor icur; + xfs_extnum_t loaded; +}; + +/* Stuff every bmbt record from this block into the incore extent map. */ +static int +xfs_iread_bmbt_block( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xfs_iread_state *ir = priv; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip = cur->bc_ino.ip; + struct xfs_btree_block *block; + struct xfs_buf *bp; + struct xfs_bmbt_rec *frp; + xfs_extnum_t num_recs; + xfs_extnum_t j; + int whichfork = cur->bc_ino.whichfork; + + block = xfs_btree_get_block(cur, level, &bp); + + /* Abort if we find more records than nextents. */ + num_recs = xfs_btree_get_numrecs(block); + if (unlikely(ir->loaded + num_recs > + XFS_IFORK_NEXTENTS(ip, whichfork))) { + xfs_warn(ip->i_mount, "corrupt dinode %llu, (btree extents).", + (unsigned long long)ip->i_ino); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block, + sizeof(*block), __this_address); + return -EFSCORRUPTED; + } + + /* Copy records into the incore cache. */ + frp = XFS_BMBT_REC_ADDR(mp, block, 1); + for (j = 0; j < num_recs; j++, frp++, ir->loaded++) { + struct xfs_bmbt_irec new; + xfs_failaddr_t fa; + + xfs_bmbt_disk_get_all(frp, &new); + fa = xfs_bmap_validate_extent(ip, whichfork, &new); + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, + "xfs_iread_extents(2)", frp, + sizeof(*frp), fa); + return -EFSCORRUPTED; + } + xfs_iext_insert(ip, &ir->icur, &new, + xfs_bmap_fork_to_state(whichfork)); + trace_xfs_read_extent(ip, &ir->icur, + xfs_bmap_fork_to_state(whichfork), _THIS_IP_); + xfs_iext_next(XFS_IFORK_PTR(ip, whichfork), &ir->icur); + } + + return 0; +} + /* * Read in extents from a btree-format inode. */ @@ -1163,134 +1230,39 @@ xfs_iread_extents( struct xfs_inode *ip, int whichfork) { - struct xfs_mount *mp = ip->i_mount; - int state = xfs_bmap_fork_to_state(whichfork); + struct xfs_iread_state ir; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_extnum_t nextents = XFS_IFORK_NEXTENTS(ip, whichfork); - struct xfs_btree_block *block = ifp->if_broot; - struct xfs_iext_cursor icur; - struct xfs_bmbt_irec new; - xfs_fsblock_t bno; - struct xfs_buf *bp; - xfs_extnum_t i, j; - int level; - __be64 *pp; + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_cur *cur; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; - } - - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - level = be16_to_cpu(block->bb_level); - if (unlikely(level == 0)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; - } - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - - /* - * Go down the tree until leaf level is reached, following the first - * pointer (leftmost) at each level. - */ - while (level-- > 0) { - error = xfs_btree_read_bufl(mp, tp, bno, &bp, - XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); - if (error) - goto out; - block = XFS_BUF_TO_BLOCK(bp); - if (level == 0) - break; - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(mp, - xfs_verify_fsbno(mp, bno), out_brelse); - xfs_trans_brelse(tp, bp); + if (XFS_IS_CORRUPT(mp, + XFS_IFORK_FORMAT(ip, whichfork) != + XFS_DINODE_FMT_BTREE)) { + error = -EFSCORRUPTED; + goto out; } - /* - * Here with bp and block set to the leftmost leaf node in the tree. - */ - i = 0; - xfs_iext_first(ifp, &icur); - - /* - * Loop over all leaf nodes. Copy information to the extent records. - */ - for (;;) { - xfs_bmbt_rec_t *frp; - xfs_fsblock_t nextbno; - xfs_extnum_t num_recs; - - num_recs = xfs_btree_get_numrecs(block); - if (unlikely(i + num_recs > nextents)) { - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, (btree extents).", - (unsigned long long) ip->i_ino); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, - __func__, block, sizeof(*block), - __this_address); - error = -EFSCORRUPTED; - goto out_brelse; - } - /* - * Read-ahead the next leaf block, if any. - */ - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - if (nextbno != NULLFSBLOCK) - xfs_btree_reada_bufl(mp, nextbno, 1, - &xfs_bmbt_buf_ops); - /* - * Copy records into the extent records. - */ - frp = XFS_BMBT_REC_ADDR(mp, block, 1); - for (j = 0; j < num_recs; j++, frp++, i++) { - xfs_failaddr_t fa; - - xfs_bmbt_disk_get_all(frp, &new); - fa = xfs_bmap_validate_extent(ip, whichfork, &new); - if (fa) { - error = -EFSCORRUPTED; - xfs_inode_verifier_error(ip, error, - "xfs_iread_extents(2)", - frp, sizeof(*frp), fa); - goto out_brelse; - } - xfs_iext_insert(ip, &icur, &new, state); - trace_xfs_read_extent(ip, &icur, state, _THIS_IP_); - xfs_iext_next(ifp, &icur); - } - xfs_trans_brelse(tp, bp); - bno = nextbno; - /* - * If we've reached the end, stop. - */ - if (bno == NULLFSBLOCK) - break; - error = xfs_btree_read_bufl(mp, tp, bno, &bp, - XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); - if (error) - goto out; - block = XFS_BUF_TO_BLOCK(bp); - } + ir.loaded = 0; + xfs_iext_first(ifp, &ir.icur); + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + error = xfs_btree_visit_blocks(cur, xfs_iread_bmbt_block, + XFS_BTREE_VISIT_RECORDS, &ir); + xfs_btree_del_cursor(cur, error); + if (error) + goto out; - if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) { + if (XFS_IS_CORRUPT(mp, + ir.loaded != XFS_IFORK_NEXTENTS(ip, whichfork))) { error = -EFSCORRUPTED; goto out; } - ASSERT(i == xfs_iext_count(ifp)); + ASSERT(ir.loaded == xfs_iext_count(ifp)); ifp->if_flags |= XFS_IFEXTENTS; return 0; - -out_brelse: - xfs_trans_brelse(tp, bp); out: xfs_iext_destroy(ifp); return error; @@ -1317,8 +1289,7 @@ xfs_bmap_first_unused( xfs_fileoff_t lowest, max; int error; - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + ASSERT(xfs_ifork_has_extents(ip, whichfork) || XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { @@ -1374,7 +1345,8 @@ xfs_bmap_last_before( case XFS_DINODE_FMT_EXTENTS: break; default: - return -EIO; + ASSERT(0); + return -EFSCORRUPTED; } if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -1473,9 +1445,8 @@ xfs_bmap_last_offset( if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) return 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return -EIO; + if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ip, whichfork))) + return -EFSCORRUPTED; error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); if (error || is_empty) @@ -1555,7 +1526,7 @@ xfs_bmap_add_extent_delay_real( ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || - (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -1652,15 +1623,24 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_delete(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_decrement(bma->cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; @@ -1686,7 +1666,10 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; @@ -1716,7 +1699,10 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(bma->cur, &PREV); if (error) goto done; @@ -1741,11 +1727,17 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } break; @@ -1776,7 +1768,10 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; @@ -1797,11 +1792,17 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } if (xfs_bmap_needs_btree(bma->ip, whichfork)) { @@ -1815,7 +1816,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + (bma->cur ? bma->cur->bc_ino.allocated : 0)); PREV.br_startoff = new_endoff; PREV.br_blockcount = temp; @@ -1842,7 +1843,10 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(bma->cur, &RIGHT); if (error) goto done; @@ -1874,11 +1878,17 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } if (xfs_bmap_needs_btree(bma->ip, whichfork)) { @@ -1892,7 +1902,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + (bma->cur ? bma->cur->bc_ino.allocated : 0)); PREV.br_startblock = nullstartblock(da_new); PREV.br_blockcount = temp; @@ -1954,11 +1964,17 @@ xfs_bmap_add_extent_delay_real( error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } if (xfs_bmap_needs_btree(bma->ip, whichfork)) { @@ -2007,8 +2023,8 @@ xfs_bmap_add_extent_delay_real( xfs_mod_delalloc(mp, (int64_t)da_new - da_old); if (bma->cur) { - da_new += bma->cur->bc_private.b.allocated; - bma->cur->bc_private.b.allocated = 0; + da_new += bma->cur->bc_ino.allocated; + bma->cur->bc_ino.allocated = 0; } /* adjust for changes in reserved delayed indirect blocks */ @@ -2152,19 +2168,34 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &LEFT); if (error) goto done; @@ -2190,13 +2221,22 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &PREV, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &LEFT); if (error) goto done; @@ -2225,13 +2265,22 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; @@ -2254,7 +2303,10 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; @@ -2284,7 +2336,10 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; @@ -2318,14 +2373,20 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; cur->bc_rec.b = *new; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } break; @@ -2352,7 +2413,10 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; @@ -2386,17 +2450,26 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } break; @@ -2430,7 +2503,10 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } /* new right extent - oldext */ error = xfs_bmbt_update(cur, &r[1]); if (error) @@ -2439,7 +2515,10 @@ xfs_bmap_add_extent_unwritten_real( cur->bc_rec.b = PREV; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } /* * Reset the cursor to the position of the new extent * we are about to insert as we can't trust it after @@ -2448,11 +2527,17 @@ xfs_bmap_add_extent_unwritten_real( error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } /* new middle extent - newext */ if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } break; @@ -2486,7 +2571,7 @@ xfs_bmap_add_extent_unwritten_real( /* clear out the allocated field, done with it now in any case. */ if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; *curp = cur; } @@ -2665,7 +2750,7 @@ xfs_bmap_add_extent_hole_real( struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2735,15 +2820,24 @@ xfs_bmap_add_extent_hole_real( error = xfs_bmbt_lookup_eq(cur, &right, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_delete(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &left); if (error) goto done; @@ -2769,7 +2863,10 @@ xfs_bmap_add_extent_hole_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &left); if (error) goto done; @@ -2796,7 +2893,10 @@ xfs_bmap_add_extent_hole_real( error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_bmbt_update(cur, &right); if (error) goto done; @@ -2819,11 +2919,17 @@ xfs_bmap_add_extent_hole_real( error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_insert(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } break; } @@ -2847,7 +2953,7 @@ xfs_bmap_add_extent_hole_real( /* clear out the allocated field, done with it now in any case. */ if (cur) - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_bmap_check_leaf_extents(cur, ip, whichfork); done: @@ -3058,7 +3164,7 @@ xfs_bmap_adjacent( mp = ap->ip->i_mount; nullfb = ap->tp->t_firstblock == NULLFSBLOCK; rt = XFS_IS_REALTIME_INODE(ap->ip) && - xfs_alloc_is_userdata(ap->datatype); + (ap->datatype & XFS_ALLOC_USERDATA); fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->tp->t_firstblock); /* @@ -3203,11 +3309,12 @@ xfs_bmap_longest_free_extent( pag = xfs_perag_get(mp, ag); if (!pag->pagf_init) { error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK); - if (error) - goto out; - - if (!pag->pagf_init) { - *notinit = 1; + if (error) { + /* Couldn't lock the AGF, so skip this AG. */ + if (error == -EAGAIN) { + *notinit = 1; + error = 0; + } goto out; } } @@ -3411,7 +3518,7 @@ xfs_bmap_btalloc( if (ap->flags & XFS_BMAPI_COWFORK) align = xfs_get_cowextsz_hint(ap->ip); - else if (xfs_alloc_is_userdata(ap->datatype)) + else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); if (align) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, @@ -3426,7 +3533,7 @@ xfs_bmap_btalloc( fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->tp->t_firstblock); if (nullfb) { - if (xfs_alloc_is_userdata(ap->datatype) && + if ((ap->datatype & XFS_ALLOC_USERDATA) && xfs_inode_is_filestream(ap->ip)) { ag = xfs_filestream_lookup_ag(ap->ip); ag = (ag != NULLAGNUMBER) ? ag : 0; @@ -3466,7 +3573,7 @@ xfs_bmap_btalloc( * enough for the request. If one isn't found, then adjust * the minimum allocation size to the largest space found. */ - if (xfs_alloc_is_userdata(ap->datatype) && + if ((ap->datatype & XFS_ALLOC_USERDATA) && xfs_inode_is_filestream(ap->ip)) error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); else @@ -3500,13 +3607,11 @@ xfs_bmap_btalloc( args.mod = args.prod - args.mod; } /* - * If we are not low on available data blocks, and the - * underlying logical volume manager is a stripe, and - * the file offset is zero then try to allocate data - * blocks on stripe unit boundary. - * NOTE: ap->aeof is only set if the allocation length - * is >= the stripe unit and the allocation offset is - * at the end of file. + * If we are not low on available data blocks, and the underlying + * logical volume manager is a stripe, and the file offset is zero then + * try to allocate data blocks on stripe unit boundary. NOTE: ap->aeof + * is only set if the allocation length is >= the stripe unit and the + * allocation offset is at the end of file. */ if (!(ap->tp->t_flags & XFS_TRANS_LOWMODE) && ap->aeof) { if (!ap->offset) { @@ -3514,9 +3619,11 @@ xfs_bmap_btalloc( atype = args.type; isaligned = 1; /* - * Adjust for alignment + * Adjust minlen to try and preserve alignment if we + * can't guarantee an aligned maxlen extent. */ - if (blen > args.alignment && blen <= args.maxlen) + if (blen > args.alignment && + blen <= args.maxlen + args.alignment) args.minlen = blen - args.alignment; args.minalignslop = 0; } else { @@ -3554,8 +3661,6 @@ xfs_bmap_btalloc( args.wasdel = ap->wasdel; args.resv = XFS_AG_RESV_NONE; args.datatype = ap->datatype; - if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) - args.ip = ap->ip; error = xfs_alloc_vextent(&args); if (error) @@ -3640,20 +3745,6 @@ xfs_bmap_btalloc( return 0; } -/* - * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. - * It figures out where to ask the underlying allocator to put the new extent. - */ -STATIC int -xfs_bmap_alloc( - struct xfs_bmalloca *ap) /* bmap alloc argument struct */ -{ - if (XFS_IS_REALTIME_INODE(ap->ip) && - xfs_alloc_is_userdata(ap->datatype)) - return xfs_bmap_rtalloc(ap); - return xfs_bmap_btalloc(ap); -} - /* Trim extent to fit a logical block range. */ void xfs_trim_extent( @@ -3815,11 +3906,8 @@ xfs_bmapi_read( XFS_BMAPI_COWFORK))); ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -4010,6 +4098,39 @@ out_unreserve_quota: } static int +xfs_bmap_alloc_userdata( + struct xfs_bmalloca *bma) +{ + struct xfs_mount *mp = bma->ip->i_mount; + int whichfork = xfs_bmapi_whichfork(bma->flags); + int error; + + /* + * Set the data type being allocated. For the data fork, the first data + * in the file is treated differently to all other allocations. For the + * attribute fork, we only need to ensure the allocated range is not on + * the busy list. + */ + bma->datatype = XFS_ALLOC_NOBUSY; + if (whichfork == XFS_DATA_FORK) { + bma->datatype |= XFS_ALLOC_USERDATA; + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + + if (mp->m_dalign && bma->length >= mp->m_dalign) { + error = xfs_bmap_isaeof(bma, whichfork); + if (error) + return error; + } + + if (XFS_IS_REALTIME_INODE(bma->ip)) + return xfs_bmap_rtalloc(bma); + } + + return xfs_bmap_btalloc(bma); +} + +static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) { @@ -4028,7 +4149,8 @@ xfs_bmapi_allocate( if (bma->wasdel) { bma->length = (xfs_extlen_t)bma->got.br_blockcount; bma->offset = bma->got.br_startoff; - xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev); + if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev)) + bma->prev.br_startoff = NULLFILEOFF; } else { bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); if (!bma->eof) @@ -4036,43 +4158,24 @@ xfs_bmapi_allocate( bma->got.br_startoff - bma->offset); } - /* - * Set the data type being allocated. For the data fork, the first data - * in the file is treated differently to all other allocations. For the - * attribute fork, we only need to ensure the allocated range is not on - * the busy list. - */ - if (!(bma->flags & XFS_BMAPI_METADATA)) { - bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK) { - if (bma->offset == 0) - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; - else - bma->datatype |= XFS_ALLOC_USERDATA; - } - if (bma->flags & XFS_BMAPI_ZERO) - bma->datatype |= XFS_ALLOC_USERDATA_ZERO; - } + if (bma->flags & XFS_BMAPI_CONTIG) + bma->minlen = bma->length; + else + bma->minlen = 1; - bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + if (bma->flags & XFS_BMAPI_METADATA) + error = xfs_bmap_btalloc(bma); + else + error = xfs_bmap_alloc_userdata(bma); + if (error || bma->blkno == NULLFSBLOCK) + return error; - /* - * Only want to do the alignment at the eof if it is userdata and - * allocation length is larger than a stripe unit. - */ - if (mp->m_dalign && bma->length >= mp->m_dalign && - !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { - error = xfs_bmap_isaeof(bma, whichfork); + if (bma->flags & XFS_BMAPI_ZERO) { + error = xfs_zero_extent(bma->ip, bma->blkno, bma->length); if (error) return error; } - error = xfs_bmap_alloc(bma); - if (error) - return error; - - if (bma->blkno == NULLFSBLOCK) - return 0; if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); /* @@ -4082,8 +4185,8 @@ xfs_bmapi_allocate( bma->nallocs++; if (bma->cur) - bma->cur->bc_private.b.flags = - bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + bma->cur->bc_ino.flags = + bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; bma->got.br_startoff = bma->offset; bma->got.br_startblock = bma->blkno; @@ -4312,11 +4415,8 @@ xfs_bmapi_write( ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) != (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -4456,16 +4556,21 @@ int xfs_bmapi_convert_delalloc( struct xfs_inode *ip, int whichfork, - xfs_fileoff_t offset_fsb, - struct xfs_bmbt_irec *imap, + xfs_off_t offset, + struct iomap *iomap, unsigned int *seq) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmalloca bma = { NULL }; + uint16_t flags = 0; struct xfs_trans *tp; int error; + if (whichfork == XFS_COW_FORK) + flags |= IOMAP_F_SHARED; + /* * Space for the extent and indirect blocks was reserved when the * delalloc extent was created so there's no need to do so here. @@ -4495,7 +4600,7 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - *imap = bma.got; + xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4505,7 +4610,6 @@ xfs_bmapi_convert_delalloc( bma.wasdel = true; bma.offset = bma.got.br_startoff; bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); - bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); if (whichfork == XFS_COW_FORK) bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; @@ -4528,7 +4632,7 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - *imap = bma.got; + xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) @@ -4578,11 +4682,8 @@ xfs_bmapi_remap( ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) != (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -4606,7 +4707,7 @@ xfs_bmapi_remap( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } got.br_startoff = bno; @@ -5013,7 +5114,10 @@ xfs_bmap_del_extent_real( error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } if (got.br_startoff == del->br_startoff) @@ -5037,7 +5141,10 @@ xfs_bmap_del_extent_real( } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } break; case BMAP_LEFT_FILLING: /* @@ -5108,7 +5215,10 @@ xfs_bmap_del_extent_real( error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } /* * Update the btree record back * to the original value. @@ -5125,7 +5235,10 @@ xfs_bmap_del_extent_real( error = -ENOSPC; goto done; } - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } } else flags |= xfs_ilog_fext(whichfork); XFS_IFORK_NEXT_SET(ip, whichfork, @@ -5192,7 +5305,7 @@ __xfs_bunmapi( int isrt; /* freeing in rt area */ int logflags; /* transaction logging flags */ xfs_extlen_t mod; /* rt extent offset */ - struct xfs_mount *mp; /* mount structure */ + struct xfs_mount *mp = ip->i_mount; int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ @@ -5209,14 +5322,8 @@ __xfs_bunmapi( whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); - if (unlikely( - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { - XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW, - ip->i_mount); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork))) return -EFSCORRUPTED; - } - mp = ip->i_mount; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -5255,7 +5362,7 @@ __xfs_bunmapi( if (ifp->if_flags & XFS_IFBROOT) { ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } else cur = NULL; @@ -5300,7 +5407,7 @@ __xfs_bunmapi( * Make sure we don't touch multiple AGF headers out of order * in a single transaction, as that could cause AB-BA deadlocks. */ - if (!wasdel) { + if (!wasdel && !isrt) { agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); if (prev_agno != NULLAGNUMBER && prev_agno > agno) break; @@ -5376,16 +5483,17 @@ __xfs_bunmapi( } div_u64_rem(del.br_startblock, mp->m_sb.sb_rextsize, &mod); if (mod) { + xfs_extlen_t off = mp->m_sb.sb_rextsize - mod; + /* * Realtime extent is lined up at the end but not * at the front. We'll get rid of full extents if * we can. */ - mod = mp->m_sb.sb_rextsize - mod; - if (del.br_blockcount > mod) { - del.br_blockcount -= mod; - del.br_startoff += mod; - del.br_startblock += mod; + if (del.br_blockcount > off) { + del.br_blockcount -= off; + del.br_startoff += off; + del.br_startblock += off; } else if (del.br_startoff == start && (del.br_state == XFS_EXT_UNWRITTEN || tp->t_blk_res == 0)) { @@ -5403,6 +5511,7 @@ __xfs_bunmapi( continue; } else if (del.br_state == XFS_EXT_UNWRITTEN) { struct xfs_bmbt_irec prev; + xfs_fileoff_t unwrite_start; /* * This one is already unwritten. @@ -5416,12 +5525,13 @@ __xfs_bunmapi( ASSERT(!isnullstartblock(prev.br_startblock)); ASSERT(del.br_startblock == prev.br_startblock + prev.br_blockcount); - if (prev.br_startoff < start) { - mod = start - prev.br_startoff; - prev.br_blockcount -= mod; - prev.br_startblock += mod; - prev.br_startoff = start; - } + unwrite_start = max3(start, + del.br_startoff - mod, + prev.br_startoff); + mod = unwrite_start - prev.br_startoff; + prev.br_startoff = unwrite_start; + prev.br_startblock += mod; + prev.br_blockcount -= mod; prev.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, whichfork, &icur, &cur, @@ -5508,7 +5618,7 @@ error0: xfs_trans_log_inode(tp, ip, logflags); if (cur) { if (!error) - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -5610,18 +5720,21 @@ xfs_bmse_merge( error = xfs_bmbt_lookup_eq(cur, got, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; error = xfs_btree_delete(cur, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; /* lookup and update size of the previous extent */ error = xfs_bmbt_lookup_eq(cur, left, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; error = xfs_bmbt_update(cur, &new); if (error) @@ -5669,7 +5782,8 @@ xfs_bmap_shift_update_extent( error = xfs_bmbt_lookup_eq(cur, &prev, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; error = xfs_bmbt_update(cur, got); if (error) @@ -5705,11 +5819,8 @@ xfs_bmap_collapse_extents( int error = 0; int logflags = 0; - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -5726,15 +5837,17 @@ xfs_bmap_collapse_extents( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { *done = true; goto del_cursor; } - XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock), - del_cursor); + if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) { + error = -EFSCORRUPTED; + goto del_cursor; + } new_startoff = got.br_startoff - offset_shift_fsb; if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) { @@ -5823,11 +5936,8 @@ xfs_bmap_insert_extents( int error = 0; int logflags = 0; - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -5844,7 +5954,7 @@ xfs_bmap_insert_extents( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } if (*next_fsb == NULLFSBLOCK) { @@ -5860,11 +5970,13 @@ xfs_bmap_insert_extents( goto del_cursor; } } - XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock), - del_cursor); + if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) { + error = -EFSCORRUPTED; + goto del_cursor; + } - if (stop_fsb >= got.br_startoff + got.br_blockcount) { - error = -EIO; + if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) { + error = -EFSCORRUPTED; goto del_cursor; } @@ -5911,8 +6023,8 @@ del_cursor: * @split_fsb is a block where the extents is split. If split_fsb lies in a * hole or the first block of extents, just return 0. */ -STATIC int -xfs_bmap_split_extent_at( +int +xfs_bmap_split_extent( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_fsb) @@ -5929,12 +6041,8 @@ xfs_bmap_split_extent_at( int logflags = 0; int i = 0; - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmap_split_extent_at", - XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -5964,11 +6072,14 @@ xfs_bmap_split_extent_at( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto del_cursor; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto del_cursor; + } } got.br_blockcount = gotblkcnt; @@ -5993,11 +6104,17 @@ xfs_bmap_split_extent_at( error = xfs_bmbt_lookup_eq(cur, &new, &i); if (error) goto del_cursor; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto del_cursor; + } error = xfs_btree_insert(cur, &i); if (error) goto del_cursor; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto del_cursor; + } } /* @@ -6014,7 +6131,7 @@ xfs_bmap_split_extent_at( del_cursor: if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } @@ -6023,34 +6140,6 @@ del_cursor: return error; } -int -xfs_bmap_split_extent( - struct xfs_inode *ip, - xfs_fileoff_t split_fsb) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - error = xfs_bmap_split_extent_at(tp, ip, split_fsb); - if (error) - goto out; - - return xfs_trans_commit(tp); - -out: - xfs_trans_cancel(tp); - return error; -} - /* Deferred mapping is only for real extents in the data fork. */ static bool xfs_bmap_is_update_needed( diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e2798c6f3a5f..f3259ad5c22c 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -222,14 +222,14 @@ int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off, int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb); -int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); +int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t split_offset); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap, - unsigned int *seq); + xfs_off_t offset, struct iomap *iomap, unsigned int *seq); int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index ffe608d2a2d9..295a59cf8840 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -166,13 +166,13 @@ xfs_bmbt_dup_cursor( struct xfs_btree_cur *new; new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.b.ip, cur->bc_private.b.whichfork); + cur->bc_ino.ip, cur->bc_ino.whichfork); /* * Copy the firstblock, dfops, and flags values, * since init cursor doesn't get them. */ - new->bc_private.b.flags = cur->bc_private.b.flags; + new->bc_ino.flags = cur->bc_ino.flags; return new; } @@ -183,12 +183,12 @@ xfs_bmbt_update_cursor( struct xfs_btree_cur *dst) { ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) || - (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); + (dst->bc_ino.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); - dst->bc_private.b.allocated += src->bc_private.b.allocated; + dst->bc_ino.allocated += src->bc_ino.allocated; dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock; - src->bc_private.b.allocated = 0; + src->bc_ino.allocated = 0; } STATIC int @@ -205,8 +205,8 @@ xfs_bmbt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.fsbno = cur->bc_tp->t_firstblock; - xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino, - cur->bc_private.b.whichfork); + xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino, + cur->bc_ino.whichfork); if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); @@ -230,7 +230,7 @@ xfs_bmbt_alloc_block( } args.minlen = args.maxlen = args.prod = 1; - args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL; if (!args.wasdel && args.tp->t_blk_res == 0) { error = -ENOSPC; goto error0; @@ -259,10 +259,10 @@ xfs_bmbt_alloc_block( ASSERT(args.len == 1); cur->bc_tp->t_firstblock = args.fsbno; - cur->bc_private.b.allocated++; - cur->bc_private.b.ip->i_d.di_nblocks++; - xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); - xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip, + cur->bc_ino.allocated++; + cur->bc_ino.ip->i_d.di_nblocks++; + xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE); + xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip, XFS_TRANS_DQ_BCOUNT, 1L); new->l = cpu_to_be64(args.fsbno); @@ -280,12 +280,12 @@ xfs_bmbt_free_block( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); struct xfs_owner_info oinfo; - xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_private.b.whichfork); + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); xfs_bmap_add_free(cur->bc_tp, fsbno, 1, &oinfo); ip->i_d.di_nblocks--; @@ -302,8 +302,8 @@ xfs_bmbt_get_minrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, - cur->bc_private.b.whichfork); + ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0) / 2; @@ -320,8 +320,8 @@ xfs_bmbt_get_maxrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, - cur->bc_private.b.whichfork); + ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0); @@ -347,7 +347,7 @@ xfs_bmbt_get_dmaxrecs( { if (level != cur->bc_nlevels - 1) return cur->bc_mp->m_bmap_dmxr[level != 0]; - return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0); + return xfs_bmdr_maxrecs(cur->bc_ino.forksize, level == 0); } STATIC void @@ -566,11 +566,11 @@ xfs_bmbt_init_cursor( if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); - cur->bc_private.b.ip = ip; - cur->bc_private.b.allocated = 0; - cur->bc_private.b.flags = 0; - cur->bc_private.b.whichfork = whichfork; + cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_ino.ip = ip; + cur->bc_ino.allocated = 0; + cur->bc_ino.flags = 0; + cur->bc_ino.whichfork = whichfork; return cur; } @@ -644,7 +644,7 @@ xfs_bmbt_change_owner( cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); if (!cur) return -ENOMEM; - cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER; + cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 71de937f9e64..2d25bab68764 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -20,6 +20,7 @@ #include "xfs_trace.h" #include "xfs_alloc.h" #include "xfs_log.h" +#include "xfs_btree_staging.h" /* * Cursor allocation zone. @@ -105,11 +106,10 @@ xfs_btree_check_lblock( xfs_failaddr_t fa; fa = __xfs_btree_check_lblock(cur, block, level, bp); - if (unlikely(XFS_TEST_ERROR(fa != NULL, mp, - XFS_ERRTAG_BTREE_CHECK_LBLOCK))) { + if (XFS_IS_CORRUPT(mp, fa != NULL) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } return 0; @@ -169,11 +169,10 @@ xfs_btree_check_sblock( xfs_failaddr_t fa; fa = __xfs_btree_check_sblock(cur, block, level, bp); - if (unlikely(XFS_TEST_ERROR(fa != NULL, mp, - XFS_ERRTAG_BTREE_CHECK_SBLOCK))) { + if (XFS_IS_CORRUPT(mp, fa != NULL) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } return 0; @@ -216,7 +215,7 @@ xfs_btree_check_sptr( { if (level <= 0) return false; - return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno); + return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.agno, agbno); } /* @@ -236,8 +235,8 @@ xfs_btree_check_ptr( return 0; xfs_err(cur->bc_mp, "Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.", - cur->bc_private.b.ip->i_ino, - cur->bc_private.b.whichfork, cur->bc_btnum, + cur->bc_ino.ip->i_ino, + cur->bc_ino.whichfork, cur->bc_btnum, level, index); } else { if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]), @@ -245,7 +244,7 @@ xfs_btree_check_ptr( return 0; xfs_err(cur->bc_mp, "AG %u: Corrupt btree %d pointer at level %d index %d.", - cur->bc_private.a.agno, cur->bc_btnum, + cur->bc_ag.agno, cur->bc_btnum, level, index); } @@ -380,11 +379,13 @@ xfs_btree_del_cursor( * allocated indirect blocks' accounting. */ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || - cur->bc_private.b.allocated == 0); + cur->bc_ino.allocated == 0); /* * Free the cursor. */ - kmem_zone_free(xfs_btree_cur_zone, cur); + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) + kmem_free((void *)cur->bc_ops); + kmem_cache_free(xfs_btree_cur_zone, cur); } /* @@ -644,6 +645,17 @@ xfs_btree_ptr_addr( ((char *)block + xfs_btree_ptr_offset(cur, n, level)); } +struct xfs_ifork * +xfs_btree_ifork_ptr( + struct xfs_btree_cur *cur) +{ + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + + if (cur->bc_flags & XFS_BTREE_STAGING) + return cur->bc_ino.ifake->if_fork; + return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork); +} + /* * Get the root block which is stored in the inode. * @@ -654,9 +666,8 @@ STATIC struct xfs_btree_block * xfs_btree_get_iroot( struct xfs_btree_cur *cur) { - struct xfs_ifork *ifp; + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); return (struct xfs_btree_block *)ifp->if_broot; } @@ -681,61 +692,6 @@ xfs_btree_get_block( } /* - * Get a buffer for the block, return it with no data read. - * Long-form addressing. - */ -xfs_buf_t * /* buffer for fsbno */ -xfs_btree_get_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t fsbno) /* file system block number */ -{ - xfs_daddr_t d; /* real disk block address */ - - ASSERT(fsbno != NULLFSBLOCK); - d = XFS_FSB_TO_DADDR(mp, fsbno); - return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0); -} - -/* - * Get a buffer for the block, return it with no data read. - * Short-form addressing. - */ -xfs_buf_t * /* buffer for agno/agbno */ -xfs_btree_get_bufs( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno) /* allocation group block number */ -{ - xfs_daddr_t d; /* real disk block address */ - - ASSERT(agno != NULLAGNUMBER); - ASSERT(agbno != NULLAGBLOCK); - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0); -} - -/* - * Check for the cursor referring to the last block at the given level. - */ -int /* 1=is last block, 0=not last block */ -xfs_btree_islastblock( - xfs_btree_cur_t *cur, /* btree cursor */ - int level) /* level to check */ -{ - struct xfs_btree_block *block; /* generic btree block pointer */ - xfs_buf_t *bp; /* buffer containing block */ - - block = xfs_btree_get_block(cur, level, &bp); - xfs_btree_check_block(cur, block, level, bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); - else - return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); -} - -/* * Change the cursor to point to the first record at the given level. * Other levels are unaffected. */ @@ -938,13 +894,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno, left, 1, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno, right, 1, cur->bc_ops->buf_ops); rval++; } @@ -1002,7 +958,7 @@ xfs_btree_ptr_to_daddr( *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); } else { agbno = be32_to_cpu(ptr->s); - *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.agno, agbno); } @@ -1071,7 +1027,7 @@ xfs_btree_ptr_is_null( return ptr->s == cpu_to_be32(NULLAGBLOCK); } -STATIC void +void xfs_btree_set_ptr_null( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) @@ -1107,7 +1063,7 @@ xfs_btree_get_sibling( } } -STATIC void +void xfs_btree_set_sibling( struct xfs_btree_cur *cur, struct xfs_btree_block *block, @@ -1185,7 +1141,7 @@ xfs_btree_init_block( btnum, level, numrecs, owner, 0); } -STATIC void +void xfs_btree_init_block_cur( struct xfs_btree_cur *cur, struct xfs_buf *bp, @@ -1201,9 +1157,9 @@ xfs_btree_init_block_cur( * code. */ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - owner = cur->bc_private.b.ip->i_ino; + owner = cur->bc_ino.ip->i_ino; else - owner = cur->bc_private.a.agno; + owner = cur->bc_ag.agno; xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, cur->bc_btnum, level, numrecs, @@ -1277,7 +1233,7 @@ xfs_btree_set_refs( } } -STATIC int +int xfs_btree_get_buf_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, @@ -1291,11 +1247,10 @@ xfs_btree_get_buf_block( error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; - *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, 0); - - if (!*bpp) - return -ENOMEM; + error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, + 0, bpp); + if (error) + return error; (*bpp)->b_ops = cur->bc_ops->buf_ops; *block = XFS_BUF_TO_BLOCK(*bpp); @@ -1338,7 +1293,7 @@ xfs_btree_read_buf_block( /* * Copy keys from one btree block to another. */ -STATIC void +void xfs_btree_copy_keys( struct xfs_btree_cur *cur, union xfs_btree_key *dst_key, @@ -1366,11 +1321,11 @@ xfs_btree_copy_recs( /* * Copy block pointers from one btree block to another. */ -STATIC void +void xfs_btree_copy_ptrs( struct xfs_btree_cur *cur, union xfs_btree_ptr *dst_ptr, - union xfs_btree_ptr *src_ptr, + const union xfs_btree_ptr *src_ptr, int numptrs) { ASSERT(numptrs >= 0); @@ -1451,8 +1406,8 @@ xfs_btree_log_keys( xfs_btree_key_offset(cur, first), xfs_btree_key_offset(cur, last + 1) - 1); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1494,8 +1449,8 @@ xfs_btree_log_ptrs( xfs_btree_ptr_offset(cur, first, level), xfs_btree_ptr_offset(cur, last + 1, level) - 1); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1563,8 +1518,8 @@ xfs_btree_log_block( xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, first, last); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1801,10 +1756,10 @@ xfs_btree_lookup_get_block( /* Check the inode owner since the verifiers don't. */ if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && - !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) && + !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && (cur->bc_flags & XFS_BTREE_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != - cur->bc_private.b.ip->i_ino) + cur->bc_ino.ip->i_ino) goto out_bad; /* Did we get the level we were looking for? */ @@ -1820,6 +1775,7 @@ xfs_btree_lookup_get_block( out_bad: *blkp = NULL; + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(cur->bc_tp, bp); return -EFSCORRUPTED; } @@ -1867,7 +1823,7 @@ xfs_btree_lookup( XFS_BTREE_STATS_INC(cur, lookup); /* No such thing as a zero-level tree. */ - if (cur->bc_nlevels == 0) + if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) return -EFSCORRUPTED; block = NULL; @@ -1987,7 +1943,8 @@ xfs_btree_lookup( error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; *stat = 1; return 0; } @@ -2408,8 +2365,6 @@ xfs_btree_lshift( XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); if (level > 0) { /* It's a nonleaf. operate on keys and ptrs */ - int i; /* loop index */ - for (i = 0; i < rrecs; i++) { error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); if (error) @@ -2442,7 +2397,10 @@ xfs_btree_lshift( if (error) goto error0; i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_btree_decrement(tcur, level, &i); if (error) @@ -2609,7 +2567,10 @@ xfs_btree_rshift( if (error) goto error0; i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_btree_increment(tcur, level, &i); if (error) @@ -2990,9 +2951,9 @@ xfs_btree_new_iroot( xfs_btree_copy_ptrs(cur, pp, &nptr, 1); - xfs_iroot_realloc(cur->bc_private.b.ip, + xfs_iroot_realloc(cur->bc_ino.ip, 1 - xfs_btree_get_numrecs(cblock), - cur->bc_private.b.whichfork); + cur->bc_ino.whichfork); xfs_btree_setbuf(cur, level, cbp); @@ -3005,7 +2966,7 @@ xfs_btree_new_iroot( xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); *logflags |= - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); *stat = 1; return 0; error0: @@ -3157,11 +3118,11 @@ xfs_btree_make_block_unfull( if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && level == cur->bc_nlevels - 1) { - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ - xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); + xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); *stat = 1; } else { /* A root block that needs replacing */ @@ -3463,7 +3424,10 @@ xfs_btree_insert( goto error0; } - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } level++; /* @@ -3504,8 +3468,8 @@ STATIC int xfs_btree_kill_iroot( struct xfs_btree_cur *cur) { - int whichfork = cur->bc_private.b.whichfork; - struct xfs_inode *ip = cur->bc_private.b.ip; + int whichfork = cur->bc_ino.whichfork; + struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_block *block; struct xfs_btree_block *cblock; @@ -3563,8 +3527,8 @@ xfs_btree_kill_iroot( index = numrecs - cur->bc_ops->get_maxrecs(cur, level); if (index) { - xfs_iroot_realloc(cur->bc_private.b.ip, index, - cur->bc_private.b.whichfork); + xfs_iroot_realloc(cur->bc_ino.ip, index, + cur->bc_ino.whichfork); block = ifp->if_broot; } @@ -3593,7 +3557,7 @@ xfs_btree_kill_iroot( cur->bc_bufs[level - 1] = NULL; be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); cur->bc_nlevels--; out0: return 0; @@ -3761,8 +3725,8 @@ xfs_btree_delrec( */ if (level == cur->bc_nlevels - 1) { if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { - xfs_iroot_realloc(cur->bc_private.b.ip, -1, - cur->bc_private.b.whichfork); + xfs_iroot_realloc(cur->bc_ino.ip, -1, + cur->bc_ino.whichfork); error = xfs_btree_kill_iroot(cur); if (error) @@ -3867,15 +3831,24 @@ xfs_btree_delrec( * Actually any entry but the first would suffice. */ i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_btree_increment(tcur, level, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* Grab a pointer to the block. */ right = xfs_btree_get_block(tcur, level, &rbp); @@ -3919,12 +3892,18 @@ xfs_btree_delrec( rrecs = xfs_btree_get_numrecs(right); if (!xfs_btree_ptr_is_null(cur, &lptr)) { i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } } } @@ -3938,13 +3917,19 @@ xfs_btree_delrec( * previous block. */ i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* Grab a pointer to the block. */ left = xfs_btree_get_block(tcur, level, &lbp); @@ -4286,6 +4271,7 @@ int xfs_btree_visit_blocks( struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, + unsigned int flags, void *data) { union xfs_btree_ptr lptr; @@ -4311,6 +4297,11 @@ xfs_btree_visit_blocks( /* save for the next iteration of the loop */ xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); + + if (!(flags & XFS_BTREE_VISIT_LEAVES)) + continue; + } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) { + continue; } /* for each buffer in the level */ @@ -4413,7 +4404,7 @@ xfs_btree_change_owner( bbcoi.buffer_list = buffer_list; return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, - &bbcoi); + XFS_BTREE_VISIT_ALL, &bbcoi); } /* Verify the v5 fields of a long-format btree block. */ @@ -4865,7 +4856,7 @@ xfs_btree_count_blocks( { *blocks = 0; return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, - blocks); + XFS_BTREE_VISIT_ALL, blocks); } /* Compare two btree pointers. */ diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index ced1e65d1483..8626c5a81aad 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -10,6 +10,7 @@ struct xfs_buf; struct xfs_inode; struct xfs_mount; struct xfs_trans; +struct xfs_ifork; extern kmem_zone_t *xfs_btree_cur_zone; @@ -177,12 +178,37 @@ union xfs_btree_irec { struct xfs_refcount_irec rc; }; -/* Per-AG btree private information. */ -union xfs_btree_cur_private { - struct { - unsigned long nr_ops; /* # record updates */ - int shape_changes; /* # of extent splits */ - } refc; +/* Per-AG btree information. */ +struct xfs_btree_cur_ag { + union { + struct xfs_buf *agbp; + struct xbtree_afakeroot *afake; /* for staging cursor */ + }; + xfs_agnumber_t agno; + union { + struct { + unsigned long nr_ops; /* # record updates */ + int shape_changes; /* # of extent splits */ + } refc; + struct { + bool active; /* allocation cursor state */ + } abt; + }; +}; + +/* Btree-in-inode cursor information */ +struct xfs_btree_cur_ino { + struct xfs_inode *ip; + struct xbtree_ifakeroot *ifake; /* for staging cursor */ + int allocated; + short forksize; + char whichfork; + char flags; +/* We are converting a delalloc reservation */ +#define XFS_BTCUR_BMBT_WASDEL (1 << 0) + +/* For extent swap, ignore owner check in verifier */ +#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1) }; /* @@ -206,21 +232,9 @@ typedef struct xfs_btree_cur xfs_btnum_t bc_btnum; /* identifies which btree type */ int bc_statoff; /* offset of btre stats array */ union { - struct { /* needed for BNO, CNT, INO */ - struct xfs_buf *agbp; /* agf/agi buffer pointer */ - xfs_agnumber_t agno; /* ag number */ - union xfs_btree_cur_private priv; - } a; - struct { /* needed for BMAP */ - struct xfs_inode *ip; /* pointer to our inode */ - int allocated; /* count of alloced */ - short forksize; /* fork's inode space */ - char whichfork; /* data or attr fork */ - char flags; /* flags */ -#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */ -#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */ - } b; - } bc_private; /* per-btree type data */ + struct xfs_btree_cur_ag bc_ag; + struct xfs_btree_cur_ino bc_ino; + }; } xfs_btree_cur_t; /* cursor flags */ @@ -229,6 +243,12 @@ typedef struct xfs_btree_cur #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ #define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ #define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */ +/* + * The root of this btree is a fakeroot structure so that we can stage a btree + * rebuild without leaving it accessible via primary metadata. The ops struct + * is dynamically allocated and must be freed when the cursor is deleted. + */ +#define XFS_BTREE_STAGING (1<<5) #define XFS_BTREE_NOERROR 0 @@ -294,35 +314,6 @@ xfs_btree_dup_cursor( xfs_btree_cur_t **ncur);/* output cursor */ /* - * Get a buffer for the block, return it with no data read. - * Long-form addressing. - */ -struct xfs_buf * /* buffer for fsbno */ -xfs_btree_get_bufl( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t fsbno); /* file system block number */ - -/* - * Get a buffer for the block, return it with no data read. - * Short-form addressing. - */ -struct xfs_buf * /* buffer for agno/agbno */ -xfs_btree_get_bufs( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno); /* allocation group block number */ - -/* - * Check for the cursor referring to the last block at the given level. - */ -int /* 1=is last block, 0=not last block */ -xfs_btree_islastblock( - xfs_btree_cur_t *cur, /* btree cursor */ - int level); /* level to check */ - -/* * Compute first and last byte offsets for the fields given. * Interprets the offsets table, which contains struct field offsets. */ @@ -482,8 +473,15 @@ int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn, typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, void *data); +/* Visit record blocks. */ +#define XFS_BTREE_VISIT_RECORDS (1 << 0) +/* Visit leaf blocks. */ +#define XFS_BTREE_VISIT_LEAVES (1 << 1) +/* Visit all blocks. */ +#define XFS_BTREE_VISIT_ALL (XFS_BTREE_VISIT_RECORDS | \ + XFS_BTREE_VISIT_LEAVES) int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, - xfs_btree_visit_blocks_fn fn, void *data); + xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data); int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); @@ -513,5 +511,39 @@ union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, union xfs_btree_irec *high, bool *exists); bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); +struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); + +/* Does this cursor point to the last block in the given level? */ +static inline bool +xfs_btree_islastblock( + xfs_btree_cur_t *cur, + int level) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cur, level, &bp); + ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); + return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); +} + +void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); +int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, + struct xfs_btree_block **block, struct xfs_buf **bpp); +void xfs_btree_set_sibling(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, union xfs_btree_ptr *ptr, + int lr); +void xfs_btree_init_block_cur(struct xfs_btree_cur *cur, + struct xfs_buf *bp, int level, int numrecs); +void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur, + union xfs_btree_ptr *dst_ptr, + const union xfs_btree_ptr *src_ptr, int numptrs); +void xfs_btree_copy_keys(struct xfs_btree_cur *cur, + union xfs_btree_key *dst_key, union xfs_btree_key *src_key, + int numkeys); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c new file mode 100644 index 000000000000..f464a7c7cf22 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -0,0 +1,879 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_btree.h" +#include "xfs_trace.h" +#include "xfs_btree_staging.h" + +/* + * Staging Cursors and Fake Roots for Btrees + * ========================================= + * + * A staging btree cursor is a special type of btree cursor that callers must + * use to construct a new btree index using the btree bulk loader code. The + * bulk loading code uses the staging btree cursor to abstract the details of + * initializing new btree blocks and filling them with records or key/ptr + * pairs. Regular btree operations (e.g. queries and modifications) are not + * supported with staging cursors, and callers must not invoke them. + * + * Fake root structures contain all the information about a btree that is under + * construction by the bulk loading code. Staging btree cursors point to fake + * root structures instead of the usual AG header or inode structure. + * + * Callers are expected to initialize a fake root structure and pass it into + * the _stage_cursor function for a specific btree type. When bulk loading is + * complete, callers should call the _commit_staged_btree function for that + * specific btree type to commit the new btree into the filesystem. + */ + +/* + * Don't allow staging cursors to be duplicated because they're supposed to be + * kept private to a single thread. + */ +STATIC struct xfs_btree_cur * +xfs_btree_fakeroot_dup_cursor( + struct xfs_btree_cur *cur) +{ + ASSERT(0); + return NULL; +} + +/* + * Don't allow block allocation for a staging cursor, because staging cursors + * do not support regular btree modifications. + * + * Bulk loading uses a separate callback to obtain new blocks from a + * preallocated list, which prevents ENOSPC failures during loading. + */ +STATIC int +xfs_btree_fakeroot_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start_bno, + union xfs_btree_ptr *new_bno, + int *stat) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +/* + * Don't allow block freeing for a staging cursor, because staging cursors + * do not support regular btree modifications. + */ +STATIC int +xfs_btree_fakeroot_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +/* Initialize a pointer to the root block from the fakeroot. */ +STATIC void +xfs_btree_fakeroot_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xbtree_afakeroot *afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + afake = cur->bc_ag.afake; + ptr->s = cpu_to_be32(afake->af_root); +} + +/* + * Bulk Loading for AG Btrees + * ========================== + * + * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the + * staging cursor. Callers should initialize this to zero. + * + * The _stage_cursor() function for a specific btree type should call + * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging + * cursor. The corresponding _commit_staged_btree() function should log the + * new root and call xfs_btree_commit_afakeroot() to transform the staging + * cursor into a regular btree cursor. + */ + +/* Update the btree root information for a per-AG fake root. */ +STATIC void +xfs_btree_afakeroot_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + afake->af_root = be32_to_cpu(ptr->s); + afake->af_levels += inc; +} + +/* + * Initialize a AG-rooted btree cursor with the given AG btree fake root. + * The btree cursor's bc_ops will be overridden as needed to make the staging + * functionality work. + */ +void +xfs_btree_stage_afakeroot( + struct xfs_btree_cur *cur, + struct xbtree_afakeroot *afake) +{ + struct xfs_btree_ops *nops; + + ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); + ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)); + ASSERT(cur->bc_tp == NULL); + + nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); + memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); + nops->alloc_block = xfs_btree_fakeroot_alloc_block; + nops->free_block = xfs_btree_fakeroot_free_block; + nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; + nops->set_root = xfs_btree_afakeroot_set_root; + nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; + + cur->bc_ag.afake = afake; + cur->bc_nlevels = afake->af_levels; + cur->bc_ops = nops; + cur->bc_flags |= XFS_BTREE_STAGING; +} + +/* + * Transform an AG-rooted staging btree cursor back into a regular cursor by + * substituting a real btree root for the fake one and restoring normal btree + * cursor ops. The caller must log the btree root change prior to calling + * this. + */ +void +xfs_btree_commit_afakeroot( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp, + const struct xfs_btree_ops *ops) +{ + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(cur->bc_tp == NULL); + + trace_xfs_btree_commit_afakeroot(cur); + + kmem_free((void *)cur->bc_ops); + cur->bc_ag.agbp = agbp; + cur->bc_ops = ops; + cur->bc_flags &= ~XFS_BTREE_STAGING; + cur->bc_tp = tp; +} + +/* + * Bulk Loading for Inode-Rooted Btrees + * ==================================== + * + * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to + * the staging cursor. This structure should be initialized as follows: + * + * - if_fork_size field should be set to the number of bytes available to the + * fork in the inode. + * + * - if_fork should point to a freshly allocated struct xfs_ifork. + * + * - if_format should be set to the appropriate fork type (e.g. + * XFS_DINODE_FMT_BTREE). + * + * All other fields must be zero. + * + * The _stage_cursor() function for a specific btree type should call + * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging + * cursor. The corresponding _commit_staged_btree() function should log the + * new root and call xfs_btree_commit_ifakeroot() to transform the staging + * cursor into a regular btree cursor. + */ + +/* + * Initialize an inode-rooted btree cursor with the given inode btree fake + * root. The btree cursor's bc_ops will be overridden as needed to make the + * staging functionality work. If new_ops is not NULL, these new ops will be + * passed out to the caller for further overriding. + */ +void +xfs_btree_stage_ifakeroot( + struct xfs_btree_cur *cur, + struct xbtree_ifakeroot *ifake, + struct xfs_btree_ops **new_ops) +{ + struct xfs_btree_ops *nops; + + ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_tp == NULL); + + nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); + memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); + nops->alloc_block = xfs_btree_fakeroot_alloc_block; + nops->free_block = xfs_btree_fakeroot_free_block; + nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; + nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; + + cur->bc_ino.ifake = ifake; + cur->bc_nlevels = ifake->if_levels; + cur->bc_ops = nops; + cur->bc_flags |= XFS_BTREE_STAGING; + + if (new_ops) + *new_ops = nops; +} + +/* + * Transform an inode-rooted staging btree cursor back into a regular cursor by + * substituting a real btree root for the fake one and restoring normal btree + * cursor ops. The caller must log the btree root change prior to calling + * this. + */ +void +xfs_btree_commit_ifakeroot( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + int whichfork, + const struct xfs_btree_ops *ops) +{ + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(cur->bc_tp == NULL); + + trace_xfs_btree_commit_ifakeroot(cur); + + kmem_free((void *)cur->bc_ops); + cur->bc_ino.ifake = NULL; + cur->bc_ino.whichfork = whichfork; + cur->bc_ops = ops; + cur->bc_flags &= ~XFS_BTREE_STAGING; + cur->bc_tp = tp; +} + +/* + * Bulk Loading of Staged Btrees + * ============================= + * + * This interface is used with a staged btree cursor to create a totally new + * btree with a large number of records (i.e. more than what would fit in a + * single root block). When the creation is complete, the new root can be + * linked atomically into the filesystem by committing the staged cursor. + * + * Creation of a new btree proceeds roughly as follows: + * + * The first step is to initialize an appropriate fake btree root structure and + * then construct a staged btree cursor. Refer to the block comments about + * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for + * more information about how to do this. + * + * The second step is to initialize a struct xfs_btree_bload context as + * documented in the structure definition. + * + * The third step is to call xfs_btree_bload_compute_geometry to compute the + * height of and the number of blocks needed to construct the btree. See the + * section "Computing the Geometry of the New Btree" for details about this + * computation. + * + * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and + * save them for later use by ->claim_block(). Bulk loading requires all + * blocks to be allocated beforehand to avoid ENOSPC failures midway through a + * rebuild, and to minimize seek distances of the new btree. + * + * Step five is to call xfs_btree_bload() to start constructing the btree. + * + * The final step is to commit the staging btree cursor, which logs the new + * btree root and turns the staging cursor into a regular cursor. The caller + * is responsible for cleaning up the previous btree blocks, if any. + * + * Computing the Geometry of the New Btree + * ======================================= + * + * The number of items placed in each btree block is computed via the following + * algorithm: For leaf levels, the number of items for the level is nr_records + * in the bload structure. For node levels, the number of items for the level + * is the number of blocks in the next lower level of the tree. For each + * level, the desired number of items per block is defined as: + * + * desired = max(minrecs, maxrecs - slack factor) + * + * The number of blocks for the level is defined to be: + * + * blocks = floor(nr_items / desired) + * + * Note this is rounded down so that the npb calculation below will never fall + * below minrecs. The number of items that will actually be loaded into each + * btree block is defined as: + * + * npb = nr_items / blocks + * + * Some of the leftmost blocks in the level will contain one extra record as + * needed to handle uneven division. If the number of records in any block + * would exceed maxrecs for that level, blocks is incremented and npb is + * recalculated. + * + * In other words, we compute the number of blocks needed to satisfy a given + * loading level, then spread the items as evenly as possible. + * + * The height and number of fs blocks required to create the btree are computed + * and returned via btree_height and nr_blocks. + */ + +/* + * Put a btree block that we're loading onto the ordered list and release it. + * The btree blocks will be written to disk when bulk loading is finished. + */ +static void +xfs_btree_bload_drop_buf( + struct list_head *buffers_list, + struct xfs_buf **bpp) +{ + if (*bpp == NULL) + return; + + if (!xfs_buf_delwri_queue(*bpp, buffers_list)) + ASSERT(0); + + xfs_buf_relse(*bpp); + *bpp = NULL; +} + +/* + * Allocate and initialize one btree block for bulk loading. + * + * The new btree block will have its level and numrecs fields set to the values + * of the level and nr_this_block parameters, respectively. + * + * The caller should ensure that ptrp, bpp, and blockp refer to the left + * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp + * will all point to the new block. + */ +STATIC int +xfs_btree_bload_prep_block( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + struct list_head *buffers_list, + unsigned int level, + unsigned int nr_this_block, + union xfs_btree_ptr *ptrp, /* in/out */ + struct xfs_buf **bpp, /* in/out */ + struct xfs_btree_block **blockp, /* in/out */ + void *priv) +{ + union xfs_btree_ptr new_ptr; + struct xfs_buf *new_bp; + struct xfs_btree_block *new_block; + int ret; + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + size_t new_size; + + ASSERT(*bpp == NULL); + + /* Allocate a new incore btree root block. */ + new_size = bbl->iroot_size(cur, nr_this_block, priv); + ifp->if_broot = kmem_zalloc(new_size, 0); + ifp->if_broot_bytes = (int)new_size; + ifp->if_flags |= XFS_IFBROOT; + + /* Initialize it and send it out. */ + xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot, + XFS_BUF_DADDR_NULL, cur->bc_btnum, level, + nr_this_block, cur->bc_ino.ip->i_ino, + cur->bc_flags); + + *bpp = NULL; + *blockp = ifp->if_broot; + xfs_btree_set_ptr_null(cur, ptrp); + return 0; + } + + /* Claim one of the caller's preallocated blocks. */ + xfs_btree_set_ptr_null(cur, &new_ptr); + ret = bbl->claim_block(cur, &new_ptr, priv); + if (ret) + return ret; + + ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr)); + + ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp); + if (ret) + return ret; + + /* + * The previous block (if any) is the left sibling of the new block, + * so set its right sibling pointer to the new block and drop it. + */ + if (*blockp) + xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB); + xfs_btree_bload_drop_buf(buffers_list, bpp); + + /* Initialize the new btree block. */ + xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block); + xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB); + + /* Set the out parameters. */ + *bpp = new_bp; + *blockp = new_block; + xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1); + return 0; +} + +/* Load one leaf block. */ +STATIC int +xfs_btree_bload_leaf( + struct xfs_btree_cur *cur, + unsigned int recs_this_block, + xfs_btree_bload_get_record_fn get_record, + struct xfs_btree_block *block, + void *priv) +{ + unsigned int j; + int ret; + + /* Fill the leaf block with records. */ + for (j = 1; j <= recs_this_block; j++) { + union xfs_btree_rec *block_rec; + + ret = get_record(cur, priv); + if (ret) + return ret; + block_rec = xfs_btree_rec_addr(cur, j, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return 0; +} + +/* + * Load one node block with key/ptr pairs. + * + * child_ptr must point to a block within the next level down in the tree. A + * key/ptr entry will be created in the new node block to the block pointed to + * by child_ptr. On exit, child_ptr points to the next block on the child + * level that needs processing. + */ +STATIC int +xfs_btree_bload_node( + struct xfs_btree_cur *cur, + unsigned int recs_this_block, + union xfs_btree_ptr *child_ptr, + struct xfs_btree_block *block) +{ + unsigned int j; + int ret; + + /* Fill the node block with keys and pointers. */ + for (j = 1; j <= recs_this_block; j++) { + union xfs_btree_key child_key; + union xfs_btree_ptr *block_ptr; + union xfs_btree_key *block_key; + struct xfs_btree_block *child_block; + struct xfs_buf *child_bp; + + ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); + + ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block, + &child_bp); + if (ret) + return ret; + + block_ptr = xfs_btree_ptr_addr(cur, j, block); + xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1); + + block_key = xfs_btree_key_addr(cur, j, block); + xfs_btree_get_keys(cur, child_block, &child_key); + xfs_btree_copy_keys(cur, block_key, &child_key, 1); + + xfs_btree_get_sibling(cur, child_block, child_ptr, + XFS_BB_RIGHTSIB); + xfs_buf_relse(child_bp); + } + + return 0; +} + +/* + * Compute the maximum number of records (or keyptrs) per block that we want to + * install at this level in the btree. Caller is responsible for having set + * @cur->bc_ino.forksize to the desired fork size, if appropriate. + */ +STATIC unsigned int +xfs_btree_bload_max_npb( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level) +{ + unsigned int ret; + + if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs) + return cur->bc_ops->get_dmaxrecs(cur, level); + + ret = cur->bc_ops->get_maxrecs(cur, level); + if (level == 0) + ret -= bbl->leaf_slack; + else + ret -= bbl->node_slack; + return ret; +} + +/* + * Compute the desired number of records (or keyptrs) per block that we want to + * install at this level in the btree, which must be somewhere between minrecs + * and max_npb. The caller is free to install fewer records per block. + */ +STATIC unsigned int +xfs_btree_bload_desired_npb( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level) +{ + unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level); + + /* Root blocks are not subject to minrecs rules. */ + if (level == cur->bc_nlevels - 1) + return max(1U, npb); + + return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb); +} + +/* + * Compute the number of records to be stored in each block at this level and + * the number of blocks for this level. For leaf levels, we must populate an + * empty root block even if there are no records, so we have to have at least + * one block. + */ +STATIC void +xfs_btree_bload_level_geometry( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level, + uint64_t nr_this_level, + unsigned int *avg_per_block, + uint64_t *blocks, + uint64_t *blocks_with_extra) +{ + uint64_t npb; + uint64_t dontcare; + unsigned int desired_npb; + unsigned int maxnr; + + maxnr = cur->bc_ops->get_maxrecs(cur, level); + + /* + * Compute the number of blocks we need to fill each block with the + * desired number of records/keyptrs per block. Because desired_npb + * could be minrecs, we use regular integer division (which rounds + * the block count down) so that in the next step the effective # of + * items per block will never be less than desired_npb. + */ + desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level); + *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare); + *blocks = max(1ULL, *blocks); + + /* + * Compute the number of records that we will actually put in each + * block, assuming that we want to spread the records evenly between + * the blocks. Take care that the effective # of items per block (npb) + * won't exceed maxrecs even for the blocks that get an extra record, + * since desired_npb could be maxrecs, and in the previous step we + * rounded the block count down. + */ + npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); + if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) { + (*blocks)++; + npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); + } + + *avg_per_block = min_t(uint64_t, npb, nr_this_level); + + trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level, + *avg_per_block, desired_npb, *blocks, + *blocks_with_extra); +} + +/* + * Ensure a slack value is appropriate for the btree. + * + * If the slack value is negative, set slack so that we fill the block to + * halfway between minrecs and maxrecs. Make sure the slack is never so large + * that we can underflow minrecs. + */ +static void +xfs_btree_bload_ensure_slack( + struct xfs_btree_cur *cur, + int *slack, + int level) +{ + int maxr; + int minr; + + maxr = cur->bc_ops->get_maxrecs(cur, level); + minr = cur->bc_ops->get_minrecs(cur, level); + + /* + * If slack is negative, automatically set slack so that we load the + * btree block approximately halfway between minrecs and maxrecs. + * Generally, this will net us 75% loading. + */ + if (*slack < 0) + *slack = maxr - ((maxr + minr) >> 1); + + *slack = min(*slack, maxr - minr); +} + +/* + * Prepare a btree cursor for a bulk load operation by computing the geometry + * fields in bbl. Caller must ensure that the btree cursor is a staging + * cursor. This function can be called multiple times. + */ +int +xfs_btree_bload_compute_geometry( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + uint64_t nr_records) +{ + uint64_t nr_blocks = 0; + uint64_t nr_this_level; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + /* + * Make sure that the slack values make sense for traditional leaf and + * node blocks. Inode-rooted btrees will return different minrecs and + * maxrecs values for the root block (bc_nlevels == level - 1). We're + * checking levels 0 and 1 here, so set bc_nlevels such that the btree + * code doesn't interpret either as the root level. + */ + cur->bc_nlevels = XFS_BTREE_MAXLEVELS - 1; + xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0); + xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1); + + bbl->nr_records = nr_this_level = nr_records; + for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) { + uint64_t level_blocks; + uint64_t dontcare64; + unsigned int level = cur->bc_nlevels - 1; + unsigned int avg_per_block; + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &level_blocks, &dontcare64); + + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + /* + * If all the items we want to store at this level + * would fit in the inode root block, then we have our + * btree root and are done. + * + * Note that bmap btrees forbid records in the root. + */ + if (level != 0 && nr_this_level <= avg_per_block) { + nr_blocks++; + break; + } + + /* + * Otherwise, we have to store all the items for this + * level in traditional btree blocks and therefore need + * another level of btree to point to those blocks. + * + * We have to re-compute the geometry for each level of + * an inode-rooted btree because the geometry differs + * between a btree root in an inode fork and a + * traditional btree block. + * + * This distinction is made in the btree code based on + * whether level == bc_nlevels - 1. Based on the + * previous root block size check against the root + * block geometry, we know that we aren't yet ready to + * populate the root. Increment bc_nevels and + * recalculate the geometry for a traditional + * block-based btree level. + */ + cur->bc_nlevels++; + xfs_btree_bload_level_geometry(cur, bbl, level, + nr_this_level, &avg_per_block, + &level_blocks, &dontcare64); + } else { + /* + * If all the items we want to store at this level + * would fit in a single root block, we're done. + */ + if (nr_this_level <= avg_per_block) { + nr_blocks++; + break; + } + + /* Otherwise, we need another level of btree. */ + cur->bc_nlevels++; + } + + nr_blocks += level_blocks; + nr_this_level = level_blocks; + } + + if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS) + return -EOVERFLOW; + + bbl->btree_height = cur->bc_nlevels; + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + bbl->nr_blocks = nr_blocks - 1; + else + bbl->nr_blocks = nr_blocks; + return 0; +} + +/* Bulk load a btree given the parameters and geometry established in bbl. */ +int +xfs_btree_bload( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + void *priv) +{ + struct list_head buffers_list; + union xfs_btree_ptr child_ptr; + union xfs_btree_ptr ptr; + struct xfs_buf *bp = NULL; + struct xfs_btree_block *block = NULL; + uint64_t nr_this_level = bbl->nr_records; + uint64_t blocks; + uint64_t i; + uint64_t blocks_with_extra; + uint64_t total_blocks = 0; + unsigned int avg_per_block; + unsigned int level = 0; + int ret; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + INIT_LIST_HEAD(&buffers_list); + cur->bc_nlevels = bbl->btree_height; + xfs_btree_set_ptr_null(cur, &child_ptr); + xfs_btree_set_ptr_null(cur, &ptr); + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &blocks, &blocks_with_extra); + + /* Load each leaf block. */ + for (i = 0; i < blocks; i++) { + unsigned int nr_this_block = avg_per_block; + + /* + * Due to rounding, btree blocks will not be evenly populated + * in most cases. blocks_with_extra tells us how many blocks + * will receive an extra record to distribute the excess across + * the current level as evenly as possible. + */ + if (i < blocks_with_extra) + nr_this_block++; + + ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level, + nr_this_block, &ptr, &bp, &block, priv); + if (ret) + goto out; + + trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, + nr_this_block); + + ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record, + block, priv); + if (ret) + goto out; + + /* + * Record the leftmost leaf pointer so we know where to start + * with the first node level. + */ + if (i == 0) + xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); + } + total_blocks += blocks; + xfs_btree_bload_drop_buf(&buffers_list, &bp); + + /* Populate the internal btree nodes. */ + for (level = 1; level < cur->bc_nlevels; level++) { + union xfs_btree_ptr first_ptr; + + nr_this_level = blocks; + block = NULL; + xfs_btree_set_ptr_null(cur, &ptr); + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &blocks, &blocks_with_extra); + + /* Load each node block. */ + for (i = 0; i < blocks; i++) { + unsigned int nr_this_block = avg_per_block; + + if (i < blocks_with_extra) + nr_this_block++; + + ret = xfs_btree_bload_prep_block(cur, bbl, + &buffers_list, level, nr_this_block, + &ptr, &bp, &block, priv); + if (ret) + goto out; + + trace_xfs_btree_bload_block(cur, level, i, blocks, + &ptr, nr_this_block); + + ret = xfs_btree_bload_node(cur, nr_this_block, + &child_ptr, block); + if (ret) + goto out; + + /* + * Record the leftmost node pointer so that we know + * where to start the next node level above this one. + */ + if (i == 0) + xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); + } + total_blocks += blocks; + xfs_btree_bload_drop_buf(&buffers_list, &bp); + xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1); + } + + /* Initialize the new root. */ + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); + cur->bc_ino.ifake->if_levels = cur->bc_nlevels; + cur->bc_ino.ifake->if_blocks = total_blocks - 1; + } else { + cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s); + cur->bc_ag.afake->af_levels = cur->bc_nlevels; + cur->bc_ag.afake->af_blocks = total_blocks; + } + + /* + * Write the new blocks to disk. If the ordered list isn't empty after + * that, then something went wrong and we have to fail. This should + * never happen, but we'll check anyway. + */ + ret = xfs_buf_delwri_submit(&buffers_list); + if (ret) + goto out; + if (!list_empty(&buffers_list)) { + ASSERT(list_empty(&buffers_list)); + ret = -EIO; + } + +out: + xfs_buf_delwri_cancel(&buffers_list); + if (bp) + xfs_buf_relse(bp); + return ret; +} diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h new file mode 100644 index 000000000000..643f0f9b2994 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2020 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_BTREE_STAGING_H__ +#define __XFS_BTREE_STAGING_H__ + +/* Fake root for an AG-rooted btree. */ +struct xbtree_afakeroot { + /* AG block number of the new btree root. */ + xfs_agblock_t af_root; + + /* Height of the new btree. */ + unsigned int af_levels; + + /* Number of blocks used by the btree. */ + unsigned int af_blocks; +}; + +/* Cursor interactions with with fake roots for AG-rooted btrees. */ +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur, + struct xbtree_afakeroot *afake); +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, + struct xfs_buf *agbp, const struct xfs_btree_ops *ops); + +/* Fake root for an inode-rooted btree. */ +struct xbtree_ifakeroot { + /* Fake inode fork. */ + struct xfs_ifork *if_fork; + + /* Number of blocks used by the btree. */ + int64_t if_blocks; + + /* Height of the new btree. */ + unsigned int if_levels; + + /* Number of bytes available for this fork in the inode. */ + unsigned int if_fork_size; + + /* Fork format. */ + unsigned int if_format; + + /* Number of records. */ + unsigned int if_extents; +}; + +/* Cursor interactions with with fake roots for inode-rooted btrees. */ +void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur, + struct xbtree_ifakeroot *ifake, + struct xfs_btree_ops **new_ops); +void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, + int whichfork, const struct xfs_btree_ops *ops); + +/* Bulk loading of staged btrees. */ +typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv); +typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, void *priv); +typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, + unsigned int nr_this_level, void *priv); + +struct xfs_btree_bload { + /* + * This function will be called nr_records times to load records into + * the btree. The function does this by setting the cursor's bc_rec + * field in in-core format. Records must be returned in sort order. + */ + xfs_btree_bload_get_record_fn get_record; + + /* + * This function will be called nr_blocks times to obtain a pointer + * to a new btree block on disk. Callers must preallocate all space + * for the new btree before calling xfs_btree_bload, and this function + * is what claims that reservation. + */ + xfs_btree_bload_claim_block_fn claim_block; + + /* + * This function should return the size of the in-core btree root + * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree + * types. + */ + xfs_btree_bload_iroot_size_fn iroot_size; + + /* + * The caller should set this to the number of records that will be + * stored in the new btree. + */ + uint64_t nr_records; + + /* + * Number of free records to leave in each leaf block. If the caller + * sets this to -1, the slack value will be calculated to be be halfway + * between maxrecs and minrecs. This typically leaves the block 75% + * full. Note that slack values are not enforced on inode root blocks. + */ + int leaf_slack; + + /* + * Number of free key/ptrs pairs to leave in each node block. This + * field has the same semantics as leaf_slack. + */ + int node_slack; + + /* + * The xfs_btree_bload_compute_geometry function will set this to the + * number of btree blocks needed to store nr_records records. + */ + uint64_t nr_blocks; + + /* + * The xfs_btree_bload_compute_geometry function will set this to the + * height of the new btree. + */ + unsigned int btree_height; +}; + +int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, uint64_t nr_records); +int xfs_btree_bload(struct xfs_btree_cur *cur, struct xfs_btree_bload *bbl, + void *priv); + +#endif /* __XFS_BTREE_STAGING_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 4fd1223c1bd5..897749c41f36 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -12,9 +12,9 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" -#include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap.h" #include "xfs_attr_leaf.h" @@ -107,7 +107,66 @@ xfs_da_state_free(xfs_da_state_t *state) #ifdef DEBUG memset((char *)state, 0, sizeof(*state)); #endif /* DEBUG */ - kmem_zone_free(xfs_da_state_zone, state); + kmem_cache_free(xfs_da_state_zone, state); +} + +static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork) +{ + if (whichfork == XFS_DATA_FORK) + return mp->m_dir_geo->fsbcount; + return mp->m_attr_geo->fsbcount; +} + +void +xfs_da3_node_hdr_from_disk( + struct xfs_mount *mp, + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_intnode *from3 = (struct xfs_da3_intnode *)from; + + to->forw = be32_to_cpu(from3->hdr.info.hdr.forw); + to->back = be32_to_cpu(from3->hdr.info.hdr.back); + to->magic = be16_to_cpu(from3->hdr.info.hdr.magic); + to->count = be16_to_cpu(from3->hdr.__count); + to->level = be16_to_cpu(from3->hdr.__level); + to->btree = from3->__btree; + ASSERT(to->magic == XFS_DA3_NODE_MAGIC); + } else { + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.__count); + to->level = be16_to_cpu(from->hdr.__level); + to->btree = from->__btree; + ASSERT(to->magic == XFS_DA_NODE_MAGIC); + } +} + +void +xfs_da3_node_hdr_to_disk( + struct xfs_mount *mp, + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_intnode *to3 = (struct xfs_da3_intnode *)to; + + ASSERT(from->magic == XFS_DA3_NODE_MAGIC); + to3->hdr.info.hdr.forw = cpu_to_be32(from->forw); + to3->hdr.info.hdr.back = cpu_to_be32(from->back); + to3->hdr.info.hdr.magic = cpu_to_be16(from->magic); + to3->hdr.__count = cpu_to_be16(from->count); + to3->hdr.__level = cpu_to_be16(from->level); + } else { + ASSERT(from->magic == XFS_DA_NODE_MAGIC); + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.__count = cpu_to_be16(from->count); + to->hdr.__level = cpu_to_be16(from->level); + } } /* @@ -145,12 +204,9 @@ xfs_da3_node_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_da_intnode *hdr = bp->b_addr; struct xfs_da3_icnode_hdr ichdr; - const struct xfs_dir_ops *ops; xfs_failaddr_t fa; - ops = xfs_dir_get_ops(mp, NULL); - - ops->node_hdr_from_disk(&ichdr, hdr); + xfs_da3_node_hdr_from_disk(mp, &ichdr, hdr); fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); if (fa) @@ -275,46 +331,76 @@ const struct xfs_buf_ops xfs_da3_node_buf_ops = { .verify_struct = xfs_da3_node_verify_struct, }; +static int +xfs_da3_node_set_type( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); + return 0; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_ATTR_LEAF_BUF); + return 0; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); + return 0; + default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp, + info, sizeof(*info)); + xfs_trans_brelse(tp, bp); + return -EFSCORRUPTED; + } +} + int xfs_da3_node_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, + struct xfs_buf **bpp, + int whichfork) +{ + int error; + + error = xfs_da_read_buf(tp, dp, bno, 0, bpp, whichfork, + &xfs_da3_node_buf_ops); + if (error || !*bpp || !tp) + return error; + return xfs_da3_node_set_type(tp, *bpp); +} + +int +xfs_da3_node_read_mapped( + struct xfs_trans *tp, + struct xfs_inode *dp, xfs_daddr_t mappedbno, struct xfs_buf **bpp, - int which_fork) + int whichfork) { - int err; + struct xfs_mount *mp = dp->i_mount; + int error; - err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - which_fork, &xfs_da3_node_buf_ops); - if (!err && tp && *bpp) { - struct xfs_da_blkinfo *info = (*bpp)->b_addr; - int type; + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, mappedbno, + XFS_FSB_TO_BB(mp, xfs_dabuf_nfsb(mp, whichfork)), 0, + bpp, &xfs_da3_node_buf_ops); + if (error || !*bpp) + return error; - switch (be16_to_cpu(info->magic)) { - case XFS_DA_NODE_MAGIC: - case XFS_DA3_NODE_MAGIC: - type = XFS_BLFT_DA_NODE_BUF; - break; - case XFS_ATTR_LEAF_MAGIC: - case XFS_ATTR3_LEAF_MAGIC: - type = XFS_BLFT_ATTR_LEAF_BUF; - break; - case XFS_DIR2_LEAFN_MAGIC: - case XFS_DIR3_LEAFN_MAGIC: - type = XFS_BLFT_DIR_LEAFN_BUF; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - tp->t_mountp, info, sizeof(*info)); - xfs_trans_brelse(tp, *bpp); - *bpp = NULL; - return -EFSCORRUPTED; - } - xfs_trans_buf_set_type(tp, *bpp, type); - } - return err; + if (whichfork == XFS_ATTR_FORK) + xfs_buf_set_ref(*bpp, XFS_ATTR_BTREE_REF); + else + xfs_buf_set_ref(*bpp, XFS_DIR_BTREE_REF); + + if (!tp) + return 0; + return xfs_da3_node_set_type(tp, *bpp); } /*======================================================================== @@ -343,7 +429,7 @@ xfs_da3_node_create( trace_xfs_da_node_create(args); ASSERT(level <= XFS_DA_NODE_MAXDEPTH); - error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork); + error = xfs_da_get_buf(tp, dp, blkno, &bp, whichfork); if (error) return error; bp->b_ops = &xfs_da3_node_buf_ops; @@ -363,9 +449,9 @@ xfs_da3_node_create( } ichdr.level = level; - dp->d_ops->node_hdr_to_disk(node, &ichdr); + xfs_da3_node_hdr_to_disk(dp->i_mount, node, &ichdr); xfs_trans_log_buf(tp, bp, - XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); + XFS_DA_LOGRANGE(node, &node->hdr, args->geo->node_hdr_size)); *bpp = bp; return 0; @@ -504,6 +590,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) { + xfs_buf_mark_corrupt(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -516,6 +603,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) { + xfs_buf_mark_corrupt(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -568,7 +656,7 @@ xfs_da3_root_split( dp = args->dp; tp = args->trans; - error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); + error = xfs_da_get_buf(tp, dp, blkno, &bp, args->whichfork); if (error) return error; node = bp->b_addr; @@ -577,8 +665,8 @@ xfs_da3_root_split( oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { struct xfs_da3_icnode_hdr icnodehdr; - dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot); - btree = dp->d_ops->node_tree_p(oldroot); + xfs_da3_node_hdr_from_disk(dp->i_mount, &icnodehdr, oldroot); + btree = icnodehdr.btree; size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot); level = icnodehdr.level; @@ -589,15 +677,14 @@ xfs_da3_root_split( xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); } else { struct xfs_dir3_icleaf_hdr leafhdr; - struct xfs_dir2_leaf_entry *ents; leaf = (xfs_dir2_leaf_t *)oldroot; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); - size = (int)((char *)&ents[leafhdr.count] - (char *)leaf); + size = (int)((char *)&leafhdr.ents[leafhdr.count] - + (char *)leaf); level = 0; /* @@ -637,14 +724,14 @@ xfs_da3_root_split( return error; node = bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); + btree = nodehdr.btree; btree[0].hashval = cpu_to_be32(blk1->hashval); btree[0].before = cpu_to_be32(blk1->blkno); btree[1].hashval = cpu_to_be32(blk2->hashval); btree[1].before = cpu_to_be32(blk2->blkno); nodehdr.count = 2; - dp->d_ops->node_hdr_to_disk(node, &nodehdr); + xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr); #ifdef DEBUG if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || @@ -686,7 +773,7 @@ xfs_da3_node_split( trace_xfs_da_node_split(state->args); node = oldblk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); /* * With V2 dirs the extra block is data or freespace. @@ -733,7 +820,7 @@ xfs_da3_node_split( * If we had double-split op below us, then add the extra block too. */ node = oldblk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); if (oldblk->index <= nodehdr.count) { oldblk->index++; xfs_da3_node_add(state, oldblk, addblk); @@ -788,10 +875,10 @@ xfs_da3_node_rebalance( node1 = blk1->bp->b_addr; node2 = blk2->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); - dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); - btree1 = dp->d_ops->node_tree_p(node1); - btree2 = dp->d_ops->node_tree_p(node2); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2); + btree1 = nodehdr1.btree; + btree2 = nodehdr2.btree; /* * Figure out how many entries need to move, and in which direction. @@ -804,10 +891,10 @@ xfs_da3_node_rebalance( tmpnode = node1; node1 = node2; node2 = tmpnode; - dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); - dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); - btree1 = dp->d_ops->node_tree_p(node1); - btree2 = dp->d_ops->node_tree_p(node2); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2); + btree1 = nodehdr1.btree; + btree2 = nodehdr2.btree; swap = 1; } @@ -869,14 +956,15 @@ xfs_da3_node_rebalance( /* * Log header of node 1 and all current bits of node 2. */ - dp->d_ops->node_hdr_to_disk(node1, &nodehdr1); + xfs_da3_node_hdr_to_disk(dp->i_mount, node1, &nodehdr1); xfs_trans_log_buf(tp, blk1->bp, - XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size)); + XFS_DA_LOGRANGE(node1, &node1->hdr, + state->args->geo->node_hdr_size)); - dp->d_ops->node_hdr_to_disk(node2, &nodehdr2); + xfs_da3_node_hdr_to_disk(dp->i_mount, node2, &nodehdr2); xfs_trans_log_buf(tp, blk2->bp, XFS_DA_LOGRANGE(node2, &node2->hdr, - dp->d_ops->node_hdr_size + + state->args->geo->node_hdr_size + (sizeof(btree2[0]) * nodehdr2.count))); /* @@ -886,10 +974,10 @@ xfs_da3_node_rebalance( if (swap) { node1 = blk1->bp->b_addr; node2 = blk2->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); - dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); - btree1 = dp->d_ops->node_tree_p(node1); - btree2 = dp->d_ops->node_tree_p(node2); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2); + btree1 = nodehdr1.btree; + btree2 = nodehdr2.btree; } blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval); blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval); @@ -921,8 +1009,8 @@ xfs_da3_node_add( trace_xfs_da_node_add(state->args); node = oldblk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); + btree = nodehdr.btree; ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count); ASSERT(newblk->blkno != 0); @@ -945,9 +1033,10 @@ xfs_da3_node_add( tmp + sizeof(*btree))); nodehdr.count += 1; - dp->d_ops->node_hdr_to_disk(node, &nodehdr); + xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr); xfs_trans_log_buf(state->args->trans, oldblk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); + XFS_DA_LOGRANGE(node, &node->hdr, + state->args->geo->node_hdr_size)); /* * Copy the last hash value from the oldblk to propagate upwards. @@ -1082,7 +1171,6 @@ xfs_da3_root_join( xfs_dablk_t child; struct xfs_buf *bp; struct xfs_da3_icnode_hdr oldroothdr; - struct xfs_da_node_entry *btree; int error; struct xfs_inode *dp = state->args->dp; @@ -1092,7 +1180,7 @@ xfs_da3_root_join( args = state->args; oldroot = root_blk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot); + xfs_da3_node_hdr_from_disk(dp->i_mount, &oldroothdr, oldroot); ASSERT(oldroothdr.forw == 0); ASSERT(oldroothdr.back == 0); @@ -1106,11 +1194,9 @@ xfs_da3_root_join( * Read in the (only) child block, then copy those bytes into * the root block's buffer and free the original child block. */ - btree = dp->d_ops->node_tree_p(oldroot); - child = be32_to_cpu(btree[0].before); + child = be32_to_cpu(oldroothdr.btree[0].before); ASSERT(child != 0); - error = xfs_da3_node_read(args->trans, dp, child, -1, &bp, - args->whichfork); + error = xfs_da3_node_read(args->trans, dp, child, &bp, args->whichfork); if (error) return error; xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); @@ -1172,7 +1258,7 @@ xfs_da3_node_toosmall( blk = &state->path.blk[ state->path.active-1 ]; info = blk->bp->b_addr; node = (xfs_da_intnode_t *)info; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); if (nodehdr.count > (state->args->geo->node_ents >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return 0; /* blk over 50%, don't try to join */ @@ -1224,13 +1310,13 @@ xfs_da3_node_toosmall( blkno = nodehdr.back; if (blkno == 0) continue; - error = xfs_da3_node_read(state->args->trans, dp, - blkno, -1, &bp, state->args->whichfork); + error = xfs_da3_node_read(state->args->trans, dp, blkno, &bp, + state->args->whichfork); if (error) return error; node = bp->b_addr; - dp->d_ops->node_hdr_from_disk(&thdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &thdr, node); xfs_trans_brelse(state->args->trans, bp); if (count - thdr.count >= 0) @@ -1272,18 +1358,14 @@ xfs_da3_node_lasthash( struct xfs_buf *bp, int *count) { - struct xfs_da_intnode *node; - struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; - node = bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, bp->b_addr); if (count) *count = nodehdr.count; if (!nodehdr.count) return 0; - btree = dp->d_ops->node_tree_p(node); - return be32_to_cpu(btree[nodehdr.count - 1].hashval); + return be32_to_cpu(nodehdr.btree[nodehdr.count - 1].hashval); } /* @@ -1328,8 +1410,8 @@ xfs_da3_fixhashpath( struct xfs_da3_icnode_hdr nodehdr; node = blk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); + btree = nodehdr.btree; if (be32_to_cpu(btree[blk->index].hashval) == lasthash) break; blk->hashval = lasthash; @@ -1360,7 +1442,7 @@ xfs_da3_node_remove( trace_xfs_da_node_remove(state->args); node = drop_blk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); ASSERT(drop_blk->index < nodehdr.count); ASSERT(drop_blk->index >= 0); @@ -1368,7 +1450,7 @@ xfs_da3_node_remove( * Copy over the offending entry, or just zero it out. */ index = drop_blk->index; - btree = dp->d_ops->node_tree_p(node); + btree = nodehdr.btree; if (index < nodehdr.count - 1) { tmp = nodehdr.count - index - 1; tmp *= (uint)sizeof(xfs_da_node_entry_t); @@ -1381,9 +1463,9 @@ xfs_da3_node_remove( xfs_trans_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index]))); nodehdr.count -= 1; - dp->d_ops->node_hdr_to_disk(node, &nodehdr); + xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); + XFS_DA_LOGRANGE(node, &node->hdr, state->args->geo->node_hdr_size)); /* * Copy the last hash value from the block to propagate upwards. @@ -1416,10 +1498,10 @@ xfs_da3_node_unbalance( drop_node = drop_blk->bp->b_addr; save_node = save_blk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node); - dp->d_ops->node_hdr_from_disk(&save_hdr, save_node); - drop_btree = dp->d_ops->node_tree_p(drop_node); - save_btree = dp->d_ops->node_tree_p(save_node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &drop_hdr, drop_node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &save_hdr, save_node); + drop_btree = drop_hdr.btree; + save_btree = save_hdr.btree; tp = state->args->trans; /* @@ -1453,10 +1535,10 @@ xfs_da3_node_unbalance( memcpy(&save_btree[sindex], &drop_btree[0], tmp); save_hdr.count += drop_hdr.count; - dp->d_ops->node_hdr_to_disk(save_node, &save_hdr); + xfs_da3_node_hdr_to_disk(dp->i_mount, save_node, &save_hdr); xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, &save_node->hdr, - dp->d_ops->node_hdr_size)); + state->args->geo->node_hdr_size)); /* * Save the last hashval in the remaining block for upward propagation. @@ -1517,7 +1599,7 @@ xfs_da3_node_lookup_int( */ blk->blkno = blkno; error = xfs_da3_node_read(args->trans, args->dp, blkno, - -1, &blk->bp, args->whichfork); + &blk->bp, args->whichfork); if (error) { blk->blkno = 0; state->path.active--; @@ -1541,8 +1623,10 @@ xfs_da3_node_lookup_int( break; } - if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) + if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) { + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; + } blk->magic = XFS_DA_NODE_MAGIC; @@ -1550,19 +1634,22 @@ xfs_da3_node_lookup_int( * Search an intermediate node for a match. */ node = blk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); + btree = nodehdr.btree; /* Tree taller than we can handle; bail out! */ - if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) + if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) { + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; + } /* Check the level from the root. */ if (blkno == args->geo->leafblk) expected_level = nodehdr.level - 1; - else if (expected_level != nodehdr.level) + else if (expected_level != nodehdr.level) { + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; - else + } else expected_level--; max = nodehdr.count; @@ -1612,11 +1699,11 @@ xfs_da3_node_lookup_int( } /* We can't point back to the root. */ - if (blkno == args->geo->leafblk) + if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk)) return -EFSCORRUPTED; } - if (expected_level != 0) + if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0)) return -EFSCORRUPTED; /* @@ -1678,10 +1765,10 @@ xfs_da3_node_order( node1 = node1_bp->b_addr; node2 = node2_bp->b_addr; - dp->d_ops->node_hdr_from_disk(&node1hdr, node1); - dp->d_ops->node_hdr_from_disk(&node2hdr, node2); - btree1 = dp->d_ops->node_tree_p(node1); - btree2 = dp->d_ops->node_tree_p(node2); + xfs_da3_node_hdr_from_disk(dp->i_mount, &node1hdr, node1); + xfs_da3_node_hdr_from_disk(dp->i_mount, &node2hdr, node2); + btree1 = node1hdr.btree; + btree2 = node2hdr.btree; if (node1hdr.count > 0 && node2hdr.count > 0 && ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || @@ -1746,7 +1833,7 @@ xfs_da3_blk_link( if (old_info->back) { error = xfs_da3_node_read(args->trans, dp, be32_to_cpu(old_info->back), - -1, &bp, args->whichfork); + &bp, args->whichfork); if (error) return error; ASSERT(bp != NULL); @@ -1767,7 +1854,7 @@ xfs_da3_blk_link( if (old_info->forw) { error = xfs_da3_node_read(args->trans, dp, be32_to_cpu(old_info->forw), - -1, &bp, args->whichfork); + &bp, args->whichfork); if (error) return error; ASSERT(bp != NULL); @@ -1826,7 +1913,7 @@ xfs_da3_blk_unlink( if (drop_info->back) { error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->back), - -1, &bp, args->whichfork); + &bp, args->whichfork); if (error) return error; ASSERT(bp != NULL); @@ -1843,7 +1930,7 @@ xfs_da3_blk_unlink( if (drop_info->forw) { error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->forw), - -1, &bp, args->whichfork); + &bp, args->whichfork); if (error) return error; ASSERT(bp != NULL); @@ -1878,7 +1965,6 @@ xfs_da3_path_shift( { struct xfs_da_state_blk *blk; struct xfs_da_blkinfo *info; - struct xfs_da_intnode *node; struct xfs_da_args *args; struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; @@ -1900,18 +1986,18 @@ xfs_da3_path_shift( ASSERT(path != NULL); ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); level = (path->active-1) - 1; /* skip bottom layer in path */ - for (blk = &path->blk[level]; level >= 0; blk--, level--) { - node = blk->bp->b_addr; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); + for (; level >= 0; level--) { + blk = &path->blk[level]; + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, + blk->bp->b_addr); if (forward && (blk->index < nodehdr.count - 1)) { blk->index++; - blkno = be32_to_cpu(btree[blk->index].before); + blkno = be32_to_cpu(nodehdr.btree[blk->index].before); break; } else if (!forward && (blk->index > 0)) { blk->index--; - blkno = be32_to_cpu(btree[blk->index].before); + blkno = be32_to_cpu(nodehdr.btree[blk->index].before); break; } } @@ -1929,7 +2015,7 @@ xfs_da3_path_shift( /* * Read the next child block into a local buffer. */ - error = xfs_da3_node_read(args->trans, dp, blkno, -1, &bp, + error = xfs_da3_node_read(args->trans, dp, blkno, &bp, args->whichfork); if (error) return error; @@ -1962,9 +2048,9 @@ xfs_da3_path_shift( case XFS_DA_NODE_MAGIC: case XFS_DA3_NODE_MAGIC: blk->magic = XFS_DA_NODE_MAGIC; - node = (xfs_da_intnode_t *)info; - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, + bp->b_addr); + btree = nodehdr.btree; blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); if (forward) blk->index = 0; @@ -2044,18 +2130,6 @@ xfs_da_compname( XFS_CMP_EXACT : XFS_CMP_DIFFERENT; } -static xfs_dahash_t -xfs_default_hashname( - struct xfs_name *name) -{ - return xfs_da_hashname(name->name, name->len); -} - -const struct xfs_nameops xfs_default_nameops = { - .hashname = xfs_default_hashname, - .compname = xfs_da_compname -}; - int xfs_da_grow_inode_int( struct xfs_da_args *args, @@ -2213,16 +2287,13 @@ xfs_da3_swap_lastblock( error = xfs_bmap_last_before(tp, dp, &lastoff, w); if (error) return error; - if (unlikely(lastoff == 0)) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW, - mp); + if (XFS_IS_CORRUPT(mp, lastoff == 0)) return -EFSCORRUPTED; - } /* * Read the last block in the btree space. */ last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount; - error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w); + error = xfs_da3_node_read(tp, dp, last_blkno, &last_buf, w); if (error) return error; /* @@ -2240,16 +2311,17 @@ xfs_da3_swap_lastblock( struct xfs_dir2_leaf_entry *ents; dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2); - ents = dp->d_ops->leaf_ents_p(dead_leaf2); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, + dead_leaf2); + ents = leafhdr.ents; dead_level = 0; dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval); } else { struct xfs_da3_icnode_hdr deadhdr; dead_node = (xfs_da_intnode_t *)dead_info; - dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node); - btree = dp->d_ops->node_tree_p(dead_node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &deadhdr, dead_node); + btree = deadhdr.btree; dead_level = deadhdr.level; dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval); } @@ -2258,15 +2330,13 @@ xfs_da3_swap_lastblock( * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; - if (unlikely( - be32_to_cpu(sib_info->forw) != last_blkno || - sib_info->magic != dead_info->magic)) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)", - XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, + be32_to_cpu(sib_info->forw) != last_blkno || + sib_info->magic != dead_info->magic)) { error = -EFSCORRUPTED; goto done; } @@ -2280,15 +2350,13 @@ xfs_da3_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; - if (unlikely( - be32_to_cpu(sib_info->back) != last_blkno || - sib_info->magic != dead_info->magic)) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)", - XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, + be32_to_cpu(sib_info->back) != last_blkno || + sib_info->magic != dead_info->magic)) { error = -EFSCORRUPTED; goto done; } @@ -2304,27 +2372,24 @@ xfs_da3_swap_lastblock( * Walk down the tree looking for the parent of the moved block. */ for (;;) { - error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); - if (level >= 0 && level != par_hdr.level + 1) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", - XFS_ERRLEVEL_LOW, mp); + xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); + if (XFS_IS_CORRUPT(mp, + level >= 0 && level != par_hdr.level + 1)) { error = -EFSCORRUPTED; goto done; } level = par_hdr.level; - btree = dp->d_ops->node_tree_p(par_node); + btree = par_hdr.btree; for (entno = 0; entno < par_hdr.count && be32_to_cpu(btree[entno].hashval) < dead_hash; entno++) continue; - if (entno == par_hdr.count) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)", - XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, entno == par_hdr.count)) { error = -EFSCORRUPTED; goto done; } @@ -2349,24 +2414,20 @@ xfs_da3_swap_lastblock( par_blkno = par_hdr.forw; xfs_trans_brelse(tp, par_buf); par_buf = NULL; - if (unlikely(par_blkno == 0)) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)", - XFS_ERRLEVEL_LOW, mp); + if (XFS_IS_CORRUPT(mp, par_blkno == 0)) { error = -EFSCORRUPTED; goto done; } - error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); - if (par_hdr.level != level) { - XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", - XFS_ERRLEVEL_LOW, mp); + xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); + if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) { error = -EFSCORRUPTED; goto done; } - btree = dp->d_ops->node_tree_p(par_node); + btree = par_hdr.btree; entno = 0; } /* @@ -2429,159 +2490,86 @@ xfs_da_shrink_inode( return error; } -/* - * See if the mapping(s) for this btree block are valid, i.e. - * don't contain holes, are logically contiguous, and cover the whole range. - */ -STATIC int -xfs_da_map_covers_blocks( - int nmap, - xfs_bmbt_irec_t *mapp, - xfs_dablk_t bno, - int count) -{ - int i; - xfs_fileoff_t off; - - for (i = 0, off = bno; i < nmap; i++) { - if (mapp[i].br_startblock == HOLESTARTBLOCK || - mapp[i].br_startblock == DELAYSTARTBLOCK) { - return 0; - } - if (off != mapp[i].br_startoff) { - return 0; - } - off += mapp[i].br_blockcount; - } - return off == bno + count; -} - -/* - * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map. - * - * For the single map case, it is assumed that the caller has provided a pointer - * to a valid xfs_buf_map. For the multiple map case, this function will - * allocate the xfs_buf_map to hold all the maps and replace the caller's single - * map pointer with the allocated map. - */ static int -xfs_buf_map_from_irec( - struct xfs_mount *mp, +xfs_dabuf_map( + struct xfs_inode *dp, + xfs_dablk_t bno, + unsigned int flags, + int whichfork, struct xfs_buf_map **mapp, - int *nmaps, - struct xfs_bmbt_irec *irecs, - int nirecs) + int *nmaps) { - struct xfs_buf_map *map; - int i; - - ASSERT(*nmaps == 1); - ASSERT(nirecs >= 1); + struct xfs_mount *mp = dp->i_mount; + int nfsb = xfs_dabuf_nfsb(mp, whichfork); + struct xfs_bmbt_irec irec, *irecs = &irec; + struct xfs_buf_map *map = *mapp; + xfs_fileoff_t off = bno; + int error = 0, nirecs, i; + + if (nfsb > 1) + irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_NOFS); + + nirecs = nfsb; + error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs, + xfs_bmapi_aflag(whichfork)); + if (error) + goto out_free_irecs; + /* + * Use the caller provided map for the single map case, else allocate a + * larger one that needs to be free by the caller. + */ if (nirecs > 1) { - map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), - KM_NOFS); - if (!map) - return -ENOMEM; + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS); + if (!map) { + error = -ENOMEM; + goto out_free_irecs; + } *mapp = map; } - *nmaps = nirecs; - map = *mapp; - for (i = 0; i < *nmaps; i++) { - ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK && - irecs[i].br_startblock != HOLESTARTBLOCK); + for (i = 0; i < nirecs; i++) { + if (irecs[i].br_startblock == HOLESTARTBLOCK || + irecs[i].br_startblock == DELAYSTARTBLOCK) + goto invalid_mapping; + if (off != irecs[i].br_startoff) + goto invalid_mapping; + map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock); map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount); + off += irecs[i].br_blockcount; } - return 0; -} -/* - * Map the block we are given ready for reading. There are three possible return - * values: - * -1 - will be returned if we land in a hole and mappedbno == -2 so the - * caller knows not to execute a subsequent read. - * 0 - if we mapped the block successfully - * >0 - positive error number if there was an error. - */ -static int -xfs_dabuf_map( - struct xfs_inode *dp, - xfs_dablk_t bno, - xfs_daddr_t mappedbno, - int whichfork, - struct xfs_buf_map **map, - int *nmaps) -{ - struct xfs_mount *mp = dp->i_mount; - int nfsb; - int error = 0; - struct xfs_bmbt_irec irec; - struct xfs_bmbt_irec *irecs = &irec; - int nirecs; + if (off != bno + nfsb) + goto invalid_mapping; - ASSERT(map && *map); - ASSERT(*nmaps == 1); - - if (whichfork == XFS_DATA_FORK) - nfsb = mp->m_dir_geo->fsbcount; - else - nfsb = mp->m_attr_geo->fsbcount; - - /* - * Caller doesn't have a mapping. -2 means don't complain - * if we land in a hole. - */ - if (mappedbno == -1 || mappedbno == -2) { - /* - * Optimize the one-block case. - */ - if (nfsb != 1) - irecs = kmem_zalloc(sizeof(irec) * nfsb, - KM_NOFS); + *nmaps = nirecs; +out_free_irecs: + if (irecs != &irec) + kmem_free(irecs); + return error; - nirecs = nfsb; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, - &nirecs, xfs_bmapi_aflag(whichfork)); - if (error) - goto out; - } else { - irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); - irecs->br_startoff = (xfs_fileoff_t)bno; - irecs->br_blockcount = nfsb; - irecs->br_state = 0; - nirecs = 1; - } +invalid_mapping: + /* Caller ok with no mapping. */ + if (XFS_IS_CORRUPT(mp, !(flags & XFS_DABUF_MAP_HOLE_OK))) { + error = -EFSCORRUPTED; + if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + xfs_alert(mp, "%s: bno %u inode %llu", + __func__, bno, dp->i_ino); - if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) { - error = mappedbno == -2 ? -1 : -EFSCORRUPTED; - if (unlikely(error == -EFSCORRUPTED)) { - if (xfs_error_level >= XFS_ERRLEVEL_LOW) { - int i; - xfs_alert(mp, "%s: bno %lld dir: inode %lld", - __func__, (long long)bno, - (long long)dp->i_ino); - for (i = 0; i < *nmaps; i++) { - xfs_alert(mp, + for (i = 0; i < nirecs; i++) { + xfs_alert(mp, "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d", - i, - (long long)irecs[i].br_startoff, - (long long)irecs[i].br_startblock, - (long long)irecs[i].br_blockcount, - irecs[i].br_state); - } + i, irecs[i].br_startoff, + irecs[i].br_startblock, + irecs[i].br_blockcount, + irecs[i].br_state); } - XFS_ERROR_REPORT("xfs_da_do_buf(1)", - XFS_ERRLEVEL_LOW, mp); } - goto out; + } else { + *nmaps = 0; } - error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs); -out: - if (irecs != &irec) - kmem_free(irecs); - return error; + goto out_free_irecs; } /* @@ -2589,39 +2577,26 @@ out: */ int xfs_da_get_buf( - struct xfs_trans *trans, + struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork) { + struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp; - struct xfs_buf_map map; - struct xfs_buf_map *mapp; - int nmap; + struct xfs_buf_map map, *mapp = ↦ + int nmap = 1; int error; *bpp = NULL; - mapp = ↦ - nmap = 1; - error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, - &mapp, &nmap); - if (error) { - /* mapping a hole is not an error, but we don't continue */ - if (error == -1) - error = 0; + error = xfs_dabuf_map(dp, bno, 0, whichfork, &mapp, &nmap); + if (error || nmap == 0) goto out_free; - } - bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp, - mapp, nmap, 0); - error = bp ? bp->b_error : -EIO; - if (error) { - if (bp) - xfs_trans_brelse(trans, bp); + error = xfs_trans_get_buf_map(tp, mp->m_ddev_targp, mapp, nmap, 0, &bp); + if (error) goto out_free; - } *bpp = bp; @@ -2637,35 +2612,27 @@ out_free: */ int xfs_da_read_buf( - struct xfs_trans *trans, + struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mappedbno, + unsigned int flags, struct xfs_buf **bpp, int whichfork, const struct xfs_buf_ops *ops) { + struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp; - struct xfs_buf_map map; - struct xfs_buf_map *mapp; - int nmap; + struct xfs_buf_map map, *mapp = ↦ + int nmap = 1; int error; *bpp = NULL; - mapp = ↦ - nmap = 1; - error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, - &mapp, &nmap); - if (error) { - /* mapping a hole is not an error, but we don't continue */ - if (error == -1) - error = 0; + error = xfs_dabuf_map(dp, bno, flags, whichfork, &mapp, &nmap); + if (error || !nmap) goto out_free; - } - error = xfs_trans_read_buf_map(dp->i_mount, trans, - dp->i_mount->m_ddev_targp, - mapp, nmap, 0, &bp, ops); + error = xfs_trans_read_buf_map(mp, tp, mp->m_ddev_targp, mapp, nmap, 0, + &bp, ops); if (error) goto out_free; @@ -2688,7 +2655,7 @@ int xfs_da_reada_buf( struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mappedbno, + unsigned int flags, int whichfork, const struct xfs_buf_ops *ops) { @@ -2699,16 +2666,10 @@ xfs_da_reada_buf( mapp = ↦ nmap = 1; - error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, - &mapp, &nmap); - if (error) { - /* mapping a hole is not an error, but we don't continue */ - if (error == -1) - error = 0; + error = xfs_dabuf_map(dp, bno, flags, whichfork, &mapp, &nmap); + if (error || !nmap) goto out_free; - } - mappedbno = mapp[0].bm_bn; xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops); out_free: diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ae0bbd20d9ca..53e503b6f186 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -10,7 +10,6 @@ struct xfs_inode; struct xfs_trans; struct zone; -struct xfs_dir_ops; /* * Directory/attribute geometry information. There will be one of these for each @@ -18,15 +17,23 @@ struct xfs_dir_ops; * structures will be attached to the xfs_mount. */ struct xfs_da_geometry { - int blksize; /* da block size in bytes */ - int fsbcount; /* da block size in filesystem blocks */ + unsigned int blksize; /* da block size in bytes */ + unsigned int fsbcount; /* da block size in filesystem blocks */ uint8_t fsblog; /* log2 of _filesystem_ block size */ uint8_t blklog; /* log2 of da block size */ - uint node_ents; /* # of entries in a danode */ - int magicpct; /* 37% of block size in bytes */ + unsigned int node_hdr_size; /* danode header size in bytes */ + unsigned int node_ents; /* # of entries in a danode */ + unsigned int magicpct; /* 37% of block size in bytes */ xfs_dablk_t datablk; /* blockno of dir data v2 */ + unsigned int leaf_hdr_size; /* dir2 leaf header size */ + unsigned int leaf_max_ents; /* # of entries in dir2 leaf */ xfs_dablk_t leafblk; /* blockno of leaf data v2 */ + unsigned int free_hdr_size; /* dir2 free header size */ + unsigned int free_max_bests; /* # of bests entries in dir2 free */ xfs_dablk_t freeblk; /* blockno of free data v2 */ + + xfs_dir2_data_aoff_t data_first_offset; + size_t data_entry_offset; }; /*======================================================================== @@ -50,9 +57,10 @@ typedef struct xfs_da_args { const uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ uint8_t filetype; /* filetype of inode for directories */ - uint8_t *value; /* set of bytes (maybe contain NULLs) */ + void *value; /* set of bytes (maybe contain NULLs) */ int valuelen; /* length of value */ - int flags; /* argument flags (eg: ATTR_NOCREATE) */ + unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */ + unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */ xfs_dahash_t hashval; /* hash value of name */ xfs_ino_t inumber; /* input/output inode number */ struct xfs_inode *dp; /* directory inode to manipulate */ @@ -81,7 +89,7 @@ typedef struct xfs_da_args { #define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ -#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ +#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -89,7 +97,7 @@ typedef struct xfs_da_args { { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" } + { XFS_DA_OP_NOTIME, "NOTIME" } /* * Storage for holding state during Btree searches and split/join ops. @@ -125,6 +133,25 @@ typedef struct xfs_da_state { } xfs_da_state_t; /* + * In-core version of the node header to abstract the differences in the v2 and + * v3 disk format of the headers. Callers need to convert to/from disk format as + * appropriate. + */ +struct xfs_da3_icnode_hdr { + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t level; + + /* + * Pointer to the on-disk format entries, which are behind the + * variable size (v4 vs v5) header in the on-disk block. + */ + struct xfs_da_node_entry *btree; +}; + +/* * Utility macros to aid in logging changed structure fields. */ #define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE)) @@ -132,16 +159,6 @@ typedef struct xfs_da_state { (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \ (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1) -/* - * Name ops for directory and/or attr name operations - */ -struct xfs_nameops { - xfs_dahash_t (*hashname)(struct xfs_name *); - enum xfs_dacmp (*compname)(struct xfs_da_args *, - const unsigned char *, int); -}; - - /*======================================================================== * Function prototypes. *========================================================================*/ @@ -172,25 +189,28 @@ int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bpp, int which_fork); + xfs_dablk_t bno, struct xfs_buf **bpp, int whichfork); +int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_daddr_t mappedbno, struct xfs_buf **bpp, + int whichfork); /* * Utility routines. */ + +#define XFS_DABUF_MAP_HOLE_OK (1 << 0) + int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, int count); int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bp, int whichfork); + xfs_dablk_t bno, struct xfs_buf **bp, int whichfork); int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bpp, int whichfork, - const struct xfs_buf_ops *ops); + xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp, + int whichfork, const struct xfs_buf_ops *ops); int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mapped_bno, int whichfork, - const struct xfs_buf_ops *ops); + unsigned int flags, int whichfork, + const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); @@ -202,7 +222,11 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, xfs_da_state_t *xfs_da_state_alloc(void); void xfs_da_state_free(xfs_da_state_t *state); +void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, + struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from); +void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp, + struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from); + extern struct kmem_zone *xfs_da_state_zone; -extern const struct xfs_nameops xfs_default_nameops; #endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c deleted file mode 100644 index b1ae572496b6..000000000000 --- a/fs/xfs/libxfs/xfs_da_format.c +++ /dev/null @@ -1,888 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. - * Copyright (c) 2013 Red Hat, Inc. - * All Rights Reserved. - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_shared.h" -#include "xfs_format.h" -#include "xfs_log_format.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" -#include "xfs_inode.h" -#include "xfs_dir2.h" - -/* - * Shortform directory ops - */ -static int -xfs_dir2_sf_entsize( - struct xfs_dir2_sf_hdr *hdr, - int len) -{ - int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ - - count += len; /* name */ - count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */ - return count; -} - -static int -xfs_dir3_sf_entsize( - struct xfs_dir2_sf_hdr *hdr, - int len) -{ - return xfs_dir2_sf_entsize(hdr, len) + sizeof(uint8_t); -} - -static struct xfs_dir2_sf_entry * -xfs_dir2_sf_nextentry( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - return (struct xfs_dir2_sf_entry *) - ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen)); -} - -static struct xfs_dir2_sf_entry * -xfs_dir3_sf_nextentry( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - return (struct xfs_dir2_sf_entry *) - ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen)); -} - - -/* - * For filetype enabled shortform directories, the file type field is stored at - * the end of the name. Because it's only a single byte, endian conversion is - * not necessary. For non-filetype enable directories, the type is always - * unknown and we never store the value. - */ -static uint8_t -xfs_dir2_sfe_get_ftype( - struct xfs_dir2_sf_entry *sfep) -{ - return XFS_DIR3_FT_UNKNOWN; -} - -static void -xfs_dir2_sfe_put_ftype( - struct xfs_dir2_sf_entry *sfep, - uint8_t ftype) -{ - ASSERT(ftype < XFS_DIR3_FT_MAX); -} - -static uint8_t -xfs_dir3_sfe_get_ftype( - struct xfs_dir2_sf_entry *sfep) -{ - uint8_t ftype; - - ftype = sfep->name[sfep->namelen]; - if (ftype >= XFS_DIR3_FT_MAX) - return XFS_DIR3_FT_UNKNOWN; - return ftype; -} - -static void -xfs_dir3_sfe_put_ftype( - struct xfs_dir2_sf_entry *sfep, - uint8_t ftype) -{ - ASSERT(ftype < XFS_DIR3_FT_MAX); - - sfep->name[sfep->namelen] = ftype; -} - -/* - * Inode numbers in short-form directories can come in two versions, - * either 4 bytes or 8 bytes wide. These helpers deal with the - * two forms transparently by looking at the headers i8count field. - * - * For 64-bit inode number the most significant byte must be zero. - */ -static xfs_ino_t -xfs_dir2_sf_get_ino( - struct xfs_dir2_sf_hdr *hdr, - uint8_t *from) -{ - if (hdr->i8count) - return get_unaligned_be64(from) & 0x00ffffffffffffffULL; - else - return get_unaligned_be32(from); -} - -static void -xfs_dir2_sf_put_ino( - struct xfs_dir2_sf_hdr *hdr, - uint8_t *to, - xfs_ino_t ino) -{ - ASSERT((ino & 0xff00000000000000ULL) == 0); - - if (hdr->i8count) - put_unaligned_be64(ino, to); - else - put_unaligned_be32(ino, to); -} - -static xfs_ino_t -xfs_dir2_sf_get_parent_ino( - struct xfs_dir2_sf_hdr *hdr) -{ - return xfs_dir2_sf_get_ino(hdr, hdr->parent); -} - -static void -xfs_dir2_sf_put_parent_ino( - struct xfs_dir2_sf_hdr *hdr, - xfs_ino_t ino) -{ - xfs_dir2_sf_put_ino(hdr, hdr->parent, ino); -} - -/* - * In short-form directory entries the inode numbers are stored at variable - * offset behind the entry name. If the entry stores a filetype value, then it - * sits between the name and the inode number. Hence the inode numbers may only - * be accessed through the helpers below. - */ -static xfs_ino_t -xfs_dir2_sfe_get_ino( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]); -} - -static void -xfs_dir2_sfe_put_ino( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep, - xfs_ino_t ino) -{ - xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino); -} - -static xfs_ino_t -xfs_dir3_sfe_get_ino( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]); -} - -static void -xfs_dir3_sfe_put_ino( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep, - xfs_ino_t ino) -{ - xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino); -} - - -/* - * Directory data block operations - */ - -/* - * For special situations, the dirent size ends up fixed because we always know - * what the size of the entry is. That's true for the "." and "..", and - * therefore we know that they are a fixed size and hence their offsets are - * constant, as is the first entry. - * - * Hence, this calculation is written as a macro to be able to be calculated at - * compile time and so certain offsets can be calculated directly in the - * structure initaliser via the macro. There are two macros - one for dirents - * with ftype and without so there are no unresolvable conditionals in the - * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power - * of 2 and the compiler doesn't reject it (unlike roundup()). - */ -#define XFS_DIR2_DATA_ENTSIZE(n) \ - round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ - sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN) - -#define XFS_DIR3_DATA_ENTSIZE(n) \ - round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ - sizeof(xfs_dir2_data_off_t) + sizeof(uint8_t)), \ - XFS_DIR2_DATA_ALIGN) - -static int -xfs_dir2_data_entsize( - int n) -{ - return XFS_DIR2_DATA_ENTSIZE(n); -} - -static int -xfs_dir3_data_entsize( - int n) -{ - return XFS_DIR3_DATA_ENTSIZE(n); -} - -static uint8_t -xfs_dir2_data_get_ftype( - struct xfs_dir2_data_entry *dep) -{ - return XFS_DIR3_FT_UNKNOWN; -} - -static void -xfs_dir2_data_put_ftype( - struct xfs_dir2_data_entry *dep, - uint8_t ftype) -{ - ASSERT(ftype < XFS_DIR3_FT_MAX); -} - -static uint8_t -xfs_dir3_data_get_ftype( - struct xfs_dir2_data_entry *dep) -{ - uint8_t ftype = dep->name[dep->namelen]; - - if (ftype >= XFS_DIR3_FT_MAX) - return XFS_DIR3_FT_UNKNOWN; - return ftype; -} - -static void -xfs_dir3_data_put_ftype( - struct xfs_dir2_data_entry *dep, - uint8_t type) -{ - ASSERT(type < XFS_DIR3_FT_MAX); - ASSERT(dep->namelen != 0); - - dep->name[dep->namelen] = type; -} - -/* - * Pointer to an entry's tag word. - */ -static __be16 * -xfs_dir2_data_entry_tag_p( - struct xfs_dir2_data_entry *dep) -{ - return (__be16 *)((char *)dep + - xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); -} - -static __be16 * -xfs_dir3_data_entry_tag_p( - struct xfs_dir2_data_entry *dep) -{ - return (__be16 *)((char *)dep + - xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16)); -} - -/* - * location of . and .. in data space (always block 0) - */ -static struct xfs_dir2_data_entry * -xfs_dir2_data_dot_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); -} - -static struct xfs_dir2_data_entry * -xfs_dir2_data_dotdot_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR2_DATA_ENTSIZE(1)); -} - -static struct xfs_dir2_data_entry * -xfs_dir2_data_first_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR2_DATA_ENTSIZE(1) + - XFS_DIR2_DATA_ENTSIZE(2)); -} - -static struct xfs_dir2_data_entry * -xfs_dir2_ftype_data_dotdot_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1)); -} - -static struct xfs_dir2_data_entry * -xfs_dir2_ftype_data_first_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1) + - XFS_DIR3_DATA_ENTSIZE(2)); -} - -static struct xfs_dir2_data_entry * -xfs_dir3_data_dot_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); -} - -static struct xfs_dir2_data_entry * -xfs_dir3_data_dotdot_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1)); -} - -static struct xfs_dir2_data_entry * -xfs_dir3_data_first_entry_p( - struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1) + - XFS_DIR3_DATA_ENTSIZE(2)); -} - -static struct xfs_dir2_data_free * -xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) -{ - return hdr->bestfree; -} - -static struct xfs_dir2_data_free * -xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) -{ - return ((struct xfs_dir3_data_hdr *)hdr)->best_free; -} - -static struct xfs_dir2_data_entry * -xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); -} - -static struct xfs_dir2_data_unused * -xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_unused *) - ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); -} - -static struct xfs_dir2_data_entry * -xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_entry *) - ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); -} - -static struct xfs_dir2_data_unused * -xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) -{ - return (struct xfs_dir2_data_unused *) - ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); -} - - -/* - * Directory Leaf block operations - */ -static int -xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo) -{ - return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) / - (uint)sizeof(struct xfs_dir2_leaf_entry); -} - -static struct xfs_dir2_leaf_entry * -xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp) -{ - return lp->__ents; -} - -static int -xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo) -{ - return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) / - (uint)sizeof(struct xfs_dir2_leaf_entry); -} - -static struct xfs_dir2_leaf_entry * -xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp) -{ - return ((struct xfs_dir3_leaf *)lp)->__ents; -} - -static void -xfs_dir2_leaf_hdr_from_disk( - struct xfs_dir3_icleaf_hdr *to, - struct xfs_dir2_leaf *from) -{ - to->forw = be32_to_cpu(from->hdr.info.forw); - to->back = be32_to_cpu(from->hdr.info.back); - to->magic = be16_to_cpu(from->hdr.info.magic); - to->count = be16_to_cpu(from->hdr.count); - to->stale = be16_to_cpu(from->hdr.stale); - - ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC || - to->magic == XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir2_leaf_hdr_to_disk( - struct xfs_dir2_leaf *to, - struct xfs_dir3_icleaf_hdr *from) -{ - ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC || - from->magic == XFS_DIR2_LEAFN_MAGIC); - - to->hdr.info.forw = cpu_to_be32(from->forw); - to->hdr.info.back = cpu_to_be32(from->back); - to->hdr.info.magic = cpu_to_be16(from->magic); - to->hdr.count = cpu_to_be16(from->count); - to->hdr.stale = cpu_to_be16(from->stale); -} - -static void -xfs_dir3_leaf_hdr_from_disk( - struct xfs_dir3_icleaf_hdr *to, - struct xfs_dir2_leaf *from) -{ - struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from; - - to->forw = be32_to_cpu(hdr3->info.hdr.forw); - to->back = be32_to_cpu(hdr3->info.hdr.back); - to->magic = be16_to_cpu(hdr3->info.hdr.magic); - to->count = be16_to_cpu(hdr3->count); - to->stale = be16_to_cpu(hdr3->stale); - - ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC || - to->magic == XFS_DIR3_LEAFN_MAGIC); -} - -static void -xfs_dir3_leaf_hdr_to_disk( - struct xfs_dir2_leaf *to, - struct xfs_dir3_icleaf_hdr *from) -{ - struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to; - - ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC || - from->magic == XFS_DIR3_LEAFN_MAGIC); - - hdr3->info.hdr.forw = cpu_to_be32(from->forw); - hdr3->info.hdr.back = cpu_to_be32(from->back); - hdr3->info.hdr.magic = cpu_to_be16(from->magic); - hdr3->count = cpu_to_be16(from->count); - hdr3->stale = cpu_to_be16(from->stale); -} - - -/* - * Directory/Attribute Node block operations - */ -static struct xfs_da_node_entry * -xfs_da2_node_tree_p(struct xfs_da_intnode *dap) -{ - return dap->__btree; -} - -static struct xfs_da_node_entry * -xfs_da3_node_tree_p(struct xfs_da_intnode *dap) -{ - return ((struct xfs_da3_intnode *)dap)->__btree; -} - -static void -xfs_da2_node_hdr_from_disk( - struct xfs_da3_icnode_hdr *to, - struct xfs_da_intnode *from) -{ - ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - to->forw = be32_to_cpu(from->hdr.info.forw); - to->back = be32_to_cpu(from->hdr.info.back); - to->magic = be16_to_cpu(from->hdr.info.magic); - to->count = be16_to_cpu(from->hdr.__count); - to->level = be16_to_cpu(from->hdr.__level); -} - -static void -xfs_da2_node_hdr_to_disk( - struct xfs_da_intnode *to, - struct xfs_da3_icnode_hdr *from) -{ - ASSERT(from->magic == XFS_DA_NODE_MAGIC); - to->hdr.info.forw = cpu_to_be32(from->forw); - to->hdr.info.back = cpu_to_be32(from->back); - to->hdr.info.magic = cpu_to_be16(from->magic); - to->hdr.__count = cpu_to_be16(from->count); - to->hdr.__level = cpu_to_be16(from->level); -} - -static void -xfs_da3_node_hdr_from_disk( - struct xfs_da3_icnode_hdr *to, - struct xfs_da_intnode *from) -{ - struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from; - - ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); - to->forw = be32_to_cpu(hdr3->info.hdr.forw); - to->back = be32_to_cpu(hdr3->info.hdr.back); - to->magic = be16_to_cpu(hdr3->info.hdr.magic); - to->count = be16_to_cpu(hdr3->__count); - to->level = be16_to_cpu(hdr3->__level); -} - -static void -xfs_da3_node_hdr_to_disk( - struct xfs_da_intnode *to, - struct xfs_da3_icnode_hdr *from) -{ - struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to; - - ASSERT(from->magic == XFS_DA3_NODE_MAGIC); - hdr3->info.hdr.forw = cpu_to_be32(from->forw); - hdr3->info.hdr.back = cpu_to_be32(from->back); - hdr3->info.hdr.magic = cpu_to_be16(from->magic); - hdr3->__count = cpu_to_be16(from->count); - hdr3->__level = cpu_to_be16(from->level); -} - - -/* - * Directory free space block operations - */ -static int -xfs_dir2_free_max_bests(struct xfs_da_geometry *geo) -{ - return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) / - sizeof(xfs_dir2_data_off_t); -} - -static __be16 * -xfs_dir2_free_bests_p(struct xfs_dir2_free *free) -{ - return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr)); -} - -/* - * Convert data space db to the corresponding free db. - */ -static xfs_dir2_db_t -xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) -{ - return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + - (db / xfs_dir2_free_max_bests(geo)); -} - -/* - * Convert data space db to the corresponding index in a free db. - */ -static int -xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) -{ - return db % xfs_dir2_free_max_bests(geo); -} - -static int -xfs_dir3_free_max_bests(struct xfs_da_geometry *geo) -{ - return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) / - sizeof(xfs_dir2_data_off_t); -} - -static __be16 * -xfs_dir3_free_bests_p(struct xfs_dir2_free *free) -{ - return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr)); -} - -/* - * Convert data space db to the corresponding free db. - */ -static xfs_dir2_db_t -xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) -{ - return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + - (db / xfs_dir3_free_max_bests(geo)); -} - -/* - * Convert data space db to the corresponding index in a free db. - */ -static int -xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) -{ - return db % xfs_dir3_free_max_bests(geo); -} - -static void -xfs_dir2_free_hdr_from_disk( - struct xfs_dir3_icfree_hdr *to, - struct xfs_dir2_free *from) -{ - to->magic = be32_to_cpu(from->hdr.magic); - to->firstdb = be32_to_cpu(from->hdr.firstdb); - to->nvalid = be32_to_cpu(from->hdr.nvalid); - to->nused = be32_to_cpu(from->hdr.nused); - ASSERT(to->magic == XFS_DIR2_FREE_MAGIC); -} - -static void -xfs_dir2_free_hdr_to_disk( - struct xfs_dir2_free *to, - struct xfs_dir3_icfree_hdr *from) -{ - ASSERT(from->magic == XFS_DIR2_FREE_MAGIC); - - to->hdr.magic = cpu_to_be32(from->magic); - to->hdr.firstdb = cpu_to_be32(from->firstdb); - to->hdr.nvalid = cpu_to_be32(from->nvalid); - to->hdr.nused = cpu_to_be32(from->nused); -} - -static void -xfs_dir3_free_hdr_from_disk( - struct xfs_dir3_icfree_hdr *to, - struct xfs_dir2_free *from) -{ - struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from; - - to->magic = be32_to_cpu(hdr3->hdr.magic); - to->firstdb = be32_to_cpu(hdr3->firstdb); - to->nvalid = be32_to_cpu(hdr3->nvalid); - to->nused = be32_to_cpu(hdr3->nused); - - ASSERT(to->magic == XFS_DIR3_FREE_MAGIC); -} - -static void -xfs_dir3_free_hdr_to_disk( - struct xfs_dir2_free *to, - struct xfs_dir3_icfree_hdr *from) -{ - struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to; - - ASSERT(from->magic == XFS_DIR3_FREE_MAGIC); - - hdr3->hdr.magic = cpu_to_be32(from->magic); - hdr3->firstdb = cpu_to_be32(from->firstdb); - hdr3->nvalid = cpu_to_be32(from->nvalid); - hdr3->nused = cpu_to_be32(from->nused); -} - -static const struct xfs_dir_ops xfs_dir2_ops = { - .sf_entsize = xfs_dir2_sf_entsize, - .sf_nextentry = xfs_dir2_sf_nextentry, - .sf_get_ftype = xfs_dir2_sfe_get_ftype, - .sf_put_ftype = xfs_dir2_sfe_put_ftype, - .sf_get_ino = xfs_dir2_sfe_get_ino, - .sf_put_ino = xfs_dir2_sfe_put_ino, - .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, - .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, - - .data_entsize = xfs_dir2_data_entsize, - .data_get_ftype = xfs_dir2_data_get_ftype, - .data_put_ftype = xfs_dir2_data_put_ftype, - .data_entry_tag_p = xfs_dir2_data_entry_tag_p, - .data_bestfree_p = xfs_dir2_data_bestfree_p, - - .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), - .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR2_DATA_ENTSIZE(1), - .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR2_DATA_ENTSIZE(1) + - XFS_DIR2_DATA_ENTSIZE(2), - .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), - - .data_dot_entry_p = xfs_dir2_data_dot_entry_p, - .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p, - .data_first_entry_p = xfs_dir2_data_first_entry_p, - .data_entry_p = xfs_dir2_data_entry_p, - .data_unused_p = xfs_dir2_data_unused_p, - - .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), - .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, - .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, - .leaf_max_ents = xfs_dir2_max_leaf_ents, - .leaf_ents_p = xfs_dir2_leaf_ents_p, - - .node_hdr_size = sizeof(struct xfs_da_node_hdr), - .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, - .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, - .node_tree_p = xfs_da2_node_tree_p, - - .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), - .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, - .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, - .free_max_bests = xfs_dir2_free_max_bests, - .free_bests_p = xfs_dir2_free_bests_p, - .db_to_fdb = xfs_dir2_db_to_fdb, - .db_to_fdindex = xfs_dir2_db_to_fdindex, -}; - -static const struct xfs_dir_ops xfs_dir2_ftype_ops = { - .sf_entsize = xfs_dir3_sf_entsize, - .sf_nextentry = xfs_dir3_sf_nextentry, - .sf_get_ftype = xfs_dir3_sfe_get_ftype, - .sf_put_ftype = xfs_dir3_sfe_put_ftype, - .sf_get_ino = xfs_dir3_sfe_get_ino, - .sf_put_ino = xfs_dir3_sfe_put_ino, - .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, - .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, - - .data_entsize = xfs_dir3_data_entsize, - .data_get_ftype = xfs_dir3_data_get_ftype, - .data_put_ftype = xfs_dir3_data_put_ftype, - .data_entry_tag_p = xfs_dir3_data_entry_tag_p, - .data_bestfree_p = xfs_dir2_data_bestfree_p, - - .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), - .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1), - .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1) + - XFS_DIR3_DATA_ENTSIZE(2), - .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), - - .data_dot_entry_p = xfs_dir2_data_dot_entry_p, - .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p, - .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p, - .data_entry_p = xfs_dir2_data_entry_p, - .data_unused_p = xfs_dir2_data_unused_p, - - .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), - .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, - .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, - .leaf_max_ents = xfs_dir2_max_leaf_ents, - .leaf_ents_p = xfs_dir2_leaf_ents_p, - - .node_hdr_size = sizeof(struct xfs_da_node_hdr), - .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, - .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, - .node_tree_p = xfs_da2_node_tree_p, - - .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), - .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, - .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, - .free_max_bests = xfs_dir2_free_max_bests, - .free_bests_p = xfs_dir2_free_bests_p, - .db_to_fdb = xfs_dir2_db_to_fdb, - .db_to_fdindex = xfs_dir2_db_to_fdindex, -}; - -static const struct xfs_dir_ops xfs_dir3_ops = { - .sf_entsize = xfs_dir3_sf_entsize, - .sf_nextentry = xfs_dir3_sf_nextentry, - .sf_get_ftype = xfs_dir3_sfe_get_ftype, - .sf_put_ftype = xfs_dir3_sfe_put_ftype, - .sf_get_ino = xfs_dir3_sfe_get_ino, - .sf_put_ino = xfs_dir3_sfe_put_ino, - .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, - .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, - - .data_entsize = xfs_dir3_data_entsize, - .data_get_ftype = xfs_dir3_data_get_ftype, - .data_put_ftype = xfs_dir3_data_put_ftype, - .data_entry_tag_p = xfs_dir3_data_entry_tag_p, - .data_bestfree_p = xfs_dir3_data_bestfree_p, - - .data_dot_offset = sizeof(struct xfs_dir3_data_hdr), - .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1), - .data_first_offset = sizeof(struct xfs_dir3_data_hdr) + - XFS_DIR3_DATA_ENTSIZE(1) + - XFS_DIR3_DATA_ENTSIZE(2), - .data_entry_offset = sizeof(struct xfs_dir3_data_hdr), - - .data_dot_entry_p = xfs_dir3_data_dot_entry_p, - .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p, - .data_first_entry_p = xfs_dir3_data_first_entry_p, - .data_entry_p = xfs_dir3_data_entry_p, - .data_unused_p = xfs_dir3_data_unused_p, - - .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr), - .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk, - .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk, - .leaf_max_ents = xfs_dir3_max_leaf_ents, - .leaf_ents_p = xfs_dir3_leaf_ents_p, - - .node_hdr_size = sizeof(struct xfs_da3_node_hdr), - .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, - .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, - .node_tree_p = xfs_da3_node_tree_p, - - .free_hdr_size = sizeof(struct xfs_dir3_free_hdr), - .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk, - .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk, - .free_max_bests = xfs_dir3_free_max_bests, - .free_bests_p = xfs_dir3_free_bests_p, - .db_to_fdb = xfs_dir3_db_to_fdb, - .db_to_fdindex = xfs_dir3_db_to_fdindex, -}; - -static const struct xfs_dir_ops xfs_dir2_nondir_ops = { - .node_hdr_size = sizeof(struct xfs_da_node_hdr), - .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, - .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, - .node_tree_p = xfs_da2_node_tree_p, -}; - -static const struct xfs_dir_ops xfs_dir3_nondir_ops = { - .node_hdr_size = sizeof(struct xfs_da3_node_hdr), - .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, - .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, - .node_tree_p = xfs_da3_node_tree_p, -}; - -/* - * Return the ops structure according to the current config. If we are passed - * an inode, then that overrides the default config we use which is based on - * feature bits. - */ -const struct xfs_dir_ops * -xfs_dir_get_ops( - struct xfs_mount *mp, - struct xfs_inode *dp) -{ - if (dp) - return dp->d_ops; - if (mp->m_dir_inode_ops) - return mp->m_dir_inode_ops; - if (xfs_sb_version_hascrc(&mp->m_sb)) - return &xfs_dir3_ops; - if (xfs_sb_version_hasftype(&mp->m_sb)) - return &xfs_dir2_ftype_ops; - return &xfs_dir2_ops; -} - -const struct xfs_dir_ops * -xfs_nondir_get_ops( - struct xfs_mount *mp, - struct xfs_inode *dp) -{ - if (dp) - return dp->d_ops; - if (mp->m_nondir_inode_ops) - return mp->m_nondir_inode_ops; - if (xfs_sb_version_hascrc(&mp->m_sb)) - return &xfs_dir3_nondir_ops; - return &xfs_dir2_nondir_ops; -} diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index ae654e06b2fb..08c0a4d98b89 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -94,19 +94,6 @@ struct xfs_da3_intnode { }; /* - * In-core version of the node header to abstract the differences in the v2 and - * v3 disk format of the headers. Callers need to convert to/from disk format as - * appropriate. - */ -struct xfs_da3_icnode_hdr { - uint32_t forw; - uint32_t back; - uint16_t magic; - uint16_t count; - uint16_t level; -}; - -/* * Directory version 2. * * There are 4 possible formats: @@ -230,7 +217,7 @@ typedef struct xfs_dir2_sf_entry { * A 64-bit or 32-bit inode number follows here, at a variable offset * after the name. */ -} xfs_dir2_sf_entry_t; +} __packed xfs_dir2_sf_entry_t; static inline int xfs_dir2_sf_hdr_size(int i8count) { @@ -434,14 +421,6 @@ struct xfs_dir3_leaf_hdr { __be32 pad; /* 64 bit alignment */ }; -struct xfs_dir3_icleaf_hdr { - uint32_t forw; - uint32_t back; - uint16_t magic; - uint16_t count; - uint16_t stale; -}; - /* * Leaf block entry. */ @@ -482,7 +461,7 @@ xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp) } /* - * Free space block defintions for the node format. + * Free space block definitions for the node format. */ /* @@ -521,19 +500,6 @@ struct xfs_dir3_free { #define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc) /* - * In core version of the free block header, abstracted away from on-disk format - * differences. Use this in the code, and convert to/from the disk version using - * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. - */ -struct xfs_dir3_icfree_hdr { - uint32_t magic; - uint32_t firstdb; - uint32_t nvalid; - uint32_t nused; - -}; - -/* * Single block format. * * The single block format looks like the following drawing on disk: @@ -710,29 +676,6 @@ struct xfs_attr3_leafblock { }; /* - * incore, neutral version of the attribute leaf header - */ -struct xfs_attr3_icleaf_hdr { - uint32_t forw; - uint32_t back; - uint16_t magic; - uint16_t count; - uint16_t usedbytes; - /* - * firstused is 32-bit here instead of 16-bit like the on-disk variant - * to support maximum fsb size of 64k without overflow issues throughout - * the attr code. Instead, the overflow condition is handled on - * conversion to/from disk. - */ - uint32_t firstused; - __u8 holes; - struct { - uint16_t base; - uint16_t size; - } freemap[XFS_ATTR_LEAF_MAPSIZE]; -}; - -/* * Special value to represent fs block size in the leaf header firstused field. * Only used when block size overflows the 2-bytes available on disk. */ @@ -740,8 +683,6 @@ struct xfs_attr3_icleaf_hdr { /* * Flags used in the leaf_entry[i].flags field. - * NOTE: the INCOMPLETE bit must not collide with the flags bits specified - * on the system call, they are "or"ed together for various operations. */ #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ @@ -751,19 +692,7 @@ struct xfs_attr3_icleaf_hdr { #define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) #define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) #define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) - -/* - * Conversion macros for converting namespace bits from argument flags - * to ondisk flags. - */ -#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) #define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) -#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) -#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) -#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ - ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) -#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ - ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) /* * Alignment for namelist and valuelist entries (since they are mixed diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 867c5dee0751..dd6fcaaea318 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -52,7 +52,7 @@ xfs_mode_to_ftype( * ASCII case-insensitive (ie. A-Z) support for directories that was * used in IRIX. */ -STATIC xfs_dahash_t +xfs_dahash_t xfs_ascii_ci_hashname( struct xfs_name *name) { @@ -65,14 +65,14 @@ xfs_ascii_ci_hashname( return hash; } -STATIC enum xfs_dacmp +enum xfs_dacmp xfs_ascii_ci_compname( - struct xfs_da_args *args, - const unsigned char *name, - int len) + struct xfs_da_args *args, + const unsigned char *name, + int len) { - enum xfs_dacmp result; - int i; + enum xfs_dacmp result; + int i; if (args->namelen != len) return XFS_CMP_DIFFERENT; @@ -89,26 +89,16 @@ xfs_ascii_ci_compname( return result; } -static const struct xfs_nameops xfs_ascii_ci_nameops = { - .hashname = xfs_ascii_ci_hashname, - .compname = xfs_ascii_ci_compname, -}; - int xfs_da_mount( struct xfs_mount *mp) { struct xfs_da_geometry *dageo; - int nodehdr_size; ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE); - mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL); - mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL); - - nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), KM_MAYFAIL); mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), @@ -125,6 +115,27 @@ xfs_da_mount( dageo->fsblog = mp->m_sb.sb_blocklog; dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb); dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr); + dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr); + dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr); + dageo->data_entry_offset = + sizeof(struct xfs_dir3_data_hdr); + } else { + dageo->node_hdr_size = sizeof(struct xfs_da_node_hdr); + dageo->leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr); + dageo->free_hdr_size = sizeof(struct xfs_dir2_free_hdr); + dageo->data_entry_offset = + sizeof(struct xfs_dir2_data_hdr); + } + dageo->leaf_max_ents = (dageo->blksize - dageo->leaf_hdr_size) / + sizeof(struct xfs_dir2_leaf_entry); + dageo->free_max_bests = (dageo->blksize - dageo->free_hdr_size) / + sizeof(xfs_dir2_data_off_t); + + dageo->data_first_offset = dageo->data_entry_offset + + xfs_dir2_data_entsize(mp, 1) + + xfs_dir2_data_entsize(mp, 2); /* * Now we've set up the block conversion variables, we can calculate the @@ -133,7 +144,7 @@ xfs_da_mount( dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET); dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET); dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); - dageo->node_ents = (dageo->blksize - nodehdr_size) / + dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); dageo->magicpct = (dageo->blksize * 37) / 100; @@ -143,15 +154,10 @@ xfs_da_mount( dageo->fsblog = mp->m_sb.sb_blocklog; dageo->blksize = 1 << dageo->blklog; dageo->fsbcount = 1; - dageo->node_ents = (dageo->blksize - nodehdr_size) / + dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size; + dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); dageo->magicpct = (dageo->blksize * 37) / 100; - - if (xfs_sb_version_hasasciici(&mp->m_sb)) - mp->m_dirnameops = &xfs_ascii_ci_nameops; - else - mp->m_dirnameops = &xfs_default_nameops; - return 0; } @@ -191,10 +197,10 @@ xfs_dir_ino_validate( { bool ino_ok = xfs_verify_dir_ino(mp, ino); - if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE))) { + if (XFS_IS_CORRUPT(mp, !ino_ok) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); - XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } return 0; @@ -262,7 +268,7 @@ xfs_dir_createname( args->name = name->name; args->namelen = name->len; args->filetype = name->type; - args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = inum; args->dp = dp; args->total = total; @@ -358,7 +364,7 @@ xfs_dir_lookup( args->name = name->name; args->namelen = name->len; args->filetype = name->type; - args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->dp = dp; args->whichfork = XFS_DATA_FORK; args->trans = tp; @@ -430,7 +436,7 @@ xfs_dir_removename( args->name = name->name; args->namelen = name->len; args->filetype = name->type; - args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = ino; args->dp = dp; args->total = total; @@ -491,7 +497,7 @@ xfs_dir_replace( args->name = name->name; args->namelen = name->len; args->filetype = name->type; - args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = inum; args->dp = dp; args->total = total; @@ -600,7 +606,9 @@ xfs_dir2_isblock( if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) return rval; rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; - if (rval != 0 && args->dp->i_d.di_size != args->geo->blksize) + if (XFS_IS_CORRUPT(args->dp->i_mount, + rval != 0 && + args->dp->i_d.di_size != args->geo->blksize)) return -EFSCORRUPTED; *vp = rval; return 0; @@ -716,3 +724,24 @@ xfs_dir2_namecheck( /* There shouldn't be any slashes or nulls here */ return !memchr(name, '/', length) && !memchr(name, 0, length); } + +xfs_dahash_t +xfs_dir2_hashname( + struct xfs_mount *mp, + struct xfs_name *name) +{ + if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb))) + return xfs_ascii_ci_hashname(name); + return xfs_da_hashname(name->name, name->len); +} + +enum xfs_dacmp +xfs_dir2_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb))) + return xfs_ascii_ci_compname(args, name, len); + return xfs_da_compname(args, name, len); +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index f54244779492..033777e282f2 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -18,6 +18,8 @@ struct xfs_dir2_sf_entry; struct xfs_dir2_data_hdr; struct xfs_dir2_data_entry; struct xfs_dir2_data_unused; +struct xfs_dir3_icfree_hdr; +struct xfs_dir3_icleaf_hdr; extern struct xfs_name xfs_name_dotdot; @@ -27,85 +29,6 @@ extern struct xfs_name xfs_name_dotdot; extern unsigned char xfs_mode_to_ftype(int mode); /* - * directory operations vector for encode/decode routines - */ -struct xfs_dir_ops { - int (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len); - struct xfs_dir2_sf_entry * - (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep); - uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep); - void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep, - uint8_t ftype); - xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep); - void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep, - xfs_ino_t ino); - xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr); - void (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr, - xfs_ino_t ino); - - int (*data_entsize)(int len); - uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep); - void (*data_put_ftype)(struct xfs_dir2_data_entry *dep, - uint8_t ftype); - __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep); - struct xfs_dir2_data_free * - (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr); - - xfs_dir2_data_aoff_t data_dot_offset; - xfs_dir2_data_aoff_t data_dotdot_offset; - xfs_dir2_data_aoff_t data_first_offset; - size_t data_entry_offset; - - struct xfs_dir2_data_entry * - (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr); - struct xfs_dir2_data_entry * - (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr); - struct xfs_dir2_data_entry * - (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr); - struct xfs_dir2_data_entry * - (*data_entry_p)(struct xfs_dir2_data_hdr *hdr); - struct xfs_dir2_data_unused * - (*data_unused_p)(struct xfs_dir2_data_hdr *hdr); - - int leaf_hdr_size; - void (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to, - struct xfs_dir3_icleaf_hdr *from); - void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to, - struct xfs_dir2_leaf *from); - int (*leaf_max_ents)(struct xfs_da_geometry *geo); - struct xfs_dir2_leaf_entry * - (*leaf_ents_p)(struct xfs_dir2_leaf *lp); - - int node_hdr_size; - void (*node_hdr_to_disk)(struct xfs_da_intnode *to, - struct xfs_da3_icnode_hdr *from); - void (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to, - struct xfs_da_intnode *from); - struct xfs_da_node_entry * - (*node_tree_p)(struct xfs_da_intnode *dap); - - int free_hdr_size; - void (*free_hdr_to_disk)(struct xfs_dir2_free *to, - struct xfs_dir3_icfree_hdr *from); - void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to, - struct xfs_dir2_free *from); - int (*free_max_bests)(struct xfs_da_geometry *geo); - __be16 * (*free_bests_p)(struct xfs_dir2_free *free); - xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo, - xfs_dir2_db_t db); - int (*db_to_fdindex)(struct xfs_da_geometry *geo, - xfs_dir2_db_t db); -}; - -extern const struct xfs_dir_ops * - xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); -extern const struct xfs_dir_ops * - xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); - -/* * Generic directory interface routines */ extern void xfs_dir_startup(void); @@ -124,6 +47,8 @@ extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t tot); +extern bool xfs_dir2_sf_replace_needblock(struct xfs_inode *dp, + xfs_ino_t inum); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); @@ -143,10 +68,7 @@ extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); -extern void xfs_dir2_data_freescan_int(struct xfs_da_geometry *geo, - const struct xfs_dir_ops *ops, - struct xfs_dir2_data_hdr *hdr, int *loghead); -extern void xfs_dir2_data_freescan(struct xfs_inode *dp, +extern void xfs_dir2_data_freescan(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr, int *loghead); extern void xfs_dir2_data_log_entry(struct xfs_da_args *args, struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); @@ -324,7 +246,7 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) #define XFS_READDIR_BUFSIZE (32768) unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype); -void *xfs_dir3_data_endp(struct xfs_da_geometry *geo, +unsigned int xfs_dir3_data_end_offset(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr); bool xfs_dir2_namecheck(const void *name, size_t length); diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 49e4bc39e7bb..1dbf2f980a26 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -114,6 +114,23 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .verify_struct = xfs_dir3_block_verify, }; +static xfs_failaddr_t +xfs_dir3_block_header_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (be64_to_cpu(hdr3->owner) != dp->i_ino) + return __this_address; + } + + return NULL; +} + int xfs_dir3_block_read( struct xfs_trans *tp, @@ -121,12 +138,24 @@ xfs_dir3_block_read( struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; + xfs_failaddr_t fa; int err; - err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, + err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); - if (!err && tp && *bpp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + fa = xfs_dir3_block_header_check(dp, *bpp); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + return -EFSCORRUPTED; + } + + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); return err; } @@ -172,7 +201,7 @@ xfs_dir2_block_need_space( struct xfs_dir2_data_unused *enddup = NULL; *compact = 0; - bf = dp->d_ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); /* * If there are stale entries we'll use one for the leaf. @@ -311,7 +340,7 @@ xfs_dir2_block_compact( * This needs to happen before the next call to use_free. */ if (needscan) - xfs_dir2_data_freescan(args->dp, hdr, needlog); + xfs_dir2_data_freescan(args->dp->i_mount, hdr, needlog); } /* @@ -355,7 +384,7 @@ xfs_dir2_block_addname( if (error) return error; - len = dp->d_ops->data_entsize(args->namelen); + len = xfs_dir2_data_entsize(dp->i_mount, args->namelen); /* * Set up pointers to parts of the block. @@ -458,7 +487,7 @@ xfs_dir2_block_addname( * This needs to happen before the next call to use_free. */ if (needscan) { - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); needscan = 0; } /* @@ -541,14 +570,14 @@ xfs_dir2_block_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, args->namelen); - dp->d_ops->data_put_ftype(dep, args->filetype); - tagp = dp->d_ops->data_entry_tag_p(dep); + xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); + tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Clean up the bestfree array and log the header, tail, and entry. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, bp); xfs_dir2_block_log_tail(tp, bp); @@ -633,7 +662,7 @@ xfs_dir2_block_lookup( * Fill in inode number, CI name if appropriate, release the block. */ args->inumber = be64_to_cpu(dep->inumber); - args->filetype = dp->d_ops->data_get_ftype(dep); + args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(args->trans, bp); return error; @@ -660,13 +689,11 @@ xfs_dir2_block_lookup_int( int high; /* binary search high index */ int low; /* binary search low index */ int mid; /* binary search current idx */ - xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ enum xfs_dacmp cmp; /* comparison result */ dp = args->dp; tp = args->trans; - mp = dp->i_mount; error = xfs_dir3_block_read(tp, dp, &bp); if (error) @@ -718,7 +745,7 @@ xfs_dir2_block_lookup_int( * and buffer. If it's the first case-insensitive match, store * the index and buffer and continue looking for an exact match. */ - cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + cmp = xfs_dir2_compname(args, dep->name, dep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { args->cmpresult = cmp; *bpp = bp; @@ -791,7 +818,8 @@ xfs_dir2_block_removename( needlog = needscan = 0; xfs_dir2_data_make_free(args, bp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), - dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); + xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog, + &needscan); /* * Fix up the block tail. */ @@ -806,7 +834,7 @@ xfs_dir2_block_removename( * Fix up bestfree, log the header if necessary. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, bp); xfs_dir3_data_check(dp, bp); @@ -864,7 +892,7 @@ xfs_dir2_block_replace( * Change the inode number to the new value. */ dep->inumber = cpu_to_be64(args->inumber); - dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); xfs_dir2_data_log_entry(args, bp, dep); xfs_dir3_data_check(dp, bp); return 0; @@ -914,7 +942,6 @@ xfs_dir2_leaf_to_block( __be16 *tagp; /* end of entry (tag) */ int to; /* block/leaf to index */ xfs_trans_t *tp; /* transaction pointer */ - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_to_block(args); @@ -923,8 +950,7 @@ xfs_dir2_leaf_to_block( tp = args->trans; mp = dp->i_mount; leaf = lbp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf); ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || @@ -938,7 +964,7 @@ xfs_dir2_leaf_to_block( while (dp->i_d.di_size > args->geo->blksize) { int hdrsz; - hdrsz = dp->d_ops->data_entry_offset; + hdrsz = args->geo->data_entry_offset; bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == args->geo->blksize - hdrsz) { @@ -953,7 +979,7 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp); + error = xfs_dir3_data_read(tp, dp, args->geo->datablk, 0, &dbp); if (error) return error; } @@ -1004,9 +1030,10 @@ xfs_dir2_leaf_to_block( */ lep = xfs_dir2_block_leaf_p(btp); for (from = to = 0; from < leafhdr.count; from++) { - if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + if (leafhdr.ents[from].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; - lep[to++] = ents[from]; + lep[to++] = leafhdr.ents[from]; } ASSERT(to == be32_to_cpu(btp->count)); xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1); @@ -1014,7 +1041,7 @@ xfs_dir2_leaf_to_block( * Scan the bestfree if we need it and log the data block header. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, dbp); /* @@ -1039,47 +1066,38 @@ xfs_dir2_leaf_to_block( */ int /* error */ xfs_dir2_sf_to_block( - xfs_da_args_t *args) /* operation arguments */ + struct xfs_da_args *args) { + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + struct xfs_da_geometry *geo = args->geo; xfs_dir2_db_t blkno; /* dir-relative block # (0) */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail pointer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ - xfs_inode_t *dp; /* incore directory inode */ int dummy; /* trash */ xfs_dir2_data_unused_t *dup; /* unused entry pointer */ int endoffset; /* end of data objects */ int error; /* error return value */ int i; /* index */ - xfs_mount_t *mp; /* filesystem mount point */ int needlog; /* need to log block header */ int needscan; /* need to scan block freespc */ int newoffset; /* offset from current entry */ - int offset; /* target block offset */ + unsigned int offset = geo->data_entry_offset; xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */ xfs_dir2_sf_hdr_t *sfp; /* shortform header */ __be16 *tagp; /* end of data entry */ - xfs_trans_t *tp; /* transaction pointer */ struct xfs_name name; - struct xfs_ifork *ifp; trace_xfs_dir2_sf_to_block(args); - dp = args->dp; - tp = args->trans; - mp = dp->i_mount; - ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); ASSERT(ifp->if_flags & XFS_IFINLINE); - /* - * Bomb out if the shortform directory is way too short. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - return -EIO; - } + ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; @@ -1123,7 +1141,7 @@ xfs_dir2_sf_to_block( * The whole thing is initialized to free by the init routine. * Say we're using the leaf and tail area. */ - dup = dp->d_ops->data_unused_p(hdr); + dup = bp->b_addr + offset; needlog = needscan = 0; error = xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, i, &needlog, &needscan); @@ -1146,35 +1164,37 @@ xfs_dir2_sf_to_block( be16_to_cpu(dup->length), &needlog, &needscan); if (error) goto out_free; + /* * Create entry for . */ - dep = dp->d_ops->data_dot_entry_p(hdr); + dep = bp->b_addr + offset; dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; - dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); - tagp = dp->d_ops->data_entry_tag_p(dep); - *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR); + tagp = xfs_dir2_data_entry_tag_p(mp, dep); + *tagp = cpu_to_be16(offset); xfs_dir2_data_log_entry(args, bp, dep); blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); - blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( - (char *)dep - (char *)hdr)); + blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset)); + offset += xfs_dir2_data_entsize(mp, dep->namelen); + /* * Create entry for .. */ - dep = dp->d_ops->data_dotdot_entry_p(hdr); - dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp)); + dep = bp->b_addr + offset; + dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; - dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); - tagp = dp->d_ops->data_entry_tag_p(dep); - *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR); + tagp = xfs_dir2_data_entry_tag_p(mp, dep); + *tagp = cpu_to_be16(offset); xfs_dir2_data_log_entry(args, bp, dep); blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); - blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( - (char *)dep - (char *)hdr)); - offset = dp->d_ops->data_first_offset; + blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset)); + offset += xfs_dir2_data_entsize(mp, dep->namelen); + /* * Loop over existing entries, stuff them in. */ @@ -1183,6 +1203,7 @@ xfs_dir2_sf_to_block( sfep = NULL; else sfep = xfs_dir2_sf_firstentry(sfp); + /* * Need to preserve the existing offset values in the sf directory. * Insert holes (unused entries) where necessary. @@ -1199,40 +1220,39 @@ xfs_dir2_sf_to_block( * There should be a hole here, make one. */ if (offset < newoffset) { - dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); + dup = bp->b_addr + offset; dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); dup->length = cpu_to_be16(newoffset - offset); - *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16( - ((char *)dup - (char *)hdr)); + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(offset); xfs_dir2_data_log_unused(args, bp, dup); xfs_dir2_data_freeinsert(hdr, - dp->d_ops->data_bestfree_p(hdr), - dup, &dummy); + xfs_dir2_data_bestfree_p(mp, hdr), + dup, &dummy); offset += be16_to_cpu(dup->length); continue; } /* * Copy a real entry. */ - dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset); - dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep)); + dep = bp->b_addr + newoffset; + dep->inumber = cpu_to_be64(xfs_dir2_sf_get_ino(mp, sfp, sfep)); dep->namelen = sfep->namelen; - dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep)); + xfs_dir2_data_put_ftype(mp, dep, + xfs_dir2_sf_get_ftype(mp, sfep)); memcpy(dep->name, sfep->name, dep->namelen); - tagp = dp->d_ops->data_entry_tag_p(dep); - *tagp = cpu_to_be16((char *)dep - (char *)hdr); + tagp = xfs_dir2_data_entry_tag_p(mp, dep); + *tagp = cpu_to_be16(newoffset); xfs_dir2_data_log_entry(args, bp, dep); name.name = sfep->name; name.len = sfep->namelen; - blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops-> - hashname(&name)); - blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( - (char *)dep - (char *)hdr)); + blp[2 + i].hashval = cpu_to_be32(xfs_dir2_hashname(mp, &name)); + blp[2 + i].address = + cpu_to_be32(xfs_dir2_byte_to_dataptr(newoffset)); offset = (int)((char *)(tagp + 1) - (char *)hdr); if (++i == sfp->count) sfep = NULL; else - sfep = dp->d_ops->sf_nextentry(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } /* Done with the temporary buffer */ kmem_free(sfp); diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 2c79be4c3153..375b3edb2ad2 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -13,6 +13,7 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_dir2.h" +#include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_buf_item.h" @@ -23,6 +24,71 @@ static xfs_failaddr_t xfs_dir2_data_freefind_verify( struct xfs_dir2_data_unused *dup, struct xfs_dir2_data_free **bf_ent); +struct xfs_dir2_data_free * +xfs_dir2_data_bestfree_p( + struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return ((struct xfs_dir3_data_hdr *)hdr)->best_free; + return hdr->bestfree; +} + +/* + * Pointer to an entry's tag word. + */ +__be16 * +xfs_dir2_data_entry_tag_p( + struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir2_data_entsize(mp, dep->namelen) - sizeof(__be16)); +} + +uint8_t +xfs_dir2_data_get_ftype( + struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep) +{ + if (xfs_sb_version_hasftype(&mp->m_sb)) { + uint8_t ftype = dep->name[dep->namelen]; + + if (likely(ftype < XFS_DIR3_FT_MAX)) + return ftype; + } + + return XFS_DIR3_FT_UNKNOWN; +} + +void +xfs_dir2_data_put_ftype( + struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep, + uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); + ASSERT(dep->namelen != 0); + + if (xfs_sb_version_hasftype(&mp->m_sb)) + dep->name[dep->namelen] = ftype; +} + +/* + * The number of leaf entries is limited by the size of the block and the amount + * of space used by the data entries. We don't know how much space is used by + * the data entries yet, so just ensure that the count falls somewhere inside + * the block right now. + */ +static inline unsigned int +xfs_dir2_data_max_leaf_entries( + struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir2_block_tail) - + geo->data_entry_offset) / + sizeof(struct xfs_dir2_leaf_entry); +} + /* * Check the consistency of the data block. * The input can also be a block-format directory. @@ -38,40 +104,27 @@ __xfs_dir3_data_check( xfs_dir2_block_tail_t *btp=NULL; /* block tail */ int count; /* count of entries found */ xfs_dir2_data_hdr_t *hdr; /* data block header */ - xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_free_t *dfp; /* bestfree entry */ - xfs_dir2_data_unused_t *dup; /* unused entry */ - char *endp; /* end of useful data */ int freeseen; /* mask of bestfrees seen */ xfs_dahash_t hash; /* hash of current name */ int i; /* leaf index */ int lastfree; /* last entry was unused */ xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */ struct xfs_mount *mp = bp->b_mount; - char *p; /* current data position */ int stale; /* count of stale leaves */ struct xfs_name name; - const struct xfs_dir_ops *ops; - struct xfs_da_geometry *geo; - - geo = mp->m_dir_geo; - - /* - * We can be passed a null dp here from a verifier, so we need to go the - * hard way to get them. - */ - ops = xfs_dir_get_ops(mp, dp); + unsigned int offset; + unsigned int end; + struct xfs_da_geometry *geo = mp->m_dir_geo; /* - * If this isn't a directory, or we don't get handed the dir ops, - * something is seriously wrong. Bail out. + * If this isn't a directory, something is seriously wrong. Bail out. */ - if ((dp && !S_ISDIR(VFS_I(dp)->i_mode)) || - ops != xfs_dir_get_ops(mp, NULL)) + if (dp && !S_ISDIR(VFS_I(dp)->i_mode)) return __this_address; hdr = bp->b_addr; - p = (char *)ops->data_entry_p(hdr); + offset = geo->data_entry_offset; switch (hdr->magic) { case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): @@ -79,15 +132,8 @@ __xfs_dir3_data_check( btp = xfs_dir2_block_tail_p(geo, hdr); lep = xfs_dir2_block_leaf_p(btp); - /* - * The number of leaf entries is limited by the size of the - * block and the amount of space used by the data entries. - * We don't know how much space is used by the data entries yet, - * so just ensure that the count falls somewhere inside the - * block right now. - */ if (be32_to_cpu(btp->count) >= - ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)) + xfs_dir2_data_max_leaf_entries(geo)) return __this_address; break; case cpu_to_be32(XFS_DIR3_DATA_MAGIC): @@ -96,14 +142,14 @@ __xfs_dir3_data_check( default: return __this_address; } - endp = xfs_dir3_data_endp(geo, hdr); - if (!endp) + end = xfs_dir3_data_end_offset(geo, hdr); + if (!end) return __this_address; /* * Account for zero bestfree entries. */ - bf = ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(mp, hdr); count = lastfree = freeseen = 0; if (!bf[0].length) { if (bf[0].offset) @@ -128,8 +174,10 @@ __xfs_dir3_data_check( /* * Loop over the data/unused entries. */ - while (p < endp) { - dup = (xfs_dir2_data_unused_t *)p; + while (offset < end) { + struct xfs_dir2_data_unused *dup = bp->b_addr + offset; + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; + /* * If it's unused, look for the space in the bestfree table. * If we find it, account for that, else make sure it @@ -140,10 +188,10 @@ __xfs_dir3_data_check( if (lastfree != 0) return __this_address; - if (endp < p + be16_to_cpu(dup->length)) + if (offset + be16_to_cpu(dup->length) > end) return __this_address; if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) != - (char *)dup - (char *)hdr) + offset) return __this_address; fa = xfs_dir2_data_freefind_verify(hdr, bf, dup, &dfp); if (fa) @@ -158,7 +206,7 @@ __xfs_dir3_data_check( be16_to_cpu(bf[2].length)) return __this_address; } - p += be16_to_cpu(dup->length); + offset += be16_to_cpu(dup->length); lastfree = 1; continue; } @@ -168,17 +216,15 @@ __xfs_dir3_data_check( * in the leaf section of the block. * The linear search is crude but this is DEBUG code. */ - dep = (xfs_dir2_data_entry_t *)p; if (dep->namelen == 0) return __this_address; if (xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))) return __this_address; - if (endp < p + ops->data_entsize(dep->namelen)) + if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end) return __this_address; - if (be16_to_cpu(*ops->data_entry_tag_p(dep)) != - (char *)dep - (char *)hdr) + if (be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)) != offset) return __this_address; - if (ops->data_get_ftype(dep) >= XFS_DIR3_FT_MAX) + if (xfs_dir2_data_get_ftype(mp, dep) >= XFS_DIR3_FT_MAX) return __this_address; count++; lastfree = 0; @@ -189,7 +235,7 @@ __xfs_dir3_data_check( ((char *)dep - (char *)hdr)); name.name = dep->name; name.len = dep->namelen; - hash = mp->m_dirnameops->hashname(&name); + hash = xfs_dir2_hashname(mp, &name); for (i = 0; i < be32_to_cpu(btp->count); i++) { if (be32_to_cpu(lep[i].address) == addr && be32_to_cpu(lep[i].hashval) == hash) @@ -198,7 +244,7 @@ __xfs_dir3_data_check( if (i >= be32_to_cpu(btp->count)) return __this_address; } - p += ops->data_entsize(dep->namelen); + offset += xfs_dir2_data_entsize(mp, dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. @@ -348,21 +394,49 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .verify_write = xfs_dir3_data_write_verify, }; +static xfs_failaddr_t +xfs_dir3_data_header_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; + + if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + return __this_address; + } + + return NULL; +} int xfs_dir3_data_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mapped_bno, + unsigned int flags, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; - err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, - XFS_DATA_FORK, &xfs_dir3_data_buf_ops); - if (!err && tp && *bpp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); + err = xfs_da_read_buf(tp, dp, bno, flags, bpp, XFS_DATA_FORK, + &xfs_dir3_data_buf_ops); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + fa = xfs_dir3_data_header_check(dp, *bpp); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + return -EFSCORRUPTED; + } + + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); return err; } @@ -370,10 +444,10 @@ int xfs_dir3_data_readahead( struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mapped_bno) + unsigned int flags) { - return xfs_da_reada_buf(dp, bno, mapped_bno, - XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops); + return xfs_da_reada_buf(dp, bno, flags, XFS_DATA_FORK, + &xfs_dir3_data_reada_buf_ops); } /* @@ -561,17 +635,16 @@ xfs_dir2_data_freeremove( * Given a data block, reconstruct its bestfree map. */ void -xfs_dir2_data_freescan_int( - struct xfs_da_geometry *geo, - const struct xfs_dir_ops *ops, - struct xfs_dir2_data_hdr *hdr, - int *loghead) +xfs_dir2_data_freescan( + struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr, + int *loghead) { - xfs_dir2_data_entry_t *dep; /* active data entry */ - xfs_dir2_data_unused_t *dup; /* unused data entry */ - struct xfs_dir2_data_free *bf; - char *endp; /* end of block's data */ - char *p; /* current entry pointer */ + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_dir2_data_free *bf = xfs_dir2_data_bestfree_p(mp, hdr); + void *addr = hdr; + unsigned int offset = geo->data_entry_offset; + unsigned int end; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || @@ -581,79 +654,60 @@ xfs_dir2_data_freescan_int( /* * Start by clearing the table. */ - bf = ops->data_bestfree_p(hdr); memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); *loghead = 1; - /* - * Set up pointers. - */ - p = (char *)ops->data_entry_p(hdr); - endp = xfs_dir3_data_endp(geo, hdr); - /* - * Loop over the block's entries. - */ - while (p < endp) { - dup = (xfs_dir2_data_unused_t *)p; + + end = xfs_dir3_data_end_offset(geo, addr); + while (offset < end) { + struct xfs_dir2_data_unused *dup = addr + offset; + struct xfs_dir2_data_entry *dep = addr + offset; + /* * If it's a free entry, insert it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ASSERT((char *)dup - (char *)hdr == + ASSERT(offset == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); xfs_dir2_data_freeinsert(hdr, bf, dup, loghead); - p += be16_to_cpu(dup->length); + offset += be16_to_cpu(dup->length); + continue; } + /* * For active entries, check their tags and skip them. */ - else { - dep = (xfs_dir2_data_entry_t *)p; - ASSERT((char *)dep - (char *)hdr == - be16_to_cpu(*ops->data_entry_tag_p(dep))); - p += ops->data_entsize(dep->namelen); - } + ASSERT(offset == + be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep))); + offset += xfs_dir2_data_entsize(mp, dep->namelen); } } -void -xfs_dir2_data_freescan( - struct xfs_inode *dp, - struct xfs_dir2_data_hdr *hdr, - int *loghead) -{ - return xfs_dir2_data_freescan_int(dp->i_mount->m_dir_geo, dp->d_ops, - hdr, loghead); -} - /* * Initialize a data block at the given block number in the directory. * Give back the buffer for the created block. */ int /* error */ xfs_dir3_data_init( - xfs_da_args_t *args, /* directory operation args */ - xfs_dir2_db_t blkno, /* logical dir block number */ - struct xfs_buf **bpp) /* output block buffer */ + struct xfs_da_args *args, /* directory operation args */ + xfs_dir2_db_t blkno, /* logical dir block number */ + struct xfs_buf **bpp) /* output block buffer */ { - struct xfs_buf *bp; /* block buffer */ - xfs_dir2_data_hdr_t *hdr; /* data block header */ - xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_data_unused_t *dup; /* unused entry pointer */ - struct xfs_dir2_data_free *bf; - int error; /* error return value */ - int i; /* bestfree index */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_trans_t *tp; /* transaction pointer */ - int t; /* temp */ - - dp = args->dp; - mp = dp->i_mount; - tp = args->trans; + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = args->geo; + struct xfs_buf *bp; + struct xfs_dir2_data_hdr *hdr; + struct xfs_dir2_data_unused *dup; + struct xfs_dir2_data_free *bf; + int error; + int i; + /* * Get the buffer set up for the block. */ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno), - -1, &bp, XFS_DATA_FORK); + &bp, XFS_DATA_FORK); if (error) return error; bp->b_ops = &xfs_dir3_data_buf_ops; @@ -675,8 +729,9 @@ xfs_dir3_data_init( } else hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); - bf = dp->d_ops->data_bestfree_p(hdr); - bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset); + bf = xfs_dir2_data_bestfree_p(mp, hdr); + bf[0].offset = cpu_to_be16(geo->data_entry_offset); + bf[0].length = cpu_to_be16(geo->blksize - geo->data_entry_offset); for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { bf[i].length = 0; bf[i].offset = 0; @@ -685,13 +740,11 @@ xfs_dir3_data_init( /* * Set up an unused entry for the block's body. */ - dup = dp->d_ops->data_unused_p(hdr); + dup = bp->b_addr + geo->data_entry_offset; dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); - - t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset; - bf[0].length = cpu_to_be16(t); - dup->length = cpu_to_be16(t); + dup->length = bf[0].length; *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr); + /* * Log it and return it. */ @@ -710,6 +763,7 @@ xfs_dir2_data_log_entry( struct xfs_buf *bp, xfs_dir2_data_entry_t *dep) /* data entry pointer */ { + struct xfs_mount *mp = bp->b_mount; struct xfs_dir2_data_hdr *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || @@ -718,7 +772,7 @@ xfs_dir2_data_log_entry( hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr), - (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) - + (uint)((char *)(xfs_dir2_data_entry_tag_p(mp, dep) + 1) - (char *)hdr - 1)); } @@ -739,8 +793,7 @@ xfs_dir2_data_log_header( hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); #endif - xfs_trans_log_buf(args->trans, bp, 0, - args->dp->d_ops->data_entry_offset - 1); + xfs_trans_log_buf(args->trans, bp, 0, args->geo->data_entry_offset - 1); } /* @@ -789,11 +842,11 @@ xfs_dir2_data_make_free( { xfs_dir2_data_hdr_t *hdr; /* data block pointer */ xfs_dir2_data_free_t *dfp; /* bestfree pointer */ - char *endptr; /* end of data area */ int needscan; /* need to regen bestfree */ xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *postdup; /* unused entry after us */ xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ + unsigned int end; struct xfs_dir2_data_free *bf; hdr = bp->b_addr; @@ -801,14 +854,14 @@ xfs_dir2_data_make_free( /* * Figure out where the end of the data area is. */ - endptr = xfs_dir3_data_endp(args->geo, hdr); - ASSERT(endptr != NULL); + end = xfs_dir3_data_end_offset(args->geo, hdr); + ASSERT(end != 0); /* * If this isn't the start of the block, then back up to * the previous entry and see if it's free. */ - if (offset > args->dp->d_ops->data_entry_offset) { + if (offset > args->geo->data_entry_offset) { __be16 *tagp; /* tag just before us */ tagp = (__be16 *)((char *)hdr + offset) - 1; @@ -821,7 +874,7 @@ xfs_dir2_data_make_free( * If this isn't the end of the block, see if the entry after * us is free. */ - if ((char *)hdr + offset + len < endptr) { + if (offset + len < end) { postdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG) @@ -834,7 +887,7 @@ xfs_dir2_data_make_free( * Previous and following entries are both free, * merge everything into a single free entry. */ - bf = args->dp->d_ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(args->dp->i_mount, hdr); if (prevdup && postdup) { xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */ @@ -1025,7 +1078,7 @@ xfs_dir2_data_use_free( * Look up the entry in the bestfree table. */ oldlen = be16_to_cpu(dup->length); - bf = args->dp->d_ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(args->dp->i_mount, hdr); dfp = xfs_dir2_data_freefind(hdr, bf, dup); ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length)); /* @@ -1149,19 +1202,22 @@ corrupt: } /* Find the end of the entry data in a data/block format dir block. */ -void * -xfs_dir3_data_endp( +unsigned int +xfs_dir3_data_end_offset( struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr) { + void *p; + switch (hdr->magic) { case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): - return xfs_dir2_block_leaf_p(xfs_dir2_block_tail_p(geo, hdr)); + p = xfs_dir2_block_leaf_p(xfs_dir2_block_tail_p(geo, hdr)); + return p - (void *)hdr; case cpu_to_be32(XFS_DIR3_DATA_MAGIC): case cpu_to_be32(XFS_DIR2_DATA_MAGIC): - return (char *)hdr + geo->blksize; + return geo->blksize; default: - return NULL; + return 0; } } diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index a53e4585a2f3..95d2a3f92d75 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -24,12 +24,73 @@ * Local function declarations. */ static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, - int *indexp, struct xfs_buf **dbpp); + int *indexp, struct xfs_buf **dbpp, + struct xfs_dir3_icleaf_hdr *leafhdr); static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args, struct xfs_buf *bp, int first, int last); static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args, struct xfs_buf *bp); +void +xfs_dir2_leaf_hdr_from_disk( + struct xfs_mount *mp, + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf *from3 = (struct xfs_dir3_leaf *)from; + + to->forw = be32_to_cpu(from3->hdr.info.hdr.forw); + to->back = be32_to_cpu(from3->hdr.info.hdr.back); + to->magic = be16_to_cpu(from3->hdr.info.hdr.magic); + to->count = be16_to_cpu(from3->hdr.count); + to->stale = be16_to_cpu(from3->hdr.stale); + to->ents = from3->__ents; + + ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC || + to->magic == XFS_DIR3_LEAFN_MAGIC); + } else { + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->stale = be16_to_cpu(from->hdr.stale); + to->ents = from->__ents; + + ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC || + to->magic == XFS_DIR2_LEAFN_MAGIC); + } +} + +void +xfs_dir2_leaf_hdr_to_disk( + struct xfs_mount *mp, + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf *to3 = (struct xfs_dir3_leaf *)to; + + ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC || + from->magic == XFS_DIR3_LEAFN_MAGIC); + + to3->hdr.info.hdr.forw = cpu_to_be32(from->forw); + to3->hdr.info.hdr.back = cpu_to_be32(from->back); + to3->hdr.info.hdr.magic = cpu_to_be16(from->magic); + to3->hdr.count = cpu_to_be16(from->count); + to3->hdr.stale = cpu_to_be16(from->stale); + } else { + ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC || + from->magic == XFS_DIR2_LEAFN_MAGIC); + + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.stale = cpu_to_be16(from->stale); + } +} + /* * Check the internal consistency of a leaf1 block. * Pop an assert if something is wrong. @@ -43,7 +104,7 @@ xfs_dir3_leaf1_check( struct xfs_dir2_leaf *leaf = bp->b_addr; struct xfs_dir3_icleaf_hdr leafhdr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; @@ -52,7 +113,7 @@ xfs_dir3_leaf1_check( } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) return __this_address; - return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); + return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf); } static inline void @@ -76,31 +137,15 @@ xfs_dir3_leaf_check( xfs_failaddr_t xfs_dir3_leaf_check_int( - struct xfs_mount *mp, - struct xfs_inode *dp, - struct xfs_dir3_icleaf_hdr *hdr, - struct xfs_dir2_leaf *leaf) + struct xfs_mount *mp, + struct xfs_dir3_icleaf_hdr *hdr, + struct xfs_dir2_leaf *leaf) { - struct xfs_dir2_leaf_entry *ents; - xfs_dir2_leaf_tail_t *ltp; - int stale; - int i; - const struct xfs_dir_ops *ops; - struct xfs_dir3_icleaf_hdr leafhdr; - struct xfs_da_geometry *geo = mp->m_dir_geo; - - /* - * we can be passed a null dp here from a verifier, so we need to go the - * hard way to get them. - */ - ops = xfs_dir_get_ops(mp, dp); + struct xfs_da_geometry *geo = mp->m_dir_geo; + xfs_dir2_leaf_tail_t *ltp; + int stale; + int i; - if (!hdr) { - ops->leaf_hdr_from_disk(&leafhdr, leaf); - hdr = &leafhdr; - } - - ents = ops->leaf_ents_p(leaf); ltp = xfs_dir2_leaf_tail_p(geo, leaf); /* @@ -108,23 +153,23 @@ xfs_dir3_leaf_check_int( * Should factor in the size of the bests table as well. * We can deduce a value for that from di_size. */ - if (hdr->count > ops->leaf_max_ents(geo)) + if (hdr->count > geo->leaf_max_ents) return __this_address; /* Leaves and bests don't overlap in leaf format. */ if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || hdr->magic == XFS_DIR3_LEAF1_MAGIC) && - (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) + (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) return __this_address; /* Check hash value order, count stale entries. */ for (i = stale = 0; i < hdr->count; i++) { if (i + 1 < hdr->count) { - if (be32_to_cpu(ents[i].hashval) > - be32_to_cpu(ents[i + 1].hashval)) + if (be32_to_cpu(hdr->ents[i].hashval) > + be32_to_cpu(hdr->ents[i + 1].hashval)) return __this_address; } - if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } if (hdr->stale != stale) @@ -139,17 +184,18 @@ xfs_dir3_leaf_check_int( */ static xfs_failaddr_t xfs_dir3_leaf_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_mount; - struct xfs_dir2_leaf *leaf = bp->b_addr; - xfs_failaddr_t fa; + struct xfs_mount *mp = bp->b_mount; + struct xfs_dir3_icleaf_hdr leafhdr; + xfs_failaddr_t fa; fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); if (fa) return fa; - return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); + xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, bp->b_addr); + return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr); } static void @@ -216,13 +262,12 @@ xfs_dir3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, - xfs_daddr_t mappedbno, struct xfs_buf **bpp) { int err; - err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); + err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK, + &xfs_dir3_leaf1_buf_ops); if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); return err; @@ -233,13 +278,12 @@ xfs_dir3_leafn_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, - xfs_daddr_t mappedbno, struct xfs_buf **bpp) { int err; - err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); + err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK, + &xfs_dir3_leafn_buf_ops); if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); return err; @@ -311,7 +355,7 @@ xfs_dir3_leaf_get_buf( bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno), - -1, &bp, XFS_DATA_FORK); + &bp, XFS_DATA_FORK); if (error) return error; @@ -346,7 +390,6 @@ xfs_dir2_block_to_leaf( int needscan; /* need to rescan bestfree */ xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir2_data_free *bf; - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_block_to_leaf(args); @@ -375,24 +418,24 @@ xfs_dir2_block_to_leaf( xfs_dir3_data_check(dp, dbp); btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); - bf = dp->d_ops->data_bestfree_p(hdr); - ents = dp->d_ops->leaf_ents_p(leaf); + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); /* * Set the counts in the leaf header. */ - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); leafhdr.count = be32_to_cpu(btp->count); leafhdr.stale = be32_to_cpu(btp->stale); - dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr); xfs_dir3_leaf_log_header(args, lbp); /* * Could compact these but I think we always do the conversion * after squeezing out stale entries. */ - memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1); + memcpy(leafhdr.ents, blp, + be32_to_cpu(btp->count) * sizeof(struct xfs_dir2_leaf_entry)); + xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, 0, leafhdr.count - 1); needscan = 0; needlog = 1; /* @@ -415,7 +458,7 @@ xfs_dir2_block_to_leaf( hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); /* * Set up leaf tail and bests table. */ @@ -594,7 +637,7 @@ xfs_dir2_leaf_addname( trace_xfs_dir2_leaf_addname(args); - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp); if (error) return error; @@ -607,10 +650,10 @@ xfs_dir2_leaf_addname( index = xfs_dir2_leaf_search_hash(args, lbp); leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); + ents = leafhdr.ents; bestsp = xfs_dir2_leaf_bests_p(ltp); - length = dp->d_ops->data_entsize(args->namelen); + length = xfs_dir2_data_entsize(dp->i_mount, args->namelen); /* * See if there are any entries with the same hash value @@ -773,7 +816,7 @@ xfs_dir2_leaf_addname( else xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); hdr = dbp->b_addr; - bf = dp->d_ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); bestsp[use_block] = bf[0].length; grown = 1; } else { @@ -783,13 +826,13 @@ xfs_dir2_leaf_addname( */ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, use_block), - -1, &dbp); + 0, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; } hdr = dbp->b_addr; - bf = dp->d_ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); grown = 0; } /* @@ -815,14 +858,14 @@ xfs_dir2_leaf_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - dp->d_ops->data_put_ftype(dep, args->filetype); - tagp = dp->d_ops->data_entry_tag_p(dep); + xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); + tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Need to scan fix up the bestfree table. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); /* * Need to log the data block's header. */ @@ -852,9 +895,9 @@ xfs_dir2_leaf_addname( /* * Log the leaf fields and give up the buffers. */ - dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr); xfs_dir3_leaf_log_header(args, lbp); - xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh); + xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, lfloglow, lfloghigh); xfs_dir3_leaf_check(dp, lbp); xfs_dir3_data_check(dp, dbp); return 0; @@ -874,7 +917,6 @@ xfs_dir3_leaf_compact( xfs_dir2_leaf_t *leaf; /* leaf structure */ int loglow; /* first leaf entry to log */ int to; /* target leaf index */ - struct xfs_dir2_leaf_entry *ents; struct xfs_inode *dp = args->dp; leaf = bp->b_addr; @@ -884,9 +926,9 @@ xfs_dir3_leaf_compact( /* * Compress out the stale entries in place. */ - ents = dp->d_ops->leaf_ents_p(leaf); for (from = to = 0, loglow = -1; from < leafhdr->count; from++) { - if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + if (leafhdr->ents[from].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; /* * Only actually copy the entries that are different. @@ -894,7 +936,7 @@ xfs_dir3_leaf_compact( if (from > to) { if (loglow == -1) loglow = to; - ents[to] = ents[from]; + leafhdr->ents[to] = leafhdr->ents[from]; } to++; } @@ -905,10 +947,10 @@ xfs_dir3_leaf_compact( leafhdr->count -= leafhdr->stale; leafhdr->stale = 0; - dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, leafhdr); xfs_dir3_leaf_log_header(args, bp); if (loglow != -1) - xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1); + xfs_dir3_leaf_log_ents(args, leafhdr, bp, loglow, to - 1); } /* @@ -1037,6 +1079,7 @@ xfs_dir3_leaf_log_bests( void xfs_dir3_leaf_log_ents( struct xfs_da_args *args, + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_buf *bp, int first, int last) @@ -1044,16 +1087,14 @@ xfs_dir3_leaf_log_ents( xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */ xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ struct xfs_dir2_leaf *leaf = bp->b_addr; - struct xfs_dir2_leaf_entry *ents; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); - ents = args->dp->d_ops->leaf_ents_p(leaf); - firstlep = &ents[first]; - lastlep = &ents[last]; + firstlep = &hdr->ents[first]; + lastlep = &hdr->ents[last]; xfs_trans_log_buf(args->trans, bp, (uint)((char *)firstlep - (char *)leaf), (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); @@ -1076,7 +1117,7 @@ xfs_dir3_leaf_log_header( xfs_trans_log_buf(args->trans, bp, (uint)((char *)&leaf->hdr - (char *)leaf), - args->dp->d_ops->leaf_hdr_size - 1); + args->geo->leaf_hdr_size - 1); } /* @@ -1115,28 +1156,27 @@ xfs_dir2_leaf_lookup( int error; /* error return code */ int index; /* found entry index */ struct xfs_buf *lbp; /* leaf buffer */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ - struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_lookup(args); /* * Look up name in the leaf block, returning both buffers and index. */ - if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr); + if (error) return error; - } + tp = args->trans; dp = args->dp; xfs_dir3_leaf_check(dp, lbp); - leaf = lbp->b_addr; - ents = dp->d_ops->leaf_ents_p(leaf); + /* * Get to the leaf entry and contained data entry address. */ - lep = &ents[index]; + lep = &leafhdr.ents[index]; /* * Point to the data entry. @@ -1148,7 +1188,7 @@ xfs_dir2_leaf_lookup( * Return the found inode number & CI name if appropriate */ args->inumber = be64_to_cpu(dep->inumber); - args->filetype = dp->d_ops->data_get_ftype(dep); + args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(tp, dbp); xfs_trans_brelse(tp, lbp); @@ -1166,7 +1206,8 @@ xfs_dir2_leaf_lookup_int( xfs_da_args_t *args, /* operation arguments */ struct xfs_buf **lbpp, /* out: leaf buffer */ int *indexp, /* out: index in leaf block */ - struct xfs_buf **dbpp) /* out: data buffer */ + struct xfs_buf **dbpp, /* out: data buffer */ + struct xfs_dir3_icleaf_hdr *leafhdr) { xfs_dir2_db_t curdb = -1; /* current data block number */ struct xfs_buf *dbp = NULL; /* data buffer */ @@ -1182,22 +1223,19 @@ xfs_dir2_leaf_lookup_int( xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t cidb = -1; /* case match data block no. */ enum xfs_dacmp cmp; /* name compare result */ - struct xfs_dir2_leaf_entry *ents; - struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp); if (error) return error; *lbpp = lbp; leaf = lbp->b_addr; xfs_dir3_leaf_check(dp, lbp); - ents = dp->d_ops->leaf_ents_p(leaf); - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(mp, leafhdr, leaf); /* * Look for the first leaf entry with our hash value. @@ -1207,8 +1245,9 @@ xfs_dir2_leaf_lookup_int( * Loop over all the entries with the right hash value * looking to match the name. */ - for (lep = &ents[index]; - index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + for (lep = &leafhdr->ents[index]; + index < leafhdr->count && + be32_to_cpu(lep->hashval) == args->hashval; lep++, index++) { /* * Skip over stale leaf entries. @@ -1229,7 +1268,7 @@ xfs_dir2_leaf_lookup_int( xfs_trans_brelse(tp, dbp); error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, newdb), - -1, &dbp); + 0, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1247,7 +1286,7 @@ xfs_dir2_leaf_lookup_int( * and buffer. If it's the first case-insensitive match, store * the index and buffer and continue looking for an exact match. */ - cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + cmp = xfs_dir2_compname(args, dep->name, dep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { args->cmpresult = cmp; *indexp = index; @@ -1271,7 +1310,7 @@ xfs_dir2_leaf_lookup_int( xfs_trans_brelse(tp, dbp); error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, cidb), - -1, &dbp); + 0, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1297,6 +1336,7 @@ int /* error */ xfs_dir2_leaf_removename( xfs_da_args_t *args) /* operation arguments */ { + struct xfs_da_geometry *geo = args->geo; __be16 *bestsp; /* leaf block best freespace */ xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ @@ -1314,7 +1354,6 @@ xfs_dir2_leaf_removename( int needscan; /* need to rescan data frees */ xfs_dir2_data_off_t oldbest; /* old value of best free */ struct xfs_dir2_data_free *bf; /* bestfree table */ - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_removename(args); @@ -1322,51 +1361,54 @@ xfs_dir2_leaf_removename( /* * Lookup the leaf entry, get the leaf and data blocks read in. */ - if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr); + if (error) return error; - } + dp = args->dp; leaf = lbp->b_addr; hdr = dbp->b_addr; xfs_dir3_data_check(dp, dbp); - bf = dp->d_ops->data_bestfree_p(hdr); - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); + /* * Point to the leaf entry, use that to point to the data entry. */ - lep = &ents[index]; - db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + lep = &leafhdr.ents[index]; + db = xfs_dir2_dataptr_to_db(geo, be32_to_cpu(lep->address)); dep = (xfs_dir2_data_entry_t *)((char *)hdr + - xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(geo, be32_to_cpu(lep->address))); needscan = needlog = 0; oldbest = be16_to_cpu(bf[0].length); - ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ltp = xfs_dir2_leaf_tail_p(geo, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); - if (be16_to_cpu(bestsp[db]) != oldbest) + if (be16_to_cpu(bestsp[db]) != oldbest) { + xfs_buf_mark_corrupt(lbp); return -EFSCORRUPTED; + } /* * Mark the former data entry unused. */ xfs_dir2_data_make_free(args, dbp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), - dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); + xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog, + &needscan); /* * We just mark the leaf entry stale by putting a null in it. */ leafhdr.stale++; - dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr); xfs_dir3_leaf_log_header(args, lbp); lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir3_leaf_log_ents(args, lbp, index, index); + xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, index, index); /* * Scan the freespace in the data block again if necessary, * log the data block header if necessary. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, dbp); /* @@ -1382,8 +1424,8 @@ xfs_dir2_leaf_removename( * If the data block is now empty then get rid of the data block. */ if (be16_to_cpu(bf[0].length) == - args->geo->blksize - dp->d_ops->data_entry_offset) { - ASSERT(db != args->geo->datablk); + geo->blksize - geo->data_entry_offset) { + ASSERT(db != geo->datablk); if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { /* * Nope, can't get rid of it because it caused @@ -1425,7 +1467,7 @@ xfs_dir2_leaf_removename( /* * If the data block was not the first one, drop it. */ - else if (db != args->geo->datablk) + else if (db != geo->datablk) dbp = NULL; xfs_dir3_leaf_check(dp, lbp); @@ -1448,26 +1490,24 @@ xfs_dir2_leaf_replace( int error; /* error return code */ int index; /* index of leaf entry */ struct xfs_buf *lbp; /* leaf buffer */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ - struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_replace(args); /* * Look up the entry. */ - if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr); + if (error) return error; - } + dp = args->dp; - leaf = lbp->b_addr; - ents = dp->d_ops->leaf_ents_p(leaf); /* * Point to the leaf entry, get data address from it. */ - lep = &ents[index]; + lep = &leafhdr.ents[index]; /* * Point to the data entry. */ @@ -1479,7 +1519,7 @@ xfs_dir2_leaf_replace( * Put the new inode number in, log it. */ dep->inumber = cpu_to_be64(args->inumber); - dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); tp = args->trans; xfs_dir2_data_log_entry(args, dbp, dep); xfs_dir3_leaf_check(dp, lbp); @@ -1501,21 +1541,17 @@ xfs_dir2_leaf_search_hash( xfs_dahash_t hashwant; /* hash value looking for */ int high; /* high leaf index */ int low; /* low leaf index */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ int mid=0; /* current leaf index */ - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; - leaf = lbp->b_addr; - ents = args->dp->d_ops->leaf_ents_p(leaf); - args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(args->dp->i_mount, &leafhdr, lbp->b_addr); /* * Note, the table cannot be empty, so we have to go through the loop. * Binary search the leaf entries looking for our hash value. */ - for (lep = ents, low = 0, high = leafhdr.count - 1, + for (lep = leafhdr.ents, low = 0, high = leafhdr.count - 1, hashwant = args->hashval; low <= high; ) { mid = (low + high) >> 1; @@ -1552,6 +1588,7 @@ xfs_dir2_leaf_trim_data( struct xfs_buf *lbp, /* leaf buffer */ xfs_dir2_db_t db) /* data block number */ { + struct xfs_da_geometry *geo = args->geo; __be16 *bestsp; /* leaf bests table */ struct xfs_buf *dbp; /* data block buffer */ xfs_inode_t *dp; /* incore directory inode */ @@ -1565,23 +1602,23 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db), - -1, &dbp); + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(geo, db), 0, &dbp); if (error) return error; leaf = lbp->b_addr; - ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ltp = xfs_dir2_leaf_tail_p(geo, leaf); #ifdef DEBUG { struct xfs_dir2_data_hdr *hdr = dbp->b_addr; - struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr); + struct xfs_dir2_data_free *bf = + xfs_dir2_data_bestfree_p(dp->i_mount, hdr); ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); ASSERT(be16_to_cpu(bf[0].length) == - args->geo->blksize - dp->d_ops->data_entry_offset); + geo->blksize - geo->data_entry_offset); ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); } #endif @@ -1639,7 +1676,6 @@ xfs_dir2_node_to_leaf( int error; /* error return code */ struct xfs_buf *fbp; /* buffer for freespace block */ xfs_fileoff_t fo; /* freespace file offset */ - xfs_dir2_free_t *free; /* freespace structure */ struct xfs_buf *lbp; /* buffer for leaf block */ xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -1697,7 +1733,7 @@ xfs_dir2_node_to_leaf( return 0; lbp = state->path.blk[0].bp; leaf = lbp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); @@ -1708,8 +1744,7 @@ xfs_dir2_node_to_leaf( error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp); if (error) return error; - free = fbp->b_addr; - dp->d_ops->free_hdr_from_disk(&freehdr, free); + xfs_dir2_free_hdr_from_disk(mp, &freehdr, fbp->b_addr); ASSERT(!freehdr.firstdb); @@ -1743,10 +1778,10 @@ xfs_dir2_node_to_leaf( /* * Set up the leaf bests table. */ - memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free), + memcpy(xfs_dir2_leaf_bests_p(ltp), freehdr.bests, freehdr.nvalid * sizeof(xfs_dir2_data_off_t)); - dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir2_leaf_hdr_to_disk(mp, leaf, &leafhdr); xfs_dir3_leaf_log_header(args, lbp); xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); xfs_dir3_leaf_log_tail(args, lbp); diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 705c4f562758..6ac4aad98cd7 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -34,6 +34,25 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, int *rval); /* + * Convert data space db to the corresponding free db. + */ +static xfs_dir2_db_t +xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / geo->free_max_bests); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static int +xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return db % geo->free_max_bests; +} + +/* * Check internal consistency of a leafn block. */ #ifdef DEBUG @@ -45,7 +64,7 @@ xfs_dir3_leafn_check( struct xfs_dir2_leaf *leaf = bp->b_addr; struct xfs_dir3_icleaf_hdr leafhdr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; @@ -54,7 +73,7 @@ xfs_dir3_leafn_check( } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) return __this_address; - return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); + return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf); } static inline void @@ -160,10 +179,9 @@ xfs_dir3_free_header_check( struct xfs_buf *bp) { struct xfs_mount *mp = dp->i_mount; + int maxbests = mp->m_dir_geo->free_max_bests; unsigned int firstdb; - int maxbests; - maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo); firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) - xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) * maxbests; @@ -176,6 +194,8 @@ xfs_dir3_free_header_check( return __this_address; if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) return __this_address; + if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + return __this_address; } else { struct xfs_dir2_free_hdr *hdr = bp->b_addr; @@ -194,22 +214,23 @@ __xfs_dir3_free_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, - xfs_daddr_t mappedbno, + unsigned int flags, struct xfs_buf **bpp) { xfs_failaddr_t fa; int err; - err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir3_free_buf_ops); + err = xfs_da_read_buf(tp, dp, fbno, flags, bpp, XFS_DATA_FORK, + &xfs_dir3_free_buf_ops); if (err || !*bpp) return err; /* Check things that we can't do in the verifier. */ fa = xfs_dir3_free_header_check(dp, fbno, *bpp); if (fa) { - xfs_verifier_error(*bpp, -EFSCORRUPTED, fa); + __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); + *bpp = NULL; return -EFSCORRUPTED; } @@ -220,6 +241,58 @@ __xfs_dir3_free_read( return 0; } +void +xfs_dir2_free_hdr_from_disk( + struct xfs_mount *mp, + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free *from3 = (struct xfs_dir3_free *)from; + + to->magic = be32_to_cpu(from3->hdr.hdr.magic); + to->firstdb = be32_to_cpu(from3->hdr.firstdb); + to->nvalid = be32_to_cpu(from3->hdr.nvalid); + to->nused = be32_to_cpu(from3->hdr.nused); + to->bests = from3->bests; + + ASSERT(to->magic == XFS_DIR3_FREE_MAGIC); + } else { + to->magic = be32_to_cpu(from->hdr.magic); + to->firstdb = be32_to_cpu(from->hdr.firstdb); + to->nvalid = be32_to_cpu(from->hdr.nvalid); + to->nused = be32_to_cpu(from->hdr.nused); + to->bests = from->bests; + + ASSERT(to->magic == XFS_DIR2_FREE_MAGIC); + } +} + +static void +xfs_dir2_free_hdr_to_disk( + struct xfs_mount *mp, + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free *to3 = (struct xfs_dir3_free *)to; + + ASSERT(from->magic == XFS_DIR3_FREE_MAGIC); + + to3->hdr.hdr.magic = cpu_to_be32(from->magic); + to3->hdr.firstdb = cpu_to_be32(from->firstdb); + to3->hdr.nvalid = cpu_to_be32(from->nvalid); + to3->hdr.nused = cpu_to_be32(from->nused); + } else { + ASSERT(from->magic == XFS_DIR2_FREE_MAGIC); + + to->hdr.magic = cpu_to_be32(from->magic); + to->hdr.firstdb = cpu_to_be32(from->firstdb); + to->hdr.nvalid = cpu_to_be32(from->nvalid); + to->hdr.nused = cpu_to_be32(from->nused); + } +} + int xfs_dir2_free_read( struct xfs_trans *tp, @@ -227,7 +300,7 @@ xfs_dir2_free_read( xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp); + return __xfs_dir3_free_read(tp, dp, fbno, 0, bpp); } static int @@ -237,7 +310,7 @@ xfs_dir2_free_try_read( xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp); + return __xfs_dir3_free_read(tp, dp, fbno, XFS_DABUF_MAP_HOLE_OK, bpp); } static int @@ -254,7 +327,7 @@ xfs_dir3_free_get_buf( struct xfs_dir3_icfree_hdr hdr; error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno), - -1, &bp, XFS_DATA_FORK); + &bp, XFS_DATA_FORK); if (error) return error; @@ -278,7 +351,7 @@ xfs_dir3_free_get_buf( uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid); } else hdr.magic = XFS_DIR2_FREE_MAGIC; - dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr); + xfs_dir2_free_hdr_to_disk(mp, bp->b_addr, &hdr); *bpp = bp; return 0; } @@ -289,21 +362,19 @@ xfs_dir3_free_get_buf( STATIC void xfs_dir2_free_log_bests( struct xfs_da_args *args, + struct xfs_dir3_icfree_hdr *hdr, struct xfs_buf *bp, int first, /* first entry to log */ int last) /* last entry to log */ { - xfs_dir2_free_t *free; /* freespace structure */ - __be16 *bests; + struct xfs_dir2_free *free = bp->b_addr; - free = bp->b_addr; - bests = args->dp->d_ops->free_bests_p(free); ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); xfs_trans_log_buf(args->trans, bp, - (uint)((char *)&bests[first] - (char *)free), - (uint)((char *)&bests[last] - (char *)free + - sizeof(bests[0]) - 1)); + (char *)&hdr->bests[first] - (char *)free, + (char *)&hdr->bests[last] - (char *)free + + sizeof(hdr->bests[0]) - 1); } /* @@ -322,7 +393,7 @@ xfs_dir2_free_log_header( free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); #endif xfs_trans_log_buf(args->trans, bp, 0, - args->dp->d_ops->free_hdr_size - 1); + args->geo->free_hdr_size - 1); } /* @@ -339,14 +410,12 @@ xfs_dir2_leaf_to_node( int error; /* error return value */ struct xfs_buf *fbp; /* freespace buffer */ xfs_dir2_db_t fdb; /* freespace block number */ - xfs_dir2_free_t *free; /* freespace structure */ __be16 *from; /* pointer to freespace entry */ int i; /* leaf freespace index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ int n; /* count of live freespc ents */ xfs_dir2_data_off_t off; /* freespace entry value */ - __be16 *to; /* pointer to freespace entry */ xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir3_icfree_hdr freehdr; @@ -368,24 +437,25 @@ xfs_dir2_leaf_to_node( if (error) return error; - free = fbp->b_addr; - dp->d_ops->free_hdr_from_disk(&freehdr, free); + xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, fbp->b_addr); leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); if (be32_to_cpu(ltp->bestcount) > - (uint)dp->i_d.di_size / args->geo->blksize) + (uint)dp->i_d.di_size / args->geo->blksize) { + xfs_buf_mark_corrupt(lbp); return -EFSCORRUPTED; + } /* * Copy freespace entries from the leaf block to the new block. * Count active entries. */ from = xfs_dir2_leaf_bests_p(ltp); - to = dp->d_ops->free_bests_p(free); - for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { - if ((off = be16_to_cpu(*from)) != NULLDATAOFF) + for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++) { + off = be16_to_cpu(*from); + if (off != NULLDATAOFF) n++; - *to = cpu_to_be16(off); + freehdr.bests[i] = cpu_to_be16(off); } /* @@ -394,8 +464,8 @@ xfs_dir2_leaf_to_node( freehdr.nused = n; freehdr.nvalid = be32_to_cpu(ltp->bestcount); - dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); - xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1); + xfs_dir2_free_hdr_to_disk(dp->i_mount, fbp->b_addr, &freehdr); + xfs_dir2_free_log_bests(args, &freehdr, fbp, 0, freehdr.nvalid - 1); xfs_dir2_free_log_header(args, fbp); /* @@ -438,15 +508,17 @@ xfs_dir2_leafn_add( trace_xfs_dir2_leafn_add(args, index); - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); + ents = leafhdr.ents; /* * Quick check just to make sure we are not going to index * into other peoples memory */ - if (index < 0) + if (index < 0) { + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; + } /* * If there are already the maximum number of leaf entries in @@ -455,7 +527,7 @@ xfs_dir2_leafn_add( * a compact. */ - if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) { + if (leafhdr.count == args->geo->leaf_max_ents) { if (!leafhdr.stale) return -ENOSPC; compact = leafhdr.stale > 1; @@ -493,9 +565,9 @@ xfs_dir2_leafn_add( lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo, args->blkno, args->index)); - dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr); xfs_dir3_leaf_log_header(args, bp); - xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh); + xfs_dir3_leaf_log_ents(args, &leafhdr, bp, lfloglow, lfloghigh); xfs_dir3_leaf_check(dp, bp); return 0; } @@ -509,10 +581,9 @@ xfs_dir2_free_hdr_check( { struct xfs_dir3_icfree_hdr hdr; - dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr); + xfs_dir2_free_hdr_from_disk(dp->i_mount, &hdr, bp->b_addr); - ASSERT((hdr.firstdb % - dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0); + ASSERT((hdr.firstdb % dp->i_mount->m_dir_geo->free_max_bests) == 0); ASSERT(hdr.firstdb <= db); ASSERT(db < hdr.firstdb + hdr.nvalid); } @@ -530,11 +601,9 @@ xfs_dir2_leaf_lasthash( struct xfs_buf *bp, /* leaf buffer */ int *count) /* count of entries in leaf */ { - struct xfs_dir2_leaf *leaf = bp->b_addr; - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, bp->b_addr); ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || leafhdr.magic == XFS_DIR3_LEAFN_MAGIC || @@ -545,9 +614,7 @@ xfs_dir2_leaf_lasthash( *count = leafhdr.count; if (!leafhdr.count) return 0; - - ents = dp->d_ops->leaf_ents_p(leaf); - return be32_to_cpu(ents[leafhdr.count - 1].hashval); + return be32_to_cpu(leafhdr.ents[leafhdr.count - 1].hashval); } /* @@ -576,15 +643,13 @@ xfs_dir2_leafn_lookup_for_addname( xfs_dir2_db_t newdb; /* new data block number */ xfs_dir2_db_t newfdb; /* new free block number */ xfs_trans_t *tp; /* transaction pointer */ - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf); xfs_dir3_leaf_check(dp, bp); ASSERT(leafhdr.count > 0); @@ -604,11 +669,11 @@ xfs_dir2_leafn_lookup_for_addname( ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); } - length = dp->d_ops->data_entsize(args->namelen); + length = xfs_dir2_data_entsize(mp, args->namelen); /* * Loop over leaf entries with the right hash value. */ - for (lep = &ents[index]; + for (lep = &leafhdr.ents[index]; index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; lep++, index++) { /* @@ -630,14 +695,14 @@ xfs_dir2_leafn_lookup_for_addname( * in hand, take a look at it. */ if (newdb != curdb) { - __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; curdb = newdb; /* * Convert the data block to the free block * holding its freespace information. */ - newfdb = dp->d_ops->db_to_fdb(args->geo, newdb); + newfdb = xfs_dir2_db_to_fdb(args->geo, newdb); /* * If it's not the one we have in hand, read it in. */ @@ -661,20 +726,20 @@ xfs_dir2_leafn_lookup_for_addname( /* * Get the index for our entry. */ - fi = dp->d_ops->db_to_fdindex(args->geo, curdb); + fi = xfs_dir2_db_to_fdindex(args->geo, curdb); /* * If it has room, return it. */ - bests = dp->d_ops->free_bests_p(free); - if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) { - XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", - XFS_ERRLEVEL_LOW, mp); + xfs_dir2_free_hdr_from_disk(mp, &freehdr, free); + if (XFS_IS_CORRUPT(mp, + freehdr.bests[fi] == + cpu_to_be16(NULLDATAOFF))) { if (curfdb != newfdb) xfs_trans_brelse(tp, curbp); return -EFSCORRUPTED; } curfdb = newfdb; - if (be16_to_cpu(bests[fi]) >= length) + if (be16_to_cpu(freehdr.bests[fi]) >= length) goto out; } } @@ -728,19 +793,19 @@ xfs_dir2_leafn_lookup_for_entry( xfs_dir2_db_t newdb; /* new data block number */ xfs_trans_t *tp; /* transaction pointer */ enum xfs_dacmp cmp; /* comparison result */ - struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf); xfs_dir3_leaf_check(dp, bp); - if (leafhdr.count <= 0) + if (leafhdr.count <= 0) { + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; + } /* * Look up the hash value in the leaf entries. @@ -756,7 +821,7 @@ xfs_dir2_leafn_lookup_for_entry( /* * Loop over leaf entries with the right hash value. */ - for (lep = &ents[index]; + for (lep = &leafhdr.ents[index]; index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; lep++, index++) { /* @@ -795,7 +860,7 @@ xfs_dir2_leafn_lookup_for_entry( error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, newdb), - -1, &curbp); + 0, &curbp); if (error) return error; } @@ -813,7 +878,7 @@ xfs_dir2_leafn_lookup_for_entry( * EEXIST immediately. If it's the first case-insensitive * match, store the block & inode number and continue looking. */ - cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + cmp = xfs_dir2_compname(args, dep->name, dep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { /* If there is a CI match block, drop it */ if (args->cmpresult != XFS_CMP_DIFFERENT && @@ -821,7 +886,7 @@ xfs_dir2_leafn_lookup_for_entry( xfs_trans_brelse(tp, state->extrablk.bp); args->cmpresult = cmp; args->inumber = be64_to_cpu(dep->inumber); - args->filetype = dp->d_ops->data_get_ftype(dep); + args->filetype = xfs_dir2_data_get_ftype(mp, dep); *indexp = index; state->extravalid = 1; state->extrablk.bp = curbp; @@ -911,7 +976,7 @@ xfs_dir3_leafn_moveents( if (start_d < dhdr->count) { memmove(&dents[start_d + count], &dents[start_d], (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(args, bp_d, start_d + count, + xfs_dir3_leaf_log_ents(args, dhdr, bp_d, start_d + count, count + dhdr->count - 1); } /* @@ -933,7 +998,7 @@ xfs_dir3_leafn_moveents( */ memcpy(&dents[start_d], &sents[start_s], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1); + xfs_dir3_leaf_log_ents(args, dhdr, bp_d, start_d, start_d + count - 1); /* * If there are source entries after the ones we copied, @@ -942,7 +1007,8 @@ xfs_dir3_leafn_moveents( if (start_s + count < shdr->count) { memmove(&sents[start_s], &sents[start_s + count], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1); + xfs_dir3_leaf_log_ents(args, shdr, bp_s, start_s, + start_s + count - 1); } /* @@ -971,10 +1037,10 @@ xfs_dir2_leafn_order( struct xfs_dir3_icleaf_hdr hdr1; struct xfs_dir3_icleaf_hdr hdr2; - dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); - dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); - ents1 = dp->d_ops->leaf_ents_p(leaf1); - ents2 = dp->d_ops->leaf_ents_p(leaf2); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr1, leaf1); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf2); + ents1 = hdr1.ents; + ents2 = hdr2.ents; if (hdr1.count > 0 && hdr2.count > 0 && (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) || @@ -1024,10 +1090,10 @@ xfs_dir2_leafn_rebalance( leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); - dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); - ents1 = dp->d_ops->leaf_ents_p(leaf1); - ents2 = dp->d_ops->leaf_ents_p(leaf2); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr1, leaf1); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf2); + ents1 = hdr1.ents; + ents2 = hdr2.ents; oldsum = hdr1.count + hdr2.count; #if defined(DEBUG) || defined(XFS_WARN) @@ -1073,8 +1139,8 @@ xfs_dir2_leafn_rebalance( ASSERT(hdr1.stale + hdr2.stale == oldstale); /* log the changes made when moving the entries */ - dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1); - dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf1, &hdr1); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf2, &hdr2); xfs_dir3_leaf_log_header(args, blk1->bp); xfs_dir3_leaf_log_header(args, blk2->bp); @@ -1120,19 +1186,17 @@ xfs_dir3_data_block_free( int longest) { int logfree = 0; - __be16 *bests; struct xfs_dir3_icfree_hdr freehdr; struct xfs_inode *dp = args->dp; - dp->d_ops->free_hdr_from_disk(&freehdr, free); - bests = dp->d_ops->free_bests_p(free); + xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free); if (hdr) { /* * Data block is not empty, just set the free entry to the new * value. */ - bests[findex] = cpu_to_be16(longest); - xfs_dir2_free_log_bests(args, fbp, findex, findex); + freehdr.bests[findex] = cpu_to_be16(longest); + xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex); return 0; } @@ -1148,18 +1212,18 @@ xfs_dir3_data_block_free( int i; /* free entry index */ for (i = findex - 1; i >= 0; i--) { - if (bests[i] != cpu_to_be16(NULLDATAOFF)) + if (freehdr.bests[i] != cpu_to_be16(NULLDATAOFF)) break; } freehdr.nvalid = i + 1; logfree = 0; } else { /* Not the last entry, just punch it out. */ - bests[findex] = cpu_to_be16(NULLDATAOFF); + freehdr.bests[findex] = cpu_to_be16(NULLDATAOFF); logfree = 1; } - dp->d_ops->free_hdr_to_disk(free, &freehdr); + xfs_dir2_free_hdr_to_disk(dp->i_mount, free, &freehdr); xfs_dir2_free_log_header(args, fbp); /* @@ -1184,7 +1248,7 @@ xfs_dir3_data_block_free( /* Log the free entry that changed, unless we got rid of it. */ if (logfree) - xfs_dir2_free_log_bests(args, fbp, findex, findex); + xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex); return 0; } @@ -1201,6 +1265,7 @@ xfs_dir2_leafn_remove( xfs_da_state_blk_t *dblk, /* data block */ int *rval) /* resulting block needs join */ { + struct xfs_da_geometry *geo = args->geo; xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ struct xfs_buf *dbp; /* data block buffer */ @@ -1215,27 +1280,25 @@ xfs_dir2_leafn_remove( xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir2_data_free *bf; /* bestfree table */ struct xfs_dir3_icleaf_hdr leafhdr; - struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leafn_remove(args, index); dp = args->dp; tp = args->trans; leaf = bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); /* * Point to the entry we're removing. */ - lep = &ents[index]; + lep = &leafhdr.ents[index]; /* * Extract the data block and offset from the entry. */ - db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + db = xfs_dir2_dataptr_to_db(geo, be32_to_cpu(lep->address)); ASSERT(dblk->blkno == db); - off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)); + off = xfs_dir2_dataptr_to_off(geo, be32_to_cpu(lep->address)); ASSERT(dblk->index == off); /* @@ -1243,11 +1306,11 @@ xfs_dir2_leafn_remove( * Log the leaf block changes. */ leafhdr.stale++; - dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr); xfs_dir3_leaf_log_header(args, bp); lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir3_leaf_log_ents(args, bp, index, index); + xfs_dir3_leaf_log_ents(args, &leafhdr, bp, index, index); /* * Make the data entry free. Keep track of the longest freespace @@ -1256,17 +1319,18 @@ xfs_dir2_leafn_remove( dbp = dblk->bp; hdr = dbp->b_addr; dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); - bf = dp->d_ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); longest = be16_to_cpu(bf[0].length); needlog = needscan = 0; xfs_dir2_data_make_free(args, dbp, off, - dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); + xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog, + &needscan); /* * Rescan the data block freespaces for bestfree. * Log the data block header if needed. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, dbp); xfs_dir3_data_check(dp, dbp); @@ -1285,9 +1349,8 @@ xfs_dir2_leafn_remove( * Convert the data block number to a free block, * read in the free block. */ - fdb = dp->d_ops->db_to_fdb(args->geo, db); - error = xfs_dir2_free_read(tp, dp, - xfs_dir2_db_to_da(args->geo, fdb), + fdb = xfs_dir2_db_to_fdb(geo, db); + error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(geo, fdb), &fbp); if (error) return error; @@ -1295,23 +1358,22 @@ xfs_dir2_leafn_remove( #ifdef DEBUG { struct xfs_dir3_icfree_hdr freehdr; - dp->d_ops->free_hdr_from_disk(&freehdr, free); - ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) * - (fdb - xfs_dir2_byte_to_db(args->geo, - XFS_DIR2_FREE_OFFSET))); + + xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free); + ASSERT(freehdr.firstdb == geo->free_max_bests * + (fdb - xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET))); } #endif /* * Calculate which entry we need to fix. */ - findex = dp->d_ops->db_to_fdindex(args->geo, db); + findex = xfs_dir2_db_to_fdindex(geo, db); longest = be16_to_cpu(bf[0].length); /* * If the data block is now empty we can get rid of it * (usually). */ - if (longest == args->geo->blksize - - dp->d_ops->data_entry_offset) { + if (longest == geo->blksize - geo->data_entry_offset) { /* * Try to punch out the data block. */ @@ -1343,9 +1405,9 @@ xfs_dir2_leafn_remove( * Return indication of whether this leaf block is empty enough * to justify trying to join it with a neighbor. */ - *rval = (dp->d_ops->leaf_hdr_size + - (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) < - args->geo->magicpct; + *rval = (geo->leaf_hdr_size + + (uint)sizeof(leafhdr.ents) * (leafhdr.count - leafhdr.stale)) < + geo->magicpct; return 0; } @@ -1444,12 +1506,12 @@ xfs_dir2_leafn_toosmall( */ blk = &state->path.blk[state->path.active - 1]; leaf = blk->bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf); + ents = leafhdr.ents; xfs_dir3_leaf_check(dp, blk->bp); count = leafhdr.count - leafhdr.stale; - bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]); + bytes = state->args->geo->leaf_hdr_size + count * sizeof(ents[0]); if (bytes > (state->args->geo->blksize >> 1)) { /* * Blk over 50%, don't try to join. @@ -1494,8 +1556,7 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - error = xfs_dir3_leafn_read(state->args->trans, dp, - blkno, -1, &bp); + error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, &bp); if (error) return error; @@ -1507,8 +1568,8 @@ xfs_dir2_leafn_toosmall( (state->args->geo->blksize >> 2); leaf = bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf); - ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf); + ents = hdr2.ents; count += hdr2.count - hdr2.stale; bytes -= count * sizeof(ents[0]); @@ -1570,10 +1631,10 @@ xfs_dir2_leafn_unbalance( drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf); - dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf); - sents = dp->d_ops->leaf_ents_p(save_leaf); - dents = dp->d_ops->leaf_ents_p(drop_leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &savehdr, save_leaf); + xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &drophdr, drop_leaf); + sents = savehdr.ents; + dents = drophdr.ents; /* * If there are any stale leaf entries, take this opportunity @@ -1599,8 +1660,8 @@ xfs_dir2_leafn_unbalance( save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval); /* log the changes made when moving the entries */ - dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr); - dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, save_leaf, &savehdr); + xfs_dir2_leaf_hdr_to_disk(dp->i_mount, drop_leaf, &drophdr); xfs_dir3_leaf_log_header(args, save_blk->bp); xfs_dir3_leaf_log_header(args, drop_blk->bp); @@ -1619,19 +1680,16 @@ xfs_dir2_node_add_datablk( xfs_dir2_db_t *dbno, struct xfs_buf **dbpp, struct xfs_buf **fbpp, + struct xfs_dir3_icfree_hdr *hdr, int *findex) { struct xfs_inode *dp = args->dp; struct xfs_trans *tp = args->trans; struct xfs_mount *mp = dp->i_mount; - struct xfs_dir3_icfree_hdr freehdr; struct xfs_dir2_data_free *bf; - struct xfs_dir2_data_hdr *hdr; - struct xfs_dir2_free *free = NULL; xfs_dir2_db_t fbno; struct xfs_buf *fbp; struct xfs_buf *dbp; - __be16 *bests = NULL; int error; /* Not allowed to allocate, return failure. */ @@ -1650,7 +1708,7 @@ xfs_dir2_node_add_datablk( * Get the freespace block corresponding to the data block * that was just allocated. */ - fbno = dp->d_ops->db_to_fdb(args->geo, *dbno); + fbno = xfs_dir2_db_to_fdb(args->geo, *dbno); error = xfs_dir2_free_try_read(tp, dp, xfs_dir2_db_to_da(args->geo, fbno), &fbp); if (error) @@ -1665,11 +1723,13 @@ xfs_dir2_node_add_datablk( if (error) return error; - if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) { + if (XFS_IS_CORRUPT(mp, + xfs_dir2_db_to_fdb(args->geo, *dbno) != + fbno)) { xfs_alert(mp, "%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld", __func__, (unsigned long long)dp->i_ino, - (long long)dp->d_ops->db_to_fdb(args->geo, *dbno), + (long long)xfs_dir2_db_to_fdb(args->geo, *dbno), (long long)*dbno, (long long)fbno); if (fblk) { xfs_alert(mp, @@ -1679,7 +1739,6 @@ xfs_dir2_node_add_datablk( } else { xfs_alert(mp, " ... fblk is NULL"); } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } @@ -1687,44 +1746,39 @@ xfs_dir2_node_add_datablk( error = xfs_dir3_free_get_buf(args, fbno, &fbp); if (error) return error; - free = fbp->b_addr; - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); + xfs_dir2_free_hdr_from_disk(mp, hdr, fbp->b_addr); /* Remember the first slot as our empty slot. */ - freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo, + hdr->firstdb = (fbno - xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)) * - dp->d_ops->free_max_bests(args->geo); + args->geo->free_max_bests; } else { - free = fbp->b_addr; - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); + xfs_dir2_free_hdr_from_disk(mp, hdr, fbp->b_addr); } /* Set the freespace block index from the data block number. */ - *findex = dp->d_ops->db_to_fdindex(args->geo, *dbno); + *findex = xfs_dir2_db_to_fdindex(args->geo, *dbno); /* Extend the freespace table if the new data block is off the end. */ - if (*findex >= freehdr.nvalid) { - ASSERT(*findex < dp->d_ops->free_max_bests(args->geo)); - freehdr.nvalid = *findex + 1; - bests[*findex] = cpu_to_be16(NULLDATAOFF); + if (*findex >= hdr->nvalid) { + ASSERT(*findex < args->geo->free_max_bests); + hdr->nvalid = *findex + 1; + hdr->bests[*findex] = cpu_to_be16(NULLDATAOFF); } /* * If this entry was for an empty data block (this should always be * true) then update the header. */ - if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) { - freehdr.nused++; - dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); + if (hdr->bests[*findex] == cpu_to_be16(NULLDATAOFF)) { + hdr->nused++; + xfs_dir2_free_hdr_to_disk(mp, fbp->b_addr, hdr); xfs_dir2_free_log_header(args, fbp); } /* Update the freespace value for the new block in the table. */ - hdr = dbp->b_addr; - bf = dp->d_ops->data_bestfree_p(hdr); - bests[*findex] = bf[0].length; + bf = xfs_dir2_data_bestfree_p(mp, dbp->b_addr); + hdr->bests[*findex] = bf[0].length; *dbpp = dbp; *fbpp = fbp; @@ -1737,11 +1791,10 @@ xfs_dir2_node_find_freeblk( struct xfs_da_state_blk *fblk, xfs_dir2_db_t *dbnop, struct xfs_buf **fbpp, + struct xfs_dir3_icfree_hdr *hdr, int *findexp, int length) { - struct xfs_dir3_icfree_hdr freehdr; - struct xfs_dir2_free *free = NULL; struct xfs_inode *dp = args->dp; struct xfs_trans *tp = args->trans; struct xfs_buf *fbp = NULL; @@ -1751,7 +1804,6 @@ xfs_dir2_node_find_freeblk( xfs_dir2_db_t dbno = -1; xfs_dir2_db_t fbno; xfs_fileoff_t fo; - __be16 *bests = NULL; int findex = 0; int error; @@ -1762,17 +1814,14 @@ xfs_dir2_node_find_freeblk( */ if (fblk) { fbp = fblk->bp; - free = fbp->b_addr; findex = fblk->index; + xfs_dir2_free_hdr_from_disk(dp->i_mount, hdr, fbp->b_addr); if (findex >= 0) { /* caller already found the freespace for us. */ - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); - - ASSERT(findex < freehdr.nvalid); - ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); - ASSERT(be16_to_cpu(bests[findex]) >= length); - dbno = freehdr.firstdb + findex; + ASSERT(findex < hdr->nvalid); + ASSERT(be16_to_cpu(hdr->bests[findex]) != NULLDATAOFF); + ASSERT(be16_to_cpu(hdr->bests[findex]) >= length); + dbno = hdr->firstdb + findex; goto found_block; } @@ -1814,15 +1863,13 @@ xfs_dir2_node_find_freeblk( if (!fbp) continue; - free = fbp->b_addr; - bests = dp->d_ops->free_bests_p(free); - dp->d_ops->free_hdr_from_disk(&freehdr, free); + xfs_dir2_free_hdr_from_disk(dp->i_mount, hdr, fbp->b_addr); /* Scan the free entry array for a large enough free space. */ - for (findex = freehdr.nvalid - 1; findex >= 0; findex--) { - if (be16_to_cpu(bests[findex]) != NULLDATAOFF && - be16_to_cpu(bests[findex]) >= length) { - dbno = freehdr.firstdb + findex; + for (findex = hdr->nvalid - 1; findex >= 0; findex--) { + if (be16_to_cpu(hdr->bests[findex]) != NULLDATAOFF && + be16_to_cpu(hdr->bests[findex]) >= length) { + dbno = hdr->firstdb + findex; goto found_block; } } @@ -1838,7 +1885,6 @@ found_block: return 0; } - /* * Add the data entry for a node-format directory name addition. * The leaf entry is added in xfs_dir2_leafn_add. @@ -1853,9 +1899,9 @@ xfs_dir2_node_addname_int( struct xfs_dir2_data_entry *dep; /* data entry pointer */ struct xfs_dir2_data_hdr *hdr; /* data block header */ struct xfs_dir2_data_free *bf; - struct xfs_dir2_free *free = NULL; /* freespace block structure */ struct xfs_trans *tp = args->trans; struct xfs_inode *dp = args->dp; + struct xfs_dir3_icfree_hdr freehdr; struct xfs_buf *dbp; /* data block buffer */ struct xfs_buf *fbp; /* freespace buffer */ xfs_dir2_data_aoff_t aoff; @@ -1867,11 +1913,10 @@ xfs_dir2_node_addname_int( int needlog = 0; /* need to log data header */ int needscan = 0; /* need to rescan data frees */ __be16 *tagp; /* data entry tag pointer */ - __be16 *bests; - length = dp->d_ops->data_entsize(args->namelen); - error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex, - length); + length = xfs_dir2_data_entsize(dp->i_mount, args->namelen); + error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &freehdr, + &findex, length); if (error) return error; @@ -1893,19 +1938,19 @@ xfs_dir2_node_addname_int( /* we're going to have to log the free block index later */ logfree = 1; error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp, - &findex); + &freehdr, &findex); } else { /* Read the data block in. */ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, dbno), - -1, &dbp); + 0, &dbp); } if (error) return error; /* setup for data block up now */ hdr = dbp->b_addr; - bf = dp->d_ops->data_bestfree_p(hdr); + bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); ASSERT(be16_to_cpu(bf[0].length) >= length); /* Point to the existing unused space. */ @@ -1926,28 +1971,26 @@ xfs_dir2_node_addname_int( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - dp->d_ops->data_put_ftype(dep, args->filetype); - tagp = dp->d_ops->data_entry_tag_p(dep); + xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); + tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); xfs_dir2_data_log_entry(args, dbp, dep); /* Rescan the freespace and log the data block if needed. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, &needlog); + xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, dbp); /* If the freespace block entry is now wrong, update it. */ - free = fbp->b_addr; - bests = dp->d_ops->free_bests_p(free); - if (bests[findex] != bf[0].length) { - bests[findex] = bf[0].length; + if (freehdr.bests[findex] != bf[0].length) { + freehdr.bests[findex] = bf[0].length; logfree = 1; } /* Log the freespace entry if needed. */ if (logfree) - xfs_dir2_free_log_bests(args, fbp, findex, findex); + xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex); /* Return the data block and offset in args. */ args->blkno = (xfs_dablk_t)dbno; @@ -2155,8 +2198,6 @@ xfs_dir2_node_replace( int i; /* btree level */ xfs_ino_t inum; /* new inode number */ int ftype; /* new file type */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */ int rval; /* internal return value */ xfs_da_state_t *state; /* btree cursor */ @@ -2188,16 +2229,17 @@ xfs_dir2_node_replace( * and locked it. But paranoia is good. */ if (rval == -EEXIST) { - struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + /* * Find the leaf entry. */ blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); - leaf = blk->bp->b_addr; - ents = args->dp->d_ops->leaf_ents_p(leaf); - lep = &ents[blk->index]; ASSERT(state->extravalid); + + xfs_dir2_leaf_hdr_from_disk(state->mp, &leafhdr, + blk->bp->b_addr); /* * Point to the data entry. */ @@ -2207,13 +2249,13 @@ xfs_dir2_node_replace( dep = (xfs_dir2_data_entry_t *) ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, - be32_to_cpu(lep->address))); + be32_to_cpu(leafhdr.ents[blk->index].address))); ASSERT(inum != be64_to_cpu(dep->inumber)); /* * Fill in the new inode number and log the entry. */ dep->inumber = cpu_to_be64(inum); - args->dp->d_ops->data_put_ftype(dep, ftype); + xfs_dir2_data_put_ftype(state->mp, dep, ftype); xfs_dir2_data_log_entry(args, state->extrablk.bp, dep); rval = 0; } @@ -2270,7 +2312,7 @@ xfs_dir2_node_trim_free( if (!bp) return 0; free = bp->b_addr; - dp->d_ops->free_hdr_from_disk(&freehdr, free); + xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free); /* * If there are used entries, there's nothing to do. diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 59f9fb2241a5..01ee0b926572 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -8,7 +8,41 @@ struct dir_context; +/* + * In-core version of the leaf and free block headers to abstract the + * differences in the v2 and v3 disk format of the headers. + */ +struct xfs_dir3_icleaf_hdr { + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t stale; + + /* + * Pointer to the on-disk format entries, which are behind the + * variable size (v4 vs v5) header in the on-disk block. + */ + struct xfs_dir2_leaf_entry *ents; +}; + +struct xfs_dir3_icfree_hdr { + uint32_t magic; + uint32_t firstdb; + uint32_t nvalid; + uint32_t nused; + + /* + * Pointer to the on-disk format entries, which are behind the + * variable size (v4 vs v5) header in the on-disk block. + */ + __be16 *bests; +}; + /* xfs_dir2.c */ +xfs_dahash_t xfs_ascii_ci_hashname(struct xfs_name *name); +enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args, + const unsigned char *name, int len); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp); extern int xfs_dir_cilookup_result(struct xfs_da_args *args, @@ -26,6 +60,15 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, struct xfs_buf *lbp, struct xfs_buf *dbp); /* xfs_dir2_data.c */ +struct xfs_dir2_data_free *xfs_dir2_data_bestfree_p(struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr); +__be16 *xfs_dir2_data_entry_tag_p(struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep); +uint8_t xfs_dir2_data_get_ftype(struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep); +void xfs_dir2_data_put_ftype(struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep, uint8_t ftype); + #ifdef DEBUG extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); #else @@ -34,10 +77,10 @@ extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); -extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); -extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, - xfs_daddr_t mapped_bno); +int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp); +int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, + unsigned int flags); extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, @@ -47,10 +90,14 @@ extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, struct xfs_buf **bpp); /* xfs_dir2_leaf.c */ -extern int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); -extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); +void xfs_dir2_leaf_hdr_from_disk(struct xfs_mount *mp, + struct xfs_dir3_icleaf_hdr *to, struct xfs_dir2_leaf *from); +void xfs_dir2_leaf_hdr_to_disk(struct xfs_mount *mp, struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from); +int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, struct xfs_buf **bpp); +int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); @@ -62,7 +109,8 @@ extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, struct xfs_buf **bpp, uint16_t magic); extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args, - struct xfs_buf *bp, int first, int last); + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_buf *bp, int first, + int last); extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args, struct xfs_buf *bp); extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); @@ -79,10 +127,11 @@ xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); extern xfs_failaddr_t xfs_dir3_leaf_check_int(struct xfs_mount *mp, - struct xfs_inode *dp, struct xfs_dir3_icleaf_hdr *hdr, - struct xfs_dir2_leaf *leaf); + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); /* xfs_dir2_node.c */ +void xfs_dir2_free_hdr_from_disk(struct xfs_mount *mp, + struct xfs_dir3_icfree_hdr *to, struct xfs_dir2_free *from); extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_buf *lbp); extern xfs_dahash_t xfs_dir2_leaf_lasthash(struct xfs_inode *dp, @@ -108,6 +157,14 @@ extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, struct xfs_buf **bpp); /* xfs_dir2_sf.c */ +xfs_ino_t xfs_dir2_sf_get_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep); +xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *hdr); +void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *hdr, xfs_ino_t ino); +uint8_t xfs_dir2_sf_get_ftype(struct xfs_mount *mp, + struct xfs_dir2_sf_entry *sfep); +struct xfs_dir2_sf_entry *xfs_dir2_sf_nextentry(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep); extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp); extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, @@ -118,9 +175,33 @@ extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip); +int xfs_dir2_sf_entsize(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, int len); +void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino); +void xfs_dir2_sf_put_ftype(struct xfs_mount *mp, + struct xfs_dir2_sf_entry *sfep, uint8_t ftype); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize); +static inline unsigned int +xfs_dir2_data_entsize( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int len; + + len = offsetof(struct xfs_dir2_data_entry, name[0]) + namelen + + sizeof(xfs_dir2_data_off_t) /* tag */; + if (xfs_sb_version_hasftype(&mp->m_sb)) + len += sizeof(uint8_t); + return round_up(len, XFS_DIR2_DATA_ALIGN); +} + +xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name); +enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args, + const unsigned char *name, int len); + #endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 85f14fc2a8da..7b7f6fb2ea3b 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -37,6 +37,126 @@ static void xfs_dir2_sf_check(xfs_da_args_t *args); static void xfs_dir2_sf_toino4(xfs_da_args_t *args); static void xfs_dir2_sf_toino8(xfs_da_args_t *args); +int +xfs_dir2_sf_entsize( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, + int len) +{ + int count = len; + + count += sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ + count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */ + + if (xfs_sb_version_hasftype(&mp->m_sb)) + count += sizeof(uint8_t); + return count; +} + +struct xfs_dir2_sf_entry * +xfs_dir2_sf_nextentry( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (void *)sfep + xfs_dir2_sf_entsize(mp, hdr, sfep->namelen); +} + +/* + * In short-form directory entries the inode numbers are stored at variable + * offset behind the entry name. If the entry stores a filetype value, then it + * sits between the name and the inode number. The actual inode numbers can + * come in two formats as well, either 4 bytes or 8 bytes wide. + */ +xfs_ino_t +xfs_dir2_sf_get_ino( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + uint8_t *from = sfep->name + sfep->namelen; + + if (xfs_sb_version_hasftype(&mp->m_sb)) + from++; + + if (!hdr->i8count) + return get_unaligned_be32(from); + return get_unaligned_be64(from) & XFS_MAXINUMBER; +} + +void +xfs_dir2_sf_put_ino( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino) +{ + uint8_t *to = sfep->name + sfep->namelen; + + ASSERT(ino <= XFS_MAXINUMBER); + + if (xfs_sb_version_hasftype(&mp->m_sb)) + to++; + + if (hdr->i8count) + put_unaligned_be64(ino, to); + else + put_unaligned_be32(ino, to); +} + +xfs_ino_t +xfs_dir2_sf_get_parent_ino( + struct xfs_dir2_sf_hdr *hdr) +{ + if (!hdr->i8count) + return get_unaligned_be32(hdr->parent); + return get_unaligned_be64(hdr->parent) & XFS_MAXINUMBER; +} + +void +xfs_dir2_sf_put_parent_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_ino_t ino) +{ + ASSERT(ino <= XFS_MAXINUMBER); + + if (hdr->i8count) + put_unaligned_be64(ino, hdr->parent); + else + put_unaligned_be32(ino, hdr->parent); +} + +/* + * The file type field is stored at the end of the name for filetype enabled + * shortform directories, or not at all otherwise. + */ +uint8_t +xfs_dir2_sf_get_ftype( + struct xfs_mount *mp, + struct xfs_dir2_sf_entry *sfep) +{ + if (xfs_sb_version_hasftype(&mp->m_sb)) { + uint8_t ftype = sfep->name[sfep->namelen]; + + if (ftype < XFS_DIR3_FT_MAX) + return ftype; + } + + return XFS_DIR3_FT_UNKNOWN; +} + +void +xfs_dir2_sf_put_ftype( + struct xfs_mount *mp, + struct xfs_dir2_sf_entry *sfep, + uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); + + if (xfs_sb_version_hasftype(&mp->m_sb)) + sfep->name[sfep->namelen] = ftype; +} + /* * Given a block directory (dp/block), calculate its size as a shortform (sf) * directory and a header for the sf directory, if it will fit it the @@ -125,7 +245,7 @@ xfs_dir2_block_sfsize( */ sfhp->count = count; sfhp->i8count = i8count; - dp->d_ops->sf_put_parent_ino(sfhp, parent); + xfs_dir2_sf_put_parent_ino(sfhp, parent); return size; } @@ -135,64 +255,48 @@ xfs_dir2_block_sfsize( */ int /* error */ xfs_dir2_block_to_sf( - xfs_da_args_t *args, /* operation arguments */ + struct xfs_da_args *args, /* operation arguments */ struct xfs_buf *bp, int size, /* shortform directory size */ - xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */ + struct xfs_dir2_sf_hdr *sfhp) /* shortform directory hdr */ { - xfs_dir2_data_hdr_t *hdr; /* block header */ - xfs_dir2_data_entry_t *dep; /* data entry pointer */ - xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_data_unused_t *dup; /* unused data pointer */ - char *endptr; /* end of data entries */ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int error; /* error return value */ int logflags; /* inode logging flags */ - xfs_mount_t *mp; /* filesystem mount point */ - char *ptr; /* current data pointer */ - xfs_dir2_sf_entry_t *sfep; /* shortform entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */ - xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */ + struct xfs_dir2_sf_entry *sfep; /* shortform entry */ + struct xfs_dir2_sf_hdr *sfp; /* shortform directory header */ + unsigned int offset = args->geo->data_entry_offset; + unsigned int end; trace_xfs_dir2_block_to_sf(args); - dp = args->dp; - mp = dp->i_mount; - - /* - * allocate a temporary destination buffer the size of the inode - * to format the data into. Once we have formatted the data, we - * can free the block and copy the formatted data into the inode literal - * area. - */ - dst = kmem_alloc(mp->m_sb.sb_inodesize, 0); - hdr = bp->b_addr; - /* - * Copy the header into the newly allocate local space. + * Allocate a temporary destination buffer the size of the inode to + * format the data into. Once we have formatted the data, we can free + * the block and copy the formatted data into the inode literal area. */ - sfp = (xfs_dir2_sf_hdr_t *)dst; + sfp = kmem_alloc(mp->m_sb.sb_inodesize, 0); memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); /* - * Set up to loop over the block's entries. + * Loop over the active and unused entries. Stop when we reach the + * leaf/tail portion of the block. */ - ptr = (char *)dp->d_ops->data_entry_p(hdr); - endptr = xfs_dir3_data_endp(args->geo, hdr); + end = xfs_dir3_data_end_offset(args->geo, bp->b_addr); sfep = xfs_dir2_sf_firstentry(sfp); - /* - * Loop over the active and unused entries. - * Stop when we reach the leaf/tail portion of the block. - */ - while (ptr < endptr) { + while (offset < end) { + struct xfs_dir2_data_unused *dup = bp->b_addr + offset; + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; + /* * If it's unused, just skip over it. */ - dup = (xfs_dir2_data_unused_t *)ptr; if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ptr += be16_to_cpu(dup->length); + offset += be16_to_cpu(dup->length); continue; } - dep = (xfs_dir2_data_entry_t *)ptr; + /* * Skip . */ @@ -204,24 +308,22 @@ xfs_dir2_block_to_sf( else if (dep->namelen == 2 && dep->name[0] == '.' && dep->name[1] == '.') ASSERT(be64_to_cpu(dep->inumber) == - dp->d_ops->sf_get_parent_ino(sfp)); + xfs_dir2_sf_get_parent_ino(sfp)); /* * Normal entry, copy it into shortform. */ else { sfep->namelen = dep->namelen; - xfs_dir2_sf_put_offset(sfep, - (xfs_dir2_data_aoff_t) - ((char *)dep - (char *)hdr)); + xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, dep->name, dep->namelen); - dp->d_ops->sf_put_ino(sfp, sfep, + xfs_dir2_sf_put_ino(mp, sfp, sfep, be64_to_cpu(dep->inumber)); - dp->d_ops->sf_put_ftype(sfep, - dp->d_ops->data_get_ftype(dep)); + xfs_dir2_sf_put_ftype(mp, sfep, + xfs_dir2_data_get_ftype(mp, dep)); - sfep = dp->d_ops->sf_nextentry(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } - ptr += dp->d_ops->data_entsize(dep->namelen); + offset += xfs_dir2_data_entsize(mp, dep->namelen); } ASSERT((char *)sfep - (char *)sfp == size); @@ -240,7 +342,7 @@ xfs_dir2_block_to_sf( * Convert the inode to local format and copy the data in. */ ASSERT(dp->i_df.if_bytes == 0); - xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size); + xfs_init_local_fork(dp, XFS_DATA_FORK, sfp, size); dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; dp->i_d.di_size = size; @@ -248,7 +350,7 @@ xfs_dir2_block_to_sf( xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); - kmem_free(dst); + kmem_free(sfp); return error; } @@ -277,13 +379,7 @@ xfs_dir2_sf_addname( ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT); dp = args->dp; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Make sure the shortform value has some of its header. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } + ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; @@ -291,7 +387,7 @@ xfs_dir2_sf_addname( /* * Compute entry (and change in) size. */ - incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen); + incr_isize = xfs_dir2_sf_entsize(dp->i_mount, sfp, args->namelen); objchange = 0; /* @@ -364,18 +460,17 @@ xfs_dir2_sf_addname_easy( xfs_dir2_data_aoff_t offset, /* offset to use for new ent */ int new_isize) /* new directory size */ { + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int byteoff; /* byte offset in sf dir */ - xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ - dp = args->dp; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; byteoff = (int)((char *)sfep - (char *)sfp); /* * Grow the in-inode space. */ - xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen), + xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen), XFS_DATA_FORK); /* * Need to set up again due to realloc of the inode data. @@ -388,8 +483,8 @@ xfs_dir2_sf_addname_easy( sfep->namelen = args->namelen; xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); - dp->d_ops->sf_put_ftype(sfep, args->filetype); + xfs_dir2_sf_put_ino(mp, sfp, sfep, args->inumber); + xfs_dir2_sf_put_ftype(mp, sfep, args->filetype); /* * Update the header and inode. @@ -416,9 +511,10 @@ xfs_dir2_sf_addname_hard( int objchange, /* changing inode number size */ int new_isize) /* new directory size */ { + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int add_datasize; /* data size need for new ent */ char *buf; /* buffer for old */ - xfs_inode_t *dp; /* incore directory inode */ int eof; /* reached end of old dir */ int nbytes; /* temp for byte copies */ xfs_dir2_data_aoff_t new_offset; /* next offset value */ @@ -432,8 +528,6 @@ xfs_dir2_sf_addname_hard( /* * Copy the old directory to the stack buffer. */ - dp = args->dp; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_d.di_size; buf = kmem_alloc(old_isize, 0); @@ -444,13 +538,13 @@ xfs_dir2_sf_addname_hard( * to insert the new entry. * If it's going to end up at the end then oldsfep will point there. */ - for (offset = dp->d_ops->data_first_offset, + for (offset = args->geo->data_first_offset, oldsfep = xfs_dir2_sf_firstentry(oldsfp), - add_datasize = dp->d_ops->data_entsize(args->namelen), + add_datasize = xfs_dir2_data_entsize(mp, args->namelen), eof = (char *)oldsfep == &buf[old_isize]; !eof; - offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen), - oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep), + offset = new_offset + xfs_dir2_data_entsize(mp, oldsfep->namelen), + oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep), eof = (char *)oldsfep == &buf[old_isize]) { new_offset = xfs_dir2_sf_get_offset(oldsfep); if (offset + add_datasize <= new_offset) @@ -479,8 +573,8 @@ xfs_dir2_sf_addname_hard( sfep->namelen = args->namelen; xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); - dp->d_ops->sf_put_ftype(sfep, args->filetype); + xfs_dir2_sf_put_ino(mp, sfp, sfep, args->inumber); + xfs_dir2_sf_put_ftype(mp, sfep, args->filetype); sfp->count++; if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) sfp->i8count++; @@ -488,7 +582,7 @@ xfs_dir2_sf_addname_hard( * If there's more left to copy, do that. */ if (!eof) { - sfep = dp->d_ops->sf_nextentry(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); memcpy(sfep, oldsfep, old_isize - nbytes); } kmem_free(buf); @@ -510,7 +604,8 @@ xfs_dir2_sf_addname_pick( xfs_dir2_sf_entry_t **sfepp, /* out(1): new entry ptr */ xfs_dir2_data_aoff_t *offsetp) /* out(1): new offset */ { - xfs_inode_t *dp; /* incore directory inode */ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int holefit; /* found hole it will fit in */ int i; /* entry number */ xfs_dir2_data_aoff_t offset; /* data block offset */ @@ -519,11 +614,9 @@ xfs_dir2_sf_addname_pick( int size; /* entry's data size */ int used; /* data bytes used */ - dp = args->dp; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - size = dp->d_ops->data_entsize(args->namelen); - offset = dp->d_ops->data_first_offset; + size = xfs_dir2_data_entsize(mp, args->namelen); + offset = args->geo->data_first_offset; sfep = xfs_dir2_sf_firstentry(sfp); holefit = 0; /* @@ -535,8 +628,8 @@ xfs_dir2_sf_addname_pick( if (!holefit) holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); offset = xfs_dir2_sf_get_offset(sfep) + - dp->d_ops->data_entsize(sfep->namelen); - sfep = dp->d_ops->sf_nextentry(sfp, sfep); + xfs_dir2_data_entsize(mp, sfep->namelen); + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } /* * Calculate data bytes used excluding the new entry, if this @@ -578,7 +671,8 @@ static void xfs_dir2_sf_check( xfs_da_args_t *args) /* operation arguments */ { - xfs_inode_t *dp; /* incore directory inode */ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int i; /* entry number */ int i8count; /* number of big inode#s */ xfs_ino_t ino; /* entry inode number */ @@ -586,23 +680,21 @@ xfs_dir2_sf_check( xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ - dp = args->dp; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - offset = dp->d_ops->data_first_offset; - ino = dp->d_ops->sf_get_parent_ino(sfp); + offset = args->geo->data_first_offset; + ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) { ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); - ino = dp->d_ops->sf_get_ino(sfp, sfep); + ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; offset = xfs_dir2_sf_get_offset(sfep) + - dp->d_ops->data_entsize(sfep->namelen); - ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX); + xfs_dir2_data_entsize(mp, sfep->namelen); + ASSERT(xfs_dir2_sf_get_ftype(mp, sfep) < XFS_DIR3_FT_MAX); } ASSERT(i8count == sfp->i8count); ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); @@ -622,22 +714,16 @@ xfs_dir2_sf_verify( struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; char *endp; - const struct xfs_dir_ops *dops; struct xfs_ifork *ifp; xfs_ino_t ino; int i; int i8count; int offset; - int size; + int64_t size; int error; uint8_t filetype; ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); - /* - * xfs_iread calls us before xfs_setup_inode sets up ip->d_ops, - * so we can only trust the mountpoint to have the right pointer. - */ - dops = xfs_dir_get_ops(mp, NULL); ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; @@ -653,12 +739,12 @@ xfs_dir2_sf_verify( endp = (char *)sfp + size; /* Check .. entry */ - ino = dops->sf_get_parent_ino(sfp); + ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; error = xfs_dir_ino_validate(mp, ino); if (error) return __this_address; - offset = dops->data_first_offset; + offset = mp->m_dir_geo->data_first_offset; /* Check all reported entries */ sfep = xfs_dir2_sf_firstentry(sfp); @@ -680,7 +766,7 @@ xfs_dir2_sf_verify( * within the data buffer. The next entry starts after the * name component, so nextentry is an acceptable test. */ - next_sfep = dops->sf_nextentry(sfp, sfep); + next_sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); if (endp < (char *)next_sfep) return __this_address; @@ -689,19 +775,19 @@ xfs_dir2_sf_verify( return __this_address; /* Check the inode number. */ - ino = dops->sf_get_ino(sfp, sfep); + ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; error = xfs_dir_ino_validate(mp, ino); if (error) return __this_address; /* Check the file type. */ - filetype = dops->sf_get_ftype(sfep); + filetype = xfs_dir2_sf_get_ftype(mp, sfep); if (filetype >= XFS_DIR3_FT_MAX) return __this_address; offset = xfs_dir2_sf_get_offset(sfep) + - dops->data_entsize(sfep->namelen); + xfs_dir2_data_entsize(mp, sfep->namelen); sfep = next_sfep; } @@ -763,7 +849,7 @@ xfs_dir2_sf_create( /* * Now can put in the inode number, since i8count is set. */ - dp->d_ops->sf_put_parent_ino(sfp, pino); + xfs_dir2_sf_put_parent_ino(sfp, pino); sfp->count = 0; dp->i_d.di_size = size; xfs_dir2_sf_check(args); @@ -779,7 +865,8 @@ int /* error */ xfs_dir2_sf_lookup( xfs_da_args_t *args) /* operation arguments */ { - xfs_inode_t *dp; /* incore directory inode */ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int i; /* entry index */ int error; xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ @@ -790,16 +877,9 @@ xfs_dir2_sf_lookup( trace_xfs_dir2_sf_lookup(args); xfs_dir2_sf_check(args); - dp = args->dp; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Bail out if the directory is way too short. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } + ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; @@ -818,7 +898,7 @@ xfs_dir2_sf_lookup( */ if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { - args->inumber = dp->d_ops->sf_get_parent_ino(sfp); + args->inumber = xfs_dir2_sf_get_parent_ino(sfp); args->cmpresult = XFS_CMP_EXACT; args->filetype = XFS_DIR3_FT_DIR; return -EEXIST; @@ -828,18 +908,17 @@ xfs_dir2_sf_lookup( */ ci_sfep = NULL; for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) { /* * Compare name and if it's an exact match, return the inode * number. If it's the first case-insensitive match, store the * inode number and continue looking for an exact match. */ - cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name, - sfep->namelen); + cmp = xfs_dir2_compname(args, sfep->name, sfep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { args->cmpresult = cmp; - args->inumber = dp->d_ops->sf_get_ino(sfp, sfep); - args->filetype = dp->d_ops->sf_get_ftype(sfep); + args->inumber = xfs_dir2_sf_get_ino(mp, sfp, sfep); + args->filetype = xfs_dir2_sf_get_ftype(mp, sfep); if (cmp == XFS_CMP_EXACT) return -EEXIST; ci_sfep = sfep; @@ -864,8 +943,9 @@ int /* error */ xfs_dir2_sf_removename( xfs_da_args_t *args) { + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int byteoff; /* offset of removed entry */ - xfs_inode_t *dp; /* incore directory inode */ int entsize; /* this entry's size */ int i; /* shortform entry index */ int newsize; /* new inode size */ @@ -875,17 +955,9 @@ xfs_dir2_sf_removename( trace_xfs_dir2_sf_removename(args); - dp = args->dp; - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); oldsize = (int)dp->i_d.di_size; - /* - * Bail out if the directory is way too short. - */ - if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } + ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == oldsize); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; @@ -895,10 +967,10 @@ xfs_dir2_sf_removename( * Find the one we're deleting. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) { if (xfs_da_compname(args, sfep->name, sfep->namelen) == XFS_CMP_EXACT) { - ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) == + ASSERT(xfs_dir2_sf_get_ino(mp, sfp, sfep) == args->inumber); break; } @@ -912,7 +984,7 @@ xfs_dir2_sf_removename( * Calculate sizes. */ byteoff = (int)((char *)sfep - (char *)sfp); - entsize = dp->d_ops->sf_entsize(sfp, args->namelen); + entsize = xfs_dir2_sf_entsize(mp, sfp, args->namelen); newsize = oldsize - entsize; /* * Copy the part if any after the removed entry, sliding it down. @@ -945,13 +1017,35 @@ xfs_dir2_sf_removename( } /* + * Check whether the sf dir replace operation need more blocks. + */ +bool +xfs_dir2_sf_replace_needblock( + struct xfs_inode *dp, + xfs_ino_t inum) +{ + int newsize; + struct xfs_dir2_sf_hdr *sfp; + + if (dp->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return false; + + sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; + newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; + + return inum > XFS_DIR2_MAX_SHORT_INUM && + sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp); +} + +/* * Replace the inode number of an entry in a shortform directory. */ int /* error */ xfs_dir2_sf_replace( xfs_da_args_t *args) /* operation arguments */ { - xfs_inode_t *dp; /* incore directory inode */ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; int i; /* entry index */ xfs_ino_t ino=0; /* entry old inode number */ int i8elevated; /* sf_toino8 set i8count=1 */ @@ -960,16 +1054,8 @@ xfs_dir2_sf_replace( trace_xfs_dir2_sf_replace(args); - dp = args->dp; - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Bail out if the shortform directory is way too small. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } + ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; @@ -980,17 +1066,14 @@ xfs_dir2_sf_replace( */ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { int error; /* error return value */ - int newsize; /* new inode size */ - newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; /* * Won't fit as shortform, convert to block then do replace. */ - if (newsize > XFS_IFORK_DSIZE(dp)) { + if (xfs_dir2_sf_replace_needblock(dp, args->inumber)) { error = xfs_dir2_sf_to_block(args); - if (error) { + if (error) return error; - } return xfs_dir2_block_replace(args); } /* @@ -1008,22 +1091,23 @@ xfs_dir2_sf_replace( */ if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { - ino = dp->d_ops->sf_get_parent_ino(sfp); + ino = xfs_dir2_sf_get_parent_ino(sfp); ASSERT(args->inumber != ino); - dp->d_ops->sf_put_parent_ino(sfp, args->inumber); + xfs_dir2_sf_put_parent_ino(sfp, args->inumber); } /* * Normal entry, look for the name. */ else { for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) { if (xfs_da_compname(args, sfep->name, sfep->namelen) == XFS_CMP_EXACT) { - ino = dp->d_ops->sf_get_ino(sfp, sfep); + ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); ASSERT(args->inumber != ino); - dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); - dp->d_ops->sf_put_ftype(sfep, args->filetype); + xfs_dir2_sf_put_ino(mp, sfp, sfep, + args->inumber); + xfs_dir2_sf_put_ftype(mp, sfep, args->filetype); break; } } @@ -1076,8 +1160,9 @@ static void xfs_dir2_sf_toino4( xfs_da_args_t *args) /* operation arguments */ { + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; char *buf; /* old dir's buffer */ - xfs_inode_t *dp; /* incore directory inode */ int i; /* entry index */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ @@ -1088,8 +1173,6 @@ xfs_dir2_sf_toino4( trace_xfs_dir2_sf_toino4(args); - dp = args->dp; - /* * Copy the old directory to the buffer. * Then nuke it from the inode, and add the new buffer to the inode. @@ -1116,21 +1199,22 @@ xfs_dir2_sf_toino4( */ sfp->count = oldsfp->count; sfp->i8count = 0; - dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); + xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp)); /* * Copy the entries field by field. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->count; - i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), - oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { + i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep), + oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); - dp->d_ops->sf_put_ino(sfp, sfep, - dp->d_ops->sf_get_ino(oldsfp, oldsfep)); - dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); + xfs_dir2_sf_put_ino(mp, sfp, sfep, + xfs_dir2_sf_get_ino(mp, oldsfp, oldsfep)); + xfs_dir2_sf_put_ftype(mp, sfep, + xfs_dir2_sf_get_ftype(mp, oldsfep)); } /* * Clean up the inode. @@ -1149,8 +1233,9 @@ static void xfs_dir2_sf_toino8( xfs_da_args_t *args) /* operation arguments */ { + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; char *buf; /* old dir's buffer */ - xfs_inode_t *dp; /* incore directory inode */ int i; /* entry index */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ @@ -1161,8 +1246,6 @@ xfs_dir2_sf_toino8( trace_xfs_dir2_sf_toino8(args); - dp = args->dp; - /* * Copy the old directory to the buffer. * Then nuke it from the inode, and add the new buffer to the inode. @@ -1189,21 +1272,22 @@ xfs_dir2_sf_toino8( */ sfp->count = oldsfp->count; sfp->i8count = 1; - dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); + xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp)); /* * Copy the entries field by field. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->count; - i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), - oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { + i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep), + oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); - dp->d_ops->sf_put_ino(sfp, sfep, - dp->d_ops->sf_get_ino(oldsfp, oldsfep)); - dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); + xfs_dir2_sf_put_ino(mp, sfp, sfep, + xfs_dir2_sf_get_ino(mp, oldsfp, oldsfep)); + xfs_dir2_sf_put_ftype(mp, sfep, + xfs_dir2_sf_get_ftype(mp, oldsfep)); } /* * Clean up the inode. diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index e8bd688a4073..bedc1e752b60 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -35,10 +35,10 @@ xfs_calc_dquots_per_chunk( xfs_failaddr_t xfs_dquot_verify( - struct xfs_mount *mp, - xfs_disk_dquot_t *ddq, - xfs_dqid_t id, - uint type) /* used only during quotacheck */ + struct xfs_mount *mp, + struct xfs_disk_dquot *ddq, + xfs_dqid_t id, + uint type) /* used only during quotacheck */ { /* * We can encounter an uninitialized dquot buffer for 2 reasons: diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index c968b60cee15..045556e78ee2 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -497,6 +497,23 @@ static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp) return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } +/* + * v5 file systems support V3 inodes only, earlier file systems support + * v2 and v1 inodes. + */ +static inline bool xfs_sb_version_has_v3inode(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline bool xfs_dinode_good_version(struct xfs_sb *sbp, + uint8_t version) +{ + if (xfs_sb_version_has_v3inode(sbp)) + return version == 3; + return version == 1 || version == 2; +} + static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; @@ -560,7 +577,6 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) #define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ #define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) -#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) #define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) #define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ @@ -707,7 +723,6 @@ typedef struct xfs_agf { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) -#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) /* * Size of the unlinked inode hash table in the agi. @@ -775,7 +790,6 @@ typedef struct xfs_agi { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) #define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) -#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) /* * The third a.g. block contains the a.g. freelist, an array @@ -783,21 +797,15 @@ typedef struct xfs_agi { */ #define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) #define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) -#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) - -#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ - &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ - (__be32 *)(bp)->b_addr) +#define XFS_BUF_TO_AGFL(bp) ((struct xfs_agfl *)((bp)->b_addr)) -typedef struct xfs_agfl { +struct xfs_agfl { __be32 agfl_magicnum; __be32 agfl_seqno; uuid_t agfl_uuid; __be64 agfl_lsn; __be32 agfl_crc; - __be32 agfl_bno[]; /* actually xfs_agfl_size(mp) */ -} __attribute__((packed)) xfs_agfl_t; +} __attribute__((packed)); #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) @@ -920,13 +928,13 @@ static inline uint xfs_dinode_size(int version) * This enum is used in string mapping in xfs_trace.h; please keep the * TRACE_DEFINE_ENUMs for it up to date. */ -typedef enum xfs_dinode_fmt { +enum xfs_dinode_fmt { XFS_DINODE_FMT_DEV, /* xfs_dev_t */ XFS_DINODE_FMT_LOCAL, /* bulk data */ XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ XFS_DINODE_FMT_UUID /* added long ago, but never used */ -} xfs_dinode_fmt_t; +}; #define XFS_INODE_FORMAT_STR \ { XFS_DINODE_FMT_DEV, "dev" }, \ @@ -946,8 +954,12 @@ typedef enum xfs_dinode_fmt { /* * Inode size for given fs. */ -#define XFS_LITINO(mp, version) \ - ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) +#define XFS_DINODE_SIZE(sbp) \ + (xfs_sb_version_has_v3inode(sbp) ? \ + sizeof(struct xfs_dinode) : \ + offsetof(struct xfs_dinode, di_crc)) +#define XFS_LITINO(mp) \ + ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(&(mp)->m_sb)) /* * Inode data & attribute fork sizes, per inode. @@ -956,13 +968,9 @@ typedef enum xfs_dinode_fmt { #define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) #define XFS_DFORK_DSIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_DFORK_BOFF(dip) : \ - XFS_LITINO(mp, (dip)->di_version)) + (XFS_DFORK_Q(dip) ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp)) #define XFS_DFORK_ASIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ - 0) + (XFS_DFORK_Q(dip) ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0) #define XFS_DFORK_SIZE(dip,mp,w) \ ((w) == XFS_DATA_FORK ? \ XFS_DFORK_DSIZE(dip, mp) : \ @@ -1144,11 +1152,11 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) /* * This is the main portion of the on-disk representation of quota - * information for a user. This is the q_core of the xfs_dquot_t that + * information for a user. This is the q_core of the struct xfs_dquot that * is kept in kernel memory. We pad this with some more expansion room * to construct the on disk structure. */ -typedef struct xfs_disk_dquot { +struct xfs_disk_dquot { __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ __u8 d_version; /* dquot version */ __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ @@ -1171,15 +1179,15 @@ typedef struct xfs_disk_dquot { __be32 d_rtbtimer; /* similar to above; for RT disk blocks */ __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */ __be16 d_pad; -} xfs_disk_dquot_t; +}; /* * This is what goes on disk. This is separated from the xfs_disk_dquot because * carrying the unnecessary padding would be a waste of memory. */ typedef struct xfs_dqblk { - xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ - char dd_fill[4]; /* filling for posterity */ + struct xfs_disk_dquot dd_diskdq; /* portion living incore as well */ + char dd_fill[4];/* filling for posterity */ /* * These two are only present on filesystems with the CRC bits set. @@ -1540,6 +1548,13 @@ typedef struct xfs_bmdr_block { #define BMBT_BLOCKCOUNT_BITLEN 21 #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) +#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) + +/* + * bmbt records have a file offset (block) field that is 54 bits wide, so this + * is the largest xfs_fileoff_t that we ever expect to see. + */ +#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK) typedef struct xfs_bmbt_rec { __be64 l0, l1; diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index e9371a8e0e26..245188e4f6d3 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -324,7 +324,7 @@ typedef struct xfs_growfs_rt { * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE */ typedef struct xfs_bstime { - time_t tv_sec; /* seconds */ + __kernel_long_t tv_sec; /* seconds */ __s32 tv_nsec; /* and nanoseconds */ } xfs_bstime_t; @@ -416,7 +416,7 @@ struct xfs_bulkstat { /* * Project quota id helpers (previously projid was 16bit only - * and using two 16bit values to hold new 32bit projid was choosen + * and using two 16bit values to hold new 32bit projid was chosen * to retain compatibility with "old" filesystems). */ static inline uint32_t @@ -568,10 +568,40 @@ typedef struct xfs_fsop_setdm_handlereq { struct fsdmidata __user *data; /* DMAPI data */ } xfs_fsop_setdm_handlereq_t; +/* + * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface. + * + * NOTE: Must match the values declared in libattr without the XFS_IOC_ prefix. + */ +#define XFS_IOC_ATTR_ROOT 0x0002 /* use attrs in root namespace */ +#define XFS_IOC_ATTR_SECURE 0x0008 /* use attrs in security namespace */ +#define XFS_IOC_ATTR_CREATE 0x0010 /* fail if attr already exists */ +#define XFS_IOC_ATTR_REPLACE 0x0020 /* fail if attr does not exist */ + typedef struct xfs_attrlist_cursor { __u32 opaque[4]; } xfs_attrlist_cursor_t; +/* + * Define how lists of attribute names are returned to userspace from the + * XFS_IOC_ATTRLIST_BY_HANDLE ioctl. struct xfs_attrlist is the header at the + * beginning of the returned buffer, and a each entry in al_offset contains the + * relative offset of an xfs_attrlist_ent containing the actual entry. + * + * NOTE: struct xfs_attrlist must match struct attrlist defined in libattr, and + * struct xfs_attrlist_ent must match struct attrlist_ent defined in libattr. + */ +struct xfs_attrlist { + __s32 al_count; /* number of entries in attrlist */ + __s32 al_more; /* T/F: more attrs (do call again) */ + __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ +}; + +struct xfs_attrlist_ent { /* data from attr_list() */ + __u32 a_valuelen; /* number bytes in value of attr */ + char a_name[1]; /* attr name (NULL terminated) */ +}; + typedef struct xfs_fsop_attrlist_handlereq { struct xfs_fsop_handlereq hreq; /* handle interface structure */ struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ @@ -589,7 +619,7 @@ typedef struct xfs_attr_multiop { void __user *am_attrname; void __user *am_attrvalue; __u32 am_length; - __u32 am_flags; + __u32 am_flags; /* XFS_IOC_ATTR_* */ } xfs_attr_multiop_t; typedef struct xfs_fsop_attrmulti_handlereq { diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 588d44613094..7fcf62b324b0 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -105,7 +105,7 @@ xfs_inobt_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; uint64_t realfree; @@ -177,7 +177,7 @@ xfs_inobt_insert( xfs_btnum_t btnum) { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agino_t thisino; int i; @@ -276,6 +276,7 @@ xfs_ialloc_inode_init( int i, j; xfs_daddr_t d; xfs_ino_t ino = 0; + int error; /* * Loop over the new block(s), filling in the inodes. For small block @@ -303,7 +304,7 @@ xfs_ialloc_inode_init( * That means for v3 inode we log the entire buffer rather than just the * inode cores. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { version = 3; ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno)); @@ -327,19 +328,18 @@ xfs_ialloc_inode_init( */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * M_IGEO(mp)->blocks_per_cluster)); - fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * - M_IGEO(mp)->blocks_per_cluster, - XBF_UNMAPPED); - if (!fbuf) - return -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, + XBF_UNMAPPED, &fbuf); + if (error) + return error; /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; - uint isize = xfs_dinode_size(version); + uint isize = XFS_DINODE_SIZE(&mp->m_sb); free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); @@ -525,7 +525,7 @@ xfs_inobt_insert_sprec( bool merge) /* merge or replace */ { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); int error; int i; @@ -544,7 +544,10 @@ xfs_inobt_insert_sprec( nrec->ir_free, &i); if (error) goto error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error; + } goto out; } @@ -557,17 +560,23 @@ xfs_inobt_insert_sprec( error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); - XFS_WANT_CORRUPTED_GOTO(mp, - rec.ir_startino == nrec->ir_startino, - error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error; + } + if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { + error = -EFSCORRUPTED; + goto error; + } /* * This should never fail. If we have coexisting records that * cannot merge, something is seriously wrong. */ - XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec), - error); + if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { + error = -EFSCORRUPTED; + goto error; + } trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, rec.ir_holemask, nrec->ir_startino, @@ -649,7 +658,7 @@ xfs_ialloc_ag_alloc( * chunk of inodes. If the filesystem is striped, this will fill * an entire stripe unit with inodes. */ - agi = XFS_BUF_TO_AGI(agbp); + agi = agbp->b_addr; newino = be32_to_cpu(agi->agi_newino); agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + @@ -1057,7 +1066,8 @@ xfs_ialloc_next_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; } return 0; @@ -1081,7 +1091,8 @@ xfs_ialloc_get_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; } return 0; @@ -1119,7 +1130,7 @@ xfs_dialloc_ag_inobt( xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); @@ -1161,12 +1172,18 @@ xfs_dialloc_ag_inobt( error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0); + if (XFS_IS_CORRUPT(mp, j != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if (rec.ir_freecount > 0) { /* @@ -1321,19 +1338,28 @@ xfs_dialloc_ag_inobt( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } for (;;) { error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } if (rec.ir_freecount > 0) break; error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } } alloc_inode: @@ -1393,7 +1419,8 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(lcur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) + return -EFSCORRUPTED; /* * See if we've landed in the parent inode record. The finobt @@ -1416,10 +1443,16 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(rcur, &rrec, &j); if (error) goto error_rcur; - XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur); + if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) { + error = -EFSCORRUPTED; + goto error_rcur; + } } - XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur); + if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) { + error = -EFSCORRUPTED; + goto error_rcur; + } if (i == 1 && j == 1) { /* * Both the left and right records are valid. Choose the closer @@ -1472,7 +1505,8 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; return 0; } } @@ -1483,12 +1517,14 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; return 0; } @@ -1510,20 +1546,24 @@ xfs_dialloc_ag_update_inobt( error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + return -EFSCORRUPTED; ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) && - (rec.ir_freecount == frec->ir_freecount)); + if (XFS_IS_CORRUPT(cur->bc_mp, + rec.ir_free != frec->ir_free || + rec.ir_freecount != frec->ir_freecount)) + return -EFSCORRUPTED; return xfs_inobt_update(cur, &rec); } @@ -1543,7 +1583,7 @@ xfs_dialloc_ag( xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); @@ -1903,7 +1943,7 @@ xfs_difree_inobt( struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); struct xfs_perag *pag; struct xfs_btree_cur *cur; @@ -1933,14 +1973,20 @@ xfs_difree_inobt( __func__, error); goto error0; } - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } error = xfs_inobt_get_rec(cur, &rec, &i); if (error) { xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", __func__, error); goto error0; } - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error0; + } /* * Get the offset in the inode chunk. */ @@ -2033,7 +2079,7 @@ xfs_difree_finobt( xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; @@ -2052,7 +2098,10 @@ xfs_difree_finobt( * freed an inode in a previously fully allocated chunk. If not, * something is out of sync. */ - XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); + if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) { + error = -EFSCORRUPTED; + goto error; + } error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask, ibtrec->ir_count, @@ -2075,14 +2124,20 @@ xfs_difree_finobt( error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto error; + } rec.ir_free |= XFS_INOBT_MASK(offset); rec.ir_freecount++; - XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) && - (rec.ir_freecount == ibtrec->ir_freecount), - error); + if (XFS_IS_CORRUPT(mp, + rec.ir_free != ibtrec->ir_free || + rec.ir_freecount != ibtrec->ir_freecount)) { + error = -EFSCORRUPTED; + goto error; + } /* * The content of inobt records should always match between the inobt @@ -2434,9 +2489,8 @@ xfs_ialloc_log_agi( sizeof(xfs_agi_t) }; #ifdef DEBUG - xfs_agi_t *agi; /* allocation group header */ + struct xfs_agi *agi = bp->b_addr; - agi = XFS_BUF_TO_AGI(bp); ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif @@ -2468,14 +2522,13 @@ xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; - struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + struct xfs_agi *agi = bp->b_addr; int i; if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (!xfs_log_check_lsn(mp, - be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn))) + if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn))) return __this_address; } @@ -2538,6 +2591,7 @@ xfs_agi_write_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_agi *agi = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agi_verify(bp); @@ -2550,7 +2604,7 @@ xfs_agi_write_verify( return; if (bip) - XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); + agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); } @@ -2606,7 +2660,7 @@ xfs_ialloc_read_agi( if (error) return error; - agi = XFS_BUF_TO_AGI(*bpp); + agi = (*bpp)->b_addr; pag = xfs_perag_get(mp, agno); if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); @@ -2818,7 +2872,7 @@ xfs_ialloc_setup_geometry( * cannot change the behavior. */ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { int new_size = igeo->inode_cluster_size_raw; new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; @@ -2854,3 +2908,67 @@ xfs_ialloc_setup_geometry( else igeo->ialloc_align = 0; } + +/* Compute the location of the root directory inode that is laid out by mkfs. */ +xfs_ino_t +xfs_ialloc_calc_rootino( + struct xfs_mount *mp, + int sunit) +{ + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t first_bno; + + /* + * Pre-calculate the geometry of AG 0. We know what it looks like + * because libxfs knows how to create allocation groups now. + * + * first_bno is the first block in which mkfs could possibly have + * allocated the root directory inode, once we factor in the metadata + * that mkfs formats before it. Namely, the four AG headers... + */ + first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize); + + /* ...the two free space btree roots... */ + first_bno += 2; + + /* ...the inode btree root... */ + first_bno += 1; + + /* ...the initial AGFL... */ + first_bno += xfs_alloc_min_freelist(mp, NULL); + + /* ...the free inode btree root... */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + first_bno++; + + /* ...the reverse mapping btree root... */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + first_bno++; + + /* ...the reference count btree... */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + first_bno++; + + /* + * ...and the log, if it is allocated in the first allocation group. + * + * This can happen with filesystems that only have a single + * allocation group, or very odd geometries created by old mkfs + * versions on very small filesystems. + */ + if (mp->m_sb.sb_logstart && + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) + first_bno += mp->m_sb.sb_logblocks; + + /* + * Now round first_bno up to whatever allocation alignment is given + * by the filesystem or was passed in. + */ + if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0) + first_bno = roundup(first_bno, sunit); + else if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt > 1) + first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt); + + return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno)); +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 323592d563d5..72b3468b97b1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -152,5 +152,6 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_setup_geometry(struct xfs_mount *mp); +xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index b82992f795aa..b2c122ad8f0e 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -12,6 +12,7 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" @@ -20,7 +21,6 @@ #include "xfs_trans.h" #include "xfs_rmap.h" - STATIC int xfs_inobt_get_minrecs( struct xfs_btree_cur *cur, @@ -34,7 +34,7 @@ xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_ag.agbp, cur->bc_ag.agno, cur->bc_btnum); } @@ -44,8 +44,8 @@ xfs_inobt_set_root( union xfs_btree_ptr *nptr, int inc) /* level change */ { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; agi->agi_root = nptr->s; be32_add_cpu(&agi->agi_level, inc); @@ -58,8 +58,8 @@ xfs_finobt_set_root( union xfs_btree_ptr *nptr, int inc) /* level change */ { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; agi->agi_free_root = nptr->s; be32_add_cpu(&agi->agi_free_level, inc); @@ -83,7 +83,7 @@ __xfs_inobt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.oinfo = XFS_RMAP_OINFO_INOBT; - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.agno, sbno); args.minlen = 1; args.maxlen = 1; args.prod = 1; @@ -212,9 +212,9 @@ xfs_inobt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_root; } @@ -224,9 +224,9 @@ xfs_finobt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_free_root; } @@ -400,32 +400,27 @@ static const struct xfs_btree_ops xfs_finobt_ops = { }; /* - * Allocate a new inode btree cursor. + * Initialize a new inode btree cursor. */ -struct xfs_btree_cur * /* new inode btree cursor */ -xfs_inobt_init_cursor( +static struct xfs_btree_cur * +xfs_inobt_init_common( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for agi structure */ xfs_agnumber_t agno, /* allocation group number */ xfs_btnum_t btnum) /* ialloc or free ino btree */ { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); struct xfs_btree_cur *cur; cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); - cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = btnum; if (btnum == XFS_BTNUM_INO) { - cur->bc_nlevels = be32_to_cpu(agi->agi_level); - cur->bc_ops = &xfs_inobt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); + cur->bc_ops = &xfs_inobt_ops; } else { - cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); - cur->bc_ops = &xfs_finobt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); + cur->bc_ops = &xfs_finobt_ops; } cur->bc_blocklog = mp->m_sb.sb_blocklog; @@ -433,12 +428,75 @@ xfs_inobt_init_cursor( if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; + cur->bc_ag.agno = agno; + return cur; +} + +/* Create an inode btree cursor. */ +struct xfs_btree_cur * +xfs_inobt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = agbp->b_addr; + cur = xfs_inobt_init_common(mp, tp, agno, btnum); + if (btnum == XFS_BTNUM_INO) + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + else + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + cur->bc_ag.agbp = agbp; return cur; } +/* Create an inode btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_inobt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + + cur = xfs_inobt_init_common(mp, NULL, agno, btnum); + xfs_btree_stage_afakeroot(cur, afake); + return cur; +} + +/* + * Install a new inobt btree root. Caller is responsible for invalidating + * and freeing the old btree blocks. + */ +void +xfs_inobt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agi *agi = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + if (cur->bc_btnum == XFS_BTNUM_INO) { + agi->agi_root = cpu_to_be32(afake->af_root); + agi->agi_level = cpu_to_be32(afake->af_levels); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops); + } else { + agi->agi_free_root = cpu_to_be32(afake->af_root); + agi->agi_free_level = cpu_to_be32(afake->af_levels); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREE_ROOT | + XFS_AGI_FREE_LEVEL); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops); + } +} + /* * Calculate number of records in an inobt btree block. */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 951305ecaae1..35bbd978c272 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -48,6 +48,9 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, xfs_btnum_t); +struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno, + xfs_btnum_t btnum); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ @@ -68,4 +71,7 @@ int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_btnum_t btnum, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp); +void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 7bc87408f1a0..52451809c478 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -596,7 +596,7 @@ xfs_iext_realloc_root( struct xfs_ifork *ifp, struct xfs_iext_cursor *cur) { - size_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec); + int64_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec); void *new; /* account for the prev/next pointers */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 28ab3c5255e1..39c5a6e24915 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -44,17 +44,6 @@ xfs_inobp_check( } #endif -bool -xfs_dinode_good_version( - struct xfs_mount *mp, - __u8 version) -{ - if (xfs_sb_version_hascrc(&mp->m_sb)) - return version == 3; - - return version == 1 || version == 2; -} - /* * If we are doing readahead on an inode buffer, we might be in log recovery * reading an inode allocation buffer that hasn't yet been replayed, and hence @@ -93,7 +82,7 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = xfs_verify_magic16(bp, dip->di_magic) && - xfs_dinode_good_version(mp, dip->di_version) && + xfs_dinode_good_version(&mp->m_sb, dip->di_version) && xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { @@ -205,26 +194,23 @@ xfs_inode_from_disk( struct xfs_icdinode *to = &ip->i_d; struct inode *inode = VFS_I(ip); - /* * Convert v1 inodes immediately to v2 inode format as this is the * minimum inode version format we support in the rest of the code. + * They will also be unconditionally written back to disk as v2 inodes. */ - to->di_version = from->di_version; - if (to->di_version == 1) { + if (unlikely(from->di_version == 1)) { set_nlink(inode, be16_to_cpu(from->di_onlink)); - to->di_projid_lo = 0; - to->di_projid_hi = 0; - to->di_version = 2; + to->di_projid = 0; } else { set_nlink(inode, be32_to_cpu(from->di_nlink)); - to->di_projid_lo = be16_to_cpu(from->di_projid_lo); - to->di_projid_hi = be16_to_cpu(from->di_projid_hi); + to->di_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | + be16_to_cpu(from->di_projid_lo); } to->di_format = from->di_format; - to->di_uid = be32_to_cpu(from->di_uid); - to->di_gid = be32_to_cpu(from->di_gid); + i_uid_write(inode, be32_to_cpu(from->di_uid)); + i_gid_write(inode, be32_to_cpu(from->di_gid)); to->di_flushiter = be16_to_cpu(from->di_flushiter); /* @@ -253,11 +239,11 @@ xfs_inode_from_disk( to->di_dmstate = be16_to_cpu(from->di_dmstate); to->di_flags = be16_to_cpu(from->di_flags); - if (to->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); - to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); - to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec); + to->di_crtime.tv_nsec = be32_to_cpu(from->di_crtime.t_nsec); to->di_flags2 = be64_to_cpu(from->di_flags2); to->di_cowextsize = be32_to_cpu(from->di_cowextsize); } @@ -275,12 +261,11 @@ xfs_inode_to_disk( to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); to->di_onlink = 0; - to->di_version = from->di_version; to->di_format = from->di_format; - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); - to->di_projid_lo = cpu_to_be16(from->di_projid_lo); - to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + to->di_uid = cpu_to_be32(i_uid_read(inode)); + to->di_gid = cpu_to_be32(i_gid_read(inode)); + to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff); + to->di_projid_hi = cpu_to_be16(from->di_projid >> 16); memset(to->di_pad, 0, sizeof(to->di_pad)); to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec); @@ -304,10 +289,11 @@ xfs_inode_to_disk( to->di_dmstate = cpu_to_be16(from->di_dmstate); to->di_flags = cpu_to_be16(from->di_flags); - if (from->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); + to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec); + to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec); to->di_flags2 = cpu_to_be64(from->di_flags2); to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); @@ -316,6 +302,7 @@ xfs_inode_to_disk( uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_flushiter = 0; } else { + to->di_version = 2; to->di_flushiter = cpu_to_be16(from->di_flushiter); } } @@ -429,7 +416,7 @@ xfs_dinode_verify_forkoff( case XFS_DINODE_FMT_LOCAL: /* fall through ... */ case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ case XFS_DINODE_FMT_BTREE: - if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3)) + if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3)) return __this_address; break; default: @@ -455,7 +442,7 @@ xfs_dinode_verify( /* Verify v3 integrity information first */ if (dip->di_version >= 3) { - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_sb_version_has_v3inode(&mp->m_sb)) return __this_address; if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) @@ -630,12 +617,9 @@ xfs_iread( /* shortcut IO on inode allocation if possible */ if ((iget_flags & XFS_IGET_CREATE) && - xfs_sb_version_hascrc(&mp->m_sb) && + xfs_sb_version_has_v3inode(&mp->m_sb) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { - /* initialise the on-disk inode core */ - memset(&ip->i_d, 0, sizeof(ip->i_d)); VFS_I(ip)->i_generation = prandom_u32(); - ip->i_d.di_version = 3; return 0; } @@ -677,7 +661,6 @@ xfs_iread( * Partial initialisation of the in-core inode. Just the bits * that xfs_ialloc won't overwrite or relies on being correct. */ - ip->i_d.di_version = dip->di_version; VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen); ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); @@ -691,7 +674,6 @@ xfs_iread( VFS_I(ip)->i_mode = 0; } - ASSERT(ip->i_d.di_version >= 2); ip->i_delayed_blks = 0; /* diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index ab0f84165317..9b373dcf9e34 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -16,13 +16,9 @@ struct xfs_dinode; * format specific structures at the appropriate time. */ struct xfs_icdinode { - int8_t di_version; /* inode version */ int8_t di_format; /* format of di_c data */ uint16_t di_flushiter; /* incremented on flush */ - uint32_t di_uid; /* owner's user id */ - uint32_t di_gid; /* owner's group id */ - uint16_t di_projid_lo; /* lower part of owner's project id */ - uint16_t di_projid_hi; /* higher part of owner's project id */ + uint32_t di_projid; /* owner's project id */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ @@ -37,7 +33,7 @@ struct xfs_icdinode { uint64_t di_flags2; /* more random flags */ uint32_t di_cowextsize; /* basic cow extent size for file */ - xfs_ictimestamp_t di_crtime; /* time created */ + struct timespec64 di_crtime; /* time created */ }; /* @@ -62,8 +58,6 @@ void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, struct xfs_dinode *to); -bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version); - #if defined(DEBUG) void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #else diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index c643beeb5a24..518c6f0ec3a6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -75,11 +75,15 @@ xfs_iformat_fork( error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); break; default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, + dip, sizeof(*dip), __this_address); return -EFSCORRUPTED; } break; default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); return -EFSCORRUPTED; } if (error) @@ -110,14 +114,16 @@ xfs_iformat_fork( error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); break; default: + xfs_inode_verifier_error(ip, error, __func__, dip, + sizeof(*dip), __this_address); error = -EFSCORRUPTED; break; } if (error) { - kmem_zone_free(xfs_ifork_zone, ip->i_afp); + kmem_cache_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; if (ip->i_cowfp) - kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); + kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); ip->i_cowfp = NULL; xfs_idestroy_fork(ip, XFS_DATA_FORK); } @@ -129,7 +135,7 @@ xfs_init_local_fork( struct xfs_inode *ip, int whichfork, const void *data, - int size) + int64_t size) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); int mem_size = size, real_size = 0; @@ -177,7 +183,7 @@ xfs_iformat_local( */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %d).", + "corrupt inode %Lu (bad size %d for local fork, size = %zd).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); xfs_inode_verifier_error(ip, -EFSCORRUPTED, @@ -467,11 +473,11 @@ xfs_iroot_realloc( void xfs_idata_realloc( struct xfs_inode *ip, - int byte_diff, + int64_t byte_diff, int whichfork) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - int new_size = (int)ifp->if_bytes + byte_diff; + int64_t new_size = ifp->if_bytes + byte_diff; ASSERT(new_size >= 0); ASSERT(new_size <= XFS_IFORK_SIZE(ip, whichfork)); @@ -525,10 +531,10 @@ xfs_idestroy_fork( } if (whichfork == XFS_ATTR_FORK) { - kmem_zone_free(xfs_ifork_zone, ip->i_afp); + kmem_cache_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; } else if (whichfork == XFS_COW_FORK) { - kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); + kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); ip->i_cowfp = NULL; } } @@ -552,7 +558,7 @@ xfs_iextents_copy( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_iext_cursor icur; struct xfs_bmbt_irec rec; - int copied = 0; + int64_t copied = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); ASSERT(ifp->if_bytes > 0); diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 00c62ce170d0..668ee942be22 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -13,16 +13,16 @@ struct xfs_dinode; * File incore extent information, present for each of data & attr forks. */ struct xfs_ifork { - int if_bytes; /* bytes in if_u1 */ - unsigned int if_seq; /* fork mod counter */ + int64_t if_bytes; /* bytes in if_u1 */ struct xfs_btree_block *if_broot; /* file's incore btree root */ - short if_broot_bytes; /* bytes allocated for root */ - unsigned char if_flags; /* per-fork flags */ + unsigned int if_seq; /* fork mod counter */ int if_height; /* height of the extent tree */ union { void *if_root; /* extent tree root */ char *if_data; /* inline file data */ } if_u1; + short if_broot_bytes; /* bytes allocated for root */ + unsigned char if_flags; /* per-fork flags */ }; /* @@ -46,14 +46,9 @@ struct xfs_ifork { (ip)->i_afp : \ (ip)->i_cowfp)) #define XFS_IFORK_DSIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_IFORK_BOFF(ip) : \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) + (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount)) #define XFS_IFORK_ASIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ - XFS_IFORK_BOFF(ip) : \ - 0) + (XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0) #define XFS_IFORK_SIZE(ip,w) \ ((w) == XFS_DATA_FORK ? \ XFS_IFORK_DSIZE(ip) : \ @@ -87,18 +82,24 @@ struct xfs_ifork { #define XFS_IFORK_MAXEXT(ip, w) \ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) +#define xfs_ifork_has_extents(ip, w) \ + (XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_EXTENTS || \ + XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_BTREE) + struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_inode *, int); -void xfs_idata_realloc(struct xfs_inode *, int, int); +void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, + int whichfork); void xfs_iroot_realloc(struct xfs_inode *, int, int); int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, int); -void xfs_init_local_fork(struct xfs_inode *, int, const void *, int); +void xfs_init_local_fork(struct xfs_inode *ip, int whichfork, + const void *data, int64_t size); xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp); void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur, diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index e5f97c69b320..e3400c9c71cd 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -424,17 +424,15 @@ struct xfs_log_dinode { /* structure must be padded to 64 bit alignment */ }; -static inline uint xfs_log_dinode_size(int version) -{ - if (version == 3) - return sizeof(struct xfs_log_dinode); - return offsetof(struct xfs_log_dinode, di_next_unlinked); -} +#define xfs_log_dinode_size(mp) \ + (xfs_sb_version_has_v3inode(&(mp)->m_sb) ? \ + sizeof(struct xfs_log_dinode) : \ + offsetof(struct xfs_log_dinode, di_next_unlinked)) /* - * Buffer Log Format defintions + * Buffer Log Format definitions * - * These are the physical dirty bitmap defintions for the log format structure. + * These are the physical dirty bitmap definitions for the log format structure. */ #define XFS_BLF_CHUNK 128 #define XFS_BLF_SHIFT 7 @@ -462,11 +460,20 @@ static inline uint xfs_log_dinode_size(int version) #define XFS_BLF_GDQUOT_BUF (1<<4) /* - * This is the structure used to lay out a buf log item in the - * log. The data map describes which 128 byte chunks of the buffer - * have been logged. - */ -#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) + * This is the structure used to lay out a buf log item in the log. The data + * map describes which 128 byte chunks of the buffer have been logged. + * + * The placement of blf_map_size causes blf_data_map to start at an odd + * multiple of sizeof(unsigned int) offset within the struct. Because the data + * bitmap size will always be an even number, the end of the data_map (and + * therefore the structure) will also be at an odd multiple of sizeof(unsigned + * int). Some 64-bit compilers will insert padding at the end of the struct to + * ensure 64-bit alignment of blf_blkno, but 32-bit ones will not. Therefore, + * XFS_BLF_DATAMAP_SIZE must be an odd number to make the padding explicit and + * keep the structure size consistent between 32-bit and 64-bit platforms. + */ +#define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) +#define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1) typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index f3d18eaecebb..3bf671637a91 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -30,14 +30,14 @@ typedef struct xlog_recover_item { xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ } xlog_recover_item_t; -typedef struct xlog_recover { +struct xlog_recover { struct hlist_node r_list; xlog_tid_t r_log_tid; /* log's transaction id */ xfs_trans_header_t r_theader; /* trans header for partial */ int r_state; /* not needed */ xfs_lsn_t r_lsn; /* xact lsn */ struct list_head r_itemq; /* q for items */ -} xlog_recover_t; +}; #define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 9a7fadb1361c..2076627243b0 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -46,7 +46,7 @@ xfs_refcount_lookup_le( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -63,7 +63,7 @@ xfs_refcount_lookup_ge( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -80,7 +80,7 @@ xfs_refcount_lookup_eq( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -108,7 +108,7 @@ xfs_refcount_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; xfs_agblock_t realstart; @@ -119,7 +119,7 @@ xfs_refcount_get_rec( xfs_refcount_btrec_to_irec(rec, irec); - agno = cur->bc_private.a.agno; + agno = cur->bc_ag.agno; if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) goto out_bad_rec; @@ -144,7 +144,7 @@ xfs_refcount_get_rec( if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) goto out_bad_rec; - trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.agno, irec); return 0; out_bad_rec: @@ -169,14 +169,14 @@ xfs_refcount_update( union xfs_btree_rec rec; int error; - trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.agno, irec); rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); error = xfs_btree_update(cur, &rec); if (error) trace_xfs_refcount_update_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -193,19 +193,22 @@ xfs_refcount_insert( { int error; - trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.agno, irec); cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; error = xfs_btree_insert(cur, i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } out_error: if (error) trace_xfs_refcount_insert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -227,17 +230,23 @@ xfs_refcount_delete( error = xfs_refcount_get_rec(cur, &irec, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); - trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } + trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.agno, &irec); error = xfs_btree_delete(cur, i); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (error) goto out_error; error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec); out_error: if (error) trace_xfs_refcount_delete_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -349,12 +358,15 @@ xfs_refcount_split_extent( error = xfs_refcount_get_rec(cur, &rcext, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno) return 0; *shape_changed = true; - trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.agno, &rcext, agbno); /* Establish the right extent. */ @@ -371,12 +383,15 @@ xfs_refcount_split_extent( error = xfs_refcount_insert(cur, &tmp, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } return error; out_error: trace_xfs_refcount_split_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -396,7 +411,7 @@ xfs_refcount_merge_center_extents( int found_rec; trace_xfs_refcount_merge_center_extents(cur->bc_mp, - cur->bc_private.a.agno, left, center, right); + cur->bc_ag.agno, left, center, right); /* * Make sure the center and right extents are not in the btree. @@ -410,19 +425,27 @@ xfs_refcount_merge_center_extents( &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (center->rc_refcount > 1) { error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } /* Enlarge the left extent. */ @@ -430,7 +453,10 @@ xfs_refcount_merge_center_extents( &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } left->rc_blockcount = extlen; error = xfs_refcount_update(cur, left); @@ -442,7 +468,7 @@ xfs_refcount_merge_center_extents( out_error: trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -461,7 +487,7 @@ xfs_refcount_merge_left_extent( int found_rec; trace_xfs_refcount_merge_left_extent(cur->bc_mp, - cur->bc_private.a.agno, left, cleft); + cur->bc_ag.agno, left, cleft); /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ if (cleft->rc_refcount > 1) { @@ -469,14 +495,18 @@ xfs_refcount_merge_left_extent( &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } /* Enlarge the left extent. */ @@ -484,7 +514,10 @@ xfs_refcount_merge_left_extent( &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } left->rc_blockcount += cleft->rc_blockcount; error = xfs_refcount_update(cur, left); @@ -497,7 +530,7 @@ xfs_refcount_merge_left_extent( out_error: trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -515,7 +548,7 @@ xfs_refcount_merge_right_extent( int found_rec; trace_xfs_refcount_merge_right_extent(cur->bc_mp, - cur->bc_private.a.agno, cright, right); + cur->bc_ag.agno, cright, right); /* * If the extent ending at agbno+aglen (cright) wasn't synthesized, @@ -526,14 +559,18 @@ xfs_refcount_merge_right_extent( &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } /* Enlarge the right extent. */ @@ -541,7 +578,10 @@ xfs_refcount_merge_right_extent( &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } right->rc_startblock -= cright->rc_blockcount; right->rc_blockcount += cright->rc_blockcount; @@ -554,7 +594,7 @@ xfs_refcount_merge_right_extent( out_error: trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -587,7 +627,10 @@ xfs_refcount_find_left_extents( error = xfs_refcount_get_rec(cur, &tmp, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (xfs_refc_next(&tmp) != agbno) return 0; @@ -605,8 +648,10 @@ xfs_refcount_find_left_extents( error = xfs_refcount_get_rec(cur, &tmp, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } /* if tmp starts at the end of our range, just use that */ if (tmp.rc_startblock == agbno) @@ -634,13 +679,13 @@ xfs_refcount_find_left_extents( cleft->rc_blockcount = aglen; cleft->rc_refcount = 1; } - trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.agno, left, cleft, agbno); return error; out_error: trace_xfs_refcount_find_left_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -671,7 +716,10 @@ xfs_refcount_find_right_extents( error = xfs_refcount_get_rec(cur, &tmp, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (tmp.rc_startblock != agbno + aglen) return 0; @@ -689,8 +737,10 @@ xfs_refcount_find_right_extents( error = xfs_refcount_get_rec(cur, &tmp, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, - out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } /* if tmp ends at the end of our range, just use that */ if (xfs_refc_next(&tmp) == agbno + aglen) @@ -718,13 +768,13 @@ xfs_refcount_find_right_extents( cright->rc_blockcount = aglen; cright->rc_refcount = 1; } - trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.agno, cright, right, agbno + aglen); return error; out_error: trace_xfs_refcount_find_right_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -833,7 +883,7 @@ xfs_refcount_still_have_space( { unsigned long overhead; - overhead = cur->bc_private.a.priv.refc.shape_changes * + overhead = cur->bc_ag.refc.shape_changes * xfs_allocfree_log_count(cur->bc_mp, 1); overhead *= cur->bc_mp->m_sb.sb_blocksize; @@ -841,17 +891,17 @@ xfs_refcount_still_have_space( * Only allow 2 refcount extent updates per transaction if the * refcount continue update "error" has been injected. */ - if (cur->bc_private.a.priv.refc.nr_ops > 2 && + if (cur->bc_ag.refc.nr_ops > 2 && XFS_TEST_ERROR(false, cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; - if (cur->bc_private.a.priv.refc.nr_ops == 0) + if (cur->bc_ag.refc.nr_ops == 0) return true; else if (overhead > cur->bc_tp->t_log_res) return false; return cur->bc_tp->t_log_res - overhead > - cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; + cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; } /* @@ -902,7 +952,7 @@ xfs_refcount_adjust_extents( ext.rc_startblock - *agbno); tmp.rc_refcount = 1 + adj; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &tmp); + cur->bc_ag.agno, &tmp); /* * Either cover the hole (increment) or @@ -913,12 +963,15 @@ xfs_refcount_adjust_extents( &found_tmp); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - found_tmp == 1, out_error); - cur->bc_private.a.priv.refc.nr_ops++; + if (XFS_IS_CORRUPT(cur->bc_mp, + found_tmp != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } + cur->bc_ag.refc.nr_ops++; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_private.a.agno, + cur->bc_ag.agno, tmp.rc_startblock); xfs_bmap_add_free(cur->bc_tp, fsbno, tmp.rc_blockcount, oinfo); @@ -945,23 +998,25 @@ xfs_refcount_adjust_extents( goto skip; ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &ext); + cur->bc_ag.agno, &ext); if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) goto out_error; - cur->bc_private.a.priv.refc.nr_ops++; + cur->bc_ag.refc.nr_ops++; } else if (ext.rc_refcount == 1) { error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - found_rec == 1, out_error); - cur->bc_private.a.priv.refc.nr_ops++; + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } + cur->bc_ag.refc.nr_ops++; goto advloop; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_private.a.agno, + cur->bc_ag.agno, ext.rc_startblock); xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount, oinfo); @@ -980,7 +1035,7 @@ advloop: return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1002,10 +1057,10 @@ xfs_refcount_adjust( *new_agbno = agbno; *new_aglen = aglen; if (adj == XFS_REFCOUNT_ADJUST_INCREASE) - trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.agno, agbno, aglen); else - trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.agno, agbno, aglen); /* @@ -1033,7 +1088,7 @@ xfs_refcount_adjust( if (shape_changed) shape_changes++; if (shape_changes) - cur->bc_private.a.priv.refc.shape_changes++; + cur->bc_ag.refc.shape_changes++; /* Now that we've taken care of the ends, adjust the middle extents */ error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, @@ -1044,7 +1099,7 @@ xfs_refcount_adjust( return 0; out_error: - trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1060,7 +1115,7 @@ xfs_refcount_finish_one_cleanup( if (rcur == NULL) return; - agbp = rcur->bc_private.a.agbp; + agbp = rcur->bc_ag.agbp; xfs_btree_del_cursor(rcur, error); if (error) xfs_trans_brelse(tp, agbp); @@ -1110,9 +1165,9 @@ xfs_refcount_finish_one( * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_private.a.agno != agno) { - nr_ops = rcur->bc_private.a.priv.refc.nr_ops; - shape_changes = rcur->bc_private.a.priv.refc.shape_changes; + if (rcur != NULL && rcur->bc_ag.agno != agno) { + nr_ops = rcur->bc_ag.refc.nr_ops; + shape_changes = rcur->bc_ag.refc.shape_changes; xfs_refcount_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -1122,16 +1177,14 @@ xfs_refcount_finish_one( XFS_ALLOC_FLAG_FREEING, &agbp); if (error) return error; - if (!agbp) - return -EFSCORRUPTED; rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); if (!rcur) { error = -ENOMEM; goto out_cur; } - rcur->bc_private.a.priv.refc.nr_ops = nr_ops; - rcur->bc_private.a.priv.refc.shape_changes = shape_changes; + rcur->bc_ag.refc.nr_ops = nr_ops; + rcur->bc_ag.refc.shape_changes = shape_changes; } *pcur = rcur; @@ -1250,7 +1303,7 @@ xfs_refcount_find_shared( int have; int error; - trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.agno, agbno, aglen); /* By default, skip the whole range */ @@ -1272,7 +1325,10 @@ xfs_refcount_find_shared( error = xfs_refcount_get_rec(cur, &tmp, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } /* If the extent ends before the start, look at the next one */ if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) { @@ -1284,7 +1340,10 @@ xfs_refcount_find_shared( error = xfs_refcount_get_rec(cur, &tmp, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } /* If the extent starts after the range we want, bail out */ @@ -1312,7 +1371,10 @@ xfs_refcount_find_shared( error = xfs_refcount_get_rec(cur, &tmp, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (tmp.rc_startblock >= agbno + aglen || tmp.rc_startblock != *fbno + *flen) break; @@ -1321,12 +1383,12 @@ xfs_refcount_find_shared( done: trace_xfs_refcount_find_shared_result(cur->bc_mp, - cur->bc_private.a.agno, *fbno, *flen); + cur->bc_ag.agno, *fbno, *flen); out_error: if (error) trace_xfs_refcount_find_shared_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1413,39 +1475,52 @@ xfs_refcount_adjust_cow_extents( switch (adj) { case XFS_REFCOUNT_ADJUST_COW_ALLOC: /* Adding a CoW reservation, there should be nothing here. */ - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - ext.rc_startblock >= agbno + aglen, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, + agbno + aglen > ext.rc_startblock)) { + error = -EFSCORRUPTED; + goto out_error; + } tmp.rc_startblock = agbno; tmp.rc_blockcount = aglen; tmp.rc_refcount = 1; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &tmp); + cur->bc_ag.agno, &tmp); error = xfs_refcount_insert(cur, &tmp, &found_tmp); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - found_tmp == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } break; case XFS_REFCOUNT_ADJUST_COW_FREE: /* Removing a CoW reservation, there should be one extent. */ - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - ext.rc_startblock == agbno, out_error); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - ext.rc_blockcount == aglen, out_error); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - ext.rc_refcount == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_startblock != agbno)) { + error = -EFSCORRUPTED; + goto out_error; + } + if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount != aglen)) { + error = -EFSCORRUPTED; + goto out_error; + } + if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_refcount != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } ext.rc_refcount = 0; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &ext); + cur->bc_ag.agno, &ext); error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, - found_rec == 1, out_error); + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } break; default: ASSERT(0); @@ -1454,7 +1529,7 @@ xfs_refcount_adjust_cow_extents( return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1500,7 +1575,7 @@ xfs_refcount_adjust_cow( return 0; out_error: - trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1514,7 +1589,7 @@ __xfs_refcount_cow_alloc( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, + trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.agno, agbno, aglen); /* Add refcount btree reservation */ @@ -1531,7 +1606,7 @@ __xfs_refcount_cow_free( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, + trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.agno, agbno, aglen); /* Remove refcount btree reservation */ @@ -1584,14 +1659,15 @@ struct xfs_refcount_recovery { /* Stuff an extent on the recovery list. */ STATIC int xfs_refcount_recover_extent( - struct xfs_btree_cur *cur, + struct xfs_btree_cur *cur, union xfs_btree_rec *rec, void *priv) { struct list_head *debris = priv; struct xfs_refcount_recovery *rr; - if (be32_to_cpu(rec->refc.rc_refcount) != 1) + if (XFS_IS_CORRUPT(cur->bc_mp, + be32_to_cpu(rec->refc.rc_refcount) != 1)) return -EFSCORRUPTED; rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0); @@ -1640,10 +1716,6 @@ xfs_refcount_recover_cow_leftovers( error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) goto out_trans; - if (!agbp) { - error = -ENOMEM; - goto out_trans; - } cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); /* Find all the leftover CoW staging extents. */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 38529dbacd55..7fd6044a4f78 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -12,6 +12,7 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_refcount_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" @@ -25,7 +26,7 @@ xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno); + cur->bc_ag.agbp, cur->bc_ag.agno); } STATIC void @@ -34,8 +35,8 @@ xfs_refcountbt_set_root( union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); @@ -57,8 +58,8 @@ xfs_refcountbt_alloc_block( union xfs_btree_ptr *new, int *stat) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; struct xfs_alloc_arg args; /* block allocation args */ int error; /* error return value */ @@ -66,7 +67,7 @@ xfs_refcountbt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, xfs_refc_block(args.mp)); args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; @@ -75,13 +76,13 @@ xfs_refcountbt_alloc_block( error = xfs_alloc_vextent(&args); if (error) goto out_error; - trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.agno, args.agbno, 1); if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; } - ASSERT(args.agno == cur->bc_private.a.agno); + ASSERT(args.agno == cur->bc_ag.agno); ASSERT(args.len == 1); new->s = cpu_to_be32(args.agbno); @@ -101,12 +102,12 @@ xfs_refcountbt_free_block( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); int error; - trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.agno, XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); @@ -169,9 +170,9 @@ xfs_refcountbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_refcount_root; } @@ -311,41 +312,90 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = { }; /* - * Allocate a new refcount btree cursor. + * Initialize a new refcount btree cursor. */ -struct xfs_btree_cur * -xfs_refcountbt_init_cursor( +static struct xfs_btree_cur * +xfs_refcountbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_buf *agbp, xfs_agnumber_t agno) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; ASSERT(agno != NULLAGNUMBER); ASSERT(agno < mp->m_sb.sb_agcount); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = XFS_BTNUM_REFC; cur->bc_blocklog = mp->m_sb.sb_blocklog; - cur->bc_ops = &xfs_refcountbt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); - cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); - - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; + cur->bc_ag.agno = agno; cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.a.priv.refc.nr_ops = 0; - cur->bc_private.a.priv.refc.shape_changes = 0; + cur->bc_ag.refc.nr_ops = 0; + cur->bc_ag.refc.shape_changes = 0; + cur->bc_ops = &xfs_refcountbt_ops; + return cur; +} +/* Create a btree cursor. */ +struct xfs_btree_cur * +xfs_refcountbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + + cur = xfs_refcountbt_init_common(mp, tp, agno); + cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + cur->bc_ag.agbp = agbp; return cur; } +/* Create a btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_refcountbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno) +{ + struct xfs_btree_cur *cur; + + cur = xfs_refcountbt_init_common(mp, NULL, agno); + xfs_btree_stage_afakeroot(cur, afake); + return cur; +} + +/* + * Swap in the new btree root. Once we pass this point the newly rebuilt btree + * is in place and we have to kill off all the old btree blocks. + */ +void +xfs_refcountbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_refcount_root = cpu_to_be32(afake->af_root); + agf->agf_refcount_level = cpu_to_be32(afake->af_levels); + agf->agf_refcount_blocks = cpu_to_be32(afake->af_blocks); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS | + XFS_AGF_REFCOUNT_ROOT | + XFS_AGF_REFCOUNT_LEVEL); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops); +} + /* * Calculate the number of records in a refcount btree block. */ @@ -420,7 +470,7 @@ xfs_refcountbt_calc_reserves( if (error) return error; - agf = XFS_BUF_TO_AGF(agbp); + agf = agbp->b_addr; agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_refcount_blocks); xfs_trans_brelse(tp, agbp); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index ba416f71c824..69dc515db671 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xbtree_afakeroot; /* * Btree block header size @@ -46,6 +47,8 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno); +struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno); extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); @@ -58,4 +61,7 @@ extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); +void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + #endif /* __XFS_REFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 38e9414878b3..27c39268c31f 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -79,7 +79,7 @@ xfs_rmap_update( union xfs_btree_rec rec; int error; - trace_xfs_rmap_update(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); @@ -91,7 +91,7 @@ xfs_rmap_update( error = xfs_btree_update(cur, &rec); if (error) trace_xfs_rmap_update_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -107,13 +107,16 @@ xfs_rmap_insert( int i; int error; - trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 0, done); + if (XFS_IS_CORRUPT(rcur->bc_mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } rcur->bc_rec.r.rm_startblock = agbno; rcur->bc_rec.r.rm_blockcount = len; @@ -123,11 +126,14 @@ xfs_rmap_insert( error = xfs_btree_insert(rcur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); + if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } done: if (error) trace_xfs_rmap_insert_error(rcur->bc_mp, - rcur->bc_private.a.agno, error, _RET_IP_); + rcur->bc_ag.agno, error, _RET_IP_); return error; } @@ -143,22 +149,28 @@ xfs_rmap_delete( int i; int error; - trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); + if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_delete(rcur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); + if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } done: if (error) trace_xfs_rmap_delete_error(rcur->bc_mp, - rcur->bc_private.a.agno, error, _RET_IP_); + rcur->bc_ag.agno, error, _RET_IP_); return error; } @@ -185,7 +197,7 @@ xfs_rmap_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; @@ -248,7 +260,7 @@ xfs_rmap_find_left_neighbor_helper( struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, - cur->bc_private.a.agno, rec->rm_startblock, + cur->bc_ag.agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -300,7 +312,7 @@ xfs_rmap_find_left_neighbor( info.stat = stat; trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, - cur->bc_private.a.agno, bno, 0, owner, offset, flags); + cur->bc_ag.agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_find_left_neighbor_helper, &info); @@ -308,7 +320,7 @@ xfs_rmap_find_left_neighbor( error = 0; if (*stat) trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, irec->rm_startblock, + cur->bc_ag.agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return error; @@ -324,7 +336,7 @@ xfs_rmap_lookup_le_range_helper( struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, - cur->bc_private.a.agno, rec->rm_startblock, + cur->bc_ag.agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -373,14 +385,14 @@ xfs_rmap_lookup_le_range( info.stat = stat; trace_xfs_rmap_lookup_le_range(cur->bc_mp, - cur->bc_private.a.agno, bno, 0, owner, offset, flags); + cur->bc_ag.agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_lookup_le_range_helper, &info); if (error == -ECANCELED) error = 0; if (*stat) trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, irec->rm_startblock, + cur->bc_ag.agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return error; @@ -406,24 +418,39 @@ xfs_rmap_free_check_owner( return 0; /* Make sure the unwritten flag matches. */ - XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == - (rec->rm_flags & XFS_RMAP_UNWRITTEN), out); + if (XFS_IS_CORRUPT(mp, + (flags & XFS_RMAP_UNWRITTEN) != + (rec->rm_flags & XFS_RMAP_UNWRITTEN))) { + error = -EFSCORRUPTED; + goto out; + } /* Make sure the owner matches what we expect to find in the tree. */ - XFS_WANT_CORRUPTED_GOTO(mp, owner == rec->rm_owner, out); + if (XFS_IS_CORRUPT(mp, owner != rec->rm_owner)) { + error = -EFSCORRUPTED; + goto out; + } /* Check the offset, if necessary. */ if (XFS_RMAP_NON_INODE_OWNER(owner)) goto out; if (flags & XFS_RMAP_BMBT_BLOCK) { - XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_flags & XFS_RMAP_BMBT_BLOCK, - out); + if (XFS_IS_CORRUPT(mp, + !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))) { + error = -EFSCORRUPTED; + goto out; + } } else { - XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_offset <= offset, out); - XFS_WANT_CORRUPTED_GOTO(mp, - ltoff + rec->rm_blockcount >= offset + len, - out); + if (XFS_IS_CORRUPT(mp, rec->rm_offset > offset)) { + error = -EFSCORRUPTED; + goto out; + } + if (XFS_IS_CORRUPT(mp, + offset + len > ltoff + rec->rm_blockcount)) { + error = -EFSCORRUPTED; + goto out; + } } out: @@ -471,7 +498,7 @@ xfs_rmap_unmap( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -482,14 +509,20 @@ xfs_rmap_unmap( error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } error = xfs_rmap_get_rec(cur, <rec, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, ltrec.rm_startblock, + cur->bc_ag.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); ltoff = ltrec.rm_offset; @@ -502,8 +535,12 @@ xfs_rmap_unmap( * be the case that the "left" extent goes all the way to EOFS. */ if (owner == XFS_RMAP_OWN_NULL) { - XFS_WANT_CORRUPTED_GOTO(mp, bno >= ltrec.rm_startblock + - ltrec.rm_blockcount, out_error); + if (XFS_IS_CORRUPT(mp, + bno < + ltrec.rm_startblock + ltrec.rm_blockcount)) { + error = -EFSCORRUPTED; + goto out_error; + } goto out_done; } @@ -526,15 +563,22 @@ xfs_rmap_unmap( error = xfs_rmap_get_rec(cur, &rtrec, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } if (rtrec.rm_startblock >= bno + len) goto out_done; } /* Make sure the extent we found covers the entire freeing range. */ - XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && - ltrec.rm_startblock + ltrec.rm_blockcount >= - bno + len, out_error); + if (XFS_IS_CORRUPT(mp, + ltrec.rm_startblock > bno || + ltrec.rm_startblock + ltrec.rm_blockcount < + bno + len)) { + error = -EFSCORRUPTED; + goto out_error; + } /* Check owner information. */ error = xfs_rmap_free_check_owner(mp, ltoff, <rec, len, owner, @@ -544,14 +588,17 @@ xfs_rmap_unmap( if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } else if (ltrec.rm_startblock == bno) { /* * overlap left hand side of extent: move the start, trim the @@ -619,7 +666,7 @@ xfs_rmap_unmap( else cur->bc_rec.r.rm_offset = offset + len; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, cur->bc_rec.r.rm_startblock, cur->bc_rec.r.rm_blockcount, cur->bc_rec.r.rm_owner, @@ -631,11 +678,11 @@ xfs_rmap_unmap( } out_done: - trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_unmap_error(mp, cur->bc_private.a.agno, + trace_xfs_rmap_unmap_error(mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -726,7 +773,7 @@ xfs_rmap_map( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); @@ -743,9 +790,12 @@ xfs_rmap_map( error = xfs_rmap_get_rec(cur, <rec, &have_lt); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error); + if (XFS_IS_CORRUPT(mp, have_lt != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, ltrec.rm_startblock, + cur->bc_ag.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -753,9 +803,12 @@ xfs_rmap_map( have_lt = 0; } - XFS_WANT_CORRUPTED_GOTO(mp, - have_lt == 0 || - ltrec.rm_startblock + ltrec.rm_blockcount <= bno, out_error); + if (XFS_IS_CORRUPT(mp, + have_lt != 0 && + ltrec.rm_startblock + ltrec.rm_blockcount > bno)) { + error = -EFSCORRUPTED; + goto out_error; + } /* * Increment the cursor to see if we have a right-adjacent record to our @@ -769,11 +822,16 @@ xfs_rmap_map( error = xfs_rmap_get_rec(cur, >rec, &have_gt); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error); - XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= gtrec.rm_startblock, - out_error); + if (XFS_IS_CORRUPT(mp, have_gt != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } + if (XFS_IS_CORRUPT(mp, bno + len > gtrec.rm_startblock)) { + error = -EFSCORRUPTED; + goto out_error; + } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, gtrec.rm_startblock, + cur->bc_ag.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); if (!xfs_rmap_is_mergeable(>rec, owner, flags)) @@ -812,7 +870,7 @@ xfs_rmap_map( * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| */ ltrec.rm_blockcount += gtrec.rm_blockcount; - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, @@ -821,7 +879,10 @@ xfs_rmap_map( error = xfs_btree_delete(cur, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } /* point the cursor back to the left record and update */ @@ -860,19 +921,22 @@ xfs_rmap_map( cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, flags); error = xfs_btree_insert(cur, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } } - trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_map_error(mp, cur->bc_private.a.agno, + trace_xfs_rmap_map_error(mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -946,7 +1010,7 @@ xfs_rmap_convert( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -957,14 +1021,20 @@ xfs_rmap_convert( error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_rmap_get_rec(cur, &PREV, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, PREV.rm_startblock, + cur->bc_ag.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -995,12 +1065,18 @@ xfs_rmap_convert( error = xfs_rmap_get_rec(cur, &LEFT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - XFS_WANT_CORRUPTED_GOTO(mp, - LEFT.rm_startblock + LEFT.rm_blockcount <= bno, - done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } + if (XFS_IS_CORRUPT(mp, + LEFT.rm_startblock + LEFT.rm_blockcount > + bno)) { + error = -EFSCORRUPTED; + goto done; + } trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, LEFT.rm_startblock, + cur->bc_ag.agno, LEFT.rm_startblock, LEFT.rm_blockcount, LEFT.rm_owner, LEFT.rm_offset, LEFT.rm_flags); if (LEFT.rm_startblock + LEFT.rm_blockcount == bno && @@ -1017,7 +1093,10 @@ xfs_rmap_convert( error = xfs_btree_increment(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_increment(cur, 0, &i); if (error) goto done; @@ -1026,11 +1105,16 @@ xfs_rmap_convert( error = xfs_rmap_get_rec(cur, &RIGHT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock, - done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } + if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) { + error = -EFSCORRUPTED; + goto done; + } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, RIGHT.rm_startblock, + cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (bno + len == RIGHT.rm_startblock && @@ -1048,14 +1132,17 @@ xfs_rmap_convert( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state, _RET_IP_); /* reset the cursor back to PREV */ error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } /* * Switch out based on the FILLING and CONTIG state bits. @@ -1071,31 +1158,46 @@ xfs_rmap_convert( error = xfs_btree_increment(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW = LEFT; NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount; error = xfs_rmap_update(cur, &NEW); @@ -1108,18 +1210,24 @@ xfs_rmap_convert( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW = LEFT; NEW.rm_blockcount += PREV.rm_blockcount; error = xfs_rmap_update(cur, &NEW); @@ -1135,19 +1243,28 @@ xfs_rmap_convert( error = xfs_btree_increment(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); error = xfs_btree_delete(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW = PREV; NEW.rm_blockcount = len + RIGHT.rm_blockcount; NEW.rm_flags = newext; @@ -1209,12 +1326,15 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } break; case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: @@ -1253,19 +1373,25 @@ xfs_rmap_convert( oldext, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_startblock = bno; NEW.rm_owner = owner; NEW.rm_offset = offset; NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } break; case 0: @@ -1288,14 +1414,17 @@ xfs_rmap_convert( NEW = PREV; NEW.rm_blockcount = offset - PREV.rm_offset; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, NEW.rm_startblock, NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, NEW.rm_flags); error = xfs_btree_insert(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } /* * Reset the cursor to the position of the new extent * we are about to insert as we can't trust it after @@ -1305,16 +1434,22 @@ xfs_rmap_convert( oldext, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + if (XFS_IS_CORRUPT(mp, i != 0)) { + error = -EFSCORRUPTED; + goto done; + } /* new middle extent - newext */ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN; cur->bc_rec.r.rm_flags |= newext; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } break; case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: @@ -1330,12 +1465,12 @@ xfs_rmap_convert( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1371,7 +1506,7 @@ xfs_rmap_convert_shared( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -1383,7 +1518,10 @@ xfs_rmap_convert_shared( &PREV, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } ASSERT(PREV.rm_offset <= offset); ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff); @@ -1406,9 +1544,12 @@ xfs_rmap_convert_shared( goto done; if (i) { state |= RMAP_LEFT_VALID; - XFS_WANT_CORRUPTED_GOTO(mp, - LEFT.rm_startblock + LEFT.rm_blockcount <= bno, - done); + if (XFS_IS_CORRUPT(mp, + LEFT.rm_startblock + LEFT.rm_blockcount > + bno)) { + error = -EFSCORRUPTED; + goto done; + } if (xfs_rmap_is_mergeable(&LEFT, owner, newext)) state |= RMAP_LEFT_CONTIG; } @@ -1423,11 +1564,16 @@ xfs_rmap_convert_shared( error = xfs_rmap_get_rec(cur, &RIGHT, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock, - done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } + if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) { + error = -EFSCORRUPTED; + goto done; + } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, RIGHT.rm_startblock, + cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) @@ -1443,7 +1589,7 @@ xfs_rmap_convert_shared( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state, _RET_IP_); /* * Switch out based on the FILLING and CONTIG state bits. @@ -1472,7 +1618,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1495,7 +1644,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount += PREV.rm_blockcount; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1518,7 +1670,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount += RIGHT.rm_blockcount; NEW.rm_flags = RIGHT.rm_flags; error = xfs_rmap_update(cur, &NEW); @@ -1538,7 +1693,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_flags = newext; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1570,7 +1728,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount += len; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1612,7 +1773,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount = offset - NEW.rm_offset; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1644,7 +1808,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount -= len; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1679,7 +1846,10 @@ xfs_rmap_convert_shared( NEW.rm_offset, NEW.rm_flags, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto done; + } NEW.rm_blockcount = offset - NEW.rm_offset; error = xfs_rmap_update(cur, &NEW); if (error) @@ -1710,12 +1880,12 @@ xfs_rmap_convert_shared( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1753,7 +1923,7 @@ xfs_rmap_unmap_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -1765,25 +1935,44 @@ xfs_rmap_unmap_shared( <rec, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } ltoff = ltrec.rm_offset; /* Make sure the extent we found covers the entire freeing range. */ - XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && - ltrec.rm_startblock + ltrec.rm_blockcount >= - bno + len, out_error); + if (XFS_IS_CORRUPT(mp, + ltrec.rm_startblock > bno || + ltrec.rm_startblock + ltrec.rm_blockcount < + bno + len)) { + error = -EFSCORRUPTED; + goto out_error; + } /* Make sure the owner matches what we expect to find in the tree. */ - XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner, out_error); + if (XFS_IS_CORRUPT(mp, owner != ltrec.rm_owner)) { + error = -EFSCORRUPTED; + goto out_error; + } /* Make sure the unwritten flag matches. */ - XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == - (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); + if (XFS_IS_CORRUPT(mp, + (flags & XFS_RMAP_UNWRITTEN) != + (ltrec.rm_flags & XFS_RMAP_UNWRITTEN))) { + error = -EFSCORRUPTED; + goto out_error; + } /* Check the offset. */ - XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_offset <= offset, out_error); - XFS_WANT_CORRUPTED_GOTO(mp, offset <= ltoff + ltrec.rm_blockcount, - out_error); + if (XFS_IS_CORRUPT(mp, ltrec.rm_offset > offset)) { + error = -EFSCORRUPTED; + goto out_error; + } + if (XFS_IS_CORRUPT(mp, offset > ltoff + ltrec.rm_blockcount)) { + error = -EFSCORRUPTED; + goto out_error; + } if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* Exact match, simply remove the record from rmap tree. */ @@ -1836,7 +2025,10 @@ xfs_rmap_unmap_shared( ltrec.rm_offset, ltrec.rm_flags, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } ltrec.rm_blockcount -= len; error = xfs_rmap_update(cur, <rec); if (error) @@ -1862,7 +2054,10 @@ xfs_rmap_unmap_shared( ltrec.rm_offset, ltrec.rm_flags, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } ltrec.rm_blockcount = bno - ltrec.rm_startblock; error = xfs_rmap_update(cur, <rec); if (error) @@ -1877,12 +2072,12 @@ xfs_rmap_unmap_shared( goto out_error; } - trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_unmap_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1917,7 +2112,7 @@ xfs_rmap_map_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* Is there a left record that abuts our range? */ @@ -1938,9 +2133,12 @@ xfs_rmap_map_shared( error = xfs_rmap_get_rec(cur, >rec, &have_gt); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error); + if (XFS_IS_CORRUPT(mp, have_gt != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, gtrec.rm_startblock, + cur->bc_ag.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); @@ -1987,7 +2185,10 @@ xfs_rmap_map_shared( ltrec.rm_offset, ltrec.rm_flags, &i); if (error) goto out_error; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_error; + } error = xfs_rmap_update(cur, <rec); if (error) @@ -2030,12 +2231,12 @@ xfs_rmap_map_shared( goto out_error; } - trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_map_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -2135,7 +2336,7 @@ xfs_rmap_finish_one_cleanup( if (rcur == NULL) return; - agbp = rcur->bc_private.a.agbp; + agbp = rcur->bc_ag.agbp; xfs_btree_del_cursor(rcur, error); if (error) xfs_trans_brelse(tp, agbp); @@ -2185,7 +2386,7 @@ xfs_rmap_finish_one( * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_private.a.agno != agno) { + if (rcur != NULL && rcur->bc_ag.agno != agno) { xfs_rmap_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -2199,7 +2400,7 @@ xfs_rmap_finish_one( error = xfs_free_extent_fix_freelist(tp, agno, &agbp); if (error) return error; - if (!agbp) + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) return -EFSCORRUPTED; rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); @@ -2493,7 +2694,6 @@ struct xfs_rmap_key_state { uint64_t owner; uint64_t offset; unsigned int flags; - bool has_rmap; }; /* For each rmap given, figure out if it doesn't match the key we want. */ @@ -2508,7 +2708,6 @@ xfs_rmap_has_other_keys_helper( if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset && ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) return 0; - rks->has_rmap = true; return -ECANCELED; } @@ -2530,7 +2729,7 @@ xfs_rmap_has_other_keys( int error; xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags); - rks.has_rmap = false; + *has_rmap = false; low.rm_startblock = bno; memset(&high, 0xFF, sizeof(high)); @@ -2538,11 +2737,12 @@ xfs_rmap_has_other_keys( error = xfs_rmap_query_range(cur, &low, &high, xfs_rmap_has_other_keys_helper, &rks); - if (error < 0) - return error; + if (error == -ECANCELED) { + *has_rmap = true; + return 0; + } - *has_rmap = rks.has_rmap; - return 0; + return error; } const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index fc78efa52c94..b7c05314d07c 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -14,6 +14,7 @@ #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" #include "xfs_trace.h" @@ -51,7 +52,7 @@ xfs_rmapbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno); + cur->bc_ag.agbp, cur->bc_ag.agno); } STATIC void @@ -60,8 +61,8 @@ xfs_rmapbt_set_root( union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); int btnum = cur->bc_btnum; struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); @@ -83,25 +84,25 @@ xfs_rmapbt_alloc_block( union xfs_btree_ptr *new, int *stat) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; int error; xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; - trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_ag.agno, bno, 1); if (bno == NULLAGBLOCK) { *stat = 0; return 0; } - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); xfs_trans_agbtree_delta(cur->bc_tp, 1); @@ -109,7 +110,7 @@ xfs_rmapbt_alloc_block( be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno); + xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_ag.agno); *stat = 1; return 0; @@ -120,13 +121,13 @@ xfs_rmapbt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agblock_t bno; int error; bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); - trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno, bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); @@ -138,7 +139,7 @@ xfs_rmapbt_free_block( XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); - xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno); + xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_ag.agno); return 0; } @@ -215,9 +216,9 @@ xfs_rmapbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } @@ -448,17 +449,12 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = { .recs_inorder = xfs_rmapbt_recs_inorder, }; -/* - * Allocate a new allocation btree cursor. - */ -struct xfs_btree_cur * -xfs_rmapbt_init_cursor( +static struct xfs_btree_cur * +xfs_rmapbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_buf *agbp, xfs_agnumber_t agno) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); @@ -468,17 +464,68 @@ xfs_rmapbt_init_cursor( cur->bc_btnum = XFS_BTNUM_RMAP; cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); + cur->bc_ag.agno = agno; cur->bc_ops = &xfs_rmapbt_ops; + + return cur; +} + +/* Create a new reverse mapping btree cursor. */ +struct xfs_btree_cur * +xfs_rmapbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + + cur = xfs_rmapbt_init_common(mp, tp, agno); cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); + cur->bc_ag.agbp = agbp; + return cur; +} - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; +/* Create a new reverse mapping btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_rmapbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno) +{ + struct xfs_btree_cur *cur; + cur = xfs_rmapbt_init_common(mp, NULL, agno); + xfs_btree_stage_afakeroot(cur, afake); return cur; } /* + * Install a new reverse mapping btree root. Caller is responsible for + * invalidating and freeing the old btree blocks. + */ +void +xfs_rmapbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); + agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | + XFS_AGF_RMAP_BLOCKS); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops); +} + +/* * Calculate number of records in an rmap btree block. */ int @@ -569,7 +616,7 @@ xfs_rmapbt_calc_reserves( if (error) return error; - agf = XFS_BUF_TO_AGF(agbp); + agf = agbp->b_addr; agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_rmap_blocks); xfs_trans_brelse(tp, agbp); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 820d668b063d..115c3455a734 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -9,6 +9,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xbtree_afakeroot; /* rmaps only exist on crc enabled filesystems */ #define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN @@ -43,6 +44,10 @@ struct xfs_mount; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, xfs_agnumber_t agno); +struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno); +void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); int xfs_rmapbt_maxrecs(int blocklen, int leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 8ea1efc97b41..f42c74cb8be5 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -15,7 +15,7 @@ #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_rtalloc.h" - +#include "xfs_error.h" /* * Realtime allocator bitmap functions shared with userspace. @@ -70,7 +70,7 @@ xfs_rtbuf_get( if (error) return error; - if (nmap == 0 || !xfs_bmap_is_real_extent(&map)) + if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_real_extent(&map))) return -EFSCORRUPTED; ASSERT(map.br_startblock != NULLFSBLOCK); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index ac6cdca63e15..c526c5e5ab76 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -10,6 +10,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" +#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" @@ -219,7 +220,7 @@ xfs_validate_sb_common( struct xfs_buf *bp, struct xfs_sb *sbp) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; uint32_t agcount = 0; uint32_t rem; @@ -327,6 +328,38 @@ xfs_validate_sb_common( return -EFSCORRUPTED; } + /* Validate the realtime geometry; stolen from xfs_repair */ + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { + xfs_notice(mp, + "realtime extent sanity check failed"); + return -EFSCORRUPTED; + } + + if (sbp->sb_rblocks == 0) { + if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || + sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) { + xfs_notice(mp, + "realtime zeroed geometry check failed"); + return -EFSCORRUPTED; + } + } else { + uint64_t rexts; + uint64_t rbmblocks; + + rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize); + rbmblocks = howmany_64(sbp->sb_rextents, + NBBY * sbp->sb_blocksize); + + if (sbp->sb_rextents != rexts || + sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) || + sbp->sb_rbmblocks != rbmblocks) { + xfs_notice(mp, + "realtime geometry sanity check failed"); + return -EFSCORRUPTED; + } + } + if (sbp->sb_unit) { if (!xfs_sb_version_hasdalign(sbp) || sbp->sb_unit > sbp->sb_width || @@ -680,7 +713,7 @@ xfs_sb_read_verify( { struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; int error; /* @@ -706,7 +739,7 @@ xfs_sb_read_verify( * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ - __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); + __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; @@ -729,7 +762,7 @@ static void xfs_sb_quiet_read_verify( struct xfs_buf *bp) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { /* XFS filesystem, verify noisily! */ @@ -747,13 +780,14 @@ xfs_sb_write_verify( struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_dsb *dsb = bp->b_addr; int error; /* * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ - __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); + __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; @@ -765,7 +799,7 @@ xfs_sb_write_verify( return; if (bip) - XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + dsb->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF); return; @@ -926,7 +960,7 @@ xfs_log_sb( mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); } @@ -984,9 +1018,9 @@ xfs_update_secondary_sbs( for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) { struct xfs_buf *bp; - bp = xfs_buf_get(mp->m_ddev_targp, + error = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_DADDR), - XFS_FSS_TO_BB(mp, 1)); + XFS_FSS_TO_BB(mp, 1), &bp); /* * If we get an error reading or writing alternate superblocks, * continue. xfs_repair chooses the "best" superblock based @@ -994,19 +1028,19 @@ xfs_update_secondary_sbs( * superblocks un-updated than updated, and xfs_repair may * pick them over the properly-updated primary. */ - if (!bp) { + if (error) { xfs_warn(mp, "error allocating secondary superblock for ag %d", agno); if (!saved_error) - saved_error = -ENOMEM; + saved_error = error; continue; } bp->b_ops = &xfs_sb_buf_ops; xfs_buf_oneshot(bp); xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_buf_delwri_queue(bp, &buffer_list); xfs_buf_relse(bp); @@ -1184,13 +1218,14 @@ xfs_sb_get_secondary( struct xfs_buf **bpp) { struct xfs_buf *bp; + int error; ASSERT(agno != 0 && agno != NULLAGNUMBER); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0); - if (!bp) - return -ENOMEM; + XFS_FSS_TO_BB(mp, 1), 0, &bp); + if (error) + return error; bp->b_ops = &xfs_sb_buf_ops; xfs_buf_oneshot(bp); *bpp = bp; diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index a9ad90926b87..2b8ccb5b975d 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -55,7 +55,7 @@ xfs_trans_ichgtime( int flags) { struct inode *inode = VFS_I(ip); - struct timespec64 tv; + struct timespec64 tv; ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -66,10 +66,8 @@ xfs_trans_ichgtime( inode->i_mtime = tv; if (flags & XFS_ICHGTIME_CHG) inode->i_ctime = tv; - if (flags & XFS_ICHGTIME_CREATE) { - ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec; - ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec; - } + if (flags & XFS_ICHGTIME_CREATE) + ip->i_d.di_crtime = tv; } /* diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index d12bbd526e7c..d1a0848cb52e 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -187,7 +187,7 @@ xfs_calc_inode_chunk_res( XFS_FSB_TO_B(mp, 1)); if (alloc) { /* icreate tx uses ordered buffers */ - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_sb_version_has_v3inode(&mp->m_sb)) return res; size = XFS_FSB_TO_B(mp, 1); } @@ -197,6 +197,24 @@ xfs_calc_inode_chunk_res( } /* + * Per-extent log reservation for the btree changes involved in freeing or + * allocating a realtime extent. We have to be able to log as many rtbitmap + * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, + * as well as the realtime summary block. + */ +static unsigned int +xfs_rtalloc_log_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + unsigned int rtbmp_bytes; + + rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY; + return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; +} + +/* * Various log reservation values. * * These are based on the size of the file system block because that is what @@ -218,13 +236,21 @@ xfs_calc_inode_chunk_res( /* * In a write transaction we can allocate a maximum of 2 - * extents. This gives: + * extents. This gives (t1): * the inode getting the new extents: inode size * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - * And the bmap_finish transaction can free bmap blocks in a join: + * Or, if we're writing to a realtime file (t2): + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 1 block + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join (t3): * the agfs of the ags containing the blocks: 2 * sector size * the agfls of the ags containing the blocks: 2 * sector size * the super block free block counter: sector size @@ -234,40 +260,72 @@ STATIC uint xfs_calc_write_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), - XFS_FSB_TO_B(mp, 1)) + + blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1)))); + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz); + } else { + t2 = 0; + } + + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); } /* - * In truncating a file we free up to two extents at once. We can modify: + * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size * the inode's bmap btree: (max depth + 1) * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: + * And the bmap_finish transaction can free the blocks and bmap blocks (t2): * the agf for each of the ags: 4 * sector size * the agfl for each of the ags: 4 * sector size * the super block to reflect the freed blocks: sector size * worst case split in allocation btrees per extent assuming 4 extents: * 4 exts * 2 trees * (2 * max depth - 1) * block size + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size */ STATIC uint xfs_calc_itruncate_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), - XFS_FSB_TO_B(mp, 1)))); + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); + + t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + } else { + t3 = 0; + } + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); } /* @@ -718,7 +776,7 @@ xfs_calc_clear_agi_bucket_reservation( /* * Adjusting quota limits. - * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) + * the disk quota buffer: sizeof(struct xfs_disk_dquot) */ STATIC uint xfs_calc_qm_setqlim_reservation(void) @@ -742,7 +800,7 @@ xfs_calc_qm_dqalloc_reservation( /* * Turning off quotas. - * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 * the superblock for the quota flags: sector size */ STATIC uint @@ -755,7 +813,7 @@ xfs_calc_qm_quotaoff_reservation( /* * End of turning off quotas. - * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 */ STATIC uint xfs_calc_qm_quotaoff_end_reservation(void) diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 300b3e91ca3a..397d94775440 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -21,7 +21,6 @@ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */ typedef int64_t xfs_lsn_t; /* log sequence number */ -typedef int32_t xfs_tid_t; /* transaction identifier */ typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ typedef uint32_t xfs_dahash_t; /* dir/attr hash value */ @@ -33,7 +32,6 @@ typedef uint64_t xfs_fileoff_t; /* block number in a file */ typedef uint64_t xfs_filblks_t; /* number of blocks in a file */ typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ -typedef int64_t xfs_sfiloff_t; /* signed block number in a file */ /* * New verifiers will return the instruction address of the failing check. diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index ba0f747c82e8..e9bcf1faa183 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -92,7 +92,7 @@ xchk_superblock( if (!xchk_process_error(sc, agno, XFS_SB_BLOCK(mp), &error)) return error; - sb = XFS_BUF_TO_SBP(bp); + sb = bp->b_addr; /* * Verify the geometries match. Fields that are permanently @@ -358,7 +358,7 @@ static inline void xchk_agf_xref_freeblks( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; xfs_extlen_t blocks = 0; int error; @@ -378,7 +378,7 @@ static inline void xchk_agf_xref_cntbt( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; xfs_agblock_t agbno; xfs_extlen_t blocks; int have; @@ -410,7 +410,7 @@ STATIC void xchk_agf_xref_btreeblks( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agblock_t blocks; xfs_agblock_t btreeblks; @@ -456,7 +456,7 @@ static inline void xchk_agf_xref_refcblks( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; xfs_agblock_t blocks; int error; @@ -525,7 +525,7 @@ xchk_agf( goto out; xchk_buffer_recheck(sc, sc->sa.agf_bp); - agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + agf = sc->sa.agf_bp->b_addr; /* Check the AG length */ eoag = be32_to_cpu(agf->agf_length); @@ -711,7 +711,7 @@ xchk_agfl( goto out; /* Allocate buffer to ensure uniqueness of AGFL entries. */ - agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + agf = sc->sa.agf_bp->b_addr; agflcount = be32_to_cpu(agf->agf_flcount); if (agflcount > xfs_agfl_size(sc->mp)) { xchk_block_set_corrupt(sc, sc->sa.agf_bp); @@ -728,7 +728,7 @@ xchk_agfl( } /* Check the blocks in the AGFL. */ - error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), + error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sc->sa.agfl_bp, xchk_agfl_block, &sai); if (error == -ECANCELED) { error = 0; @@ -765,7 +765,7 @@ static inline void xchk_agi_xref_icounts( struct xfs_scrub *sc) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; xfs_agino_t icount; xfs_agino_t freecount; int error; @@ -834,7 +834,7 @@ xchk_agi( goto out; xchk_buffer_recheck(sc, sc->sa.agi_bp); - agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); + agi = sc->sa.agi_bp->b_addr; /* Check the AG length */ eoag = be32_to_cpu(agi->agi_length); diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 7a1a38b636a9..bca2ab1d4be9 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -49,7 +49,7 @@ xrep_superblock( /* Copy AG 0's superblock to this one. */ xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); @@ -140,7 +140,7 @@ xrep_agf_find_btrees( struct xrep_find_ag_btree *fab, struct xfs_buf *agfl_bp) { - struct xfs_agf *old_agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *old_agf = agf_bp->b_addr; int error; /* Go find the root data. */ @@ -181,7 +181,7 @@ xrep_agf_init_header( struct xfs_agf *old_agf) { struct xfs_mount *mp = sc->mp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; memcpy(old_agf, agf, sizeof(*old_agf)); memset(agf, 0, BBTOB(agf_bp->b_length)); @@ -238,7 +238,7 @@ xrep_agf_calc_from_btrees( { struct xrep_agf_allocbt raa = { .sc = sc }; struct xfs_btree_cur *cur = NULL; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agblock_t btreeblks; xfs_agblock_t blocks; @@ -302,7 +302,7 @@ xrep_agf_commit_new( struct xfs_buf *agf_bp) { struct xfs_perag *pag; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; /* Trigger fdblocks recalculation */ xfs_force_summary_recalc(sc->mp); @@ -376,7 +376,7 @@ xrep_agf( if (error) return error; agf_bp->b_ops = &xfs_agf_buf_ops; - agf = XFS_BUF_TO_AGF(agf_bp); + agf = agf_bp->b_addr; /* * Load the AGFL so that we can screen out OWN_AG blocks that are on @@ -395,7 +395,7 @@ xrep_agf( * Spot-check the AGFL blocks; if they're obviously corrupt then * there's nothing we can do but bail out. */ - error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp, + error = xfs_agfl_walk(sc->mp, agf_bp->b_addr, agfl_bp, xrep_agf_check_agfl_block, sc); if (error) return error; @@ -429,10 +429,10 @@ out_revert: struct xrep_agfl { /* Bitmap of other OWN_AG metadata blocks. */ - struct xfs_bitmap agmetablocks; + struct xbitmap agmetablocks; /* Bitmap of free space. */ - struct xfs_bitmap *freesp; + struct xbitmap *freesp; struct xfs_scrub *sc; }; @@ -453,14 +453,14 @@ xrep_agfl_walk_rmap( /* Record all the OWN_AG blocks. */ if (rec->rm_owner == XFS_RMAP_OWN_AG) { - fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, rec->rm_startblock); - error = xfs_bitmap_set(ra->freesp, fsb, rec->rm_blockcount); + error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount); if (error) return error; } - return xfs_bitmap_set_btcur_path(&ra->agmetablocks, cur); + return xbitmap_set_btcur_path(&ra->agmetablocks, cur); } /* @@ -476,19 +476,17 @@ STATIC int xrep_agfl_collect_blocks( struct xfs_scrub *sc, struct xfs_buf *agf_bp, - struct xfs_bitmap *agfl_extents, + struct xbitmap *agfl_extents, xfs_agblock_t *flcount) { struct xrep_agfl ra; struct xfs_mount *mp = sc->mp; struct xfs_btree_cur *cur; - struct xfs_bitmap_range *br; - struct xfs_bitmap_range *n; int error; ra.sc = sc; ra.freesp = agfl_extents; - xfs_bitmap_init(&ra.agmetablocks); + xbitmap_init(&ra.agmetablocks); /* Find all space used by the free space btrees & rmapbt. */ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); @@ -500,7 +498,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the bnobt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, XFS_BTNUM_BNO); - error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur); + error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) goto err; xfs_btree_del_cursor(cur, error); @@ -508,7 +506,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the cntbt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, XFS_BTNUM_CNT); - error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur); + error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) goto err; @@ -518,8 +516,8 @@ xrep_agfl_collect_blocks( * Drop the freesp meta blocks that are in use by btrees. * The remaining blocks /should/ be AGFL blocks. */ - error = xfs_bitmap_disunion(agfl_extents, &ra.agmetablocks); - xfs_bitmap_destroy(&ra.agmetablocks); + error = xbitmap_disunion(agfl_extents, &ra.agmetablocks); + xbitmap_destroy(&ra.agmetablocks); if (error) return error; @@ -527,18 +525,12 @@ xrep_agfl_collect_blocks( * Calculate the new AGFL size. If we found more blocks than fit in * the AGFL we'll free them later. */ - *flcount = 0; - for_each_xfs_bitmap_extent(br, n, agfl_extents) { - *flcount += br->len; - if (*flcount > xfs_agfl_size(mp)) - break; - } - if (*flcount > xfs_agfl_size(mp)) - *flcount = xfs_agfl_size(mp); + *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents), + xfs_agfl_size(mp)); return 0; err: - xfs_bitmap_destroy(&ra.agmetablocks); + xbitmap_destroy(&ra.agmetablocks); xfs_btree_del_cursor(cur, error); return error; } @@ -550,7 +542,7 @@ xrep_agfl_update_agf( struct xfs_buf *agf_bp, xfs_agblock_t flcount) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; ASSERT(flcount <= xfs_agfl_size(sc->mp)); @@ -573,13 +565,13 @@ STATIC void xrep_agfl_init_header( struct xfs_scrub *sc, struct xfs_buf *agfl_bp, - struct xfs_bitmap *agfl_extents, + struct xbitmap *agfl_extents, xfs_agblock_t flcount) { struct xfs_mount *mp = sc->mp; __be32 *agfl_bno; - struct xfs_bitmap_range *br; - struct xfs_bitmap_range *n; + struct xbitmap_range *br; + struct xbitmap_range *n; struct xfs_agfl *agfl; xfs_agblock_t agbno; unsigned int fl_off; @@ -602,8 +594,8 @@ xrep_agfl_init_header( * step. */ fl_off = 0; - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agfl_bp); - for_each_xfs_bitmap_extent(br, n, agfl_extents) { + agfl_bno = xfs_buf_to_agfl_bno(agfl_bp); + for_each_xbitmap_extent(br, n, agfl_extents) { agbno = XFS_FSB_TO_AGBNO(mp, br->start); trace_xrep_agfl_insert(mp, sc->sa.agno, agbno, br->len); @@ -637,7 +629,7 @@ int xrep_agfl( struct xfs_scrub *sc) { - struct xfs_bitmap agfl_extents; + struct xbitmap agfl_extents; struct xfs_mount *mp = sc->mp; struct xfs_buf *agf_bp; struct xfs_buf *agfl_bp; @@ -649,7 +641,7 @@ xrep_agfl( return -EOPNOTSUPP; xchk_perag_get(sc->mp, &sc->sa); - xfs_bitmap_init(&agfl_extents); + xbitmap_init(&agfl_extents); /* * Read the AGF so that we can query the rmapbt. We hope that there's @@ -659,8 +651,6 @@ xrep_agfl( error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp); if (error) return error; - if (!agf_bp) - return -ENOMEM; /* * Make sure we have the AGFL buffer, as scrub might have decided it @@ -698,10 +688,10 @@ xrep_agfl( goto err; /* Dump any AGFL overflow. */ - return xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, + error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, XFS_AG_RESV_AGFL); err: - xfs_bitmap_destroy(&agfl_extents); + xbitmap_destroy(&agfl_extents); return error; } @@ -735,8 +725,6 @@ xrep_agi_find_btrees( error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp); if (error) return error; - if (!agf_bp) - return -ENOMEM; /* Find the btree roots. */ error = xrep_find_ag_btree_roots(sc, agf_bp, fab, NULL); @@ -765,7 +753,7 @@ xrep_agi_init_header( struct xfs_buf *agi_bp, struct xfs_agi *old_agi) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp); + struct xfs_agi *agi = agi_bp->b_addr; struct xfs_mount *mp = sc->mp; memcpy(old_agi, agi, sizeof(*old_agi)); @@ -811,7 +799,7 @@ xrep_agi_calc_from_btrees( struct xfs_buf *agi_bp) { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp); + struct xfs_agi *agi = agi_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agino_t count; xfs_agino_t freecount; @@ -839,7 +827,7 @@ xrep_agi_commit_new( struct xfs_buf *agi_bp) { struct xfs_perag *pag; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp); + struct xfs_agi *agi = agi_bp->b_addr; /* Trigger inode count recalculation */ xfs_force_summary_recalc(sc->mp); @@ -896,7 +884,7 @@ xrep_agi( if (error) return error; agi_bp->b_ops = &xfs_agi_buf_ops; - agi = XFS_BUF_TO_AGI(agi_bp); + agi = agi_bp->b_addr; /* Find the AGI btree roots. */ error = xrep_agi_find_btrees(sc, fab); diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 5533e48e605d..73d924e47565 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -94,7 +94,7 @@ xchk_allocbt_rec( union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t bno; xfs_extlen_t len; diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 0edc7f8eb96e..9faddb334a2c 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -98,7 +98,7 @@ struct xchk_xattr { /* * Check that an extended attribute key can be looked up by hash. * - * We use the XFS attribute list iterator (i.e. xfs_attr_list_int_ilocked) + * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked) * to call this function for every attribute key in an inode. Once * we're here, we load the attribute value to see if any errors happen, * or if we get more or less data than we expected. @@ -147,11 +147,8 @@ xchk_xattr_listent( return; } - args.flags = ATTR_KERNOTIME; - if (flags & XFS_ATTR_ROOT) - args.flags |= ATTR_ROOT; - else if (flags & XFS_ATTR_SECURE) - args.flags |= ATTR_SECURE; + args.op_flags = XFS_DA_OP_NOTIME; + args.attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK; args.geo = context->dp->i_mount->m_attr_geo; args.whichfork = XFS_ATTR_FORK; args.dp = context->dp; @@ -162,7 +159,10 @@ xchk_xattr_listent( args.value = xchk_xattr_valuebuf(sx->sc); args.valuelen = valuelen; - error = xfs_attr_get_ilocked(context->dp, &args); + error = xfs_attr_get_ilocked(&args); + /* ENODATA means the hash lookup failed and the attr is bad */ + if (error == -ENODATA) + error = -EFSCORRUPTED; if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, &error)) goto fail_xref; @@ -398,15 +398,14 @@ out: STATIC int xchk_xattr_rec( struct xchk_da_btree *ds, - int level, - void *rec) + int level) { struct xfs_mount *mp = ds->state->mp; - struct xfs_attr_leaf_entry *ent = rec; - struct xfs_da_state_blk *blk; + struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; struct xfs_attr_leaf_name_local *lentry; struct xfs_attr_leaf_name_remote *rentry; struct xfs_buf *bp; + struct xfs_attr_leaf_entry *ent; xfs_dahash_t calc_hash; xfs_dahash_t hash; int nameidx; @@ -414,7 +413,9 @@ xchk_xattr_rec( unsigned int badflags; int error; - blk = &ds->state->path.blk[level]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + + ent = xfs_attr3_leaf_entryp(blk->bp->b_addr) + blk->index; /* Check the whole block, if necessary. */ error = xchk_xattr_block(ds, level); @@ -473,7 +474,6 @@ xchk_xattr( struct xfs_scrub *sc) { struct xchk_xattr sx; - struct attrlist_cursor_kern cursor = { 0 }; xfs_dablk_t last_checked = -1U; int error = 0; @@ -492,11 +492,10 @@ xchk_xattr( /* Check that every attr key can also be looked up by hash. */ sx.context.dp = sc->ip; - sx.context.cursor = &cursor; sx.context.resynch = 1; sx.context.put_listent = xchk_xattr_listent; sx.context.tp = sc->tp; - sx.context.flags = ATTR_INCOMPLETE; + sx.context.allow_incomplete = true; sx.sc = sc; /* @@ -515,7 +514,7 @@ xchk_xattr( * iteration, which doesn't really follow the usual buffer * locking order. */ - error = xfs_attr_list_int_ilocked(&sx.context); + error = xfs_attr_list_ilocked(&sx.context); if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) goto out; diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 3d47d111be5a..f88694f22d05 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -18,14 +18,14 @@ * This is the logical equivalent of bitmap |= mask(start, len). */ int -xfs_bitmap_set( - struct xfs_bitmap *bitmap, +xbitmap_set( + struct xbitmap *bitmap, uint64_t start, uint64_t len) { - struct xfs_bitmap_range *bmr; + struct xbitmap_range *bmr; - bmr = kmem_alloc(sizeof(struct xfs_bitmap_range), KM_MAYFAIL); + bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL); if (!bmr) return -ENOMEM; @@ -39,13 +39,13 @@ xfs_bitmap_set( /* Free everything related to this bitmap. */ void -xfs_bitmap_destroy( - struct xfs_bitmap *bitmap) +xbitmap_destroy( + struct xbitmap *bitmap) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; + struct xbitmap_range *bmr; + struct xbitmap_range *n; - for_each_xfs_bitmap_extent(bmr, n, bitmap) { + for_each_xbitmap_extent(bmr, n, bitmap) { list_del(&bmr->list); kmem_free(bmr); } @@ -53,24 +53,24 @@ xfs_bitmap_destroy( /* Set up a per-AG block bitmap. */ void -xfs_bitmap_init( - struct xfs_bitmap *bitmap) +xbitmap_init( + struct xbitmap *bitmap) { INIT_LIST_HEAD(&bitmap->list); } /* Compare two btree extents. */ static int -xfs_bitmap_range_cmp( +xbitmap_range_cmp( void *priv, struct list_head *a, struct list_head *b) { - struct xfs_bitmap_range *ap; - struct xfs_bitmap_range *bp; + struct xbitmap_range *ap; + struct xbitmap_range *bp; - ap = container_of(a, struct xfs_bitmap_range, list); - bp = container_of(b, struct xfs_bitmap_range, list); + ap = container_of(a, struct xbitmap_range, list); + bp = container_of(b, struct xbitmap_range, list); if (ap->start > bp->start) return 1; @@ -96,14 +96,14 @@ xfs_bitmap_range_cmp( #define LEFT_ALIGNED (1 << 0) #define RIGHT_ALIGNED (1 << 1) int -xfs_bitmap_disunion( - struct xfs_bitmap *bitmap, - struct xfs_bitmap *sub) +xbitmap_disunion( + struct xbitmap *bitmap, + struct xbitmap *sub) { struct list_head *lp; - struct xfs_bitmap_range *br; - struct xfs_bitmap_range *new_br; - struct xfs_bitmap_range *sub_br; + struct xbitmap_range *br; + struct xbitmap_range *new_br; + struct xbitmap_range *sub_br; uint64_t sub_start; uint64_t sub_len; int state; @@ -113,8 +113,8 @@ xfs_bitmap_disunion( return 0; ASSERT(!list_empty(&sub->list)); - list_sort(NULL, &bitmap->list, xfs_bitmap_range_cmp); - list_sort(NULL, &sub->list, xfs_bitmap_range_cmp); + list_sort(NULL, &bitmap->list, xbitmap_range_cmp); + list_sort(NULL, &sub->list, xbitmap_range_cmp); /* * Now that we've sorted both lists, we iterate bitmap once, rolling @@ -124,11 +124,11 @@ xfs_bitmap_disunion( * list traversal is similar to merge sort, but we're deleting * instead. In this manner we avoid O(n^2) operations. */ - sub_br = list_first_entry(&sub->list, struct xfs_bitmap_range, + sub_br = list_first_entry(&sub->list, struct xbitmap_range, list); lp = bitmap->list.next; while (lp != &bitmap->list) { - br = list_entry(lp, struct xfs_bitmap_range, list); + br = list_entry(lp, struct xbitmap_range, list); /* * Advance sub_br and/or br until we find a pair that @@ -181,7 +181,7 @@ xfs_bitmap_disunion( * Deleting from the middle: add the new right extent * and then shrink the left extent. */ - new_br = kmem_alloc(sizeof(struct xfs_bitmap_range), + new_br = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL); if (!new_br) { error = -ENOMEM; @@ -247,8 +247,8 @@ out: * blocks going from the leaf towards the root. */ int -xfs_bitmap_set_btcur_path( - struct xfs_bitmap *bitmap, +xbitmap_set_btcur_path( + struct xbitmap *bitmap, struct xfs_btree_cur *cur) { struct xfs_buf *bp; @@ -261,7 +261,7 @@ xfs_bitmap_set_btcur_path( if (!bp) continue; fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn); - error = xfs_bitmap_set(bitmap, fsb, 1); + error = xbitmap_set(bitmap, fsb, 1); if (error) return error; } @@ -271,12 +271,12 @@ xfs_bitmap_set_btcur_path( /* Collect a btree's block in the bitmap. */ STATIC int -xfs_bitmap_collect_btblock( +xbitmap_collect_btblock( struct xfs_btree_cur *cur, int level, void *priv) { - struct xfs_bitmap *bitmap = priv; + struct xbitmap *bitmap = priv; struct xfs_buf *bp; xfs_fsblock_t fsbno; @@ -285,14 +285,30 @@ xfs_bitmap_collect_btblock( return 0; fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn); - return xfs_bitmap_set(bitmap, fsbno, 1); + return xbitmap_set(bitmap, fsbno, 1); } /* Walk the btree and mark the bitmap wherever a btree block is found. */ int -xfs_bitmap_set_btblocks( - struct xfs_bitmap *bitmap, +xbitmap_set_btblocks( + struct xbitmap *bitmap, struct xfs_btree_cur *cur) { - return xfs_btree_visit_blocks(cur, xfs_bitmap_collect_btblock, bitmap); + return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock, + XFS_BTREE_VISIT_ALL, bitmap); +} + +/* How many bits are set in this bitmap? */ +uint64_t +xbitmap_hweight( + struct xbitmap *bitmap) +{ + struct xbitmap_range *bmr; + struct xbitmap_range *n; + uint64_t ret = 0; + + for_each_xbitmap_extent(bmr, n, bitmap) + ret += bmr->len; + + return ret; } diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index ae8ecbce6fa6..900646b72de1 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -6,31 +6,32 @@ #ifndef __XFS_SCRUB_BITMAP_H__ #define __XFS_SCRUB_BITMAP_H__ -struct xfs_bitmap_range { +struct xbitmap_range { struct list_head list; uint64_t start; uint64_t len; }; -struct xfs_bitmap { +struct xbitmap { struct list_head list; }; -void xfs_bitmap_init(struct xfs_bitmap *bitmap); -void xfs_bitmap_destroy(struct xfs_bitmap *bitmap); +void xbitmap_init(struct xbitmap *bitmap); +void xbitmap_destroy(struct xbitmap *bitmap); -#define for_each_xfs_bitmap_extent(bex, n, bitmap) \ +#define for_each_xbitmap_extent(bex, n, bitmap) \ list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) -#define for_each_xfs_bitmap_block(b, bex, n, bitmap) \ +#define for_each_xbitmap_block(b, bex, n, bitmap) \ list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) \ - for ((b) = bex->start; (b) < bex->start + bex->len; (b)++) + for ((b) = (bex)->start; (b) < (bex)->start + (bex)->len; (b)++) -int xfs_bitmap_set(struct xfs_bitmap *bitmap, uint64_t start, uint64_t len); -int xfs_bitmap_disunion(struct xfs_bitmap *bitmap, struct xfs_bitmap *sub); -int xfs_bitmap_set_btcur_path(struct xfs_bitmap *bitmap, +int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); +int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); +int xbitmap_set_btcur_path(struct xbitmap *bitmap, struct xfs_btree_cur *cur); -int xfs_bitmap_set_btblocks(struct xfs_bitmap *bitmap, +int xbitmap_set_btblocks(struct xbitmap *bitmap, struct xfs_btree_cur *cur); +uint64_t xbitmap_hweight(struct xbitmap *bitmap); #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index fa6ea6407992..add8598eacd5 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -374,7 +374,7 @@ xchk_bmapbt_rec( struct xfs_bmbt_irec iext_irec; struct xfs_iext_cursor icur; struct xchk_bmap_info *info = bs->private; - struct xfs_inode *ip = bs->cur->bc_private.b.ip; + struct xfs_inode *ip = bs->cur->bc_ino.ip; struct xfs_buf *bp = NULL; struct xfs_btree_block *block; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork); @@ -501,7 +501,7 @@ xchk_bmap_check_rmap( xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, - cur->bc_private.a.agno, rec->rm_startblock)) + cur->bc_ag.agno, rec->rm_startblock)) xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); if (irec.br_blockcount > rec->rm_blockcount) diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 003a772cd26c..2e50d146105d 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -14,8 +14,15 @@ static inline bool xchk_should_terminate( struct xfs_scrub *sc, - int *error) + int *error) { + /* + * If preemption is disabled, we need to yield to the scheduler every + * few seconds so that we don't run afoul of the soft lockup watchdog + * or RCU stall detector. + */ + cond_resched(); + if (fatal_signal_pending(current)) { if (*error == 0) *error = -EAGAIN; diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 77ff9f97bcda..9a2e27ac1300 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -77,40 +77,18 @@ xchk_da_set_corrupt( __return_address); } -/* Find an entry at a certain level in a da btree. */ -STATIC void * -xchk_da_btree_entry( - struct xchk_da_btree *ds, - int level, - int rec) +static struct xfs_da_node_entry * +xchk_da_btree_node_entry( + struct xchk_da_btree *ds, + int level) { - char *ents; - struct xfs_da_state_blk *blk; - void *baddr; + struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; + struct xfs_da3_icnode_hdr hdr; - /* Dispatch the entry finding function. */ - blk = &ds->state->path.blk[level]; - baddr = blk->bp->b_addr; - switch (blk->magic) { - case XFS_ATTR_LEAF_MAGIC: - case XFS_ATTR3_LEAF_MAGIC: - ents = (char *)xfs_attr3_leaf_entryp(baddr); - return ents + (rec * sizeof(struct xfs_attr_leaf_entry)); - case XFS_DIR2_LEAFN_MAGIC: - case XFS_DIR3_LEAFN_MAGIC: - ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr); - return ents + (rec * sizeof(struct xfs_dir2_leaf_entry)); - case XFS_DIR2_LEAF1_MAGIC: - case XFS_DIR3_LEAF1_MAGIC: - ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr); - return ents + (rec * sizeof(struct xfs_dir2_leaf_entry)); - case XFS_DA_NODE_MAGIC: - case XFS_DA3_NODE_MAGIC: - ents = (char *)ds->dargs.dp->d_ops->node_tree_p(baddr); - return ents + (rec * sizeof(struct xfs_da_node_entry)); - } + ASSERT(blk->magic == XFS_DA_NODE_MAGIC); - return NULL; + xfs_da3_node_hdr_from_disk(ds->sc->mp, &hdr, blk->bp->b_addr); + return hdr.btree + blk->index; } /* Scrub a da btree hash (key). */ @@ -120,7 +98,6 @@ xchk_da_btree_hash( int level, __be32 *hashp) { - struct xfs_da_state_blk *blks; struct xfs_da_node_entry *entry; xfs_dahash_t hash; xfs_dahash_t parent_hash; @@ -135,8 +112,7 @@ xchk_da_btree_hash( return 0; /* Is this hash no larger than the parent hash? */ - blks = ds->state->path.blk; - entry = xchk_da_btree_entry(ds, level - 1, blks[level - 1].index); + entry = xchk_da_btree_node_entry(ds, level - 1); parent_hash = be32_to_cpu(entry->hashval); if (parent_hash < hash) xchk_da_set_corrupt(ds, level); @@ -243,19 +219,21 @@ xchk_da_btree_block_check_sibling( int direction, xfs_dablk_t sibling) { + struct xfs_da_state_path *path = &ds->state->path; + struct xfs_da_state_path *altpath = &ds->state->altpath; int retval; + int plevel; int error; - memcpy(&ds->state->altpath, &ds->state->path, - sizeof(ds->state->altpath)); + memcpy(altpath, path, sizeof(ds->state->altpath)); /* * If the pointer is null, we shouldn't be able to move the upper * level pointer anywhere. */ if (sibling == 0) { - error = xfs_da3_path_shift(ds->state, &ds->state->altpath, - direction, false, &retval); + error = xfs_da3_path_shift(ds->state, altpath, direction, + false, &retval); if (error == 0 && retval == 0) xchk_da_set_corrupt(ds, level); error = 0; @@ -263,27 +241,33 @@ xchk_da_btree_block_check_sibling( } /* Move the alternate cursor one block in the direction given. */ - error = xfs_da3_path_shift(ds->state, &ds->state->altpath, - direction, false, &retval); + error = xfs_da3_path_shift(ds->state, altpath, direction, false, + &retval); if (!xchk_da_process_error(ds, level, &error)) - return error; + goto out; if (retval) { xchk_da_set_corrupt(ds, level); - return error; + goto out; } - if (ds->state->altpath.blk[level].bp) - xchk_buffer_recheck(ds->sc, - ds->state->altpath.blk[level].bp); + if (altpath->blk[level].bp) + xchk_buffer_recheck(ds->sc, altpath->blk[level].bp); /* Compare upper level pointer to sibling pointer. */ - if (ds->state->altpath.blk[level].blkno != sibling) + if (altpath->blk[level].blkno != sibling) xchk_da_set_corrupt(ds, level); - if (ds->state->altpath.blk[level].bp) { - xfs_trans_brelse(ds->dargs.trans, - ds->state->altpath.blk[level].bp); - ds->state->altpath.blk[level].bp = NULL; - } + out: + /* Free all buffers in the altpath that aren't referenced from path. */ + for (plevel = 0; plevel < altpath->active; plevel++) { + if (altpath->blk[plevel].bp == NULL || + (plevel < path->active && + altpath->blk[plevel].bp == path->blk[plevel].bp)) + continue; + + xfs_trans_brelse(ds->dargs.trans, altpath->blk[plevel].bp); + altpath->blk[plevel].bp = NULL; + } + return error; } @@ -355,8 +339,8 @@ xchk_da_btree_block( goto out_nobuf; /* Read the buffer. */ - error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno, -2, - &blk->bp, dargs->whichfork, + error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno, + XFS_DABUF_MAP_HOLE_OK, &blk->bp, dargs->whichfork, &xchk_da_btree_buf_ops); if (!xchk_da_process_error(ds, level, &error)) goto out_nobuf; @@ -433,8 +417,8 @@ xchk_da_btree_block( XFS_BLFT_DA_NODE_BUF); blk->magic = XFS_DA_NODE_MAGIC; node = blk->bp->b_addr; - ip->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = ip->d_ops->node_tree_p(node); + xfs_da3_node_hdr_from_disk(ip->i_mount, &nodehdr, node); + btree = nodehdr.btree; *pmaxrecs = nodehdr.count; blk->hashval = be32_to_cpu(btree[*pmaxrecs - 1].hashval); if (level == 0) { @@ -479,14 +463,12 @@ xchk_da_btree( struct xfs_mount *mp = sc->mp; struct xfs_da_state_blk *blks; struct xfs_da_node_entry *key; - void *rec; xfs_dablk_t blkno; int level; int error; /* Skip short format data structures; no btree to scan. */ - if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE) + if (!xfs_ifork_has_extents(sc->ip, whichfork)) return 0; /* Set up initial da state. */ @@ -538,9 +520,7 @@ xchk_da_btree( } /* Dispatch record scrubbing. */ - rec = xchk_da_btree_entry(&ds, level, - blks[level].index); - error = scrub_fn(&ds, level, rec); + error = scrub_fn(&ds, level); if (error) break; if (xchk_should_terminate(sc, &error) || @@ -562,7 +542,7 @@ xchk_da_btree( } /* Hashes in order for scrub? */ - key = xchk_da_btree_entry(&ds, level, blks[level].index); + key = xchk_da_btree_node_entry(&ds, level); error = xchk_da_btree_hash(&ds, level, &key->hashval); if (error) goto out; diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h index cb3f0003245b..1f3515c6d5a8 100644 --- a/fs/xfs/scrub/dabtree.h +++ b/fs/xfs/scrub/dabtree.h @@ -28,8 +28,7 @@ struct xchk_da_btree { int tree_level; }; -typedef int (*xchk_da_btree_rec_fn)(struct xchk_da_btree *ds, - int level, void *rec); +typedef int (*xchk_da_btree_rec_fn)(struct xchk_da_btree *ds, int level); /* Check for da btree operation errors. */ bool xchk_da_process_error(struct xchk_da_btree *ds, int level, int *error); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 1e2e11721eb9..fe2a6e030c8a 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -113,6 +113,9 @@ xchk_dir_actor( offset = xfs_dir2_db_to_da(mp->m_dir_geo, xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos)); + if (xchk_should_terminate(sdc->sc, &error)) + return error; + /* Does this inode number make sense? */ if (!xfs_verify_dir_ino(mp, ino)) { xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); @@ -152,6 +155,9 @@ xchk_dir_actor( xname.type = XFS_DIR3_FT_UNKNOWN; error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL); + /* ENOENT means the hash lookup failed and the dir is corrupt */ + if (error == -ENOENT) + error = -EFSCORRUPTED; if (!xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, &error)) goto out; @@ -179,15 +185,17 @@ out: STATIC int xchk_dir_rec( struct xchk_da_btree *ds, - int level, - void *rec) + int level) { + struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; struct xfs_mount *mp = ds->state->mp; - struct xfs_dir2_leaf_entry *ent = rec; struct xfs_inode *dp = ds->dargs.dp; + struct xfs_da_geometry *geo = mp->m_dir_geo; struct xfs_dir2_data_entry *dent; struct xfs_buf *bp; - char *p, *endp; + struct xfs_dir2_leaf_entry *ent; + unsigned int end; + unsigned int iter_off; xfs_ino_t ino; xfs_dablk_t rec_bno; xfs_dir2_db_t db; @@ -195,9 +203,16 @@ xchk_dir_rec( xfs_dir2_dataptr_t ptr; xfs_dahash_t calc_hash; xfs_dahash_t hash; + struct xfs_dir3_icleaf_hdr hdr; unsigned int tag; int error; + ASSERT(blk->magic == XFS_DIR2_LEAF1_MAGIC || + blk->magic == XFS_DIR2_LEAFN_MAGIC); + + xfs_dir2_leaf_hdr_from_disk(mp, &hdr, blk->bp->b_addr); + ent = hdr.ents + blk->index; + /* Check the hash of the entry. */ error = xchk_da_btree_hash(ds, level, &ent->hashval); if (error) @@ -209,15 +224,16 @@ xchk_dir_rec( return 0; /* Find the directory entry's location. */ - db = xfs_dir2_dataptr_to_db(mp->m_dir_geo, ptr); - off = xfs_dir2_dataptr_to_off(mp->m_dir_geo, ptr); - rec_bno = xfs_dir2_db_to_da(mp->m_dir_geo, db); + db = xfs_dir2_dataptr_to_db(geo, ptr); + off = xfs_dir2_dataptr_to_off(geo, ptr); + rec_bno = xfs_dir2_db_to_da(geo, db); - if (rec_bno >= mp->m_dir_geo->leafblk) { + if (rec_bno >= geo->leafblk) { xchk_da_set_corrupt(ds, level); goto out; } - error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, -2, &bp); + error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, + XFS_DABUF_MAP_HOLE_OK, &bp); if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno, &error)) goto out; @@ -230,38 +246,37 @@ xchk_dir_rec( if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out_relse; - dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off); + dent = bp->b_addr + off; /* Make sure we got a real directory entry. */ - p = (char *)mp->m_dir_inode_ops->data_entry_p(bp->b_addr); - endp = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr); - if (!endp) { + iter_off = geo->data_entry_offset; + end = xfs_dir3_data_end_offset(geo, bp->b_addr); + if (!end) { xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); goto out_relse; } - while (p < endp) { - struct xfs_dir2_data_entry *dep; - struct xfs_dir2_data_unused *dup; + for (;;) { + struct xfs_dir2_data_entry *dep = bp->b_addr + iter_off; + struct xfs_dir2_data_unused *dup = bp->b_addr + iter_off; + + if (iter_off >= end) { + xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out_relse; + } - dup = (struct xfs_dir2_data_unused *)p; if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - p += be16_to_cpu(dup->length); + iter_off += be16_to_cpu(dup->length); continue; } - dep = (struct xfs_dir2_data_entry *)p; if (dep == dent) break; - p += mp->m_dir_inode_ops->data_entsize(dep->namelen); - } - if (p >= endp) { - xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); - goto out_relse; + iter_off += xfs_dir2_data_entsize(mp, dep->namelen); } /* Retrieve the entry, sanity check it, and compare hashes. */ ino = be64_to_cpu(dent->inumber); hash = be32_to_cpu(ent->hashval); - tag = be16_to_cpup(dp->d_ops->data_entry_tag_p(dent)); + tag = be16_to_cpup(xfs_dir2_data_entry_tag_p(mp, dent)); if (!xfs_verify_dir_ino(mp, ino) || tag != off) xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); if (dent->namelen == 0) { @@ -319,19 +334,15 @@ xchk_directory_data_bestfree( struct xfs_buf *bp; struct xfs_dir2_data_free *bf; struct xfs_mount *mp = sc->mp; - const struct xfs_dir_ops *d_ops; - char *ptr; - char *endptr; u16 tag; unsigned int nr_bestfrees = 0; unsigned int nr_frees = 0; unsigned int smallest_bestfree; int newlen; - int offset; + unsigned int offset; + unsigned int end; int error; - d_ops = sc->ip->d_ops; - if (is_block) { /* dir block format */ if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET)) @@ -339,7 +350,7 @@ xchk_directory_data_bestfree( error = xfs_dir3_block_read(sc->tp, sc->ip, &bp); } else { /* dir data format */ - error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, -1, &bp); + error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, 0, &bp); } if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; @@ -351,7 +362,7 @@ xchk_directory_data_bestfree( goto out_buf; /* Do the bestfrees correspond to actual free space? */ - bf = d_ops->data_bestfree_p(bp->b_addr); + bf = xfs_dir2_data_bestfree_p(mp, bp->b_addr); smallest_bestfree = UINT_MAX; for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { offset = be16_to_cpu(dfp->offset); @@ -361,13 +372,13 @@ xchk_directory_data_bestfree( xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out_buf; } - dup = (struct xfs_dir2_data_unused *)(bp->b_addr + offset); + dup = bp->b_addr + offset; tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)); /* bestfree doesn't match the entry it points at? */ if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG) || be16_to_cpu(dup->length) != be16_to_cpu(dfp->length) || - tag != ((char *)dup - (char *)bp->b_addr)) { + tag != offset) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out_buf; } @@ -383,30 +394,30 @@ xchk_directory_data_bestfree( } /* Make sure the bestfrees are actually the best free spaces. */ - ptr = (char *)d_ops->data_entry_p(bp->b_addr); - endptr = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr); + offset = mp->m_dir_geo->data_entry_offset; + end = xfs_dir3_data_end_offset(mp->m_dir_geo, bp->b_addr); /* Iterate the entries, stopping when we hit or go past the end. */ - while (ptr < endptr) { - dup = (struct xfs_dir2_data_unused *)ptr; + while (offset < end) { + dup = bp->b_addr + offset; + /* Skip real entries */ if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG)) { - struct xfs_dir2_data_entry *dep; + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; - dep = (struct xfs_dir2_data_entry *)ptr; - newlen = d_ops->data_entsize(dep->namelen); + newlen = xfs_dir2_data_entsize(mp, dep->namelen); if (newlen <= 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out_buf; } - ptr += newlen; + offset += newlen; continue; } /* Spot check this free entry */ tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)); - if (tag != ((char *)dup - (char *)bp->b_addr)) { + if (tag != offset) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out_buf; } @@ -425,13 +436,13 @@ xchk_directory_data_bestfree( xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out_buf; } - ptr += newlen; - if (ptr <= endptr) + offset += newlen; + if (offset <= end) nr_frees++; } /* We're required to fill all the space. */ - if (ptr != endptr) + if (offset != end) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); /* Did we see at least as many free slots as there are bestfrees? */ @@ -458,7 +469,7 @@ xchk_directory_check_freesp( { struct xfs_dir2_data_free *dfp; - dfp = sc->ip->d_ops->data_bestfree_p(dbp->b_addr); + dfp = xfs_dir2_data_bestfree_p(sc->mp, dbp->b_addr); if (len != be16_to_cpu(dfp->length)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); @@ -475,12 +486,10 @@ xchk_directory_leaf1_bestfree( xfs_dablk_t lblk) { struct xfs_dir3_icleaf_hdr leafhdr; - struct xfs_dir2_leaf_entry *ents; struct xfs_dir2_leaf_tail *ltp; struct xfs_dir2_leaf *leaf; struct xfs_buf *dbp; struct xfs_buf *bp; - const struct xfs_dir_ops *d_ops = sc->ip->d_ops; struct xfs_da_geometry *geo = sc->mp->m_dir_geo; __be16 *bestp; __u16 best; @@ -492,14 +501,13 @@ xchk_directory_leaf1_bestfree( int error; /* Read the free space block. */ - error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, -1, &bp); + error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - goto out; + return error; xchk_buffer_recheck(sc, bp); leaf = bp->b_addr; - d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - ents = d_ops->leaf_ents_p(leaf); + xfs_dir2_leaf_hdr_from_disk(sc->ip->i_mount, &leafhdr, leaf); ltp = xfs_dir2_leaf_tail_p(geo, leaf); bestcount = be32_to_cpu(ltp->bestcount); bestp = xfs_dir2_leaf_bests_p(ltp); @@ -521,24 +529,25 @@ xchk_directory_leaf1_bestfree( } /* Is the leaf count even remotely sane? */ - if (leafhdr.count > d_ops->leaf_max_ents(geo)) { + if (leafhdr.count > geo->leaf_max_ents) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out; } /* Leaves and bests don't overlap in leaf format. */ - if ((char *)&ents[leafhdr.count] > (char *)bestp) { + if ((char *)&leafhdr.ents[leafhdr.count] > (char *)bestp) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out; } /* Check hash value order, count stale entries. */ for (i = 0; i < leafhdr.count; i++) { - hash = be32_to_cpu(ents[i].hashval); + hash = be32_to_cpu(leafhdr.ents[i].hashval); if (i > 0 && lasthash > hash) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); lasthash = hash; - if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + if (leafhdr.ents[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } if (leafhdr.stale != stale) @@ -552,16 +561,17 @@ xchk_directory_leaf1_bestfree( if (best == NULLDATAOFF) continue; error = xfs_dir3_data_read(sc->tp, sc->ip, - i * args->geo->fsbcount, -1, &dbp); + i * args->geo->fsbcount, 0, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) break; xchk_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; + break; } out: + xfs_trans_brelse(sc->tp, bp); return error; } @@ -575,7 +585,6 @@ xchk_directory_free_bestfree( struct xfs_dir3_icfree_hdr freehdr; struct xfs_buf *dbp; struct xfs_buf *bp; - __be16 *bestp; __u16 best; unsigned int stale = 0; int i; @@ -584,7 +593,7 @@ xchk_directory_free_bestfree( /* Read the free space block */ error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - goto out; + return error; xchk_buffer_recheck(sc, bp); if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { @@ -595,20 +604,19 @@ xchk_directory_free_bestfree( } /* Check all the entries. */ - sc->ip->d_ops->free_hdr_from_disk(&freehdr, bp->b_addr); - bestp = sc->ip->d_ops->free_bests_p(bp->b_addr); - for (i = 0; i < freehdr.nvalid; i++, bestp++) { - best = be16_to_cpu(*bestp); + xfs_dir2_free_hdr_from_disk(sc->ip->i_mount, &freehdr, bp->b_addr); + for (i = 0; i < freehdr.nvalid; i++) { + best = be16_to_cpu(freehdr.bests[i]); if (best == NULLDATAOFF) { stale++; continue; } error = xfs_dir3_data_read(sc->tp, sc->ip, (freehdr.firstdb + i) * args->geo->fsbcount, - -1, &dbp); + 0, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - break; + goto out; xchk_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); } @@ -616,6 +624,7 @@ xchk_directory_free_bestfree( if (freehdr.nused + stale != freehdr.nvalid) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); out: + xfs_trans_brelse(sc->tp, bp); return error; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 98f82d7c8b40..ec2064ed3c30 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -83,9 +83,6 @@ xchk_fscount_warmup( error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp); if (error) break; - error = -ENOMEM; - if (!agf_bp || !agi_bp) - break; /* * These are supposed to be initialized by the header read @@ -104,7 +101,7 @@ next_loop_perag: pag = NULL; error = 0; - if (fatal_signal_pending(current)) + if (xchk_should_terminate(sc, &error)) break; } @@ -163,6 +160,7 @@ xchk_fscount_aggregate_agcounts( uint64_t delayed; xfs_agnumber_t agno; int tries = 8; + int error = 0; retry: fsc->icount = 0; @@ -196,10 +194,13 @@ retry: xfs_perag_put(pag); - if (fatal_signal_pending(current)) + if (xchk_should_terminate(sc, &error)) break; } + if (error) + return error; + /* * The global incore space reservation is taken from the incore * counters, so leave that out of the computation. diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index b2f602811e9d..83d27cdf579b 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -11,6 +11,7 @@ #include "xfs_sb.h" #include "xfs_health.h" #include "scrub/scrub.h" +#include "scrub/health.h" /* * Scrub and In-Core Filesystem Health Assessments diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 681758704fda..64c217eb06a7 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -104,7 +104,7 @@ xchk_iallocbt_chunk( xfs_extlen_t len) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t bno; bno = XFS_AGINO_TO_AGBNO(mp, agino); @@ -164,7 +164,7 @@ xchk_iallocbt_check_cluster_ifree( * the record, compute which fs inode we're talking about. */ agino = irec->ir_startino + irec_ino; - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.agno, agino); irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || @@ -215,7 +215,7 @@ xchk_iallocbt_check_cluster( struct xfs_dinode *dip; struct xfs_buf *cluster_bp; unsigned int nr_inodes; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t agbno; unsigned int cluster_index; uint16_t cluster_mask = 0; @@ -426,7 +426,7 @@ xchk_iallocbt_rec( struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agino_t agino; xfs_extlen_t len; int holecount; diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index c962bd534690..5705adc43a75 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -32,8 +32,10 @@ xchk_setup_parent( struct xchk_parent_ctx { struct dir_context dc; + struct xfs_scrub *sc; xfs_ino_t ino; xfs_nlink_t nlink; + bool cancelled; }; /* Look for a single entry in a directory pointing to an inode. */ @@ -47,11 +49,21 @@ xchk_parent_actor( unsigned type) { struct xchk_parent_ctx *spc; + int error = 0; spc = container_of(dc, struct xchk_parent_ctx, dc); if (spc->ino == ino) spc->nlink++; - return 0; + + /* + * If we're facing a fatal signal, bail out. Store the cancellation + * status separately because the VFS readdir code squashes error codes + * into short directory reads. + */ + if (xchk_should_terminate(spc->sc, &error)) + spc->cancelled = true; + + return error; } /* Count the number of dentries in the parent dir that point to this inode. */ @@ -62,10 +74,9 @@ xchk_parent_count_parent_dentries( xfs_nlink_t *nlink) { struct xchk_parent_ctx spc = { - .dc.actor = xchk_parent_actor, - .dc.pos = 0, - .ino = sc->ip->i_ino, - .nlink = 0, + .dc.actor = xchk_parent_actor, + .ino = sc->ip->i_ino, + .sc = sc, }; size_t bufsize; loff_t oldpos; @@ -80,7 +91,7 @@ xchk_parent_count_parent_dentries( */ lock_mode = xfs_ilock_data_map_shared(parent); if (parent->i_d.di_nextents > 0) - error = xfs_dir3_data_readahead(parent, 0, -1); + error = xfs_dir3_data_readahead(parent, 0, 0); xfs_iunlock(parent, lock_mode); if (error) return error; @@ -97,6 +108,10 @@ xchk_parent_count_parent_dentries( error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize); if (error) goto out; + if (spc.cancelled) { + error = -EAGAIN; + goto out; + } if (oldpos == spc.dc.pos) break; oldpos = spc.dc.pos; diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 0a33b4421c32..905a34558361 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -93,6 +93,10 @@ xchk_quota_item( unsigned long long rcount; xfs_ino_t fs_icount; xfs_dqid_t id = be32_to_cpu(d->d_id); + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; /* * Except for the root dquot, the actual dquot we got must either have @@ -178,6 +182,9 @@ xchk_quota_item( if (id != 0 && rhard != 0 && rcount > rhard) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -EFSCORRUPTED; + return 0; } diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 0cab11a5d390..beaeb6fa3119 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -336,7 +336,7 @@ xchk_refcountbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; xfs_agblock_t *cow_blocks = bs->private; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t bno; xfs_extlen_t len; xfs_nlink_t refcount; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index b70a88bc975e..db3cfd12803d 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -208,8 +208,10 @@ xrep_calc_ag_resblks( /* Now grab the block counters from the AGF. */ error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp); if (!error) { - aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length); - freelen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_freeblks); + struct xfs_agf *agf = bp->b_addr; + + aglen = be32_to_cpu(agf->agf_length); + freelen = be32_to_cpu(agf->agf_freeblks); usedlen = aglen - freelen; xfs_buf_relse(bp); } @@ -341,13 +343,17 @@ xrep_init_btblock( struct xfs_trans *tp = sc->tp; struct xfs_mount *mp = sc->mp; struct xfs_buf *bp; + int error; trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb), XFS_FSB_TO_AGBNO(mp, fsb), btnum); ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb), - XFS_FSB_TO_BB(mp, 1), 0); + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0, + &bp); + if (error) + return error; xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); @@ -430,10 +436,10 @@ xrep_init_btblock( int xrep_invalidate_blocks( struct xfs_scrub *sc, - struct xfs_bitmap *bitmap) + struct xbitmap *bitmap) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; + struct xbitmap_range *bmr; + struct xbitmap_range *n; struct xfs_buf *bp; xfs_fsblock_t fsbno; @@ -445,7 +451,7 @@ xrep_invalidate_blocks( * because we never own those; and if we can't TRYLOCK the buffer we * assume it's owned by someone else. */ - for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) { + for_each_xbitmap_block(fsbno, bmr, n, bitmap) { /* Skip AG headers and post-EOFS blocks */ if (!xfs_verify_fsbno(sc->mp, fsbno)) continue; @@ -542,8 +548,6 @@ xrep_reap_block( error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp); if (error) return error; - if (!agf_bp) - return -ENOMEM; } else { agf_bp = sc->sa.agf_bp; } @@ -593,18 +597,18 @@ out_free: int xrep_reap_extents( struct xfs_scrub *sc, - struct xfs_bitmap *bitmap, + struct xbitmap *bitmap, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; + struct xbitmap_range *bmr; + struct xbitmap_range *n; xfs_fsblock_t fsbno; int error = 0; ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb)); - for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) { + for_each_xbitmap_block(fsbno, bmr, n, bitmap) { ASSERT(sc->ip != NULL || XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.agno); trace_xrep_dispose_btree_extent(sc->mp, @@ -613,11 +617,9 @@ xrep_reap_extents( error = xrep_reap_block(sc, fsbno, oinfo, type); if (error) - goto out; + break; } -out: - xfs_bitmap_destroy(bitmap); return error; } @@ -877,7 +879,7 @@ xrep_find_ag_btree_roots( ri.sc = sc; ri.btree_info = btree_info; - ri.agf = XFS_BUF_TO_AGF(agf_bp); + ri.agf = agf_bp->b_addr; ri.agfl_bp = agfl_bp; for (fab = btree_info; fab->buf_ops; fab++) { ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG); diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 60c61d7052a8..04a47d45605b 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -28,11 +28,11 @@ int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb, struct xfs_buf **bpp, xfs_btnum_t btnum, const struct xfs_buf_ops *ops); -struct xfs_bitmap; +struct xbitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); -int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xfs_bitmap *btlist); -int xrep_reap_extents(struct xfs_scrub *sc, struct xfs_bitmap *exlist, +int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist); +int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); struct xrep_find_ag_btree { @@ -75,7 +75,6 @@ static inline xfs_extlen_t xrep_calc_ag_resblks( struct xfs_scrub *sc) { - ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)); return 0; } diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 8d4cefd761c1..f4fcb4719f41 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -92,7 +92,7 @@ xchk_rmapbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_rmap_irec irec; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; bool non_inode; bool is_unwritten; bool is_bmbt; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 15c8c5f3f688..8ebf35b115ce 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -16,6 +16,7 @@ #include "xfs_qm.h" #include "xfs_errortag.h" #include "xfs_error.h" +#include "xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -167,6 +168,7 @@ xchk_teardown( xfs_irele(sc->ip); sc->ip = NULL; } + sb_end_write(sc->mp->m_super); if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { @@ -489,6 +491,14 @@ xfs_scrub_metadata( sc.ops = &meta_scrub_ops[sm->sm_type]; sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); retry_op: + /* + * If freeze runs concurrently with a scrub, the freeze can be delayed + * indefinitely as we walk the filesystem and iterate over metadata + * buffers. Freeze quiesces the log (which waits for the buffer LRU to + * be emptied) and that won't happen while checking is running. + */ + sb_start_write(mp->m_super); + /* Set up for the operation. */ error = sc.ops->setup(&sc, ip); if (error) diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 9eaab2eb5ed3..2c6c248be823 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -24,9 +24,9 @@ xchk_btree_cur_fsbno( return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn); else if (level == cur->bc_nlevels - 1 && cur->bc_flags & XFS_BTREE_LONG_PTRS) - return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_private.b.ip->i_ino); + return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS)) - return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, 0); + return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, 0); return NULLFSBLOCK; } diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 3362bae28b46..e46f5cef90da 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -329,7 +329,7 @@ TRACE_EVENT(xchk_btree_op_error, __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) - __field(int, ptr); + __field(int, ptr) __field(int, error) __field(void *, ret_ip) ), @@ -379,7 +379,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->ino = sc->ip->i_ino; - __entry->whichfork = cur->bc_private.b.whichfork; + __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; __entry->btnum = cur->bc_btnum; __entry->level = level; @@ -414,7 +414,7 @@ TRACE_EVENT(xchk_btree_error, __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) - __field(int, ptr); + __field(int, ptr) __field(void *, ret_ip) ), TP_fast_assign( @@ -452,14 +452,14 @@ TRACE_EVENT(xchk_ifork_btree_error, __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) - __field(int, ptr); + __field(int, ptr) __field(void *, ret_ip) ), TP_fast_assign( xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->ino = sc->ip->i_ino; - __entry->whichfork = cur->bc_private.b.whichfork; + __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; __entry->btnum = cur->bc_btnum; __entry->level = level; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 96d7071cfa46..d4c687b5cd06 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -12,8 +12,12 @@ #include "xfs_inode.h" #include "xfs_attr.h" #include "xfs_trace.h" -#include <linux/posix_acl_xattr.h> +#include "xfs_error.h" +#include "xfs_acl.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include <linux/posix_acl_xattr.h> /* * Locking scheme: @@ -23,6 +27,7 @@ STATIC struct posix_acl * xfs_acl_from_disk( + struct xfs_mount *mp, const struct xfs_acl *aclp, int len, int max_entries) @@ -32,11 +37,18 @@ xfs_acl_from_disk( const struct xfs_acl_entry *ace; unsigned int count, i; - if (len < sizeof(*aclp)) + if (len < sizeof(*aclp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp, + len); return ERR_PTR(-EFSCORRUPTED); + } + count = be32_to_cpu(aclp->acl_cnt); - if (count > max_entries || XFS_ACL_SIZE(count) != len) + if (count > max_entries || XFS_ACL_SIZE(count) != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp, + len); return ERR_PTR(-EFSCORRUPTED); + } acl = posix_acl_alloc(count, GFP_KERNEL); if (!acl) @@ -57,10 +69,12 @@ xfs_acl_from_disk( switch (acl_e->e_tag) { case ACL_USER: - acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id)); + acl_e->e_uid = make_kuid(&init_user_ns, + be32_to_cpu(ace->ae_id)); break; case ACL_GROUP: - acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id)); + acl_e->e_gid = make_kgid(&init_user_ns, + be32_to_cpu(ace->ae_id)); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: @@ -93,10 +107,12 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) ace->ae_tag = cpu_to_be32(acl_e->e_tag); switch (acl_e->e_tag) { case ACL_USER: - ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid)); + ace->ae_id = cpu_to_be32( + from_kuid(&init_user_ns, acl_e->e_uid)); break; case ACL_GROUP: - ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid)); + ace->ae_id = cpu_to_be32( + from_kgid(&init_user_ns, acl_e->e_gid)); break; default: ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID); @@ -110,99 +126,86 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) struct posix_acl * xfs_get_acl(struct inode *inode, int type) { - struct xfs_inode *ip = XFS_I(inode); - struct posix_acl *acl = NULL; - struct xfs_acl *xfs_acl = NULL; - unsigned char *ea_name; - int error; - int len; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct posix_acl *acl = NULL; + struct xfs_da_args args = { + .dp = ip, + .attr_filter = XFS_ATTR_ROOT, + .valuelen = XFS_ACL_MAX_SIZE(mp), + }; + int error; trace_xfs_get_acl(ip); switch (type) { case ACL_TYPE_ACCESS: - ea_name = SGI_ACL_FILE; + args.name = SGI_ACL_FILE; break; case ACL_TYPE_DEFAULT: - ea_name = SGI_ACL_DEFAULT; + args.name = SGI_ACL_DEFAULT; break; default: BUG(); } + args.namelen = strlen(args.name); /* - * If we have a cached ACLs value just return it, not need to - * go out to the disk. + * If the attribute doesn't exist make sure we have a negative cache + * entry, for any other error assume it is transient. */ - len = XFS_ACL_MAX_SIZE(ip->i_mount); - error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len, - ATTR_ALLOC | ATTR_ROOT); - if (error) { - /* - * If the attribute doesn't exist make sure we have a negative - * cache entry, for any other error assume it is transient. - */ - if (error != -ENOATTR) - acl = ERR_PTR(error); - } else { - acl = xfs_acl_from_disk(xfs_acl, len, - XFS_ACL_MAX_ENTRIES(ip->i_mount)); - kmem_free(xfs_acl); + error = xfs_attr_get(&args); + if (!error) { + acl = xfs_acl_from_disk(mp, args.value, args.valuelen, + XFS_ACL_MAX_ENTRIES(mp)); + } else if (error != -ENOATTR) { + acl = ERR_PTR(error); } + + kmem_free(args.value); return acl; } int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct xfs_inode *ip = XFS_I(inode); - unsigned char *ea_name; - int error; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_da_args args = { + .dp = ip, + .attr_filter = XFS_ATTR_ROOT, + }; + int error; switch (type) { case ACL_TYPE_ACCESS: - ea_name = SGI_ACL_FILE; + args.name = SGI_ACL_FILE; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; - ea_name = SGI_ACL_DEFAULT; + args.name = SGI_ACL_DEFAULT; break; default: return -EINVAL; } + args.namelen = strlen(args.name); if (acl) { - struct xfs_acl *xfs_acl; - int len = XFS_ACL_MAX_SIZE(ip->i_mount); - - xfs_acl = kmem_zalloc_large(len, 0); - if (!xfs_acl) + args.valuelen = XFS_ACL_SIZE(acl->a_count); + args.value = kmem_zalloc_large(args.valuelen, 0); + if (!args.value) return -ENOMEM; - - xfs_acl_to_disk(xfs_acl, acl); - - /* subtract away the unused acl entries */ - len -= sizeof(struct xfs_acl_entry) * - (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); - - error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, - len, ATTR_ROOT); - - kmem_free(xfs_acl); - } else { - /* - * A NULL ACL argument means we want to remove the ACL. - */ - error = xfs_attr_remove(ip, ea_name, ATTR_ROOT); - - /* - * If the attribute didn't exist to start with that's fine. - */ - if (error == -ENOATTR) - error = 0; + xfs_acl_to_disk(args.value, acl); } + error = xfs_attr_set(&args); + kmem_free(args.value); + + /* + * If the attribute didn't exist to start with that's fine. + */ + if (!acl && error == -ENOATTR) + error = 0; if (!error) set_cached_acl(inode, type, acl); return error; @@ -262,3 +265,19 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; } + +/* + * Invalidate any cached ACLs if the user has bypassed the ACL interface. + * We don't validate the content whatsoever so it is caller responsibility to + * provide data in valid format and ensure i_mode is consistent. + */ +void +xfs_forget_acl( + struct inode *inode, + const char *name) +{ + if (!strcmp(name, SGI_ACL_FILE)) + forget_cached_acl(inode, ACL_TYPE_ACCESS); + else if (!strcmp(name, SGI_ACL_DEFAULT)) + forget_cached_acl(inode, ACL_TYPE_DEFAULT); +} diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 94615e34bc86..c042c0868016 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -13,14 +13,16 @@ struct posix_acl; extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +void xfs_forget_acl(struct inode *inode, const char *name); #else static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) { return NULL; } # define xfs_set_acl NULL +static inline void xfs_forget_acl(struct inode *inode, const char *name) +{ +} #endif /* CONFIG_XFS_POSIX_ACL */ -extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags); - #endif /* __XFS_ACL_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f16d5f196c6b..1fd4fb7a607c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -18,108 +18,22 @@ #include "xfs_bmap_util.h" #include "xfs_reflink.h" -/* - * structure owned by writepages passed to individual writepage calls - */ struct xfs_writepage_ctx { - struct xfs_bmbt_irec imap; - int fork; + struct iomap_writepage_ctx ctx; unsigned int data_seq; unsigned int cow_seq; - struct xfs_ioend *ioend; }; -struct block_device * -xfs_find_bdev_for_inode( - struct inode *inode) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - - if (XFS_IS_REALTIME_INODE(ip)) - return mp->m_rtdev_targp->bt_bdev; - else - return mp->m_ddev_targp->bt_bdev; -} - -struct dax_device * -xfs_find_daxdev_for_inode( - struct inode *inode) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - - if (XFS_IS_REALTIME_INODE(ip)) - return mp->m_rtdev_targp->bt_daxdev; - else - return mp->m_ddev_targp->bt_daxdev; -} - -static void -xfs_finish_page_writeback( - struct inode *inode, - struct bio_vec *bvec, - int error) -{ - struct iomap_page *iop = to_iomap_page(bvec->bv_page); - - if (error) { - SetPageError(bvec->bv_page); - mapping_set_error(inode->i_mapping, -EIO); - } - - ASSERT(iop || i_blocksize(inode) == PAGE_SIZE); - ASSERT(!iop || atomic_read(&iop->write_count) > 0); - - if (!iop || atomic_dec_and_test(&iop->write_count)) - end_page_writeback(bvec->bv_page); -} - -/* - * We're now finished for good with this ioend structure. Update the page - * state, release holds on bios, and finally free up memory. Do not use the - * ioend after this. - */ -STATIC void -xfs_destroy_ioend( - struct xfs_ioend *ioend, - int error) +static inline struct xfs_writepage_ctx * +XFS_WPC(struct iomap_writepage_ctx *ctx) { - struct inode *inode = ioend->io_inode; - struct bio *bio = &ioend->io_inline_bio; - struct bio *last = ioend->io_bio, *next; - u64 start = bio->bi_iter.bi_sector; - bool quiet = bio_flagged(bio, BIO_QUIET); - - for (bio = &ioend->io_inline_bio; bio; bio = next) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * For the last bio, bi_private points to the ioend, so we - * need to explicitly end the iteration here. - */ - if (bio == last) - next = NULL; - else - next = bio->bi_private; - - /* walk each page on bio, ending page IO on them */ - bio_for_each_segment_all(bvec, bio, iter_all) - xfs_finish_page_writeback(inode, bvec, error); - bio_put(bio); - } - - if (unlikely(error && !quiet)) { - xfs_err_ratelimited(XFS_I(inode)->i_mount, - "writeback error on sector %llu", start); - } + return container_of(ctx, struct xfs_writepage_ctx, ctx); } /* * Fast and loose check if this write could update the on-disk inode size. */ -static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) +static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend) { return ioend->io_offset + ioend->io_size > XFS_I(ioend->io_inode)->i_d.di_size; @@ -127,7 +41,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) STATIC int xfs_setfilesize_trans_alloc( - struct xfs_ioend *ioend) + struct iomap_ioend *ioend) { struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; struct xfs_trans *tp; @@ -137,7 +51,7 @@ xfs_setfilesize_trans_alloc( if (error) return error; - ioend->io_append_trans = tp; + ioend->io_private = tp; /* * We may pass freeze protection with a transaction. So tell lockdep @@ -200,11 +114,11 @@ xfs_setfilesize( STATIC int xfs_setfilesize_ioend( - struct xfs_ioend *ioend, + struct iomap_ioend *ioend, int error) { struct xfs_inode *ip = XFS_I(ioend->io_inode); - struct xfs_trans *tp = ioend->io_append_trans; + struct xfs_trans *tp = ioend->io_private; /* * The transaction may have been allocated in the I/O submission thread, @@ -228,9 +142,8 @@ xfs_setfilesize_ioend( */ STATIC void xfs_end_ioend( - struct xfs_ioend *ioend) + struct iomap_ioend *ioend) { - struct list_head ioend_list; struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; @@ -257,7 +170,7 @@ xfs_end_ioend( */ error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { - if (ioend->io_fork == XFS_COW_FORK) + if (ioend->io_flags & IOMAP_F_SHARED) xfs_reflink_cancel_cow_range(ip, offset, size, true); goto done; } @@ -265,154 +178,86 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_fork == XFS_COW_FORK) + if (ioend->io_flags & IOMAP_F_SHARED) error = xfs_reflink_end_cow(ip, offset, size); - else if (ioend->io_state == XFS_EXT_UNWRITTEN) + else if (ioend->io_type == IOMAP_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); else - ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); + ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private); done: - if (ioend->io_append_trans) + if (ioend->io_private) error = xfs_setfilesize_ioend(ioend, error); - list_replace_init(&ioend->io_list, &ioend_list); - xfs_destroy_ioend(ioend, error); - - while (!list_empty(&ioend_list)) { - ioend = list_first_entry(&ioend_list, struct xfs_ioend, - io_list); - list_del_init(&ioend->io_list); - xfs_destroy_ioend(ioend, error); - } - + iomap_finish_ioends(ioend, error); memalloc_nofs_restore(nofs_flag); } /* - * We can merge two adjacent ioends if they have the same set of work to do. - */ -static bool -xfs_ioend_can_merge( - struct xfs_ioend *ioend, - struct xfs_ioend *next) -{ - if (ioend->io_bio->bi_status != next->io_bio->bi_status) - return false; - if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK)) - return false; - if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^ - (next->io_state == XFS_EXT_UNWRITTEN)) - return false; - if (ioend->io_offset + ioend->io_size != next->io_offset) - return false; - return true; -} - -/* * If the to be merged ioend has a preallocated transaction for file * size updates we need to ensure the ioend it is merged into also * has one. If it already has one we can simply cancel the transaction * as it is guaranteed to be clean. */ static void -xfs_ioend_merge_append_transactions( - struct xfs_ioend *ioend, - struct xfs_ioend *next) +xfs_ioend_merge_private( + struct iomap_ioend *ioend, + struct iomap_ioend *next) { - if (!ioend->io_append_trans) { - ioend->io_append_trans = next->io_append_trans; - next->io_append_trans = NULL; + if (!ioend->io_private) { + ioend->io_private = next->io_private; + next->io_private = NULL; } else { xfs_setfilesize_ioend(next, -ECANCELED); } } -/* Try to merge adjacent completions. */ -STATIC void -xfs_ioend_try_merge( - struct xfs_ioend *ioend, - struct list_head *more_ioends) -{ - struct xfs_ioend *next_ioend; - - while (!list_empty(more_ioends)) { - next_ioend = list_first_entry(more_ioends, struct xfs_ioend, - io_list); - if (!xfs_ioend_can_merge(ioend, next_ioend)) - break; - list_move_tail(&next_ioend->io_list, &ioend->io_list); - ioend->io_size += next_ioend->io_size; - if (next_ioend->io_append_trans) - xfs_ioend_merge_append_transactions(ioend, next_ioend); - } -} - -/* list_sort compare function for ioends */ -static int -xfs_ioend_compare( - void *priv, - struct list_head *a, - struct list_head *b) -{ - struct xfs_ioend *ia; - struct xfs_ioend *ib; - - ia = container_of(a, struct xfs_ioend, io_list); - ib = container_of(b, struct xfs_ioend, io_list); - if (ia->io_offset < ib->io_offset) - return -1; - else if (ia->io_offset > ib->io_offset) - return 1; - return 0; -} - /* Finish all pending io completions. */ void xfs_end_io( struct work_struct *work) { - struct xfs_inode *ip; - struct xfs_ioend *ioend; - struct list_head completion_list; + struct xfs_inode *ip = + container_of(work, struct xfs_inode, i_ioend_work); + struct iomap_ioend *ioend; + struct list_head tmp; unsigned long flags; - ip = container_of(work, struct xfs_inode, i_ioend_work); - spin_lock_irqsave(&ip->i_ioend_lock, flags); - list_replace_init(&ip->i_ioend_list, &completion_list); + list_replace_init(&ip->i_ioend_list, &tmp); spin_unlock_irqrestore(&ip->i_ioend_lock, flags); - list_sort(NULL, &completion_list, xfs_ioend_compare); - - while (!list_empty(&completion_list)) { - ioend = list_first_entry(&completion_list, struct xfs_ioend, - io_list); + iomap_sort_ioends(&tmp); + while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, + io_list))) { list_del_init(&ioend->io_list); - xfs_ioend_try_merge(ioend, &completion_list); + iomap_ioend_try_merge(ioend, &tmp, xfs_ioend_merge_private); xfs_end_ioend(ioend); } } +static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend) +{ + return ioend->io_private || + ioend->io_type == IOMAP_UNWRITTEN || + (ioend->io_flags & IOMAP_F_SHARED); +} + STATIC void xfs_end_bio( struct bio *bio) { - struct xfs_ioend *ioend = bio->bi_private; + struct iomap_ioend *ioend = bio->bi_private; struct xfs_inode *ip = XFS_I(ioend->io_inode); - struct xfs_mount *mp = ip->i_mount; unsigned long flags; - if (ioend->io_fork == XFS_COW_FORK || - ioend->io_state == XFS_EXT_UNWRITTEN || - ioend->io_append_trans != NULL) { - spin_lock_irqsave(&ip->i_ioend_lock, flags); - if (list_empty(&ip->i_ioend_list)) - WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue, - &ip->i_ioend_work)); - list_add_tail(&ioend->io_list, &ip->i_ioend_list); - spin_unlock_irqrestore(&ip->i_ioend_lock, flags); - } else - xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); + ASSERT(xfs_ioend_needs_workqueue(ioend)); + + spin_lock_irqsave(&ip->i_ioend_lock, flags); + if (list_empty(&ip->i_ioend_list)) + WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, + &ip->i_ioend_work)); + list_add_tail(&ioend->io_list, &ip->i_ioend_list); + spin_unlock_irqrestore(&ip->i_ioend_lock, flags); } /* @@ -421,19 +266,19 @@ xfs_end_bio( */ static bool xfs_imap_valid( - struct xfs_writepage_ctx *wpc, + struct iomap_writepage_ctx *wpc, struct xfs_inode *ip, - xfs_fileoff_t offset_fsb) + loff_t offset) { - if (offset_fsb < wpc->imap.br_startoff || - offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount) + if (offset < wpc->iomap.offset || + offset >= wpc->iomap.offset + wpc->iomap.length) return false; /* * If this is a COW mapping, it is sufficient to check that the mapping * covers the offset. Be careful to check this first because the caller * can revalidate a COW mapping without updating the data seqno. */ - if (wpc->fork == XFS_COW_FORK) + if (wpc->iomap.flags & IOMAP_F_SHARED) return true; /* @@ -443,17 +288,17 @@ xfs_imap_valid( * checked (and found nothing at this offset) could have added * overlapping blocks. */ - if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq)) + if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) return false; if (xfs_inode_has_cow_data(ip) && - wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) return false; return true; } /* * Pass in a dellalloc extent and convert it to real extents, return the real - * extent that maps offset_fsb in wpc->imap. + * extent that maps offset_fsb in wpc->iomap. * * The current page is held locked so nothing could have removed the block * backing offset_fsb, although it could have moved from the COW to the data @@ -461,32 +306,38 @@ xfs_imap_valid( */ static int xfs_convert_blocks( - struct xfs_writepage_ctx *wpc, + struct iomap_writepage_ctx *wpc, struct xfs_inode *ip, - xfs_fileoff_t offset_fsb) + int whichfork, + loff_t offset) { int error; + unsigned *seq; + + if (whichfork == XFS_COW_FORK) + seq = &XFS_WPC(wpc)->cow_seq; + else + seq = &XFS_WPC(wpc)->data_seq; /* - * Attempt to allocate whatever delalloc extent currently backs - * offset_fsb and put the result into wpc->imap. Allocate in a loop - * because it may take several attempts to allocate real blocks for a - * contiguous delalloc extent if free space is sufficiently fragmented. + * Attempt to allocate whatever delalloc extent currently backs offset + * and put the result into wpc->iomap. Allocate in a loop because it + * may take several attempts to allocate real blocks for a contiguous + * delalloc extent if free space is sufficiently fragmented. */ do { - error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb, - &wpc->imap, wpc->fork == XFS_COW_FORK ? - &wpc->cow_seq : &wpc->data_seq); + error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, + &wpc->iomap, seq); if (error) return error; - } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb); + } while (wpc->iomap.offset + wpc->iomap.length <= offset); return 0; } -STATIC int +static int xfs_map_blocks( - struct xfs_writepage_ctx *wpc, + struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset) { @@ -496,6 +347,7 @@ xfs_map_blocks( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_fileoff_t cow_fsb = NULLFILEOFF; + int whichfork = XFS_DATA_FORK; struct xfs_bmbt_irec imap; struct xfs_iext_cursor icur; int retries = 0; @@ -519,7 +371,7 @@ xfs_map_blocks( * against concurrent updates and provides a memory barrier on the way * out that ensures that we always see the current value. */ - if (xfs_imap_valid(wpc, ip, offset_fsb)) + if (xfs_imap_valid(wpc, ip, offset)) return 0; /* @@ -541,10 +393,10 @@ retry: xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) cow_fsb = imap.br_startoff; if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { - wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); + XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); - wpc->fork = XFS_COW_FORK; + whichfork = XFS_COW_FORK; goto allocate_blocks; } @@ -552,7 +404,7 @@ retry: * No COW extent overlap. Revalidate now that we may have updated * ->cow_seq. If the data mapping is still valid, we're done. */ - if (xfs_imap_valid(wpc, ip, offset_fsb)) { + if (xfs_imap_valid(wpc, ip, offset)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return 0; } @@ -564,11 +416,9 @@ retry: */ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) imap.br_startoff = end_fsb; /* fake a hole past EOF */ - wpc->data_seq = READ_ONCE(ip->i_df.if_seq); + XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); - wpc->fork = XFS_DATA_FORK; - /* landed in a hole or beyond EOF? */ if (imap.br_startoff > offset_fsb) { imap.br_blockcount = imap.br_startoff - offset_fsb; @@ -592,11 +442,11 @@ retry: isnullstartblock(imap.br_startblock)) goto allocate_blocks; - wpc->imap = imap; - trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0); + trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: - error = xfs_convert_blocks(wpc, ip, offset_fsb); + error = xfs_convert_blocks(wpc, ip, whichfork, offset); if (error) { /* * If we failed to find the extent in the COW fork we might have @@ -605,7 +455,7 @@ allocate_blocks: * the former case, but prevent additional retries to avoid * looping forever for the latter case. */ - if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++) + if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++) goto retry; ASSERT(error != -EAGAIN); return error; @@ -616,34 +466,22 @@ allocate_blocks: * original delalloc one. Trim the return extent to the next COW * boundary again to force a re-lookup. */ - if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF && - cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount) - wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff; + if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) { + loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb); - ASSERT(wpc->imap.br_startoff <= offset_fsb); - ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb); - trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap); + if (cow_offset < wpc->iomap.offset + wpc->iomap.length) + wpc->iomap.length = cow_offset - wpc->iomap.offset; + } + + ASSERT(wpc->iomap.offset <= offset); + ASSERT(wpc->iomap.offset + wpc->iomap.length > offset); + trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap); return 0; } -/* - * Submit the bio for an ioend. We are passed an ioend with a bio attached to - * it, and we submit that bio. The ioend may be used for multiple bio - * submissions, so we only want to allocate an append transaction for the ioend - * once. In the case of multiple bio submission, each bio will take an IO - * reference to the ioend to ensure that the ioend completion is only done once - * all bios have been submitted and the ioend is really done. - * - * If @status is non-zero, it means that we have a situation where some part of - * the submission process has failed after we have marked paged for writeback - * and unlocked them. In this situation, we need to fail the bio and ioend - * rather than submit it to IO. This typically only happens on a filesystem - * shutdown. - */ -STATIC int -xfs_submit_ioend( - struct writeback_control *wbc, - struct xfs_ioend *ioend, +static int +xfs_prepare_ioend( + struct iomap_ioend *ioend, int status) { unsigned int nofs_flag; @@ -656,157 +494,24 @@ xfs_submit_ioend( nofs_flag = memalloc_nofs_save(); /* Convert CoW extents to regular */ - if (!status && ioend->io_fork == XFS_COW_FORK) { + if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); } /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - (ioend->io_fork == XFS_COW_FORK || - ioend->io_state != XFS_EXT_UNWRITTEN) && + ((ioend->io_flags & IOMAP_F_SHARED) || + ioend->io_type != IOMAP_UNWRITTEN) && xfs_ioend_is_append(ioend) && - !ioend->io_append_trans) + !ioend->io_private) status = xfs_setfilesize_trans_alloc(ioend); memalloc_nofs_restore(nofs_flag); - ioend->io_bio->bi_private = ioend; - ioend->io_bio->bi_end_io = xfs_end_bio; - - /* - * If we are failing the IO now, just mark the ioend with an - * error and finish it. This will run IO completion immediately - * as there is only one reference to the ioend at this point in - * time. - */ - if (status) { - ioend->io_bio->bi_status = errno_to_blk_status(status); - bio_endio(ioend->io_bio); - return status; - } - - submit_bio(ioend->io_bio); - return 0; -} - -static struct xfs_ioend * -xfs_alloc_ioend( - struct inode *inode, - int fork, - xfs_exntst_t state, - xfs_off_t offset, - struct block_device *bdev, - sector_t sector, - struct writeback_control *wbc) -{ - struct xfs_ioend *ioend; - struct bio *bio; - - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset); - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = sector; - bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); - bio->bi_write_hint = inode->i_write_hint; - wbc_init_bio(wbc, bio); - - ioend = container_of(bio, struct xfs_ioend, io_inline_bio); - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_fork = fork; - ioend->io_state = state; - ioend->io_inode = inode; - ioend->io_size = 0; - ioend->io_offset = offset; - ioend->io_append_trans = NULL; - ioend->io_bio = bio; - return ioend; -} - -/* - * Allocate a new bio, and chain the old bio to the new one. - * - * Note that we have to do perform the chaining in this unintuitive order - * so that the bi_private linkage is set up in the right direction for the - * traversal in xfs_destroy_ioend(). - */ -static struct bio * -xfs_chain_bio( - struct bio *prev) -{ - struct bio *new; - - new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); - bio_copy_dev(new, prev);/* also copies over blkcg information */ - new->bi_iter.bi_sector = bio_end_sector(prev); - new->bi_opf = prev->bi_opf; - new->bi_write_hint = prev->bi_write_hint; - - bio_chain(prev, new); - bio_get(prev); /* for xfs_destroy_ioend */ - submit_bio(prev); - return new; -} - -/* - * Test to see if we have an existing ioend structure that we could append to - * first, otherwise finish off the current ioend and start another. - */ -STATIC void -xfs_add_to_ioend( - struct inode *inode, - xfs_off_t offset, - struct page *page, - struct iomap_page *iop, - struct xfs_writepage_ctx *wpc, - struct writeback_control *wbc, - struct list_head *iolist) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct block_device *bdev = xfs_find_bdev_for_inode(inode); - unsigned len = i_blocksize(inode); - unsigned poff = offset & (PAGE_SIZE - 1); - bool merged, same_page = false; - sector_t sector; - - sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + - ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9); - - if (!wpc->ioend || - wpc->fork != wpc->ioend->io_fork || - wpc->imap.br_state != wpc->ioend->io_state || - sector != bio_end_sector(wpc->ioend->io_bio) || - offset != wpc->ioend->io_offset + wpc->ioend->io_size) { - if (wpc->ioend) - list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = xfs_alloc_ioend(inode, wpc->fork, - wpc->imap.br_state, offset, bdev, sector, wbc); - } - - merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, - &same_page); - - if (iop && !same_page) - atomic_inc(&iop->write_count); - - if (!merged) { - if (bio_full(wpc->ioend->io_bio, len)) - wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio); - bio_add_page(wpc->ioend->io_bio, page, len, poff); - } - - wpc->ioend->io_size += len; - wbc_account_cgroup_owner(wbc, page, len); -} - -STATIC void -xfs_vm_invalidatepage( - struct page *page, - unsigned int offset, - unsigned int length) -{ - trace_xfs_invalidatepage(page->mapping->host, page, offset, length); - iomap_invalidatepage(page, offset, length); + if (xfs_ioend_needs_workqueue(ioend)) + ioend->io_bio->bi_end_io = xfs_end_bio; + return status; } /* @@ -820,8 +525,8 @@ xfs_vm_invalidatepage( * transaction as there is no space left for block reservation (typically why we * see a ENOSPC in writeback). */ -STATIC void -xfs_aops_discard_page( +static void +xfs_discard_page( struct page *page) { struct inode *inode = page->mapping->host; @@ -834,7 +539,7 @@ xfs_aops_discard_page( if (XFS_FORCED_SHUTDOWN(mp)) goto out_invalidate; - xfs_alert(mp, + xfs_alert_ratelimited(mp, "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", page, ip->i_ino, offset); @@ -843,246 +548,14 @@ xfs_aops_discard_page( if (error && !XFS_FORCED_SHUTDOWN(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: - xfs_vm_invalidatepage(page, 0, PAGE_SIZE); -} - -/* - * We implement an immediate ioend submission policy here to avoid needing to - * chain multiple ioends and hence nest mempool allocations which can violate - * forward progress guarantees we need to provide. The current ioend we are - * adding blocks to is cached on the writepage context, and if the new block - * does not append to the cached ioend it will create a new ioend and cache that - * instead. - * - * If a new ioend is created and cached, the old ioend is returned and queued - * locally for submission once the entire page is processed or an error has been - * detected. While ioends are submitted immediately after they are completed, - * batching optimisations are provided by higher level block plugging. - * - * At the end of a writeback pass, there will be a cached ioend remaining on the - * writepage context that the caller will need to submit. - */ -static int -xfs_writepage_map( - struct xfs_writepage_ctx *wpc, - struct writeback_control *wbc, - struct inode *inode, - struct page *page, - uint64_t end_offset) -{ - LIST_HEAD(submit_list); - struct iomap_page *iop = to_iomap_page(page); - unsigned len = i_blocksize(inode); - struct xfs_ioend *ioend, *next; - uint64_t file_offset; /* file offset of page */ - int error = 0, count = 0, i; - - ASSERT(iop || i_blocksize(inode) == PAGE_SIZE); - ASSERT(!iop || atomic_read(&iop->write_count) == 0); - - /* - * Walk through the page to find areas to write back. If we run off the - * end of the current map or find the current map invalid, grab a new - * one. - */ - for (i = 0, file_offset = page_offset(page); - i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; - i++, file_offset += len) { - if (iop && !test_bit(i, iop->uptodate)) - continue; - - error = xfs_map_blocks(wpc, inode, file_offset); - if (error) - break; - if (wpc->imap.br_startblock == HOLESTARTBLOCK) - continue; - xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, - &submit_list); - count++; - } - - ASSERT(wpc->ioend || list_empty(&submit_list)); - ASSERT(PageLocked(page)); - ASSERT(!PageWriteback(page)); - - /* - * On error, we have to fail the ioend here because we may have set - * pages under writeback, we have to make sure we run IO completion to - * mark the error state of the IO appropriately, so we can't cancel the - * ioend directly here. That means we have to mark this page as under - * writeback if we included any blocks from it in the ioend chain so - * that completion treats it correctly. - * - * If we didn't include the page in the ioend, the on error we can - * simply discard and unlock it as there are no other users of the page - * now. The caller will still need to trigger submission of outstanding - * ioends on the writepage context so they are treated correctly on - * error. - */ - if (unlikely(error)) { - if (!count) { - xfs_aops_discard_page(page); - ClearPageUptodate(page); - unlock_page(page); - goto done; - } - - /* - * If the page was not fully cleaned, we need to ensure that the - * higher layers come back to it correctly. That means we need - * to keep the page dirty, and for WB_SYNC_ALL writeback we need - * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed - * so another attempt to write this page in this writeback sweep - * will be made. - */ - set_page_writeback_keepwrite(page); - } else { - clear_page_dirty_for_io(page); - set_page_writeback(page); - } - - unlock_page(page); - - /* - * Preserve the original error if there was one, otherwise catch - * submission errors here and propagate into subsequent ioend - * submissions. - */ - list_for_each_entry_safe(ioend, next, &submit_list, io_list) { - int error2; - - list_del_init(&ioend->io_list); - error2 = xfs_submit_ioend(wbc, ioend, error); - if (error2 && !error) - error = error2; - } - - /* - * We can end up here with no error and nothing to write only if we race - * with a partial page truncate on a sub-page block sized filesystem. - */ - if (!count) - end_page_writeback(page); -done: - mapping_set_error(page->mapping, error); - return error; + iomap_invalidatepage(page, 0, PAGE_SIZE); } -/* - * Write out a dirty page. - * - * For delalloc space on the page we need to allocate space and flush it. - * For unwritten space on the page we need to start the conversion to - * regular allocated space. - */ -STATIC int -xfs_do_writepage( - struct page *page, - struct writeback_control *wbc, - void *data) -{ - struct xfs_writepage_ctx *wpc = data; - struct inode *inode = page->mapping->host; - loff_t offset; - uint64_t end_offset; - pgoff_t end_index; - - trace_xfs_writepage(inode, page, 0, 0); - - /* - * Refuse to write the page out if we are called from reclaim context. - * - * This avoids stack overflows when called from deeply used stacks in - * random callers for direct reclaim or memcg reclaim. We explicitly - * allow reclaim from kswapd as the stack usage there is relatively low. - * - * This should never happen except in the case of a VM regression so - * warn about it. - */ - if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == - PF_MEMALLOC)) - goto redirty; - - /* - * Given that we do not allow direct reclaim to call us, we should - * never be called while in a filesystem transaction. - */ - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) - goto redirty; - - /* - * Is this page beyond the end of the file? - * - * The page index is less than the end_index, adjust the end_offset - * to the highest offset that this page should represent. - * ----------------------------------------------------- - * | file mapping | <EOF> | - * ----------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | | - * ^--------------------------------^----------|-------- - * | desired writeback range | see else | - * ---------------------------------^------------------| - */ - offset = i_size_read(inode); - end_index = offset >> PAGE_SHIFT; - if (page->index < end_index) - end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT; - else { - /* - * Check whether the page to write out is beyond or straddles - * i_size or not. - * ------------------------------------------------------- - * | file mapping | <EOF> | - * ------------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | - * ^--------------------------------^-----------|--------- - * | | Straddles | - * ---------------------------------^-----------|--------| - */ - unsigned offset_into_page = offset & (PAGE_SIZE - 1); - - /* - * Skip the page if it is fully outside i_size, e.g. due to a - * truncate operation that is in progress. We must redirty the - * page so that reclaim stops reclaiming it. Otherwise - * xfs_vm_releasepage() is called on it and gets confused. - * - * Note that the end_index is unsigned long, it would overflow - * if the given offset is greater than 16TB on 32-bit system - * and if we do check the page is fully outside i_size or not - * via "if (page->index >= end_index + 1)" as "end_index + 1" - * will be evaluated to 0. Hence this page will be redirtied - * and be written out repeatedly which would result in an - * infinite loop, the user program that perform this operation - * will hang. Instead, we can verify this situation by checking - * if the page to write is totally beyond the i_size or if it's - * offset is just equal to the EOF. - */ - if (page->index > end_index || - (page->index == end_index && offset_into_page == 0)) - goto redirty; - - /* - * The page straddles i_size. It must be zeroed out on each - * and every writepage invocation because it may be mmapped. - * "A file is mapped in multiples of the page size. For a file - * that is not a multiple of the page size, the remaining - * memory is zeroed when mapped, and writes to that region are - * not written out to the file." - */ - zero_user_segment(page, offset_into_page, PAGE_SIZE); - - /* Adjust the end_offset to the end of file */ - end_offset = offset; - } - - return xfs_writepage_map(wpc, wbc, inode, page, end_offset); - -redirty: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} +static const struct iomap_writeback_ops xfs_writeback_ops = { + .map_blocks = xfs_map_blocks, + .prepare_ioend = xfs_prepare_ioend, + .discard_page = xfs_discard_page, +}; STATIC int xfs_vm_writepage( @@ -1090,12 +563,8 @@ xfs_vm_writepage( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { }; - int ret; - ret = xfs_do_writepage(page, wbc, &wpc); - if (wpc.ioend) - ret = xfs_submit_ioend(wbc, wpc.ioend, ret); - return ret; + return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops); } STATIC int @@ -1104,13 +573,9 @@ xfs_vm_writepages( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { }; - int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); - if (wpc.ioend) - ret = xfs_submit_ioend(wbc, wpc.ioend, ret); - return ret; + return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); } STATIC int @@ -1118,18 +583,11 @@ xfs_dax_writepages( struct address_space *mapping, struct writeback_control *wbc) { - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - return dax_writeback_mapping_range(mapping, - xfs_find_bdev_for_inode(mapping->host), wbc); -} + struct xfs_inode *ip = XFS_I(mapping->host); -STATIC int -xfs_vm_releasepage( - struct page *page, - gfp_t gfp_mask) -{ - trace_xfs_releasepage(page->mapping->host, page, 0, 0); - return iomap_releasepage(page, gfp_mask); + xfs_iflags_clear(ip, XFS_ITRUNCATED); + return dax_writeback_mapping_range(mapping, + xfs_inode_buftarg(ip)->bt_daxdev, wbc); } STATIC sector_t @@ -1152,7 +610,7 @@ xfs_vm_bmap( */ if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; - return iomap_bmap(mapping, block, &xfs_iomap_ops); + return iomap_bmap(mapping, block, &xfs_read_iomap_ops); } STATIC int @@ -1160,19 +618,14 @@ xfs_vm_readpage( struct file *unused, struct page *page) { - trace_xfs_vm_readpage(page->mapping->host, 1); - return iomap_readpage(page, &xfs_iomap_ops); + return iomap_readpage(page, &xfs_read_iomap_ops); } -STATIC int -xfs_vm_readpages( - struct file *unused, - struct address_space *mapping, - struct list_head *pages, - unsigned nr_pages) +STATIC void +xfs_vm_readahead( + struct readahead_control *rac) { - trace_xfs_vm_readpages(mapping->host, nr_pages); - return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops); + iomap_readahead(rac, &xfs_read_iomap_ops); } static int @@ -1181,18 +634,19 @@ xfs_iomap_swapfile_activate( struct file *swap_file, sector_t *span) { - sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file)); - return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops); + sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev; + return iomap_swapfile_activate(sis, swap_file, span, + &xfs_read_iomap_ops); } const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, - .readpages = xfs_vm_readpages, + .readahead = xfs_vm_readahead, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, .set_page_dirty = iomap_set_page_dirty, - .releasepage = xfs_vm_releasepage, - .invalidatepage = xfs_vm_invalidatepage, + .releasepage = iomap_releasepage, + .invalidatepage = iomap_invalidatepage, .bmap = xfs_vm_bmap, .direct_IO = noop_direct_IO, .migratepage = iomap_migrate_page, diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 45a1ea240cbb..e0bd68419764 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -6,29 +6,9 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern struct bio_set xfs_ioend_bioset; - -/* - * Structure for buffered I/O completions. - */ -struct xfs_ioend { - struct list_head io_list; /* next ioend in chain */ - int io_fork; /* inode fork written back */ - xfs_exntst_t io_state; /* extent state */ - struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of the extent */ - xfs_off_t io_offset; /* offset in the file */ - struct xfs_trans *io_append_trans;/* xact. for size update */ - struct bio *io_bio; /* bio being built */ - struct bio io_inline_bio; /* MUST BE LAST! */ -}; - extern const struct address_space_operations xfs_address_space_operations; extern const struct address_space_operations xfs_dax_aops; int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); -extern struct block_device *xfs_find_bdev_for_inode(struct inode *); -extern struct dax_device *xfs_find_daxdev_for_inode(struct inode *); - #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index a640a285cc52..c42f90e16b4f 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -22,24 +22,21 @@ #include "xfs_attr_leaf.h" #include "xfs_quota.h" #include "xfs_dir2.h" +#include "xfs_error.h" /* - * Look at all the extents for this logical region, - * invalidate any buffers that are incore/in transactions. + * Invalidate any incore buffers associated with this remote attribute value + * extent. We never log remote attribute value buffers, which means that they + * won't be attached to a transaction and are therefore safe to mark stale. + * The actual bunmapi will be taken care of later. */ STATIC int -xfs_attr3_leaf_freextent( - struct xfs_trans **trans, +xfs_attr3_rmt_stale( struct xfs_inode *dp, xfs_dablk_t blkno, int blkcnt) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_dablk_t tblkno; - xfs_daddr_t dblkno; - int tblkcnt; - int dblkcnt; int nmap; int error; @@ -47,47 +44,29 @@ xfs_attr3_leaf_freextent( * Roll through the "value", invalidating the attribute value's * blocks. */ - tblkno = blkno; - tblkcnt = blkcnt; - while (tblkcnt > 0) { + while (blkcnt > 0) { /* * Try to remember where we decided to put the value. */ nmap = 1; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); - if (error) { + if (error) return error; - } - ASSERT(nmap == 1); - ASSERT(map.br_startblock != DELAYSTARTBLOCK); + if (XFS_IS_CORRUPT(dp->i_mount, nmap != 1)) + return -EFSCORRUPTED; /* - * If it's a hole, these are already unmapped - * so there's nothing to invalidate. + * Mark any incore buffers for the remote value as stale. We + * never log remote attr value buffers, so the buffer should be + * easy to kill. */ - if (map.br_startblock != HOLESTARTBLOCK) { - - dblkno = XFS_FSB_TO_DADDR(dp->i_mount, - map.br_startblock); - dblkcnt = XFS_FSB_TO_BB(dp->i_mount, - map.br_blockcount); - bp = xfs_trans_get_buf(*trans, - dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, 0); - if (!bp) - return -ENOMEM; - xfs_trans_binval(*trans, bp); - /* - * Roll to next transaction. - */ - error = xfs_trans_roll_inode(trans, dp); - if (error) - return error; - } + error = xfs_attr_rmtval_stale(dp, &map, 0); + if (error) + return error; - tblkno += map.br_blockcount; - tblkcnt -= map.br_blockcount; + blkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } return 0; @@ -101,86 +80,45 @@ xfs_attr3_leaf_freextent( */ STATIC int xfs_attr3_leaf_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) { - struct xfs_attr_leafblock *leaf; - struct xfs_attr3_icleaf_hdr ichdr; - struct xfs_attr_leaf_entry *entry; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_mount *mp = bp->b_mount; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_remote *name_rmt; - struct xfs_attr_inactive_list *list; - struct xfs_attr_inactive_list *lp; - int error; - int count; - int size; - int tmp; - int i; - struct xfs_mount *mp = bp->b_mount; + int error = 0; + int i; - leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); /* - * Count the number of "remote" value extents. + * Find the remote value extents for this leaf and invalidate their + * incore buffers. */ - count = 0; entry = xfs_attr3_leaf_entryp(leaf); for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) - count++; - } - } - - /* - * If there are no "remote" values, we're done. - */ - if (count == 0) { - xfs_trans_brelse(*trans, bp); - return 0; - } - - /* - * Allocate storage for a list of all the "remote" value extents. - */ - size = count * sizeof(xfs_attr_inactive_list_t); - list = kmem_alloc(size, 0); + int blkcnt; - /* - * Identify each of the "remote" value extents. - */ - lp = list; - entry = xfs_attr3_leaf_entryp(leaf); - for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) { - lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); - lp++; - } - } - } - xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ + if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL)) + continue; - /* - * Invalidate each of the "remote" value extents. - */ - error = 0; - for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr3_leaf_freextent(trans, dp, - lp->valueblk, lp->valuelen); + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (!name_rmt->valueblk) + continue; - if (error == 0) - error = tmp; /* save only the 1st errno */ + blkcnt = xfs_attr3_rmt_blocks(dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + error = xfs_attr3_rmt_stale(dp, + be32_to_cpu(name_rmt->valueblk), blkcnt); + if (error) + goto err; } - kmem_free(list); + xfs_trans_brelse(*trans, bp); +err: return error; } @@ -190,37 +128,35 @@ xfs_attr3_leaf_inactive( */ STATIC int xfs_attr3_node_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp, - int level) + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp, + int level) { - xfs_da_blkinfo_t *info; - xfs_da_intnode_t *node; - xfs_dablk_t child_fsb; - xfs_daddr_t parent_blkno, child_blkno; - int error, i; - struct xfs_buf *child_bp; - struct xfs_da_node_entry *btree; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_blkinfo *info; + xfs_dablk_t child_fsb; + xfs_daddr_t parent_blkno, child_blkno; + struct xfs_buf *child_bp; struct xfs_da3_icnode_hdr ichdr; + int error, i; /* * Since this code is recursive (gasp!) we must protect ourselves. */ if (level > XFS_DA_NODE_MAXDEPTH) { + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - return -EIO; + return -EFSCORRUPTED; } - node = bp->b_addr; - dp->d_ops->node_hdr_from_disk(&ichdr, node); + xfs_da3_node_hdr_from_disk(dp->i_mount, &ichdr, bp->b_addr); parent_blkno = bp->b_bn; if (!ichdr.count) { xfs_trans_brelse(*trans, bp); return 0; } - btree = dp->d_ops->node_tree_p(node); - child_fsb = be32_to_cpu(btree[0].before); + child_fsb = be32_to_cpu(ichdr.btree[0].before); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ /* @@ -235,7 +171,7 @@ xfs_attr3_node_inactive( * traversal of the tree so we may deal with many blocks * before we come back to this one. */ - error = xfs_da3_node_read(*trans, dp, child_fsb, -1, &child_bp, + error = xfs_da3_node_read(*trans, dp, child_fsb, &child_bp, XFS_ATTR_FORK); if (error) return error; @@ -258,8 +194,9 @@ xfs_attr3_node_inactive( error = xfs_attr3_leaf_inactive(trans, dp, child_bp); break; default: - error = -EIO; + xfs_buf_mark_corrupt(child_bp); xfs_trans_brelse(*trans, child_bp); + error = -EFSCORRUPTED; break; } if (error) @@ -268,10 +205,17 @@ xfs_attr3_node_inactive( /* * Remove the subsidiary block from the cache and from the log. */ - error = xfs_da_get_buf(*trans, dp, 0, child_blkno, &child_bp, - XFS_ATTR_FORK); + error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, + child_blkno, + XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, + &child_bp); if (error) return error; + error = bp->b_error; + if (error) { + xfs_trans_brelse(*trans, child_bp); + return error; + } xfs_trans_binval(*trans, child_bp); /* @@ -279,13 +223,15 @@ xfs_attr3_node_inactive( * child block number. */ if (i + 1 < ichdr.count) { - error = xfs_da3_node_read(*trans, dp, 0, parent_blkno, - &bp, XFS_ATTR_FORK); + struct xfs_da3_icnode_hdr phdr; + + error = xfs_da3_node_read_mapped(*trans, dp, + parent_blkno, &bp, XFS_ATTR_FORK); if (error) return error; - node = bp->b_addr; - btree = dp->d_ops->node_tree_p(node); - child_fsb = be32_to_cpu(btree[i + 1].before); + xfs_da3_node_hdr_from_disk(dp->i_mount, &phdr, + bp->b_addr); + child_fsb = be32_to_cpu(phdr.btree[i + 1].before); xfs_trans_brelse(*trans, bp); } /* @@ -310,6 +256,7 @@ xfs_attr3_root_inactive( struct xfs_trans **trans, struct xfs_inode *dp) { + struct xfs_mount *mp = dp->i_mount; struct xfs_da_blkinfo *info; struct xfs_buf *bp; xfs_daddr_t blkno; @@ -321,7 +268,7 @@ xfs_attr3_root_inactive( * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_da3_node_read(*trans, dp, 0, &bp, XFS_ATTR_FORK); if (error) return error; blkno = bp->b_bn; @@ -341,7 +288,8 @@ xfs_attr3_root_inactive( error = xfs_attr3_leaf_inactive(trans, dp, bp); break; default: - error = -EIO; + error = -EFSCORRUPTED; + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); break; } @@ -351,9 +299,15 @@ xfs_attr3_root_inactive( /* * Invalidate the incore copy of the root block. */ - error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); + error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, blkno, + XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &bp); if (error) return error; + error = bp->b_error; + if (error) { + xfs_trans_brelse(*trans, bp); + return error; + } xfs_trans_binval(*trans, bp); /* remove from cache */ /* * Commit the invalidate and start the next transaction. diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 00758fdc2fec..5ff1d929d3b5 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -49,25 +49,22 @@ xfs_attr_shortform_compare(const void *a, const void *b) * we can begin returning them to the user. */ static int -xfs_attr_shortform_list(xfs_attr_list_context_t *context) +xfs_attr_shortform_list( + struct xfs_attr_list_context *context) { - attrlist_cursor_kern_t *cursor; - xfs_attr_sf_sort_t *sbuf, *sbp; - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - xfs_inode_t *dp; - int sbsize, nsbuf, count, i; - - ASSERT(context != NULL); - dp = context->dp; - ASSERT(dp != NULL); + struct xfs_attrlist_cursor_kern *cursor = &context->cursor; + struct xfs_inode *dp = context->dp; + struct xfs_attr_sf_sort *sbuf, *sbp; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + int sbsize, nsbuf, count, i; + int error = 0; + ASSERT(dp->i_afp != NULL); sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) return 0; - cursor = context->cursor; - ASSERT(cursor != NULL); trace_xfs_attr_list_sf(context); @@ -84,6 +81,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) (XFS_ISRESET_CURSOR(cursor) && (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + if (XFS_IS_CORRUPT(context->dp->i_mount, + !xfs_attr_namecheck(sfe->nameval, + sfe->namelen))) + return -EFSCORRUPTED; context->put_listent(context, sfe->flags, sfe->nameval, @@ -161,10 +162,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) break; } } - if (i == nsbuf) { - kmem_free(sbuf); - return 0; - } + if (i == nsbuf) + goto out; /* * Loop putting entries into the user buffer. @@ -174,6 +173,12 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) cursor->hashval = sbp->hash; cursor->offset = 0; } + if (XFS_IS_CORRUPT(context->dp->i_mount, + !xfs_attr_namecheck(sbp->name, + sbp->namelen))) { + error = -EFSCORRUPTED; + goto out; + } context->put_listent(context, sbp->flags, sbp->name, @@ -183,9 +188,9 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) break; cursor->offset++; } - +out: kmem_free(sbuf); - return 0; + return error; } /* @@ -195,7 +200,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) STATIC int xfs_attr_node_list_lookup( struct xfs_attr_list_context *context, - struct attrlist_cursor_kern *cursor, + struct xfs_attrlist_cursor_kern *cursor, struct xfs_buf **pbp) { struct xfs_da3_icnode_hdr nodehdr; @@ -213,7 +218,7 @@ xfs_attr_node_list_lookup( ASSERT(*pbp == NULL); cursor->blkno = 0; for (;;) { - error = xfs_da3_node_read(tp, dp, cursor->blkno, -1, &bp, + error = xfs_da3_node_read(tp, dp, cursor->blkno, &bp, XFS_ATTR_FORK); if (error) return error; @@ -229,7 +234,7 @@ xfs_attr_node_list_lookup( goto out_corruptbuf; } - dp->d_ops->node_hdr_from_disk(&nodehdr, node); + xfs_da3_node_hdr_from_disk(mp, &nodehdr, node); /* Tree taller than we can handle; bail out! */ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) @@ -243,7 +248,7 @@ xfs_attr_node_list_lookup( else expected_level--; - btree = dp->d_ops->node_tree_p(node); + btree = nodehdr.btree; for (i = 0; i < nodehdr.count; btree++, i++) { if (cursor->hashval <= be32_to_cpu(btree->hashval)) { cursor->blkno = be32_to_cpu(btree->before); @@ -258,7 +263,7 @@ xfs_attr_node_list_lookup( return 0; /* We can't point back to the root. */ - if (cursor->blkno == 0) + if (XFS_IS_CORRUPT(mp, cursor->blkno == 0)) return -EFSCORRUPTED; } @@ -269,6 +274,7 @@ xfs_attr_node_list_lookup( return 0; out_corruptbuf: + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(tp, bp); return -EFSCORRUPTED; } @@ -277,18 +283,17 @@ STATIC int xfs_attr_node_list( struct xfs_attr_list_context *context) { + struct xfs_attrlist_cursor_kern *cursor = &context->cursor; struct xfs_attr3_icleaf_hdr leafhdr; - struct attrlist_cursor_kern *cursor; struct xfs_attr_leafblock *leaf; struct xfs_da_intnode *node; struct xfs_buf *bp; struct xfs_inode *dp = context->dp; struct xfs_mount *mp = dp->i_mount; - int error; + int error = 0; trace_xfs_attr_node_list(context); - cursor = context->cursor; cursor->initted = 1; /* @@ -298,8 +303,8 @@ xfs_attr_node_list( */ bp = NULL; if (cursor->blkno > 0) { - error = xfs_da3_node_read(context->tp, dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); + error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp, + XFS_ATTR_FORK); if ((error != 0) && (error != -EFSCORRUPTED)) return error; if (bp) { @@ -358,29 +363,32 @@ xfs_attr_node_list( */ for (;;) { leaf = bp->b_addr; - xfs_attr3_leaf_list_int(bp, context); + error = xfs_attr3_leaf_list_int(bp, context); + if (error) + break; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); if (context->seen_enough || leafhdr.forw == 0) break; cursor->blkno = leafhdr.forw; xfs_trans_brelse(context->tp, bp); - error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno, -1, &bp); + error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno, + &bp); if (error) return error; } xfs_trans_brelse(context->tp, bp); - return 0; + return error; } /* * Copy out attribute list entries for attr_list(), for leaf attribute lists. */ -void +int xfs_attr3_leaf_list_int( struct xfs_buf *bp, struct xfs_attr_list_context *context) { - struct attrlist_cursor_kern *cursor; + struct xfs_attrlist_cursor_kern *cursor = &context->cursor; struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; @@ -394,7 +402,6 @@ xfs_attr3_leaf_list_int( xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); - cursor = context->cursor; cursor->initted = 1; /* @@ -417,7 +424,7 @@ xfs_attr3_leaf_list_int( } if (i == ichdr.count) { trace_xfs_attr_list_notfound(context); - return; + return 0; } } else { entry = &entries[0]; @@ -438,8 +445,8 @@ xfs_attr3_leaf_list_int( } if ((entry->flags & XFS_ATTR_INCOMPLETE) && - !(context->flags & ATTR_INCOMPLETE)) - continue; /* skip incomplete entries */ + !context->allow_incomplete) + continue; if (entry->flags & XFS_ATTR_LOCAL) { xfs_attr_leaf_name_local_t *name_loc; @@ -457,6 +464,9 @@ xfs_attr3_leaf_list_int( valuelen = be32_to_cpu(name_rmt->valuelen); } + if (XFS_IS_CORRUPT(context->dp->i_mount, + !xfs_attr_namecheck(name, namelen))) + return -EFSCORRUPTED; context->put_listent(context, entry->flags, name, namelen, valuelen); if (context->seen_enough) @@ -464,32 +474,33 @@ xfs_attr3_leaf_list_int( cursor->offset++; } trace_xfs_attr_list_leaf_end(context); - return; + return 0; } /* * Copy out attribute entries for attr_list(), for leaf attribute lists. */ STATIC int -xfs_attr_leaf_list(xfs_attr_list_context_t *context) +xfs_attr_leaf_list( + struct xfs_attr_list_context *context) { - int error; - struct xfs_buf *bp; + struct xfs_buf *bp; + int error; trace_xfs_attr_leaf_list(context); - context->cursor->blkno = 0; - error = xfs_attr3_leaf_read(context->tp, context->dp, 0, -1, &bp); + context->cursor.blkno = 0; + error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp); if (error) return error; - xfs_attr3_leaf_list_int(bp, context); + error = xfs_attr3_leaf_list_int(bp, context); xfs_trans_brelse(context->tp, bp); - return 0; + return error; } int -xfs_attr_list_int_ilocked( +xfs_attr_list_ilocked( struct xfs_attr_list_context *context) { struct xfs_inode *dp = context->dp; @@ -509,12 +520,12 @@ xfs_attr_list_int_ilocked( } int -xfs_attr_list_int( - xfs_attr_list_context_t *context) +xfs_attr_list( + struct xfs_attr_list_context *context) { - int error; - xfs_inode_t *dp = context->dp; - uint lock_mode; + struct xfs_inode *dp = context->dp; + uint lock_mode; + int error; XFS_STATS_INC(dp->i_mount, xs_attr_list); @@ -522,130 +533,7 @@ xfs_attr_list_int( return -EIO; lock_mode = xfs_ilock_attr_map_shared(dp); - error = xfs_attr_list_int_ilocked(context); + error = xfs_attr_list_ilocked(context); xfs_iunlock(dp, lock_mode); return error; } - -#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ - (((struct attrlist_ent *) 0)->a_name - (char *) 0) -#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ - ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(uint32_t)-1) \ - & ~(sizeof(uint32_t)-1)) - -/* - * Format an attribute and copy it out to the user's buffer. - * Take care to check values and protect against them changing later, - * we may be reading them directly out of a user buffer. - */ -STATIC void -xfs_attr_put_listent( - xfs_attr_list_context_t *context, - int flags, - unsigned char *name, - int namelen, - int valuelen) -{ - struct attrlist *alist = (struct attrlist *)context->alist; - attrlist_ent_t *aep; - int arraytop; - - ASSERT(!context->seen_enough); - ASSERT(!(context->flags & ATTR_KERNOVAL)); - ASSERT(context->count >= 0); - ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); - ASSERT(context->firstu >= sizeof(*alist)); - ASSERT(context->firstu <= context->bufsize); - - /* - * Only list entries in the right namespace. - */ - if (((context->flags & ATTR_SECURE) == 0) != - ((flags & XFS_ATTR_SECURE) == 0)) - return; - if (((context->flags & ATTR_ROOT) == 0) != - ((flags & XFS_ATTR_ROOT) == 0)) - return; - - arraytop = sizeof(*alist) + - context->count * sizeof(alist->al_offset[0]); - context->firstu -= ATTR_ENTSIZE(namelen); - if (context->firstu < arraytop) { - trace_xfs_attr_list_full(context); - alist->al_more = 1; - context->seen_enough = 1; - return; - } - - aep = (attrlist_ent_t *)&context->alist[context->firstu]; - aep->a_valuelen = valuelen; - memcpy(aep->a_name, name, namelen); - aep->a_name[namelen] = 0; - alist->al_offset[context->count++] = context->firstu; - alist->al_count = context->count; - trace_xfs_attr_list_add(context); - return; -} - -/* - * Generate a list of extended attribute names and optionally - * also value lengths. Positive return value follows the XFS - * convention of being an error, zero or negative return code - * is the length of the buffer returned (negated), indicating - * success. - */ -int -xfs_attr_list( - xfs_inode_t *dp, - char *buffer, - int bufsize, - int flags, - attrlist_cursor_kern_t *cursor) -{ - xfs_attr_list_context_t context; - struct attrlist *alist; - int error; - - /* - * Validate the cursor. - */ - if (cursor->pad1 || cursor->pad2) - return -EINVAL; - if ((cursor->initted == 0) && - (cursor->hashval || cursor->blkno || cursor->offset)) - return -EINVAL; - - /* Only internal consumers can retrieve incomplete attrs. */ - if (flags & ATTR_INCOMPLETE) - return -EINVAL; - - /* - * Check for a properly aligned buffer. - */ - if (((long)buffer) & (sizeof(int)-1)) - return -EFAULT; - if (flags & ATTR_KERNOVAL) - bufsize = 0; - - /* - * Initialize the output buffer. - */ - memset(&context, 0, sizeof(context)); - context.dp = dp; - context.cursor = cursor; - context.resynch = 1; - context.flags = flags; - context.alist = buffer; - context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ - context.firstu = context.bufsize; - context.put_listent = xfs_attr_put_listent; - - alist = (struct attrlist *)context.alist; - alist->al_count = 0; - alist->al_more = 0; - alist->al_offset[0] = context.bufsize; - - error = xfs_attr_list_int(&context); - ASSERT(error <= 0); - return error; -} diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 83d24e983d4c..ee6f4229cebc 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -21,7 +21,7 @@ #include "xfs_icache.h" #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" - +#include "xfs_error.h" kmem_zone_t *xfs_bui_zone; kmem_zone_t *xfs_bud_zone; @@ -35,7 +35,7 @@ void xfs_bui_item_free( struct xfs_bui_log_item *buip) { - kmem_zone_free(xfs_bui_zone, buip); + kmem_cache_free(xfs_bui_zone, buip); } /* @@ -201,7 +201,7 @@ xfs_bud_item_release( struct xfs_bud_log_item *budp = BUD_ITEM(lip); xfs_bui_release(budp->bud_buip); - kmem_zone_free(xfs_bud_zone, budp); + kmem_cache_free(xfs_bud_zone, budp); } static const struct xfs_item_ops xfs_bud_item_ops = { @@ -456,7 +456,7 @@ xfs_bui_recover( if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); - return -EIO; + return -EFSCORRUPTED; } /* @@ -490,7 +490,7 @@ xfs_bui_recover( */ set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); - return -EIO; + return -EFSCORRUPTED; } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, @@ -525,6 +525,7 @@ xfs_bui_recover( type = bui_type; break; default: + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto err_inode; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 4f443703065e..4f800f7fe888 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -53,15 +53,16 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) */ int xfs_zero_extent( - struct xfs_inode *ip, - xfs_fsblock_t start_fsb, - xfs_off_t count_fsb) + struct xfs_inode *ip, + xfs_fsblock_t start_fsb, + xfs_off_t count_fsb) { - struct xfs_mount *mp = ip->i_mount; - xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); - sector_t block = XFS_BB_TO_FSBT(mp, sector); + struct xfs_mount *mp = ip->i_mount; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); + sector_t block = XFS_BB_TO_FSBT(mp, sector); - return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)), + return blkdev_issue_zeroout(target->bt_bdev, block << (mp->m_super->s_blocksize_bits - 9), count_fsb << (mp->m_super->s_blocksize_bits - 9), GFP_NOFS, 0); @@ -164,13 +165,6 @@ xfs_bmap_rtalloc( xfs_trans_mod_dquot_byino(ap->tp, ap->ip, ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_RTBCOUNT, (long) ralen); - - /* Zero the extent if we were asked to do so */ - if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) { - error = xfs_zero_extent(ap->ip, ap->blkno, ap->length); - if (error) - return error; - } } else { ap->length = 0; } @@ -179,29 +173,6 @@ xfs_bmap_rtalloc( #endif /* CONFIG_XFS_RT */ /* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary. All offsets are considered outside - * the end of file for an empty fork, so 1 is returned in *eof in that case. - */ -int -xfs_bmap_eof( - struct xfs_inode *ip, - xfs_fileoff_t endoff, - int whichfork, - int *eof) -{ - struct xfs_bmbt_irec rec; - int error; - - error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); - if (error || *eof) - return error; - - *eof = endoff >= rec.br_startoff + rec.br_blockcount; - return 0; -} - -/* * Extent tree block counting routines. */ @@ -229,106 +200,6 @@ xfs_bmap_count_leaves( } /* - * Count leaf blocks given a range of extent records originally - * in btree format. - */ -STATIC void -xfs_bmap_disk_count_leaves( - struct xfs_mount *mp, - struct xfs_btree_block *block, - int numrecs, - xfs_filblks_t *count) -{ - int b; - xfs_bmbt_rec_t *frp; - - for (b = 1; b <= numrecs; b++) { - frp = XFS_BMBT_REC_ADDR(mp, block, b); - *count += xfs_bmbt_disk_get_blockcount(frp); - } -} - -/* - * Recursively walks each level of a btree - * to count total fsblocks in use. - */ -STATIC int -xfs_bmap_count_tree( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_ifork *ifp, - xfs_fsblock_t blockno, - int levelin, - xfs_extnum_t *nextents, - xfs_filblks_t *count) -{ - int error; - struct xfs_buf *bp, *nbp; - int level = levelin; - __be64 *pp; - xfs_fsblock_t bno = blockno; - xfs_fsblock_t nextbno; - struct xfs_btree_block *block, *nextblock; - int numrecs; - - error = xfs_btree_read_bufl(mp, tp, bno, &bp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); - - if (--level) { - /* Not at node above leaves, count this level of nodes */ - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - while (nextbno != NULLFSBLOCK) { - error = xfs_btree_read_bufl(mp, tp, nextbno, &nbp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - nextblock = XFS_BUF_TO_BLOCK(nbp); - nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); - xfs_trans_brelse(tp, nbp); - } - - /* Dive to the next level */ - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, nextents, - count); - if (error) { - xfs_trans_brelse(tp, bp); - XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", - XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; - } - xfs_trans_brelse(tp, bp); - } else { - /* count all level 1 nodes and their leaves */ - for (;;) { - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - numrecs = be16_to_cpu(block->bb_numrecs); - (*nextents) += numrecs; - xfs_bmap_disk_count_leaves(mp, block, numrecs, count); - xfs_trans_brelse(tp, bp); - if (nextbno == NULLFSBLOCK) - break; - bno = nextbno; - error = xfs_btree_read_bufl(mp, tp, bno, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); - } - } - return 0; -} - -/* * Count fsblocks of the given fork. Delayed allocation extents are * not counted towards the totals. */ @@ -340,26 +211,19 @@ xfs_bmap_count_blocks( xfs_extnum_t *nextents, xfs_filblks_t *count) { - struct xfs_mount *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - struct xfs_btree_block *block; /* current btree block */ - struct xfs_ifork *ifp; /* fork structure */ - xfs_fsblock_t bno; /* block # of "block" */ - int level; /* btree level, for checking */ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_cur *cur; + xfs_extlen_t btblocks = 0; int error; - bno = NULLFSBLOCK; - mp = ip->i_mount; *nextents = 0; *count = 0; - ifp = XFS_IFORK_PTR(ip, whichfork); + if (!ifp) return 0; switch (XFS_IFORK_FORMAT(ip, whichfork)) { - case XFS_DINODE_FMT_EXTENTS: - *nextents = xfs_bmap_count_leaves(ifp, count); - return 0; case XFS_DINODE_FMT_BTREE: if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); @@ -367,26 +231,23 @@ xfs_bmap_count_blocks( return error; } + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + error = xfs_btree_count_blocks(cur, &btblocks); + xfs_btree_del_cursor(cur, error); + if (error) + return error; + /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + * xfs_btree_count_blocks includes the root block contained in + * the inode fork in @btblocks, so subtract one because we're + * only interested in allocated disk blocks. */ - block = ifp->if_broot; - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - ASSERT(bno != NULLFSBLOCK); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - - error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, - nextents, count); - if (error) { - XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", - XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; - } - return 0; + *count += btblocks - 1; + + /* fall through */ + case XFS_DINODE_FMT_EXTENTS: + *nextents = xfs_bmap_count_leaves(ifp, count); + break; } return 0; @@ -964,8 +825,8 @@ xfs_alloc_file_space( xfs_trans_ijoin(tp, ip, 0); error = xfs_bmapi_write(tp, ip, startoffset_fsb, - allocatesize_fsb, alloc_type, resblks, - imapp, &nimaps); + allocatesize_fsb, alloc_type, 0, imapp, + &nimaps); if (error) goto error0; @@ -1039,6 +900,7 @@ out_trans_cancel: goto out_unlock; } +/* Caller must first wait for the completion of any pending DIOs if required. */ int xfs_flush_unmap_range( struct xfs_inode *ip, @@ -1050,9 +912,6 @@ xfs_flush_unmap_range( xfs_off_t rounding, start, end; int error; - /* wait for the completion of any pending DIOs */ - inode_dio_wait(inode); - rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); start = round_down(offset, rounding); end = round_up(offset + len, rounding) - 1; @@ -1084,10 +943,6 @@ xfs_free_file_space( if (len <= 0) /* if nothing being freed */ return 0; - error = xfs_flush_unmap_range(ip, offset, len); - if (error) - return error; - startoffset_fsb = XFS_B_TO_FSB(mp, offset); endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); @@ -1113,7 +968,8 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - error = iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops); + error = iomap_zero_range(VFS_I(ip), offset, len, NULL, + &xfs_buffered_write_iomap_ops); if (error) return error; @@ -1131,48 +987,12 @@ xfs_free_file_space( return error; } -/* - * Preallocate and zero a range of a file. This mechanism has the allocation - * semantics of fallocate and in addition converts data in the range to zeroes. - */ -int -xfs_zero_file_space( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t len) -{ - struct xfs_mount *mp = ip->i_mount; - uint blksize; - int error; - - trace_xfs_zero_file_space(ip); - - blksize = 1 << mp->m_sb.sb_blocklog; - - /* - * Punch a hole and prealloc the range. We use hole punch rather than - * unwritten extent conversion for two reasons: - * - * 1.) Hole punch handles partial block zeroing for us. - * - * 2.) If prealloc returns ENOSPC, the file range is still zero-valued - * by virtue of the hole punch. - */ - error = xfs_free_file_space(ip, offset, len); - if (error || xfs_is_always_cow_inode(ip)) - return error; - - return xfs_alloc_file_space(ip, round_down(offset, blksize), - round_up(offset + len, blksize) - - round_down(offset, blksize), - XFS_BMAPI_PREALLOC); -} - static int xfs_prepare_shift( struct xfs_inode *ip, loff_t offset) { + struct xfs_mount *mp = ip->i_mount; int error; /* @@ -1186,6 +1006,17 @@ xfs_prepare_shift( } /* + * Shift operations must stabilize the start block offset boundary along + * with the full range of the operation. If we don't, a COW writeback + * completion could race with an insert, front merge with the start + * extent (after split) during the shift and corrupt the file. Start + * with the block just prior to the start to stabilize the boundary. + */ + offset = round_down(offset, 1 << mp->m_sb.sb_blocklog); + if (offset) + offset -= (1 << mp->m_sb.sb_blocklog); + + /* * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ @@ -1231,7 +1062,6 @@ xfs_collapse_file_space( int error; xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len); xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); - uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); bool done = false; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); @@ -1247,32 +1077,34 @@ xfs_collapse_file_space( if (error) return error; - while (!error && !done) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, - &tp); - if (error) - break; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) + return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, - ip->i_gdquot, ip->i_pdquot, resblks, 0, - XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + while (!done) { error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb, &done); if (error) goto out_trans_cancel; + if (done) + break; - error = xfs_trans_commit(tp); + /* finish any deferred frees and roll the transaction */ + error = xfs_defer_finish(&tp); + if (error) + goto out_trans_cancel; } + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1315,35 +1147,41 @@ xfs_insert_file_space( if (error) return error; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + /* * The extent shifting code works on extent granularity. So, if stop_fsb * is not the starting block of extent, we need to split the extent at * stop_fsb. */ - error = xfs_bmap_split_extent(ip, stop_fsb); + error = xfs_bmap_split_extent(tp, ip, stop_fsb); if (error) - return error; + goto out_trans_cancel; - while (!error && !done) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, - &tp); + do { + error = xfs_trans_roll_inode(&tp, ip); if (error) - break; + goto out_trans_cancel; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb, &done, stop_fsb); if (error) goto out_trans_cancel; + } while (!done); - error = xfs_trans_commit(tp); - } - + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1611,12 +1449,12 @@ xfs_swap_extent_forks( * event of a crash. Set the owner change log flags now and leave the * bmbt scan as the last step. */ - if (ip->i_d.di_version == 3 && - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) - (*target_log_flags) |= XFS_ILOG_DOWNER; - if (tip->i_d.di_version == 3 && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) - (*src_log_flags) |= XFS_ILOG_DOWNER; + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) + (*target_log_flags) |= XFS_ILOG_DOWNER; + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + (*src_log_flags) |= XFS_ILOG_DOWNER; + } /* * Swap the data forks of the inodes @@ -1651,7 +1489,7 @@ xfs_swap_extent_forks( (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - ASSERT(ip->i_d.di_version < 3 || + ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || (*src_log_flags & XFS_ILOG_DOWNER)); (*src_log_flags) |= XFS_ILOG_DBROOT; break; @@ -1663,7 +1501,7 @@ xfs_swap_extent_forks( break; case XFS_DINODE_FMT_BTREE: (*target_log_flags) |= XFS_ILOG_DBROOT; - ASSERT(tip->i_d.di_version < 3 || + ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || (*target_log_flags & XFS_ILOG_DOWNER)); break; } @@ -1750,6 +1588,14 @@ xfs_swap_extents( goto out_unlock; } + error = xfs_qm_dqattach(ip); + if (error) + goto out_unlock; + + error = xfs_qm_dqattach(tip); + if (error) + goto out_unlock; + error = xfs_swap_extent_flush(ip); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 7a78229cf1a7..9f993168b55b 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -30,8 +30,6 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) } #endif /* CONFIG_XFS_RT */ -int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, - int whichfork, int *eof); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); @@ -59,8 +57,6 @@ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len, int alloc_type); int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); -int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, xfs_off_t len); int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 0abba171aa89..65538d18e64f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -198,20 +198,22 @@ xfs_buf_free_maps( } } -static struct xfs_buf * +static int _xfs_buf_alloc( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { struct xfs_buf *bp; int error; int i; + *bpp = NULL; bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); if (unlikely(!bp)) - return NULL; + return -ENOMEM; /* * We don't want certain flags to appear in b_flags unless they are @@ -238,8 +240,8 @@ _xfs_buf_alloc( */ error = xfs_buf_get_maps(bp, nmaps); if (error) { - kmem_zone_free(xfs_buf_zone, bp); - return NULL; + kmem_cache_free(xfs_buf_zone, bp); + return error; } bp->b_bn = map[0].bm_bn; @@ -256,7 +258,8 @@ _xfs_buf_alloc( XFS_STATS_INC(bp->b_mount, xb_create); trace_xfs_buf_init(bp, _RET_IP_); - return bp; + *bpp = bp; + return 0; } /* @@ -304,7 +307,7 @@ _xfs_buf_free_pages( * The buffer must not be on any hash - use xfs_buf_rele instead for * hashed and refcounted buffers */ -void +static void xfs_buf_free( xfs_buf_t *bp) { @@ -324,11 +327,14 @@ xfs_buf_free( __free_page(page); } + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += + bp->b_page_count; } else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); xfs_buf_free_maps(bp); - kmem_zone_free(xfs_buf_zone, bp); + kmem_cache_free(xfs_buf_zone, bp); } /* @@ -461,7 +467,7 @@ _xfs_buf_map_pages( unsigned nofs_flag; /* - * vm_map_ram() will allocate auxillary structures (e.g. + * vm_map_ram() will allocate auxiliary structures (e.g. * pagetables) with GFP_KERNEL, yet we are likely to be under * GFP_NOFS context here. Hence we need to tell memory reclaim * that we are in such a context via PF_MEMALLOC_NOFS to prevent @@ -471,7 +477,7 @@ _xfs_buf_map_pages( nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1, PAGE_KERNEL); + -1); if (bp->b_addr) break; vm_unmap_aliases(); @@ -682,53 +688,39 @@ xfs_buf_incore( * cache hits, as metadata intensive workloads will see 3 orders of magnitude * more hits than misses. */ -struct xfs_buf * +int xfs_buf_get_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { struct xfs_buf *bp; struct xfs_buf *new_bp; int error = 0; + *bpp = NULL; error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); - - switch (error) { - case 0: - /* cache hit */ + if (!error) goto found; - case -EAGAIN: - /* cache hit, trylock failure, caller handles failure */ - ASSERT(flags & XBF_TRYLOCK); - return NULL; - case -ENOENT: - /* cache miss, go for insert */ - break; - case -EFSCORRUPTED: - default: - /* - * None of the higher layers understand failure types - * yet, so return NULL to signal a fatal lookup error. - */ - return NULL; - } + if (error != -ENOENT) + return error; - new_bp = _xfs_buf_alloc(target, map, nmaps, flags); - if (unlikely(!new_bp)) - return NULL; + error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp); + if (error) + return error; error = xfs_buf_allocate_memory(new_bp, flags); if (error) { xfs_buf_free(new_bp); - return NULL; + return error; } error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); if (error) { xfs_buf_free(new_bp); - return NULL; + return error; } if (bp != new_bp) @@ -738,10 +730,11 @@ found: if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { - xfs_warn(target->bt_mount, - "%s: failed to map pagesn", __func__); + xfs_warn_ratelimited(target->bt_mount, + "%s: failed to map %u pages", __func__, + bp->b_page_count); xfs_buf_relse(bp); - return NULL; + return error; } } @@ -754,7 +747,8 @@ found: XFS_STATS_INC(target->bt_mount, xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); - return bp; + *bpp = bp; + return 0; } STATIC int @@ -806,46 +800,77 @@ xfs_buf_reverify( return bp->b_error; } -xfs_buf_t * +int xfs_buf_read_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - const struct xfs_buf_ops *ops) + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops, + xfs_failaddr_t fa) { struct xfs_buf *bp; + int error; flags |= XBF_READ; + *bpp = NULL; - bp = xfs_buf_get_map(target, map, nmaps, flags); - if (!bp) - return NULL; + error = xfs_buf_get_map(target, map, nmaps, flags, &bp); + if (error) + return error; trace_xfs_buf_read(bp, flags, _RET_IP_); if (!(bp->b_flags & XBF_DONE)) { + /* Initiate the buffer read and wait. */ XFS_STATS_INC(target->bt_mount, xb_get_read); bp->b_ops = ops; - _xfs_buf_read(bp, flags); - return bp; + error = _xfs_buf_read(bp, flags); + + /* Readahead iodone already dropped the buffer, so exit. */ + if (flags & XBF_ASYNC) + return 0; + } else { + /* Buffer already read; all we need to do is check it. */ + error = xfs_buf_reverify(bp, ops); + + /* Readahead already finished; drop the buffer and exit. */ + if (flags & XBF_ASYNC) { + xfs_buf_relse(bp); + return 0; + } + + /* We do not want read in the flags */ + bp->b_flags &= ~XBF_READ; + ASSERT(bp->b_ops != NULL || ops == NULL); } - xfs_buf_reverify(bp, ops); + /* + * If we've had a read error, then the contents of the buffer are + * invalid and should not be used. To ensure that a followup read tries + * to pull the buffer from disk again, we clear the XBF_DONE flag and + * mark the buffer stale. This ensures that anyone who has a current + * reference to the buffer will interpret it's contents correctly and + * future cache lookups will also treat it as an empty, uninitialised + * buffer. + */ + if (error) { + if (!XFS_FORCED_SHUTDOWN(target->bt_mount)) + xfs_buf_ioerror_alert(bp, fa); - if (flags & XBF_ASYNC) { - /* - * Read ahead call which is already satisfied, - * drop the buffer - */ + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); xfs_buf_relse(bp); - return NULL; + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; + return error; } - /* We do not want read in the flags */ - bp->b_flags &= ~XBF_READ; - ASSERT(bp->b_ops != NULL || ops == NULL); - return bp; + *bpp = bp; + return 0; } /* @@ -859,11 +884,14 @@ xfs_buf_readahead_map( int nmaps, const struct xfs_buf_ops *ops) { + struct xfs_buf *bp; + if (bdi_read_congested(target->bt_bdev->bd_bdi)) return; xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); + XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, + __this_address); } /* @@ -880,12 +908,13 @@ xfs_buf_read_uncached( const struct xfs_buf_ops *ops) { struct xfs_buf *bp; + int error; *bpp = NULL; - bp = xfs_buf_get_uncached(target, numblks, flags); - if (!bp) - return -ENOMEM; + error = xfs_buf_get_uncached(target, numblks, flags, &bp); + if (error) + return error; /* set up the buffer for a read IO */ ASSERT(bp->b_map_count == 1); @@ -896,7 +925,7 @@ xfs_buf_read_uncached( xfs_buf_submit(bp); if (bp->b_error) { - int error = bp->b_error; + error = bp->b_error; xfs_buf_relse(bp); return error; } @@ -905,20 +934,23 @@ xfs_buf_read_uncached( return 0; } -xfs_buf_t * +int xfs_buf_get_uncached( struct xfs_buftarg *target, size_t numblks, - int flags) + int flags, + struct xfs_buf **bpp) { unsigned long page_count; int error, i; struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); + *bpp = NULL; + /* flags might contain irrelevant bits, pass only what we care about */ - bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); - if (unlikely(bp == NULL)) + error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp); + if (error) goto fail; page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; @@ -928,8 +960,10 @@ xfs_buf_get_uncached( for (i = 0; i < page_count; i++) { bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); - if (!bp->b_pages[i]) + if (!bp->b_pages[i]) { + error = -ENOMEM; goto fail_free_mem; + } } bp->b_flags |= _XBF_PAGES; @@ -941,7 +975,8 @@ xfs_buf_get_uncached( } trace_xfs_buf_get_uncached(bp, _RET_IP_); - return bp; + *bpp = bp; + return 0; fail_free_mem: while (--i >= 0) @@ -949,9 +984,9 @@ xfs_buf_get_uncached( _xfs_buf_free_pages(bp); fail_free_buf: xfs_buf_free_maps(bp); - kmem_zone_free(xfs_buf_zone, bp); + kmem_cache_free(xfs_buf_zone, bp); fail: - return NULL; + return error; } /* @@ -1205,10 +1240,10 @@ __xfs_buf_ioerror( void xfs_buf_ioerror_alert( struct xfs_buf *bp, - const char *func) + xfs_failaddr_t func) { - xfs_alert(bp->b_mount, -"metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d", + xfs_alert_ratelimited(bp->b_mount, +"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, -bp->b_error); } @@ -1261,8 +1296,7 @@ xfs_buf_ioapply_map( int map, int *buf_offset, int *count, - int op, - int op_flags) + int op) { int page_index; int total_nr_pages = bp->b_page_count; @@ -1297,7 +1331,7 @@ next_chunk: bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; - bio_set_op_attrs(bio, op, op_flags); + bio->bi_opf = op; for (; size && nr_pages; nr_pages--, page_index++) { int rbytes, nbytes = PAGE_SIZE - offset; @@ -1342,7 +1376,6 @@ _xfs_buf_ioapply( { struct blk_plug plug; int op; - int op_flags = 0; int offset; int size; int i; @@ -1384,15 +1417,14 @@ _xfs_buf_ioapply( dump_stack(); } } - } else if (bp->b_flags & XBF_READ_AHEAD) { - op = REQ_OP_READ; - op_flags = REQ_RAHEAD; } else { op = REQ_OP_READ; + if (bp->b_flags & XBF_READ_AHEAD) + op |= REQ_RAHEAD; } /* we only use the buffer cache for meta-data */ - op_flags |= REQ_META; + op |= REQ_META; /* * Walk all the vectors issuing IO on them. Set up the initial offset @@ -1404,7 +1436,7 @@ _xfs_buf_ioapply( size = BBTOB(bp->b_length); blk_start_plug(&plug); for (i = 0; i < bp->b_map_count; i++) { - xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags); + xfs_buf_ioapply_map(bp, i, &offset, &size, op); if (bp->b_error) break; if (size <= 0) @@ -1545,6 +1577,28 @@ xfs_buf_zero( } /* + * Log a message about and stale a buffer that a caller has decided is corrupt. + * + * This function should be called for the kinds of metadata corruption that + * cannot be detect from a verifier, such as incorrect inter-block relationship + * data. Do /not/ call this function from a verifier function. + * + * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will + * be marked stale, but b_error will not be set. The caller is responsible for + * releasing the buffer or fixing it. + */ +void +__xfs_buf_mark_corrupt( + struct xfs_buf *bp, + xfs_failaddr_t fa) +{ + ASSERT(bp->b_flags & XBF_DONE); + + xfs_buf_corruption_error(bp, fa); + xfs_buf_stale(bp); +} + +/* * Handling of buffer targets (buftargs). */ @@ -2063,8 +2117,11 @@ xfs_buf_delwri_pushbuf( int __init xfs_buf_init(void) { - xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", - KM_ZONE_HWALIGN, NULL); + xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, + SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD, + NULL); if (!xfs_buf_zone) goto out; @@ -2077,7 +2134,7 @@ xfs_buf_init(void) void xfs_buf_terminate(void) { - kmem_zone_destroy(xfs_buf_zone); + kmem_cache_destroy(xfs_buf_zone); } void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index f6ce17d8d848..9a04c53c2488 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -192,37 +192,40 @@ struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags); -struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags); -struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags, - const struct xfs_buf_ops *ops); +int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp); +int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, + const struct xfs_buf_ops *ops, xfs_failaddr_t fa); void xfs_buf_readahead_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, const struct xfs_buf_ops *ops); -static inline struct xfs_buf * +static inline int xfs_buf_get( struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks) + size_t numblks, + struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_get_map(target, &map, 1, 0); + + return xfs_buf_get_map(target, &map, 1, 0, bpp); } -static inline struct xfs_buf * +static inline int xfs_buf_read( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags, + struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_read_map(target, &map, 1, flags, ops); + + return xfs_buf_read_map(target, &map, 1, flags, bpp, ops, + __builtin_return_address(0)); } static inline void @@ -236,15 +239,14 @@ xfs_buf_readahead( return xfs_buf_readahead_map(target, &map, 1, ops); } -struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, - int flags); +int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags, + struct xfs_buf **bpp); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, int flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ -extern void xfs_buf_free(xfs_buf_t *); extern void xfs_buf_rele(xfs_buf_t *); /* Locking and Unlocking Buffers */ @@ -260,7 +262,7 @@ extern void xfs_buf_ioend(struct xfs_buf *bp); extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) -extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); +extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); extern int __xfs_buf_submit(struct xfs_buf *bp, bool); static inline int xfs_buf_submit(struct xfs_buf *bp) @@ -270,6 +272,8 @@ static inline int xfs_buf_submit(struct xfs_buf *bp) } void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); +void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); +#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) /* Buffer Utility Routines */ extern void *xfs_buf_offset(struct xfs_buf *, size_t); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index d74fbd1e9d3e..1545657c3ca0 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -27,6 +27,23 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); +/* Is this log iovec plausibly large enough to contain the buffer log format? */ +bool +xfs_buf_log_check_iovec( + struct xfs_log_iovec *iovec) +{ + struct xfs_buf_log_format *blfp = iovec->i_addr; + char *bmp_end; + char *item_end; + + if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len) + return false; + + item_end = (char *)iovec->i_addr + iovec->i_len; + bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; + return bmp_end <= item_end; +} + static inline int xfs_buf_log_format_size( struct xfs_buf_log_format *blfp) @@ -328,7 +345,7 @@ xfs_buf_item_format( * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) || + if (xfs_sb_version_has_v3inode(&lip->li_mountp->m_sb) || !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; @@ -688,7 +705,7 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_push = xfs_buf_item_push, }; -STATIC int +STATIC void xfs_buf_item_get_format( struct xfs_buf_log_item *bip, int count) @@ -698,14 +715,11 @@ xfs_buf_item_get_format( if (count == 1) { bip->bli_formats = &bip->__bli_format; - return 0; + return; } bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), 0); - if (!bip->bli_formats) - return -ENOMEM; - return 0; } STATIC void @@ -731,7 +745,6 @@ xfs_buf_item_init( struct xfs_buf_log_item *bip = bp->b_log_item; int chunks; int map_size; - int error; int i; /* @@ -760,19 +773,22 @@ xfs_buf_item_init( * Discontiguous buffer support follows the layout of the underlying * buffer. This makes the implementation as simple as possible. */ - error = xfs_buf_item_get_format(bip, bp->b_map_count); - ASSERT(error == 0); - if (error) { /* to stop gcc throwing set-but-unused warnings */ - kmem_zone_free(xfs_buf_item_zone, bip); - return error; - } - + xfs_buf_item_get_format(bip, bp->b_map_count); for (i = 0; i < bip->bli_format_count; i++) { chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), XFS_BLF_CHUNK); map_size = DIV_ROUND_UP(chunks, NBWORD); + if (map_size > XFS_BLF_DATAMAP_SIZE) { + kmem_cache_free(xfs_buf_item_zone, bip); + xfs_err(mp, + "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", + map_size, + BBTOB(bp->b_maps[i].bm_len)); + return -EFSCORRUPTED; + } + bip->bli_formats[i].blf_type = XFS_LI_BUF; bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; @@ -805,6 +821,9 @@ xfs_buf_item_log_segment( uint end_bit; uint mask; + ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); + ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); + /* * Convert byte offsets to bit numbers. */ @@ -851,7 +870,7 @@ xfs_buf_item_log_segment( * first_bit and last_bit. */ while ((bits_to_set - bits_set) >= NBWORD) { - *wordp |= 0xffffffff; + *wordp = 0xffffffff; bits_set += NBWORD; wordp++; } @@ -939,7 +958,7 @@ xfs_buf_item_free( { xfs_buf_item_free_format(bip); kmem_free(bip->bli_item.li_lv_shadow); - kmem_zone_free(xfs_buf_item_zone, bip); + kmem_cache_free(xfs_buf_item_zone, bip); } /* @@ -956,7 +975,7 @@ xfs_buf_item_relse( struct xfs_buf_log_item *bip = bp->b_log_item; trace_xfs_buf_item_relse(bp, _RET_IP_); - ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); bp->b_log_item = NULL; if (list_empty(&bp->b_li_list)) @@ -1094,7 +1113,7 @@ xfs_buf_iodone_callback_error( if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { lasttime = jiffies; - xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_ioerror_alert(bp, __this_address); } lasttarg = bp->b_target; diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 4a054b11011a..30114b510332 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -61,6 +61,7 @@ void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, struct list_head *); +bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 283df898dd9f..871ec22c9aee 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -17,6 +17,7 @@ #include "xfs_trace.h" #include "xfs_bmap.h" #include "xfs_trans.h" +#include "xfs_error.h" /* * Directory file type support functions @@ -47,6 +48,7 @@ xfs_dir2_sf_getdents( { int i; /* shortform entry number */ struct xfs_inode *dp = args->dp; /* incore directory inode */ + struct xfs_mount *mp = dp->i_mount; xfs_dir2_dataptr_t off; /* current entry's offset */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ @@ -68,15 +70,15 @@ xfs_dir2_sf_getdents( return 0; /* - * Precalculate offsets for . and .. as we will always need them. - * - * XXX(hch): the second argument is sometimes 0 and sometimes - * geo->datablk + * Precalculate offsets for "." and ".." as we will always need them. + * This relies on the fact that directories always start with the + * entries for "." and "..". */ dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, - dp->d_ops->data_dot_offset); + geo->data_entry_offset); dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, - dp->d_ops->data_dotdot_offset); + geo->data_entry_offset + + xfs_dir2_data_entsize(mp, sizeof(".") - 1)); /* * Put . entry unless we're starting past it. @@ -91,7 +93,7 @@ xfs_dir2_sf_getdents( * Put .. entry unless we're starting past it. */ if (ctx->pos <= dotdot_offset) { - ino = dp->d_ops->sf_get_parent_ino(sfp); + ino = xfs_dir2_sf_get_parent_ino(sfp); ctx->pos = dotdot_offset & 0x7fffffff; if (!dir_emit(ctx, "..", 2, ino, DT_DIR)) return 0; @@ -108,17 +110,21 @@ xfs_dir2_sf_getdents( xfs_dir2_sf_get_offset(sfep)); if (ctx->pos > off) { - sfep = dp->d_ops->sf_nextentry(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); continue; } - ino = dp->d_ops->sf_get_ino(sfp, sfep); - filetype = dp->d_ops->sf_get_ftype(sfep); + ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); + filetype = xfs_dir2_sf_get_ftype(mp, sfep); ctx->pos = off & 0x7fffffff; + if (XFS_IS_CORRUPT(dp->i_mount, + !xfs_dir2_namecheck(sfep->name, + sfep->namelen))) + return -EFSCORRUPTED; if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, - xfs_dir3_get_dtype(dp->i_mount, filetype))) + xfs_dir3_get_dtype(mp, filetype))) return 0; - sfep = dp->d_ops->sf_nextentry(sfp, sfep); + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & @@ -135,17 +141,14 @@ xfs_dir2_block_getdents( struct dir_context *ctx) { struct xfs_inode *dp = args->dp; /* incore directory inode */ - xfs_dir2_data_hdr_t *hdr; /* block header */ struct xfs_buf *bp; /* buffer for block */ - xfs_dir2_data_entry_t *dep; /* block data entry */ - xfs_dir2_data_unused_t *dup; /* block unused entry */ - char *endptr; /* end of the data entries */ int error; /* error return value */ - char *ptr; /* current data entry */ int wantoff; /* starting block offset */ xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; int lock_mode; + unsigned int offset, next_offset; + unsigned int end; /* * If the block number in the offset is out of range, we're done. @@ -164,56 +167,57 @@ xfs_dir2_block_getdents( * We'll skip entries before this. */ wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos); - hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); - /* - * Set up values for the loop. - */ - ptr = (char *)dp->d_ops->data_entry_p(hdr); - endptr = xfs_dir3_data_endp(geo, hdr); /* * Loop over the data portion of the block. * Each object is a real entry (dep) or an unused one (dup). */ - while (ptr < endptr) { + end = xfs_dir3_data_end_offset(geo, bp->b_addr); + for (offset = geo->data_entry_offset; + offset < end; + offset = next_offset) { + struct xfs_dir2_data_unused *dup = bp->b_addr + offset; + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; uint8_t filetype; - dup = (xfs_dir2_data_unused_t *)ptr; /* * Unused, skip it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ptr += be16_to_cpu(dup->length); + next_offset = offset + be16_to_cpu(dup->length); continue; } - dep = (xfs_dir2_data_entry_t *)ptr; - /* * Bump pointer for the next iteration. */ - ptr += dp->d_ops->data_entsize(dep->namelen); + next_offset = offset + + xfs_dir2_data_entsize(dp->i_mount, dep->namelen); + /* * The entry is before the desired starting point, skip it. */ - if ((char *)dep - (char *)hdr < wantoff) + if (offset < wantoff) continue; - cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, - (char *)dep - (char *)hdr); + cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, offset); ctx->pos = cook & 0x7fffffff; - filetype = dp->d_ops->data_get_ftype(dep); + filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep); /* * If it didn't fit, set the final offset to here & return. */ + if (XFS_IS_CORRUPT(dp->i_mount, + !xfs_dir2_namecheck(dep->name, + dep->namelen))) { + error = -EFSCORRUPTED; + goto out_rele; + } if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), - xfs_dir3_get_dtype(dp->i_mount, filetype))) { - xfs_trans_brelse(args->trans, bp); - return 0; - } + xfs_dir3_get_dtype(dp->i_mount, filetype))) + goto out_rele; } /* @@ -222,8 +226,9 @@ xfs_dir2_block_getdents( */ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & 0x7fffffff; +out_rele: xfs_trans_brelse(args->trans, bp); - return 0; + return error; } /* @@ -276,7 +281,7 @@ xfs_dir2_leaf_readbuf( new_off = xfs_dir2_da_to_byte(geo, map.br_startoff); if (new_off > *cur_off) *cur_off = new_off; - error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, -1, &bp); + error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, 0, &bp); if (error) goto out; @@ -311,7 +316,8 @@ xfs_dir2_leaf_readbuf( break; } if (next_ra > *ra_blk) { - xfs_dir3_data_readahead(dp, next_ra, -2); + xfs_dir3_data_readahead(dp, next_ra, + XFS_DABUF_MAP_HOLE_OK); *ra_blk = next_ra; } ra_want -= geo->fsbcount; @@ -343,17 +349,17 @@ xfs_dir2_leaf_getdents( size_t bufsize) { struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp = NULL; /* data block buffer */ - xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ - char *ptr = NULL; /* pointer to current data */ struct xfs_da_geometry *geo = args->geo; xfs_dablk_t rablk = 0; /* current readahead block */ xfs_dir2_off_t curoff; /* current overall offset */ int length; /* temporary length value */ int byteoff; /* offset in current block */ int lock_mode; + unsigned int offset = 0; int error = 0; /* error return value */ /* @@ -380,7 +386,7 @@ xfs_dir2_leaf_getdents( * If we have no buffer, or we're off the end of the * current buffer, need to get another one. */ - if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { + if (!bp || offset >= geo->blksize) { if (bp) { xfs_trans_brelse(args->trans, bp); bp = NULL; @@ -393,36 +399,35 @@ xfs_dir2_leaf_getdents( if (error || !bp) break; - hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); /* * Find our position in the block. */ - ptr = (char *)dp->d_ops->data_entry_p(hdr); + offset = geo->data_entry_offset; byteoff = xfs_dir2_byte_to_off(geo, curoff); /* * Skip past the header. */ if (byteoff == 0) - curoff += dp->d_ops->data_entry_offset; + curoff += geo->data_entry_offset; /* * Skip past entries until we reach our offset. */ else { - while ((char *)ptr - (char *)hdr < byteoff) { - dup = (xfs_dir2_data_unused_t *)ptr; + while (offset < byteoff) { + dup = bp->b_addr + offset; if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { length = be16_to_cpu(dup->length); - ptr += length; + offset += length; continue; } - dep = (xfs_dir2_data_entry_t *)ptr; - length = - dp->d_ops->data_entsize(dep->namelen); - ptr += length; + dep = bp->b_addr + offset; + length = xfs_dir2_data_entsize(mp, + dep->namelen); + offset += length; } /* * Now set our real offset. @@ -430,32 +435,38 @@ xfs_dir2_leaf_getdents( curoff = xfs_dir2_db_off_to_byte(geo, xfs_dir2_byte_to_db(geo, curoff), - (char *)ptr - (char *)hdr); - if (ptr >= (char *)hdr + geo->blksize) { + offset); + if (offset >= geo->blksize) continue; - } } } + /* - * We have a pointer to an entry. - * Is it a live one? + * We have a pointer to an entry. Is it a live one? */ - dup = (xfs_dir2_data_unused_t *)ptr; + dup = bp->b_addr + offset; + /* * No, it's unused, skip over it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { length = be16_to_cpu(dup->length); - ptr += length; + offset += length; curoff += length; continue; } - dep = (xfs_dir2_data_entry_t *)ptr; - length = dp->d_ops->data_entsize(dep->namelen); - filetype = dp->d_ops->data_get_ftype(dep); + dep = bp->b_addr + offset; + length = xfs_dir2_data_entsize(mp, dep->namelen); + filetype = xfs_dir2_data_get_ftype(mp, dep); ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + if (XFS_IS_CORRUPT(dp->i_mount, + !xfs_dir2_namecheck(dep->name, + dep->namelen))) { + error = -EFSCORRUPTED; + break; + } if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), xfs_dir3_get_dtype(dp->i_mount, filetype))) @@ -464,7 +475,7 @@ xfs_dir2_leaf_getdents( /* * Advance to next entry in the block. */ - ptr += length; + offset += length; curoff += length; /* bufsize may have just been a guess; don't go negative */ bufsize = bufsize > length ? bufsize - length : 0; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 8ec7aab89044..f979d0d7e6cd 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -13,6 +13,7 @@ #include "xfs_btree.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" +#include "xfs_discard.h" #include "xfs_error.h" #include "xfs_extent_busy.h" #include "xfs_trace.h" @@ -30,6 +31,7 @@ xfs_trim_extents( struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; struct xfs_buf *agbp; + struct xfs_agf *agf; struct xfs_perag *pag; int error; int i; @@ -44,16 +46,16 @@ xfs_trim_extents( xfs_log_force(mp, XFS_LOG_SYNC); error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); - if (error || !agbp) + if (error) goto out_put_perag; + agf = agbp->b_addr; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); /* * Look up the longest btree in the AGF and start with it. */ - error = xfs_alloc_lookup_ge(cur, 0, - be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); + error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i); if (error) goto out_del_cursor; @@ -70,8 +72,11 @@ xfs_trim_extents( error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) goto out_del_cursor; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor); - ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); + if (XFS_IS_CORRUPT(mp, i != 1)) { + error = -EFSCORRUPTED; + goto out_del_cursor; + } + ASSERT(flen <= be32_to_cpu(agf->agf_longest)); /* * use daddr format for all range/len calculations as that is diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index aeb95e7391c1..af2c8e5ceea0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -48,7 +48,7 @@ static struct lock_class_key xfs_dquot_project_class; */ void xfs_qm_dqdestroy( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { ASSERT(list_empty(&dqp->q_lru)); @@ -56,7 +56,7 @@ xfs_qm_dqdestroy( mutex_destroy(&dqp->q_qlock); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); - kmem_zone_free(xfs_qm_dqzone, dqp); + kmem_cache_free(xfs_qm_dqzone, dqp); } /* @@ -113,8 +113,8 @@ xfs_qm_adjust_dqlimits( */ void xfs_qm_adjust_dqtimers( - xfs_mount_t *mp, - xfs_disk_dquot_t *d) + struct xfs_mount *mp, + struct xfs_disk_dquot *d) { ASSERT(d->d_id); @@ -137,7 +137,7 @@ xfs_qm_adjust_dqtimers( (d->d_blk_hardlimit && (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_hardlimit)))) { - d->d_btimer = cpu_to_be32(get_seconds() + + d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_btimelimit); } else { d->d_bwarns = 0; @@ -160,7 +160,7 @@ xfs_qm_adjust_dqtimers( (d->d_ino_hardlimit && (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_hardlimit)))) { - d->d_itimer = cpu_to_be32(get_seconds() + + d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_itimelimit); } else { d->d_iwarns = 0; @@ -183,7 +183,7 @@ xfs_qm_adjust_dqtimers( (d->d_rtb_hardlimit && (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_hardlimit)))) { - d->d_rtbtimer = cpu_to_be32(get_seconds() + + d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_rtbtimelimit); } else { d->d_rtbwarns = 0; @@ -305,8 +305,8 @@ xfs_dquot_disk_alloc( /* Create the block mapping. */ xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset, - XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, - XFS_QM_DQALLOC_SPACE_RES(mp), &map, &nmaps); + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map, + &nmaps); if (error) return error; ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); @@ -320,10 +320,10 @@ xfs_dquot_disk_alloc( dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); /* now we can just get the buffer (there's nothing to read yet) */ - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0); - if (!bp) - return -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) + return error; bp->b_ops = &xfs_dquot_buf_ops; /* @@ -497,7 +497,7 @@ xfs_dquot_from_disk( struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; /* copy everything from disk dquot to the incore dquot */ - memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); + memcpy(&dqp->q_core, ddqp, sizeof(struct xfs_disk_dquot)); /* * Reservation counters are defined as reservation plus current usage @@ -829,11 +829,11 @@ xfs_qm_id_for_quotatype( { switch (type) { case XFS_DQ_USER: - return ip->i_d.di_uid; + return i_uid_read(VFS_I(ip)); case XFS_DQ_GROUP: - return ip->i_d.di_gid; + return i_gid_read(VFS_I(ip)); case XFS_DQ_PROJ: - return xfs_get_projid(ip); + return ip->i_d.di_projid; } ASSERT(0); return 0; @@ -989,7 +989,7 @@ xfs_qm_dqput( */ void xfs_qm_dqrele( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { if (!dqp) return; @@ -1018,8 +1018,8 @@ xfs_qm_dqflush_done( struct xfs_buf *bp, struct xfs_log_item *lip) { - xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip; - xfs_dquot_t *dqp = qip->qli_dquot; + struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip; + struct xfs_dquot *dqp = qip->qli_dquot; struct xfs_ail *ailp = lip->li_ailp; /* @@ -1105,8 +1105,8 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp, - &xfs_dquot_buf_ops); + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, + &bp, &xfs_dquot_buf_ops); if (error) goto out_unlock; @@ -1126,11 +1126,11 @@ xfs_qm_dqflush( xfs_buf_relse(bp); xfs_dqfunlock(dqp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return -EIO; + return -EFSCORRUPTED; } /* This is the only portion of data that needs to persist */ - memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t)); + memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot)); /* * Clear the dirty field and remember the flush lsn for later use. @@ -1177,7 +1177,7 @@ xfs_qm_dqflush( out_unlock: xfs_dqfunlock(dqp); - return -EIO; + return error; } /* @@ -1188,8 +1188,8 @@ out_unlock: */ void xfs_dqlock2( - xfs_dquot_t *d1, - xfs_dquot_t *d2) + struct xfs_dquot *d1, + struct xfs_dquot *d2) { if (d1 && d2) { ASSERT(d1 != d2); @@ -1211,20 +1211,22 @@ xfs_dqlock2( int __init xfs_qm_init(void) { - xfs_qm_dqzone = - kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot"); + xfs_qm_dqzone = kmem_cache_create("xfs_dquot", + sizeof(struct xfs_dquot), + 0, 0, NULL); if (!xfs_qm_dqzone) goto out; - xfs_qm_dqtrxzone = - kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx"); + xfs_qm_dqtrxzone = kmem_cache_create("xfs_dqtrx", + sizeof(struct xfs_dquot_acct), + 0, 0, NULL); if (!xfs_qm_dqtrxzone) goto out_free_dqzone; return 0; out_free_dqzone: - kmem_zone_destroy(xfs_qm_dqzone); + kmem_cache_destroy(xfs_qm_dqzone); out: return -ENOMEM; } @@ -1232,8 +1234,8 @@ out: void xfs_qm_exit(void) { - kmem_zone_destroy(xfs_qm_dqtrxzone); - kmem_zone_destroy(xfs_qm_dqzone); + kmem_cache_destroy(xfs_qm_dqtrxzone); + kmem_cache_destroy(xfs_qm_dqzone); } /* diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 4fe85709d55d..fe3e46df604b 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -30,33 +30,36 @@ enum { /* * The incore dquot structure */ -typedef struct xfs_dquot { - uint dq_flags; /* various flags (XFS_DQ_*) */ - struct list_head q_lru; /* global free list of dquots */ - struct xfs_mount*q_mount; /* filesystem this relates to */ - uint q_nrefs; /* # active refs from inodes */ - xfs_daddr_t q_blkno; /* blkno of dquot buffer */ - int q_bufoffset; /* off of dq in buffer (# dquots) */ - xfs_fileoff_t q_fileoffset; /* offset in quotas file */ - - xfs_disk_dquot_t q_core; /* actual usage & quotas */ - xfs_dq_logitem_t q_logitem; /* dquot log item */ - xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ - xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ - xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ - xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */ - xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */ - int64_t q_low_space[XFS_QLOWSP_MAX]; - struct mutex q_qlock; /* quota lock */ - struct completion q_flush; /* flush completion queue */ - atomic_t q_pincount; /* dquot pin count */ - wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ -} xfs_dquot_t; +struct xfs_dquot { + uint dq_flags; + struct list_head q_lru; + struct xfs_mount *q_mount; + uint q_nrefs; + xfs_daddr_t q_blkno; + int q_bufoffset; + xfs_fileoff_t q_fileoffset; + + struct xfs_disk_dquot q_core; + struct xfs_dq_logitem q_logitem; + /* total regular nblks used+reserved */ + xfs_qcnt_t q_res_bcount; + /* total inos allocd+reserved */ + xfs_qcnt_t q_res_icount; + /* total realtime blks used+reserved */ + xfs_qcnt_t q_res_rtbcount; + xfs_qcnt_t q_prealloc_lo_wmark; + xfs_qcnt_t q_prealloc_hi_wmark; + int64_t q_low_space[XFS_QLOWSP_MAX]; + struct mutex q_qlock; + struct completion q_flush; + atomic_t q_pincount; + struct wait_queue_head q_pinwait; +}; /* * Lock hierarchy for q_qlock: * XFS_QLOCK_NORMAL is the implicit default, - * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 + * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 */ enum { XFS_QLOCK_NORMAL = 0, @@ -64,21 +67,21 @@ enum { }; /* - * Manage the q_flush completion queue embedded in the dquot. This completion + * Manage the q_flush completion queue embedded in the dquot. This completion * queue synchronizes processes attempting to flush the in-core dquot back to * disk. */ -static inline void xfs_dqflock(xfs_dquot_t *dqp) +static inline void xfs_dqflock(struct xfs_dquot *dqp) { wait_for_completion(&dqp->q_flush); } -static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp) +static inline bool xfs_dqflock_nowait(struct xfs_dquot *dqp) { return try_wait_for_completion(&dqp->q_flush); } -static inline void xfs_dqfunlock(xfs_dquot_t *dqp) +static inline void xfs_dqfunlock(struct xfs_dquot *dqp) { complete(&dqp->q_flush); } @@ -112,7 +115,7 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) } } -static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) +static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type) { switch (type & XFS_DQ_ALLTYPES) { case XFS_DQ_USER: @@ -147,31 +150,30 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) #define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) -extern void xfs_qm_dqdestroy(xfs_dquot_t *); -extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); -extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); -extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, - xfs_disk_dquot_t *); -extern void xfs_qm_adjust_dqlimits(struct xfs_mount *, - struct xfs_dquot *); -extern xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, - uint type); -extern int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, +void xfs_qm_dqdestroy(struct xfs_dquot *dqp); +int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); +void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); +void xfs_qm_adjust_dqtimers(struct xfs_mount *mp, + struct xfs_disk_dquot *d); +void xfs_qm_adjust_dqlimits(struct xfs_mount *mp, + struct xfs_dquot *d); +xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, uint type); +int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, uint type, bool can_alloc, struct xfs_dquot **dqpp); -extern int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type, - bool can_alloc, - struct xfs_dquot **dqpp); -extern int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, +int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type, + bool can_alloc, + struct xfs_dquot **dqpp); +int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, uint type, struct xfs_dquot **dqpp); -extern int xfs_qm_dqget_uncached(struct xfs_mount *mp, - xfs_dqid_t id, uint type, - struct xfs_dquot **dqpp); -extern void xfs_qm_dqput(xfs_dquot_t *); +int xfs_qm_dqget_uncached(struct xfs_mount *mp, + xfs_dqid_t id, uint type, + struct xfs_dquot **dqpp); +void xfs_qm_dqput(struct xfs_dquot *dqp); -extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); +void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); -extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); +void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index d60647d7197b..baad1748d0d1 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -189,7 +189,8 @@ xfs_qm_dquot_logitem_push( if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } + } else if (error == -EAGAIN) + rval = XFS_ITEM_LOCKED; spin_lock(&lip->li_ailp->ail_lock); out_unlock: @@ -307,36 +308,62 @@ xfs_qm_qoffend_logitem_committed( { struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; - struct xfs_ail *ailp = qfs->qql_item.li_ailp; - /* - * Delete the qoff-start logitem from the AIL. - * xfs_trans_ail_delete() drops the AIL lock. - */ - spin_lock(&ailp->ail_lock); - xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + xfs_qm_qoff_logitem_relse(qfs); - kmem_free(qfs->qql_item.li_lv_shadow); kmem_free(lip->li_lv_shadow); - kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; } +STATIC void +xfs_qm_qoff_logitem_release( + struct xfs_log_item *lip) +{ + struct xfs_qoff_logitem *qoff = QOFF_ITEM(lip); + + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { + if (qoff->qql_start_lip) + xfs_qm_qoff_logitem_relse(qoff->qql_start_lip); + xfs_qm_qoff_logitem_relse(qoff); + } +} + static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_committed = xfs_qm_qoffend_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, + .iop_release = xfs_qm_qoff_logitem_release, }; static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_push = xfs_qm_qoff_logitem_push, + .iop_release = xfs_qm_qoff_logitem_release, }; /* + * Delete the quotaoff intent from the AIL and free it. On success, + * this should only be called for the start item. It can be used for + * either on shutdown or abort. + */ +void +xfs_qm_qoff_logitem_relse( + struct xfs_qoff_logitem *qoff) +{ + struct xfs_log_item *lip = &qoff->qql_item; + + ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) || + test_bit(XFS_LI_ABORTED, &lip->li_flags) || + XFS_FORCED_SHUTDOWN(lip->li_mountp)); + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + kmem_free(lip->li_lv_shadow); + kmem_free(qoff); +} + +/* * Allocate and initialize an quotaoff item of the correct quota type(s). */ struct xfs_qoff_logitem * diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 1aed34ccdabc..2b86a43d7ce2 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -11,25 +11,28 @@ struct xfs_trans; struct xfs_mount; struct xfs_qoff_logitem; -typedef struct xfs_dq_logitem { - struct xfs_log_item qli_item; /* common portion */ - struct xfs_dquot *qli_dquot; /* dquot ptr */ - xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ -} xfs_dq_logitem_t; +struct xfs_dq_logitem { + struct xfs_log_item qli_item; /* common portion */ + struct xfs_dquot *qli_dquot; /* dquot ptr */ + xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ +}; -typedef struct xfs_qoff_logitem { - struct xfs_log_item qql_item; /* common portion */ - struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ +struct xfs_qoff_logitem { + struct xfs_log_item qql_item; /* common portion */ + struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ unsigned int qql_flags; -} xfs_qoff_logitem_t; +}; -extern void xfs_qm_dquot_logitem_init(struct xfs_dquot *); -extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *, - struct xfs_qoff_logitem *, uint); -extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *, - struct xfs_qoff_logitem *, uint); -extern void xfs_trans_log_quotaoff_item(struct xfs_trans *, - struct xfs_qoff_logitem *); +void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp); +struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp, + struct xfs_qoff_logitem *start, + uint flags); +void xfs_qm_qoff_logitem_relse(struct xfs_qoff_logitem *); +struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp, + struct xfs_qoff_logitem *startqoff, + uint flags); +void xfs_trans_log_quotaoff_item(struct xfs_trans *tp, + struct xfs_qoff_logitem *qlp); #endif /* __XFS_DQUOT_ITEM_H__ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 849fd4476950..a21e9cc6516a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -257,7 +257,7 @@ xfs_errortag_test( xfs_warn_ratelimited(mp, "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", - expression, file, line, mp->m_fsname); + expression, file, line, mp->m_super->s_id); return true; } @@ -329,19 +329,43 @@ xfs_corruption_error( const char *tag, int level, struct xfs_mount *mp, - void *buf, + const void *buf, size_t bufsize, const char *filename, int linenum, xfs_failaddr_t failaddr) { - if (level <= xfs_error_level) + if (buf && level <= xfs_error_level) xfs_hex_dump(buf, bufsize); xfs_error_report(tag, level, mp, filename, linenum, failaddr); xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } /* + * Complain about the kinds of metadata corruption that we can't detect from a + * verifier, such as incorrect inter-block relationship data. Does not set + * bp->b_error. + * + * Call xfs_buf_mark_corrupt, not this function. + */ +void +xfs_buf_corruption_error( + struct xfs_buf *bp, + xfs_failaddr_t fa) +{ + struct xfs_mount *mp = bp->b_mount; + + xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, + "Metadata corruption detected at %pS, %s block 0x%llx", + fa, bp->b_ops->name, bp->b_bn); + + xfs_alert(mp, "Unmount and run xfs_repair"); + + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} + +/* * Warnings specifically for verifier errors. Differentiate CRC vs. invalid * values, and omit the stack trace unless the error level is tuned high. */ @@ -350,7 +374,7 @@ xfs_buf_verifier_error( struct xfs_buf *bp, int error, const char *name, - void *buf, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr) { @@ -402,7 +426,7 @@ xfs_inode_verifier_error( struct xfs_inode *ip, int error, const char *name, - void *buf, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr) { diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 602aa7d62b66..1717b7508356 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -12,16 +12,17 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, const char *filename, int linenum, xfs_failaddr_t failaddr); extern void xfs_corruption_error(const char *tag, int level, - struct xfs_mount *mp, void *buf, size_t bufsize, + struct xfs_mount *mp, const void *buf, size_t bufsize, const char *filename, int linenum, xfs_failaddr_t failaddr); +void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa); extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, - const char *name, void *buf, size_t bufsz, + const char *name, const void *buf, size_t bufsz, xfs_failaddr_t failaddr); extern void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, - const char *name, void *buf, size_t bufsz, + const char *name, const void *buf, size_t bufsz, xfs_failaddr_t failaddr); #define XFS_ERROR_REPORT(e, lvl, mp) \ @@ -37,32 +38,6 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, /* Dump 128 bytes of any corrupt buffer */ #define XFS_CORRUPTION_DUMP_LEN (128) -/* - * Macros to set EFSCORRUPTED & return/branch. - */ -#define XFS_WANT_CORRUPTED_GOTO(mp, x, l) \ - { \ - int fs_is_ok = (x); \ - ASSERT(fs_is_ok); \ - if (unlikely(!fs_is_ok)) { \ - XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \ - XFS_ERRLEVEL_LOW, mp); \ - error = -EFSCORRUPTED; \ - goto l; \ - } \ - } - -#define XFS_WANT_CORRUPTED_RETURN(mp, x) \ - { \ - int fs_is_ok = (x); \ - ASSERT(fs_is_ok); \ - if (unlikely(!fs_is_ok)) { \ - XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \ - XFS_ERRLEVEL_LOW, mp); \ - return -EFSCORRUPTED; \ - } \ - } - #ifdef DEBUG extern int xfs_errortag_init(struct xfs_mount *mp); extern void xfs_errortag_del(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index f1372f9046e3..5a4b0119143a 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -15,7 +15,6 @@ #include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_icache.h" -#include "xfs_log.h" #include "xfs_pnfs.h" /* @@ -221,18 +220,7 @@ STATIC int xfs_fs_nfs_commit_metadata( struct inode *inode) { - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(XFS_I(inode)); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 2183d87be4cf..3991e59cfd18 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -367,7 +367,7 @@ restart: * If this is a metadata allocation, try to reuse the busy * extent instead of trimming the allocation. */ - if (!xfs_alloc_is_userdata(args->datatype) && + if (!(args->datatype & XFS_ALLOC_USERDATA) && !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { if (!xfs_extent_busy_update_extent(args->mp, args->pag, busyp, fbno, flen, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index e44efc41a041..6ea847f6e298 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -21,7 +21,7 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_trace.h" - +#include "xfs_error.h" kmem_zone_t *xfs_efi_zone; kmem_zone_t *xfs_efd_zone; @@ -39,7 +39,7 @@ xfs_efi_item_free( if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) kmem_free(efip); else - kmem_zone_free(xfs_efi_zone, efip); + kmem_cache_free(xfs_efi_zone, efip); } /* @@ -228,6 +228,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } return 0; } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; } @@ -243,7 +244,7 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp) if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) kmem_free(efdp); else - kmem_zone_free(xfs_efd_zone, efdp); + kmem_cache_free(xfs_efd_zone, efdp); } /* @@ -624,7 +625,7 @@ xfs_efi_recover( */ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip); - return -EIO; + return -EFSCORRUPTED; } } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1ffb179f35d2..4b8bdecc3863 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -80,19 +80,9 @@ xfs_dir_fsync( int datasync) { struct xfs_inode *ip = XFS_I(file->f_mapping->host); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; trace_xfs_dir_fsync(ip); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(ip); } STATIC int @@ -187,8 +177,14 @@ xfs_file_dio_aio_read( file_accessed(iocb->ki_filp); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + return -EAGAIN; + } else { + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, + is_sync_kiocb(iocb)); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -215,7 +211,7 @@ xfs_file_dax_read( xfs_ilock(ip, XFS_IOLOCK_SHARED); } - ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); + ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); xfs_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); @@ -351,7 +347,7 @@ restart: trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, - NULL, &xfs_iomap_ops); + NULL, &xfs_buffered_write_iomap_ops); if (error) return error; } else @@ -486,8 +482,7 @@ xfs_file_dio_aio_write( int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); - struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) @@ -547,15 +542,13 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); - ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops); - /* - * If unaligned, this is the only IO in-flight. If it has not yet - * completed, wait on it before we release the iolock to prevent - * subsequent overlapping IO. + * If unaligned, this is the only IO in-flight. Wait on it before we + * release the iolock to prevent subsequent overlapping IO. */ - if (ret == -EIOCBQUEUED && unaligned_io) - inode_dio_wait(inode); + ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, + &xfs_dio_write_ops, + is_sync_kiocb(iocb) || unaligned_io); out: xfs_iunlock(ip, iolock); @@ -594,7 +587,7 @@ xfs_file_dax_write( count = iov_iter_count(from); trace_xfs_file_dax_write(ip, count, pos); - ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); + ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); @@ -641,7 +634,8 @@ write_retry: current->backing_dev_info = inode_to_bdi(inode); trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); - ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); + ret = iomap_file_buffered_write(iocb, from, + &xfs_buffered_write_iomap_ops); if (likely(ret >= 0)) iocb->ki_pos += ret; @@ -818,6 +812,36 @@ xfs_file_fallocate( if (error) goto out_unlock; + /* + * Must wait for all AIO to complete before we continue as AIO can + * change the file size on completion without holding any locks we + * currently hold. We must do this first because AIO can update both + * the on disk and in memory inode sizes, and the operations that follow + * require the in-memory size to be fully up-to-date. + */ + inode_dio_wait(inode); + + /* + * Now AIO and DIO has drained we flush and (if necessary) invalidate + * the cached range over the first operation we are about to run. + * + * We care about zero and collapse here because they both run a hole + * punch over the range first. Because that can zero data, and the range + * of invalidation for the shift operations is much larger, we still do + * the required flush for collapse in xfs_prepare_shift(). + * + * Insert has the same range requirements as collapse, and we extend the + * file first which can zero data. Hence insert has the same + * flush/invalidate requirements as collapse and so they are both + * handled at the right time by xfs_prepare_shift(). + */ + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_COLLAPSE_RANGE)) { + error = xfs_flush_unmap_range(ip, offset, len); + if (error) + goto out_unlock; + } + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -881,16 +905,30 @@ xfs_file_fallocate( } if (mode & FALLOC_FL_ZERO_RANGE) { - error = xfs_zero_file_space(ip, offset, len); + /* + * Punch a hole and prealloc the range. We use a hole + * punch rather than unwritten extent conversion for two + * reasons: + * + * 1.) Hole punch handles partial block zeroing for us. + * 2.) If prealloc returns ENOSPC, the file range is + * still zero-valued by virtue of the hole punch. + */ + unsigned int blksize = i_blocksize(inode); + + trace_xfs_zero_file_space(ip); + + error = xfs_free_file_space(ip, offset, len); + if (error) + goto out_unlock; + + len = round_up(offset + len, blksize) - + round_down(offset, blksize); + offset = round_down(offset, blksize); } else if (mode & FALLOC_FL_UNSHARE_RANGE) { error = xfs_reflink_unshare(ip, offset, len); if (error) goto out_unlock; - - if (!xfs_is_always_cow_inode(ip)) { - error = xfs_alloc_file_space(ip, offset, len, - XFS_BMAPI_PREALLOC); - } } else { /* * If always_cow mode we can't use preallocations and @@ -900,12 +938,14 @@ xfs_file_fallocate( error = -EOPNOTSUPP; goto out_unlock; } + } + if (!xfs_is_always_cow_inode(ip)) { error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); + if (error) + goto out_unlock; } - if (error) - goto out_unlock; } if (file->f_flags & O_DSYNC) @@ -1019,7 +1059,11 @@ xfs_file_remap_range( ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, remap_flags); + if (ret) + goto out_unlock; + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_log_force_inode(dest); out_unlock: xfs_reflink_remap_unlock(file_in, file_out); if (ret) @@ -1059,7 +1103,7 @@ xfs_dir_open( */ mode = xfs_ilock_data_map_shared(ip); if (ip->i_d.di_nextents > 0) - error = xfs_dir3_data_readahead(ip, 0, -1); + error = xfs_dir3_data_readahead(ip, 0, 0); xfs_iunlock(ip, mode); return error; } @@ -1156,12 +1200,16 @@ __xfs_filemap_fault( if (IS_DAX(inode)) { pfn_t pfn; - ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, + (write_fault && !vmf->cow_page) ? + &xfs_direct_write_iomap_ops : + &xfs_read_iomap_ops); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); } else { if (write_fault) - ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); + ret = iomap_page_mkwrite(vmf, + &xfs_buffered_write_iomap_ops); else ret = filemap_fault(vmf); } @@ -1225,22 +1273,22 @@ static const struct vm_operations_struct xfs_file_vm_ops = { STATIC int xfs_file_mmap( - struct file *filp, - struct vm_area_struct *vma) + struct file *file, + struct vm_area_struct *vma) { - struct dax_device *dax_dev; + struct inode *inode = file_inode(file); + struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); - dax_dev = xfs_find_daxdev_for_inode(file_inode(filp)); /* * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ - if (!daxdev_mapping_supported(vma, dax_dev)) + if (!daxdev_mapping_supported(vma, target->bt_daxdev)) return -EOPNOTSUPP; - file_accessed(filp); + file_accessed(file); vma->vm_ops = &xfs_file_vm_ops; - if (IS_DAX(file_inode(filp))) + if (IS_DAX(inode)) vma->vm_flags |= VM_HUGEPAGE; return 0; } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 574a7a8b4736..1a88025e68a3 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -18,6 +18,7 @@ #include "xfs_trace.h" #include "xfs_ag_resv.h" #include "xfs_trans.h" +#include "xfs_filestream.h" struct xfs_fstrm_item { struct xfs_mru_cache_elem mru; @@ -158,16 +159,15 @@ xfs_filestream_pick_ag( if (!pag->pagf_init) { err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); - if (err && !trylock) { + if (err) { xfs_perag_put(pag); - return err; + if (err != -EAGAIN) + return err; + /* Couldn't lock the AGF, skip this AG. */ + continue; } } - /* Might fail sometimes during the 1st pass with trylock set. */ - if (!pag->pagf_init) - goto next_ag; - /* Keep track of the AG with the most free blocks. */ if (pag->pagf_freeblks > maxfree) { maxfree = pag->pagf_freeblks; @@ -374,7 +374,7 @@ xfs_filestream_new_ag( startag = (item->ag + 1) % mp->m_sb.sb_agcount; } - if (xfs_alloc_is_userdata(ap->datatype)) + if (ap->datatype & XFS_ALLOC_USERDATA) flags |= XFS_PICK_USERDATA; if (ap->tp->t_flags & XFS_TRANS_LOWMODE) flags |= XFS_PICK_LOWSPACE; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index d082143feb5a..4eebcec4aae6 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -146,6 +146,7 @@ xfs_fsmap_owner_from_rmap( dest->fmr_owner = XFS_FMR_OWN_FREE; break; default: + ASSERT(0); return -EFSCORRUPTED; } return 0; @@ -343,7 +344,7 @@ xfs_getfsmap_datadev_helper( xfs_fsblock_t fsb; xfs_daddr_t rec_daddr; - fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock); + fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.agno, rec->rm_startblock); rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); @@ -361,7 +362,7 @@ xfs_getfsmap_datadev_bnobt_helper( struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; - rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno, + rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.agno, rec->ar_startblock); irec.rm_startblock = rec->ar_startblock; @@ -895,6 +896,14 @@ xfs_getfsmap( info.format_arg = arg; info.head = head; + /* + * If fsmap runs concurrently with a scrub, the freeze can be delayed + * indefinitely as we walk the rmapbt and iterate over metadata + * buffers. Freeze quiesces the log (which waits for the buffer LRU to + * be emptied) and that won't happen while we're reading buffers. + */ + sb_start_write(mp->m_super); + /* For each device we support... */ for (i = 0; i < XFS_GETFSMAP_DEVS; i++) { /* Is this device within the range the user asked for? */ @@ -934,6 +943,7 @@ xfs_getfsmap( if (tp) xfs_trans_cancel(tp); + sb_end_write(mp->m_super); head->fmh_oflags = FMH_OF_DEV_T; return error; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 944add5ff8e0..8bf1d15be3f6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -44,7 +44,7 @@ xfs_inode_alloc( if (!ip) return NULL; if (inode_init_always(mp->m_super, VFS_I(ip))) { - kmem_zone_free(xfs_inode_zone, ip); + kmem_cache_free(xfs_inode_zone, ip); return NULL; } @@ -104,7 +104,7 @@ xfs_inode_free_callback( ip->i_itemp = NULL; } - kmem_zone_free(xfs_inode_zone, ip); + kmem_cache_free(xfs_inode_zone, ip); } static void @@ -289,6 +289,8 @@ xfs_reinit_inode( uint64_t version = inode_peek_iversion(inode); umode_t mode = inode->i_mode; dev_t dev = inode->i_rdev; + kuid_t uid = inode->i_uid; + kgid_t gid = inode->i_gid; error = inode_init_always(mp->m_super, inode); @@ -297,6 +299,8 @@ xfs_reinit_inode( inode_set_iversion_queried(inode, version); inode->i_mode = mode; inode->i_rdev = dev; + inode->i_uid = uid; + inode->i_gid = gid; return error; } @@ -907,7 +911,12 @@ xfs_eofblocks_worker( { struct xfs_mount *mp = container_of(to_delayed_work(work), struct xfs_mount, m_eofblocks_work); + + if (!sb_start_write_trylock(mp->m_super)) + return; xfs_icache_free_eofblocks(mp, NULL); + sb_end_write(mp->m_super); + xfs_queue_eofblocks(mp); } @@ -934,7 +943,12 @@ xfs_cowblocks_worker( { struct xfs_mount *mp = container_of(to_delayed_work(work), struct xfs_mount, m_cowblocks_work); + + if (!sb_start_write_trylock(mp->m_super)) + return; xfs_icache_free_cowblocks(mp, NULL); + sb_end_write(mp->m_super); + xfs_queue_cowblocks(mp); } @@ -1419,7 +1433,7 @@ xfs_inode_match_id( return 0; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && - xfs_get_projid(ip) != eofb->eof_prid) + ip->i_d.di_projid != eofb->eof_prid) return 0; return 1; @@ -1443,7 +1457,7 @@ xfs_inode_match_id_union( return 1; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && - xfs_get_projid(ip) == eofb->eof_prid) + ip->i_d.di_projid == eofb->eof_prid) return 1; return 0; diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 3ebd1b7f49d8..490fee22b878 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -55,7 +55,7 @@ STATIC void xfs_icreate_item_release( struct xfs_log_item *lip) { - kmem_zone_free(xfs_icreate_zone, ICR_ITEM(lip)); + kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip)); } static const struct xfs_item_ops xfs_icreate_item_ops = { diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 18f4b262e61c..d1772786af29 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -55,6 +55,12 @@ xfs_extlen_t xfs_get_extsz_hint( struct xfs_inode *ip) { + /* + * No point in aligning allocations if we need to COW to actually + * write to them. + */ + if (xfs_is_always_cow_inode(ip)) + return 0; if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) return ip->i_d.di_extsize; if (XFS_IS_REALTIME_INODE(ip)) @@ -795,26 +801,18 @@ xfs_ialloc( return error; ASSERT(ip != NULL); inode = VFS_I(ip); - - /* - * We always convert v1 inodes to v2 now - we only support filesystems - * with >= v2 inode capability, so there is no reason for ever leaving - * an inode in v1 format. - */ - if (ip->i_d.di_version == 1) - ip->i_d.di_version = 2; - inode->i_mode = mode; set_nlink(inode, nlink); - ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); - ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); + inode->i_uid = current_fsuid(); inode->i_rdev = rdev; - xfs_set_projid(ip, prid); + ip->i_d.di_projid = prid; if (pip && XFS_INHERIT_GID(pip)) { - ip->i_d.di_gid = pip->i_d.di_gid; + inode->i_gid = VFS_I(pip)->i_gid; if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode)) inode->i_mode |= S_ISGID; + } else { + inode->i_gid = current_fsgid(); } /* @@ -822,9 +820,8 @@ xfs_ialloc( * ID or one of the supplementary group IDs, the S_ISGID bit is cleared * (and only if the irix_sgid_inherit compatibility variable is set). */ - if ((irix_sgid_inherit) && - (inode->i_mode & S_ISGID) && - (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) + if (irix_sgid_inherit && + (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid)) inode->i_mode &= ~S_ISGID; ip->i_d.di_size = 0; @@ -841,15 +838,13 @@ xfs_ialloc( ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; - if (ip->i_d.di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { inode_set_iversion(inode, 1); ip->i_d.di_flags2 = 0; ip->i_d.di_cowextsize = 0; - ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec; - ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec; + ip->i_d.di_crtime = tv; } - flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { case S_IFIFO: @@ -902,20 +897,13 @@ xfs_ialloc( ip->i_d.di_flags |= di_flags; } - if (pip && - (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && - pip->i_d.di_version == 3 && - ip->i_d.di_version == 3) { - uint64_t di_flags2 = 0; - + if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) { if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; } if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - di_flags2 |= XFS_DIFLAG2_DAX; - - ip->i_d.di_flags2 |= di_flags2; + ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; } /* FALLTHROUGH */ case S_IFLNK: @@ -1117,7 +1105,6 @@ xfs_bumplink( { xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - ASSERT(ip->i_d.di_version > 1); inc_nlink(VFS_I(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -1153,8 +1140,7 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1214,8 +1200,7 @@ xfs_create( unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, - resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + resblks - XFS_IALLOC_SPACE_RES(mp)); if (error) { ASSERT(error != -ENOSPC); goto out_trans_cancel; @@ -1304,8 +1289,7 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1418,7 +1402,7 @@ xfs_link( * the tree quota mechanism could be circumvented. */ if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { + tdp->i_d.di_projid != sip->i_d.di_projid)) { error = -EXDEV; goto error_return; } @@ -1513,10 +1497,8 @@ xfs_itruncate_extents_flags( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; xfs_fileoff_t first_unmap_block; - xfs_fileoff_t last_block; xfs_filblks_t unmap_len; int error = 0; - int done = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!atomic_read(&VFS_I(ip)->i_count) || @@ -1536,21 +1518,22 @@ xfs_itruncate_extents_flags( * the end of the file (in a crash where the space is allocated * but the inode size is not yet updated), simply remove any * blocks which show up between the new EOF and the maximum - * possible file size. If the first block to be removed is - * beyond the maximum file size (ie it is the same as last_block), - * then there is nothing to do. + * possible file size. + * + * We have to free all the blocks to the bmbt maximum offset, even if + * the page cache can't scale that far. */ first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - if (first_unmap_block == last_block) + if (first_unmap_block >= XFS_MAX_FILEOFF) { + WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); return 0; + } - ASSERT(first_unmap_block < last_block); - unmap_len = last_block - first_unmap_block + 1; - while (!done) { + unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; + while (unmap_len > 0) { ASSERT(tp->t_firstblock == NULLFSBLOCK); - error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, - XFS_ITRUNC_MAX_EXTENTS, &done); + error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, + flags, XFS_ITRUNC_MAX_EXTENTS); if (error) goto out; @@ -1570,7 +1553,7 @@ xfs_itruncate_extents_flags( if (whichfork == XFS_DATA_FORK) { /* Remove all pending CoW reservations. */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, - first_unmap_block, last_block, true); + first_unmap_block, XFS_MAX_FILEOFF, true); if (error) goto out; @@ -2115,7 +2098,7 @@ xfs_iunlink_update_bucket( unsigned int bucket_index, xfs_agino_t new_agino) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + struct xfs_agi *agi = agibp->b_addr; xfs_agino_t old_value; int offset; @@ -2130,8 +2113,10 @@ xfs_iunlink_update_bucket( * passed in because either we're adding or removing ourselves from the * head of the list. */ - if (old_value == new_agino) + if (old_value == new_agino) { + xfs_buf_mark_corrupt(agibp); return -EFSCORRUPTED; + } agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); offset = offsetof(struct xfs_agi, agi_unlinked) + @@ -2194,6 +2179,8 @@ xfs_iunlink_update_inode( /* Make sure the old pointer isn't garbage. */ old_value = be32_to_cpu(dip->di_next_unlinked); if (!xfs_verify_agino_or_null(mp, agno, old_value)) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); error = -EFSCORRUPTED; goto out; } @@ -2205,8 +2192,11 @@ xfs_iunlink_update_inode( */ *old_next_agino = old_value; if (old_value == next_agino) { - if (next_agino != NULLAGINO) + if (next_agino != NULLAGINO) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, + dip, sizeof(*dip), __this_address); error = -EFSCORRUPTED; + } goto out; } @@ -2248,7 +2238,7 @@ xfs_iunlink( error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; /* * Get the index into the agi hash table for the list this inode will @@ -2257,8 +2247,10 @@ xfs_iunlink( */ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); if (next_agino == agino || - !xfs_verify_agino_or_null(mp, agno, next_agino)) + !xfs_verify_agino_or_null(mp, agno, next_agino)) { + xfs_buf_mark_corrupt(agibp); return -EFSCORRUPTED; + } if (next_agino != NULLAGINO) { struct xfs_perag *pag; @@ -2430,7 +2422,7 @@ xfs_iunlink_remove( error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; /* * Get the index into the agi hash table for the list this inode will @@ -2511,6 +2503,88 @@ out: } /* + * Look up the inode number specified and mark it stale if it is found. If it is + * dirty, return the inode so it can be attached to the cluster buffer so it can + * be processed appropriately when the cluster free transaction completes. + */ +static struct xfs_inode * +xfs_ifree_get_one_inode( + struct xfs_perag *pag, + struct xfs_inode *free_ip, + xfs_ino_t inum) +{ + struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *ip; + +retry: + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); + + /* Inode not in memory, nothing to do */ + if (!ip) + goto out_rcu_unlock; + + /* + * because this is an RCU protected lookup, we could find a recently + * freed or even reallocated inode during the lookup. We need to check + * under the i_flags_lock for a valid inode here. Skip it if it is not + * valid, the wrong inode or stale. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { + spin_unlock(&ip->i_flags_lock); + goto out_rcu_unlock; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Don't try to lock/unlock the current inode, but we _cannot_ skip the + * other inodes that we did not find in the list attached to the buffer + * and are not already marked stale. If we can't lock it, back off and + * retry. + */ + if (ip != free_ip) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; + } + + /* + * Check the inode number again in case we're racing with + * freeing in xfs_reclaim_inode(). See the comments in that + * function for more information as to why the initial check is + * not sufficient. + */ + if (ip->i_ino != inum) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out_rcu_unlock; + } + } + rcu_read_unlock(); + + xfs_iflock(ip); + xfs_iflags_set(ip, XFS_ISTALE); + + /* + * We don't need to attach clean inodes or those only with unlogged + * changes (which we throw away, anyway). + */ + if (!ip->i_itemp || xfs_inode_clean(ip)) { + ASSERT(ip != free_ip); + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out_no_inode; + } + return ip; + +out_rcu_unlock: + rcu_read_unlock(); +out_no_inode: + return NULL; +} + +/* * A big issue when freeing the inode cluster is that we _cannot_ skip any * inodes that are in memory - they all must be marked stale and attached to * the cluster buffer. @@ -2533,6 +2607,7 @@ xfs_ifree_cluster( struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_ino_t inum; + int error; inum = xic->first_ino; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); @@ -2561,12 +2636,11 @@ xfs_ifree_cluster( * complete before we get a lock on it, and hence we may fail * to mark all the active inodes on the buffer stale. */ - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * igeo->blocks_per_cluster, - XBF_UNMAPPED); - - if (!bp) - return -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, + mp->m_bsize * igeo->blocks_per_cluster, + XBF_UNMAPPED, &bp); + if (error) + return error; /* * This buffer may not have been correctly initialised as we @@ -2610,77 +2684,11 @@ xfs_ifree_cluster( * even trying to lock them. */ for (i = 0; i < igeo->inodes_per_cluster; i++) { -retry: - rcu_read_lock(); - ip = radix_tree_lookup(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, (inum + i))); - - /* Inode not in memory, nothing to do */ - if (!ip) { - rcu_read_unlock(); - continue; - } - - /* - * because this is an RCU protected lookup, we could - * find a recently freed or even reallocated inode - * during the lookup. We need to check under the - * i_flags_lock for a valid inode here. Skip it if it - * is not valid, the wrong inode or stale. - */ - spin_lock(&ip->i_flags_lock); - if (ip->i_ino != inum + i || - __xfs_iflags_test(ip, XFS_ISTALE)) { - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); + ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i); + if (!ip) continue; - } - spin_unlock(&ip->i_flags_lock); - - /* - * Don't try to lock/unlock the current inode, but we - * _cannot_ skip the other inodes that we did not find - * in the list attached to the buffer and are not - * already marked stale. If we can't lock it, back off - * and retry. - */ - if (ip != free_ip) { - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - rcu_read_unlock(); - delay(1); - goto retry; - } - /* - * Check the inode number again in case we're - * racing with freeing in xfs_reclaim_inode(). - * See the comments in that function for more - * information as to why the initial check is - * not sufficient. - */ - if (ip->i_ino != inum + i) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - rcu_read_unlock(); - continue; - } - } - rcu_read_unlock(); - - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_ISTALE); - - /* - * we don't need to attach clean inodes or those only - * with unlogged changes (which we throw away, anyway). - */ iip = ip->i_itemp; - if (!iip || xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; @@ -3196,6 +3204,7 @@ xfs_rename( struct xfs_trans *tp; struct xfs_inode *wip = NULL; /* whiteout inode */ struct xfs_inode *inodes[__XFS_SORT_INODES]; + struct xfs_buf *agibp; int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); @@ -3270,7 +3279,7 @@ xfs_rename( * tree quota mechanism would be circumvented. */ if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { + target_dp->i_d.di_projid != src_ip->i_d.di_projid)) { error = -EXDEV; goto out_trans_cancel; } @@ -3327,7 +3336,6 @@ xfs_rename( goto out_trans_cancel; xfs_bumplink(tp, wip); - xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); VFS_I(wip)->i_state &= ~I_LINKABLE; } @@ -3361,6 +3369,22 @@ xfs_rename( * In case there is already an entry with the same * name at the destination directory, remove it first. */ + + /* + * Check whether the replace operation will need to allocate + * blocks. This happens when the shortform directory lacks + * space and we have to convert it to a block format directory. + * When more blocks are necessary, we must lock the AGI first + * to preserve locking order (AGI -> AGF). + */ + if (xfs_dir2_sf_replace_needblock(target_dp, src_ip->i_ino)) { + error = xfs_read_agi(mp, tp, + XFS_INO_TO_AGNO(mp, target_ip->i_ino), + &agibp); + if (error) + goto out_trans_cancel; + } + error = xfs_dir_replace(tp, target_dp, target_name, src_ip->i_ino, spaceres); if (error) @@ -3778,7 +3802,6 @@ xfs_iflush_int( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip != NULL && iip->ili_fields != 0); - ASSERT(ip->i_d.di_version > 1); /* set *dip = inode's place in the buffer */ dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -3839,7 +3862,7 @@ xfs_iflush_int( * backwards compatibility with old kernels that predate logging all * inode changes. */ - if (ip->i_d.di_version < 3) + if (!xfs_sb_version_has_v3inode(&mp->m_sb)) ip->i_d.di_flushiter++; /* Check the inline fork data before we write out. */ @@ -3922,3 +3945,22 @@ xfs_irele( trace_xfs_irele(ip, _RET_IP_); iput(VFS_I(ip)); } + +/* + * Ensure all commited transactions touching the inode are written to the log. + */ +int +xfs_log_force_inode( + struct xfs_inode *ip) +{ + xfs_lsn_t lsn = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 558173f95a03..c6a63f6764a6 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -37,9 +37,6 @@ typedef struct xfs_inode { struct xfs_ifork *i_cowfp; /* copy on write extents */ struct xfs_ifork i_df; /* data fork */ - /* operations vectors */ - const struct xfs_dir_ops *d_ops; /* directory ops vector */ - /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ @@ -177,30 +174,11 @@ xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) return ret; } -/* - * Project quota id helpers (previously projid was 16bit only - * and using two 16bit values to hold new 32bit projid was chosen - * to retain compatibility with "old" filesystems). - */ -static inline prid_t -xfs_get_projid(struct xfs_inode *ip) -{ - return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo; -} - -static inline void -xfs_set_projid(struct xfs_inode *ip, - prid_t projid) -{ - ip->i_d.di_projid_hi = (uint16_t) (projid >> 16); - ip->i_d.di_projid_lo = (uint16_t) (projid & 0xffff); -} - static inline prid_t xfs_get_initial_prid(struct xfs_inode *dp) { if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - return xfs_get_projid(dp); + return dp->i_d.di_projid; return XFS_PROJID_DEFAULT; } @@ -220,6 +198,13 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) } /* + * Return the buftarg used for data allocations on a given inode. + */ +#define xfs_inode_buftarg(ip) \ + (XFS_IS_REALTIME_INODE(ip) ? \ + (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) + +/* * In-core inode flags. */ #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ @@ -441,6 +426,7 @@ int xfs_itruncate_extents_flags(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t, int); void xfs_iext_realloc(xfs_inode_t *, int, int); +int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index bb8f076805b9..f779cca2346f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -17,6 +17,7 @@ #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_error.h" #include <linux/iversion.h> @@ -124,7 +125,7 @@ xfs_inode_item_size( *nvecs += 2; *nbytes += sizeof(struct xfs_inode_log_format) + - xfs_log_dinode_size(ip->i_d.di_version); + xfs_log_dinode_size(ip->i_mount); xfs_inode_item_data_fork_size(iip, nvecs, nbytes); if (XFS_IFORK_Q(ip)) @@ -304,13 +305,11 @@ xfs_inode_to_log_dinode( struct inode *inode = VFS_I(ip); to->di_magic = XFS_DINODE_MAGIC; - - to->di_version = from->di_version; to->di_format = from->di_format; - to->di_uid = from->di_uid; - to->di_gid = from->di_gid; - to->di_projid_lo = from->di_projid_lo; - to->di_projid_hi = from->di_projid_hi; + to->di_uid = i_uid_read(inode); + to->di_gid = i_gid_read(inode); + to->di_projid_lo = from->di_projid & 0xffff; + to->di_projid_hi = from->di_projid >> 16; memset(to->di_pad, 0, sizeof(to->di_pad)); memset(to->di_pad3, 0, sizeof(to->di_pad3)); @@ -338,10 +337,11 @@ xfs_inode_to_log_dinode( /* log a dummy value to ensure log structure is fully initialised */ to->di_next_unlinked = NULLAGINO; - if (from->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + to->di_version = 3; to->di_changecount = inode_peek_iversion(inode); - to->di_crtime.t_sec = from->di_crtime.t_sec; - to->di_crtime.t_nsec = from->di_crtime.t_nsec; + to->di_crtime.t_sec = from->di_crtime.tv_sec; + to->di_crtime.t_nsec = from->di_crtime.tv_nsec; to->di_flags2 = from->di_flags2; to->di_cowextsize = from->di_cowextsize; to->di_ino = ip->i_ino; @@ -350,6 +350,7 @@ xfs_inode_to_log_dinode( uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_flushiter = 0; } else { + to->di_version = 2; to->di_flushiter = from->di_flushiter; } } @@ -369,7 +370,7 @@ xfs_inode_item_format_core( dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE); xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn); - xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version)); + xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_mount)); } /* @@ -394,8 +395,6 @@ xfs_inode_item_format( struct xfs_log_iovec *vecp = NULL; struct xfs_inode_log_format *ilf; - ASSERT(ip->i_d.di_version > 1); - ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT); ilf->ilf_type = XFS_LI_INODE; ilf->ilf_ino = ip->i_ino; @@ -553,7 +552,8 @@ xfs_inode_item_push( if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } + } else if (error == -EAGAIN) + rval = XFS_ITEM_LOCKED; spin_lock(&lip->li_ailp->ail_lock); out_unlock: @@ -666,7 +666,7 @@ xfs_inode_item_destroy( xfs_inode_t *ip) { kmem_free(ip->i_itemp->ili_item.li_lv_shadow); - kmem_zone_free(xfs_ili_zone, ip->i_itemp); + kmem_cache_free(xfs_ili_zone, ip->i_itemp); } @@ -731,29 +731,27 @@ xfs_iflush_done( * holding the lock before removing the inode from the AIL. */ if (need_ail) { - bool mlip_changed = false; + xfs_lsn_t tail_lsn = 0; /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->ail_lock); list_for_each_entry(blip, &tmp, li_bio_list) { if (INODE_ITEM(blip)->ili_logged && - blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) - mlip_changed |= xfs_ail_delete_one(ailp, blip); - else { + blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) { + /* + * xfs_ail_update_finish() only cares about the + * lsn of the first tail item removed, any + * others will be at the same or higher lsn so + * we just ignore them. + */ + xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip); + if (!tail_lsn && lsn) + tail_lsn = lsn; + } else { xfs_clear_li_failed(blip); } } - - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) - xlog_assign_tail_lsn_locked(ailp->ail_mount); - if (list_empty(&ailp->ail_head)) - wake_up_all(&ailp->ail_empty); - } - spin_unlock(&ailp->ail_lock); - - if (mlip_changed) - xfs_log_space_wake(ailp->ail_mount); + xfs_ail_update_finish(ailp, tail_lsn); } /* @@ -828,8 +826,10 @@ xfs_inode_item_format_convert( { struct xfs_inode_log_format_32 *in_f32 = buf->i_addr; - if (buf->i_len != sizeof(*in_f32)) + if (buf->i_len != sizeof(*in_f32)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; + } in_f->ilf_type = in_f32->ilf_type; in_f->ilf_size = in_f32->ilf_size; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d58f0d6a699e..309958186d33 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -33,6 +33,10 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_health.h" +#include "xfs_reflink.h" +#include "xfs_ioctl.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include <linux/mount.h> #include <linux/namei.h> @@ -290,138 +294,173 @@ xfs_readlink_by_handle( return error; } -int -xfs_set_dmattrs( - xfs_inode_t *ip, - uint evmask, - uint16_t state) +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +static void +xfs_ioc_attr_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen) { - xfs_mount_t *mp = ip->i_mount; - xfs_trans_t *tp; - int error; + struct xfs_attrlist *alist = context->buffer; + struct xfs_attrlist_ent *aep; + int arraytop; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + ASSERT(!context->seen_enough); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*alist)); + ASSERT(context->firstu <= context->bufsize); - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - ip->i_d.di_dmevmask = evmask; - ip->i_d.di_dmstate = state; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp); + /* + * Only list entries in the right namespace. + */ + if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK)) + return; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); + + /* decrement by the actual bytes used by the attr */ + context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) + + namelen + 1, sizeof(uint32_t)); + if (context->firstu < arraytop) { + trace_xfs_attr_list_full(context); + alist->al_more = 1; + context->seen_enough = 1; + return; + } + + aep = context->buffer + context->firstu; + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; + trace_xfs_attr_list_add(context); +} - return error; +static unsigned int +xfs_attr_filter( + u32 ioc_flags) +{ + if (ioc_flags & XFS_IOC_ATTR_ROOT) + return XFS_ATTR_ROOT; + if (ioc_flags & XFS_IOC_ATTR_SECURE) + return XFS_ATTR_SECURE; + return 0; } -STATIC int -xfs_fssetdm_by_handle( - struct file *parfilp, - void __user *arg) +static unsigned int +xfs_attr_flags( + u32 ioc_flags) { - int error; - struct fsdmidata fsd; - xfs_fsop_setdm_handlereq_t dmhreq; - struct dentry *dentry; + if (ioc_flags & XFS_IOC_ATTR_CREATE) + return XATTR_CREATE; + if (ioc_flags & XFS_IOC_ATTR_REPLACE) + return XATTR_REPLACE; + return 0; +} - if (!capable(CAP_MKNOD)) - return -EPERM; - if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) - return -EFAULT; +int +xfs_ioc_attr_list( + struct xfs_inode *dp, + void __user *ubuf, + int bufsize, + int flags, + struct xfs_attrlist_cursor __user *ucursor) +{ + struct xfs_attr_list_context context = { }; + struct xfs_attrlist *alist; + void *buffer; + int error; - error = mnt_want_write_file(parfilp); - if (error) - return error; + if (bufsize < sizeof(struct xfs_attrlist) || + bufsize > XFS_XATTR_LIST_MAX) + return -EINVAL; - dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); - if (IS_ERR(dentry)) { - mnt_drop_write_file(parfilp); - return PTR_ERR(dentry); - } + /* + * Reject flags, only allow namespaces. + */ + if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) + return -EINVAL; + if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) + return -EINVAL; - if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { - error = -EPERM; - goto out; - } + /* + * Validate the cursor. + */ + if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor))) + return -EFAULT; + if (context.cursor.pad1 || context.cursor.pad2) + return -EINVAL; + if (!context.cursor.initted && + (context.cursor.hashval || context.cursor.blkno || + context.cursor.offset)) + return -EINVAL; - if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) { - error = -EFAULT; - goto out; - } + buffer = kmem_zalloc_large(bufsize, 0); + if (!buffer) + return -ENOMEM; - error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, - fsd.fsd_dmstate); + /* + * Initialize the output buffer. + */ + context.dp = dp; + context.resynch = 1; + context.attr_filter = xfs_attr_filter(flags); + context.buffer = buffer; + context.bufsize = round_down(bufsize, sizeof(uint32_t)); + context.firstu = context.bufsize; + context.put_listent = xfs_ioc_attr_put_listent; + + alist = context.buffer; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; + + error = xfs_attr_list(&context); + if (error) + goto out_free; - out: - mnt_drop_write_file(parfilp); - dput(dentry); + if (copy_to_user(ubuf, buffer, bufsize) || + copy_to_user(ucursor, &context.cursor, sizeof(context.cursor))) + error = -EFAULT; +out_free: + kmem_free(buffer); return error; } STATIC int xfs_attrlist_by_handle( struct file *parfilp, - void __user *arg) + struct xfs_fsop_attrlist_handlereq __user *p) { - int error = -ENOMEM; - attrlist_cursor_kern_t *cursor; - struct xfs_fsop_attrlist_handlereq __user *p = arg; - xfs_fsop_attrlist_handlereq_t al_hreq; + struct xfs_fsop_attrlist_handlereq al_hreq; struct dentry *dentry; - char *kbuf; + int error = -ENOMEM; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) + if (copy_from_user(&al_hreq, p, sizeof(al_hreq))) return -EFAULT; - if (al_hreq.buflen < sizeof(struct attrlist) || - al_hreq.buflen > XFS_XATTR_LIST_MAX) - return -EINVAL; - - /* - * Reject flags, only allow namespaces. - */ - if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) - return -EINVAL; dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kmem_zalloc_large(al_hreq.buflen, 0); - if (!kbuf) - goto out_dput; - - cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, - al_hreq.flags, cursor); - if (error) - goto out_kfree; - - if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) { - error = -EFAULT; - goto out_kfree; - } - - if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) - error = -EFAULT; - -out_kfree: - kmem_free(kbuf); -out_dput: + error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer, + al_hreq.buflen, al_hreq.flags, &p->pos); dput(dentry); return error; } -int +static int xfs_attrmulti_attr_get( struct inode *inode, unsigned char *name, @@ -429,28 +468,33 @@ xfs_attrmulti_attr_get( uint32_t *len, uint32_t flags) { - unsigned char *kbuf; - int error = -EFAULT; + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = xfs_attr_filter(flags), + .attr_flags = xfs_attr_flags(flags), + .name = name, + .namelen = strlen(name), + .valuelen = *len, + }; + int error; if (*len > XFS_XATTR_SIZE_MAX) return -EINVAL; - kbuf = kmem_zalloc_large(*len, 0); - if (!kbuf) - return -ENOMEM; - error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags); + error = xfs_attr_get(&args); if (error) goto out_kfree; - if (copy_to_user(ubuf, kbuf, *len)) + *len = args.valuelen; + if (copy_to_user(ubuf, args.value, args.valuelen)) error = -EFAULT; out_kfree: - kmem_free(kbuf); + kmem_free(args.value); return error; } -int +static int xfs_attrmulti_attr_set( struct inode *inode, unsigned char *name, @@ -458,38 +502,75 @@ xfs_attrmulti_attr_set( uint32_t len, uint32_t flags) { - unsigned char *kbuf; + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = xfs_attr_filter(flags), + .attr_flags = xfs_attr_flags(flags), + .name = name, + .namelen = strlen(name), + }; int error; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; - if (len > XFS_XATTR_SIZE_MAX) - return -EINVAL; - kbuf = memdup_user(ubuf, len); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + if (ubuf) { + if (len > XFS_XATTR_SIZE_MAX) + return -EINVAL; + args.value = memdup_user(ubuf, len); + if (IS_ERR(args.value)) + return PTR_ERR(args.value); + args.valuelen = len; + } - error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); - if (!error) - xfs_forget_acl(inode, name, flags); - kfree(kbuf); + error = xfs_attr_set(&args); + if (!error && (flags & XFS_IOC_ATTR_ROOT)) + xfs_forget_acl(inode, name); + kfree(args.value); return error; } int -xfs_attrmulti_attr_remove( +xfs_ioc_attrmulti_one( + struct file *parfilp, struct inode *inode, - unsigned char *name, + uint32_t opcode, + void __user *uname, + void __user *value, + uint32_t *len, uint32_t flags) { + unsigned char *name; int error; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - error = xfs_attr_remove(XFS_I(inode), name, flags); - if (!error) - xfs_forget_acl(inode, name, flags); + if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE)) + return -EINVAL; + + name = strndup_user(uname, MAXNAMELEN); + if (IS_ERR(name)) + return PTR_ERR(name); + + switch (opcode) { + case ATTR_OP_GET: + error = xfs_attrmulti_attr_get(inode, name, value, len, flags); + break; + case ATTR_OP_REMOVE: + value = NULL; + *len = 0; + /* fall through */ + case ATTR_OP_SET: + error = mnt_want_write_file(parfilp); + if (error) + break; + error = xfs_attrmulti_attr_set(inode, name, value, *len, flags); + mnt_drop_write_file(parfilp); + break; + default: + error = -EINVAL; + break; + } + + kfree(name); return error; } @@ -503,7 +584,6 @@ xfs_attrmulti_by_handle( xfs_fsop_attrmulti_handlereq_t am_hreq; struct dentry *dentry; unsigned int i, size; - unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -529,56 +609,17 @@ xfs_attrmulti_by_handle( goto out_dput; } - error = -ENOMEM; - attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); - if (!attr_name) - goto out_kfree_ops; - error = 0; for (i = 0; i < am_hreq.opcount; i++) { - ops[i].am_error = strncpy_from_user((char *)attr_name, - ops[i].am_attrname, MAXNAMELEN); - if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; - if (ops[i].am_error < 0) - break; - - switch (ops[i].am_opcode) { - case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get( - d_inode(dentry), attr_name, - ops[i].am_attrvalue, &ops[i].am_length, - ops[i].am_flags); - break; - case ATTR_OP_SET: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_set( - d_inode(dentry), attr_name, - ops[i].am_attrvalue, ops[i].am_length, - ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_remove( - d_inode(dentry), attr_name, - ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - default: - ops[i].am_error = -EINVAL; - } + ops[i].am_error = xfs_ioc_attrmulti_one(parfilp, + d_inode(dentry), ops[i].am_opcode, + ops[i].am_attrname, ops[i].am_attrvalue, + &ops[i].am_length, ops[i].am_flags); } if (copy_to_user(am_hreq.ops, ops, size)) error = -EFAULT; - kfree(attr_name); - out_kfree_ops: kfree(ops); out_dput: dput(dentry); @@ -588,13 +629,12 @@ xfs_attrmulti_by_handle( int xfs_ioc_space( struct file *filp, - unsigned int cmd, xfs_flock64_t *bf) { struct inode *inode = file_inode(filp); struct xfs_inode *ip = XFS_I(inode); struct iattr iattr; - enum xfs_prealloc_flags flags = 0; + enum xfs_prealloc_flags flags = XFS_PREALLOC_CLEAR; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; int error; @@ -607,6 +647,9 @@ xfs_ioc_space( if (!S_ISREG(inode->i_mode)) return -EINVAL; + if (xfs_is_always_cow_inode(ip)) + return -EOPNOTSUPP; + if (filp->f_flags & O_DSYNC) flags |= XFS_PREALLOC_SYNC; if (filp->f_mode & FMODE_NOCMTIME) @@ -620,6 +663,7 @@ xfs_ioc_space( error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) goto out_unlock; + inode_dio_wait(inode); switch (bf->l_whence) { case 0: /*SEEK_SET*/ @@ -635,73 +679,21 @@ xfs_ioc_space( goto out_unlock; } - /* - * length of <= 0 for resv/unresv/zero is invalid. length for - * alloc/free is ignored completely and we have no idea what userspace - * might have set it to, so set it to zero to allow range - * checks to pass. - */ - switch (cmd) { - case XFS_IOC_ZERO_RANGE: - case XFS_IOC_RESVSP: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP: - case XFS_IOC_UNRESVSP64: - if (bf->l_len <= 0) { - error = -EINVAL; - goto out_unlock; - } - break; - default: - bf->l_len = 0; - break; - } - - if (bf->l_start < 0 || - bf->l_start > inode->i_sb->s_maxbytes || - bf->l_start + bf->l_len < 0 || - bf->l_start + bf->l_len >= inode->i_sb->s_maxbytes) { + if (bf->l_start < 0 || bf->l_start > inode->i_sb->s_maxbytes) { error = -EINVAL; goto out_unlock; } - switch (cmd) { - case XFS_IOC_ZERO_RANGE: - flags |= XFS_PREALLOC_SET; - error = xfs_zero_file_space(ip, bf->l_start, bf->l_len); - break; - case XFS_IOC_RESVSP: - case XFS_IOC_RESVSP64: - flags |= XFS_PREALLOC_SET; - error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len, - XFS_BMAPI_PREALLOC); - break; - case XFS_IOC_UNRESVSP: - case XFS_IOC_UNRESVSP64: - error = xfs_free_file_space(ip, bf->l_start, bf->l_len); - break; - case XFS_IOC_ALLOCSP: - case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP: - case XFS_IOC_FREESP64: - flags |= XFS_PREALLOC_CLEAR; - if (bf->l_start > XFS_ISIZE(ip)) { - error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), - bf->l_start - XFS_ISIZE(ip), 0); - if (error) - goto out_unlock; - } - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = bf->l_start; - - error = xfs_vn_setattr_size(file_dentry(filp), &iattr); - break; - default: - ASSERT(0); - error = -EINVAL; + if (bf->l_start > XFS_ISIZE(ip)) { + error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), + bf->l_start - XFS_ISIZE(ip), 0); + if (error) + goto out_unlock; } + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = bf->l_start; + error = xfs_vn_setattr_size(file_dentry(filp), &iattr); if (error) goto out_unlock; @@ -1116,7 +1108,7 @@ xfs_fill_fsxattr( fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; fa->fsx_cowextsize = ip->i_d.di_cowextsize << ip->i_mount->m_sb.sb_blocklog; - fa->fsx_projid = xfs_get_projid(ip); + fa->fsx_projid = ip->i_d.di_projid; if (attr) { if (ip->i_afp) { @@ -1271,7 +1263,7 @@ xfs_ioctl_setattr_xflags( /* diflags2 only valid for v3 inodes. */ di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); - if (di_flags2 && ip->i_d.di_version < 3) + if (di_flags2 && !xfs_sb_version_has_v3inode(&mp->m_sb)) return -EINVAL; ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); @@ -1311,10 +1303,9 @@ xfs_ioctl_setattr_dax_invalidate( * have to check the device for dax support or flush pagecache. */ if (fa->fsx_xflags & FS_XFLAG_DAX) { - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) - return -EINVAL; - if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), - sb->s_blocksize)) + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + + if (!bdev_dax_supported(target->bt_bdev, sb->s_blocksize)) return -EINVAL; } @@ -1482,8 +1473,7 @@ xfs_ioctl_setattr_check_cowextsize( if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) return 0; - if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) || - ip->i_d.di_version != 3) + if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb)) return -EINVAL; if (fa->fsx_cowextsize == 0) @@ -1544,9 +1534,9 @@ xfs_ioctl_setattr( * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp)) { - code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, - ip->i_d.di_gid, fa->fsx_projid, - XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); + code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, + VFS_I(ip)->i_gid, fa->fsx_projid, + XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); if (code) return code; } @@ -1569,7 +1559,7 @@ xfs_ioctl_setattr( } if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && - xfs_get_projid(ip) != fa->fsx_projid) { + ip->i_d.di_projid != fa->fsx_projid) { code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (code) /* out of quota */ @@ -1606,13 +1596,12 @@ xfs_ioctl_setattr( VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID); /* Change the ownerships and register project quota modifications */ - if (xfs_get_projid(ip) != fa->fsx_projid) { + if (ip->i_d.di_projid != fa->fsx_projid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } - ASSERT(ip->i_d.di_version > 1); - xfs_set_projid(ip, fa->fsx_projid); + ip->i_d.di_projid = fa->fsx_projid; } /* @@ -1624,7 +1613,7 @@ xfs_ioctl_setattr( ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; else ip->i_d.di_extsize = 0; - if (ip->i_d.di_version == 3 && + if (xfs_sb_version_has_v3inode(&mp->m_sb) && (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) ip->i_d.di_cowextsize = fa->fsx_cowextsize >> mp->m_sb.sb_blocklog; @@ -2122,24 +2111,17 @@ xfs_file_ioctl( return xfs_ioc_setlabel(filp, mp, arg); case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: - case XFS_IOC_RESVSP: - case XFS_IOC_UNRESVSP: case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP64: - case XFS_IOC_ZERO_RANGE: { + case XFS_IOC_FREESP64: { xfs_flock64_t bf; if (copy_from_user(&bf, arg, sizeof(bf))) return -EFAULT; - return xfs_ioc_space(filp, cmd, &bf); + return xfs_ioc_space(filp, &bf); } case XFS_IOC_DIOINFO: { - struct dioattr da; - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct dioattr da; da.d_mem = da.d_miniosz = target->bt_logical_sectorsize; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); @@ -2183,22 +2165,6 @@ xfs_file_ioctl( case XFS_IOC_SETXFLAGS: return xfs_ioc_setxflags(ip, filp, arg); - case XFS_IOC_FSSETDM: { - struct fsdmidata dmi; - - if (copy_from_user(&dmi, arg, sizeof(dmi))) - return -EFAULT; - - error = mnt_want_write_file(filp); - if (error) - return error; - - error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, - dmi.fsd_dmstate); - mnt_drop_write_file(filp); - return error; - } - case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: case XFS_IOC_GETBMAPX: @@ -2226,8 +2192,6 @@ xfs_file_ioctl( return -EFAULT; return xfs_open_by_handle(filp, &hreq); } - case XFS_IOC_FSSETDM_BY_HANDLE: - return xfs_fssetdm_by_handle(filp, arg); case XFS_IOC_READLINK_BY_HANDLE: { xfs_fsop_handlereq_t hreq; @@ -2399,7 +2363,10 @@ xfs_file_ioctl( if (error) return error; - return xfs_icache_free_eofblocks(mp, &keofb); + sb_start_write(mp->m_super); + error = xfs_icache_free_eofblocks(mp, &keofb); + sb_end_write(mp->m_super); + return error; } default: diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 654c0bb1bcf8..bab6a5a92407 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -6,10 +6,14 @@ #ifndef __XFS_IOCTL_H__ #define __XFS_IOCTL_H__ +struct xfs_bstat; +struct xfs_ibulk; +struct xfs_inogrp; + + extern int xfs_ioc_space( struct file *filp, - unsigned int cmd, xfs_flock64_t *bf); int @@ -31,27 +35,11 @@ xfs_readlink_by_handle( struct file *parfilp, xfs_fsop_handlereq_t *hreq); -extern int -xfs_attrmulti_attr_get( - struct inode *inode, - unsigned char *name, - unsigned char __user *ubuf, - uint32_t *len, - uint32_t flags); - -extern int -xfs_attrmulti_attr_set( - struct inode *inode, - unsigned char *name, - const unsigned char __user *ubuf, - uint32_t len, - uint32_t flags); - -extern int -xfs_attrmulti_attr_remove( - struct inode *inode, - unsigned char *name, - uint32_t flags); +int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode, + uint32_t opcode, void __user *uname, void __user *value, + uint32_t *len, uint32_t flags); +int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, int bufsize, + int flags, struct xfs_attrlist_cursor __user *ucursor); extern struct dentry * xfs_handle_to_dentry( @@ -71,16 +59,6 @@ xfs_file_compat_ioctl( unsigned int cmd, unsigned long arg); -extern int -xfs_set_dmattrs( - struct xfs_inode *ip, - uint evmask, - uint16_t state); - -struct xfs_ibulk; -struct xfs_bstat; -struct xfs_inogrp; - int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq, const struct xfs_bulkstat *bstat); int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 1e08bf79b478..c1771e728117 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -107,7 +107,7 @@ xfs_ioctl32_bstime_copyin( xfs_bstime_t *bstime, compat_xfs_bstime_t __user *bstime32) { - compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */ + old_time32_t sec32; /* tv_sec differs on 64 vs. 32 */ if (get_user(sec32, &bstime32->tv_sec) || get_user(bstime->tv_nsec, &bstime32->tv_nsec)) @@ -352,56 +352,24 @@ xfs_compat_handlereq_to_dentry( STATIC int xfs_compat_attrlist_by_handle( struct file *parfilp, - void __user *arg) + compat_xfs_fsop_attrlist_handlereq_t __user *p) { - int error; - attrlist_cursor_kern_t *cursor; - compat_xfs_fsop_attrlist_handlereq_t __user *p = arg; compat_xfs_fsop_attrlist_handlereq_t al_hreq; struct dentry *dentry; - char *kbuf; + int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&al_hreq, arg, - sizeof(compat_xfs_fsop_attrlist_handlereq_t))) + if (copy_from_user(&al_hreq, p, sizeof(al_hreq))) return -EFAULT; - if (al_hreq.buflen < sizeof(struct attrlist) || - al_hreq.buflen > XFS_XATTR_LIST_MAX) - return -EINVAL; - - /* - * Reject flags, only allow namespaces. - */ - if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) - return -EINVAL; dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); - error = -ENOMEM; - kbuf = kmem_zalloc_large(al_hreq.buflen, 0); - if (!kbuf) - goto out_dput; - - cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, - al_hreq.flags, cursor); - if (error) - goto out_kfree; - - if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) { - error = -EFAULT; - goto out_kfree; - } - - if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) - error = -EFAULT; - -out_kfree: - kmem_free(kbuf); -out_dput: + error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), + compat_ptr(al_hreq.buffer), al_hreq.buflen, + al_hreq.flags, &p->pos); dput(dentry); return error; } @@ -416,7 +384,6 @@ xfs_compat_attrmulti_by_handle( compat_xfs_fsop_attrmulti_handlereq_t am_hreq; struct dentry *dentry; unsigned int i, size; - unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -443,101 +410,24 @@ xfs_compat_attrmulti_by_handle( goto out_dput; } - error = -ENOMEM; - attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); - if (!attr_name) - goto out_kfree_ops; - error = 0; for (i = 0; i < am_hreq.opcount; i++) { - ops[i].am_error = strncpy_from_user((char *)attr_name, + ops[i].am_error = xfs_ioc_attrmulti_one(parfilp, + d_inode(dentry), ops[i].am_opcode, compat_ptr(ops[i].am_attrname), - MAXNAMELEN); - if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; - if (ops[i].am_error < 0) - break; - - switch (ops[i].am_opcode) { - case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get( - d_inode(dentry), attr_name, - compat_ptr(ops[i].am_attrvalue), - &ops[i].am_length, ops[i].am_flags); - break; - case ATTR_OP_SET: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_set( - d_inode(dentry), attr_name, - compat_ptr(ops[i].am_attrvalue), - ops[i].am_length, ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_remove( - d_inode(dentry), attr_name, - ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - default: - ops[i].am_error = -EINVAL; - } + compat_ptr(ops[i].am_attrvalue), + &ops[i].am_length, ops[i].am_flags); } if (copy_to_user(compat_ptr(am_hreq.ops), ops, size)) error = -EFAULT; - kfree(attr_name); - out_kfree_ops: kfree(ops); out_dput: dput(dentry); return error; } -STATIC int -xfs_compat_fssetdm_by_handle( - struct file *parfilp, - void __user *arg) -{ - int error; - struct fsdmidata fsd; - compat_xfs_fsop_setdm_handlereq_t dmhreq; - struct dentry *dentry; - - if (!capable(CAP_MKNOD)) - return -EPERM; - if (copy_from_user(&dmhreq, arg, - sizeof(compat_xfs_fsop_setdm_handlereq_t))) - return -EFAULT; - - dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - - if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { - error = -EPERM; - goto out; - } - - if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) { - error = -EFAULT; - goto out; - } - - error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, - fsd.fsd_dmstate); - -out: - dput(dentry); - return error; -} - long xfs_file_compat_ioctl( struct file *filp, @@ -557,18 +447,13 @@ xfs_file_compat_ioctl( case XFS_IOC_ALLOCSP_32: case XFS_IOC_FREESP_32: case XFS_IOC_ALLOCSP64_32: - case XFS_IOC_FREESP64_32: - case XFS_IOC_RESVSP_32: - case XFS_IOC_UNRESVSP_32: - case XFS_IOC_RESVSP64_32: - case XFS_IOC_UNRESVSP64_32: - case XFS_IOC_ZERO_RANGE_32: { + case XFS_IOC_FREESP64_32: { struct xfs_flock64 bf; if (xfs_compat_flock64_copyin(&bf, arg)) return -EFAULT; cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(filp, cmd, &bf); + return xfs_ioc_space(filp, &bf); } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(mp, arg); @@ -651,8 +536,6 @@ xfs_file_compat_ioctl( return xfs_compat_attrlist_by_handle(filp, arg); case XFS_IOC_ATTRMULTI_BY_HANDLE_32: return xfs_compat_attrmulti_by_handle(filp, arg); - case XFS_IOC_FSSETDM_BY_HANDLE_32: - return xfs_compat_fssetdm_by_handle(filp, arg); default: /* try the native version */ return xfs_file_ioctl(filp, cmd, (unsigned long)arg); diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 7985344d3aa6..053de7d894cd 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -32,7 +32,7 @@ #endif typedef struct compat_xfs_bstime { - compat_time_t tv_sec; /* seconds */ + old_time32_t tv_sec; /* seconds */ __s32 tv_nsec; /* and nanoseconds */ } compat_xfs_bstime_t; @@ -99,7 +99,7 @@ typedef struct compat_xfs_fsop_handlereq { _IOWR('X', 108, struct compat_xfs_fsop_handlereq) /* The bstat field in the swapext struct needs translation */ -typedef struct compat_xfs_swapext { +struct compat_xfs_swapext { int64_t sx_version; /* version */ int64_t sx_fdtarget; /* fd of target file */ int64_t sx_fdtmp; /* fd of tmp file */ @@ -107,7 +107,7 @@ typedef struct compat_xfs_swapext { xfs_off_t sx_length; /* leng from offset */ char sx_pad[16]; /* pad space, unused */ struct compat_xfs_bstat sx_stat; /* stat of target b4 copy */ -} __compat_packed compat_xfs_swapext_t; +} __compat_packed; #define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext) @@ -143,15 +143,6 @@ typedef struct compat_xfs_fsop_attrmulti_handlereq { #define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \ _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq) -typedef struct compat_xfs_fsop_setdm_handlereq { - struct compat_xfs_fsop_handlereq hreq; /* handle information */ - /* ptr to struct fsdmidata */ - compat_uptr_t data; /* DMAPI data */ -} compat_xfs_fsop_setdm_handlereq_t; - -#define XFS_IOC_FSSETDM_BY_HANDLE_32 \ - _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq) - #ifdef BROKEN_X86_ALIGNMENT /* on ia32 l_start is on a 32-bit boundary */ typedef struct compat_xfs_flock64 { diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f780e223b118..bb590a267a7f 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -29,8 +29,8 @@ #include "xfs_reflink.h" -#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ - << mp->m_writeio_log) +#define XFS_ALLOC_ALIGN(mp, off) \ + (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) static int xfs_alert_fsblock_zero( @@ -54,9 +54,10 @@ xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, - bool shared) + u16 flags) { struct xfs_mount *mp = ip->i_mount; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) return xfs_alert_fsblock_zero(ip, imap); @@ -77,14 +78,13 @@ xfs_bmbt_to_iomap( } iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); - iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + iomap->bdev = target->bt_bdev; + iomap->dax_dev = target->bt_daxdev; + iomap->flags = flags; if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; - if (shared) - iomap->flags |= IOMAP_F_SHARED; return 0; } @@ -95,18 +95,30 @@ xfs_hole_to_iomap( xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb) { + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb); iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb); - iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); - iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + iomap->bdev = target->bt_bdev; + iomap->dax_dev = target->bt_daxdev; +} + +static inline xfs_fileoff_t +xfs_iomap_end_fsb( + struct xfs_mount *mp, + loff_t offset, + loff_t count) +{ + ASSERT(offset <= mp->m_super->s_maxbytes); + return min(XFS_B_TO_FSB(mp, offset + count), + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); } -xfs_extlen_t +static xfs_extlen_t xfs_eof_alignment( - struct xfs_inode *ip, - xfs_extlen_t extsize) + struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; xfs_extlen_t align = 0; @@ -129,111 +141,80 @@ xfs_eof_alignment( align = 0; } - /* - * Always round up the allocation request to an extent boundary - * (when file on a real-time subvolume or has di_extsize hint). - */ - if (extsize) { - if (align) - align = roundup_64(align, extsize); - else - align = extsize; - } - return align; } -STATIC int +/* + * Check if last_fsb is outside the last extent, and if so grow it to the next + * stripe unit boundary. + */ +xfs_fileoff_t xfs_iomap_eof_align_last_fsb( struct xfs_inode *ip, - xfs_extlen_t extsize, - xfs_fileoff_t *last_fsb) + xfs_fileoff_t end_fsb) { - xfs_extlen_t align = xfs_eof_alignment(ip, extsize); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_extlen_t extsz = xfs_get_extsz_hint(ip); + xfs_extlen_t align = xfs_eof_alignment(ip); + struct xfs_bmbt_irec irec; + struct xfs_iext_cursor icur; + + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + + /* + * Always round up the allocation request to the extent hint boundary. + */ + if (extsz) { + if (align) + align = roundup_64(align, extsz); + else + align = extsz; + } if (align) { - xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align); - int eof, error; + xfs_fileoff_t aligned_end_fsb = roundup_64(end_fsb, align); - error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); - if (error) - return error; - if (eof) - *last_fsb = new_last_fsb; + xfs_iext_last(ifp, &icur); + if (!xfs_iext_get_extent(ifp, &icur, &irec) || + aligned_end_fsb >= irec.br_startoff + irec.br_blockcount) + return aligned_end_fsb; } - return 0; + + return end_fsb; } int xfs_iomap_write_direct( - xfs_inode_t *ip, - xfs_off_t offset, - size_t count, - xfs_bmbt_irec_t *imap, - int nmaps) + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t count_fsb, + struct xfs_bmbt_irec *imap) { - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t offset_fsb; - xfs_fileoff_t last_fsb; - xfs_filblks_t count_fsb, resaligned; - xfs_extlen_t extsz; - int nimaps; - int quota_flag; - int rt; - xfs_trans_t *tp; - uint qblocks, resblks, resrtextents; - int error; - int lockmode; - int bmapi_flags = XFS_BMAPI_PREALLOC; - uint tflags = 0; - - rt = XFS_IS_REALTIME_INODE(ip); - extsz = xfs_get_extsz_hint(ip); - lockmode = XFS_ILOCK_SHARED; /* locked by caller */ - - ASSERT(xfs_isilocked(ip, lockmode)); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + xfs_filblks_t resaligned; + int nimaps; + int quota_flag; + uint qblocks, resblks; + unsigned int resrtextents = 0; + int error; + int bmapi_flags = XFS_BMAPI_PREALLOC; + uint tflags = 0; - offset_fsb = XFS_B_TO_FSBT(mp, offset); - last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - if ((offset + count) > XFS_ISIZE(ip)) { - /* - * Assert that the in-core extent list is present since this can - * call xfs_iread_extents() and we only have the ilock shared. - * This should be safe because the lock was held around a bmapi - * call in the caller and we only need it to access the in-core - * list. - */ - ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags & - XFS_IFEXTENTS); - error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb); - if (error) - goto out_unlock; - } else { - if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) - last_fsb = min(last_fsb, (xfs_fileoff_t) - imap->br_blockcount + - imap->br_startoff); - } - count_fsb = last_fsb - offset_fsb; ASSERT(count_fsb > 0); - resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, extsz); - if (unlikely(rt)) { + resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, + xfs_get_extsz_hint(ip)); + if (unlikely(XFS_IS_REALTIME_INODE(ip))) { resrtextents = qblocks = resaligned; resrtextents /= mp->m_sb.sb_rextsize; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); quota_flag = XFS_QMOPT_RES_RTBLKS; } else { - resrtextents = 0; resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); quota_flag = XFS_QMOPT_RES_REGBLKS; } - /* - * Drop the shared lock acquired by the caller, attach the dquot if - * necessary and move on to transaction setup. - */ - xfs_iunlock(ip, lockmode); error = xfs_qm_dqattach(ip); if (error) return error; @@ -263,8 +244,7 @@ xfs_iomap_write_direct( if (error) return error; - lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockmode); + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); if (error) @@ -277,8 +257,8 @@ xfs_iomap_write_direct( * caller gave to us. */ nimaps = 1; - error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, - bmapi_flags, resblks, imap, &nimaps); + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0, + imap, &nimaps); if (error) goto out_res_cancel; @@ -301,7 +281,7 @@ xfs_iomap_write_direct( error = xfs_alert_fsblock_zero(ip, imap); out_unlock: - xfs_iunlock(ip, lockmode); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_res_cancel: @@ -410,19 +390,19 @@ xfs_iomap_prealloc_size( if (offset + count <= XFS_ISIZE(ip)) return 0; - if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && - (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))) + if (!(mp->m_flags & XFS_MOUNT_ALLOCSIZE) && + (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks))) return 0; /* * If an explicit allocsize is set, the file is small, or we * are writing behind a hole, then use the minimum prealloc: */ - if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) || + if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) || XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || !xfs_iext_peek_prev_extent(ifp, icur, &prev) || prev.br_startoff + prev.br_blockcount < offset_fsb) - return mp->m_writeio_blocks; + return mp->m_allocsize_blocks; /* * Determine the initial size of the preallocation. We are beyond the @@ -515,219 +495,13 @@ xfs_iomap_prealloc_size( while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; check_writeio: - if (alloc_blocks < mp->m_writeio_blocks) - alloc_blocks = mp->m_writeio_blocks; + if (alloc_blocks < mp->m_allocsize_blocks) + alloc_blocks = mp->m_allocsize_blocks; trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, - mp->m_writeio_blocks); + mp->m_allocsize_blocks); return alloc_blocks; } -static int -xfs_file_iomap_begin_delay( - struct inode *inode, - loff_t offset, - loff_t count, - unsigned flags, - struct iomap *iomap) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); - xfs_fileoff_t maxbytes_fsb = - XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - xfs_fileoff_t end_fsb; - struct xfs_bmbt_irec imap, cmap; - struct xfs_iext_cursor icur, ccur; - xfs_fsblock_t prealloc_blocks = 0; - bool eof = false, cow_eof = false, shared = false; - int whichfork = XFS_DATA_FORK; - int error = 0; - - ASSERT(!XFS_IS_REALTIME_INODE(ip)); - ASSERT(!xfs_get_extsz_hint(ip)); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - error = -EFSCORRUPTED; - goto out_unlock; - } - - XFS_STATS_INC(mp, xs_blk_mapw); - - if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); - if (error) - goto out_unlock; - } - - end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - - /* - * Search the data fork fork first to look up our source mapping. We - * always need the data fork map, as we have to return it to the - * iomap code so that the higher level write code can read data in to - * perform read-modify-write cycles for unaligned writes. - */ - eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); - if (eof) - imap.br_startoff = end_fsb; /* fake hole until the end */ - - /* We never need to allocate blocks for zeroing a hole. */ - if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { - xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); - goto out_unlock; - } - - /* - * Search the COW fork extent list even if we did not find a data fork - * extent. This serves two purposes: first this implements the - * speculative preallocation using cowextsize, so that we also unshare - * block adjacent to shared blocks instead of just the shared blocks - * themselves. Second the lookup in the extent list is generally faster - * than going out to the shared extent tree. - */ - if (xfs_is_cow_inode(ip)) { - if (!ip->i_cowfp) { - ASSERT(!xfs_is_reflink_inode(ip)); - xfs_ifork_init_cow(ip); - } - cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, - &ccur, &cmap); - if (!cow_eof && cmap.br_startoff <= offset_fsb) { - trace_xfs_reflink_cow_found(ip, &cmap); - whichfork = XFS_COW_FORK; - goto done; - } - } - - if (imap.br_startoff <= offset_fsb) { - /* - * For reflink files we may need a delalloc reservation when - * overwriting shared extents. This includes zeroing of - * existing extents that contain data. - */ - if (!xfs_is_cow_inode(ip) || - ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) { - trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, - &imap); - goto done; - } - - xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); - - /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_inode_need_cow(ip, &imap, &shared); - if (error) - goto out_unlock; - - /* Not shared? Just report the (potentially capped) extent. */ - if (!shared) { - trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, - &imap); - goto done; - } - - /* - * Fork all the shared blocks from our write offset until the - * end of the extent. - */ - whichfork = XFS_COW_FORK; - end_fsb = imap.br_startoff + imap.br_blockcount; - } else { - /* - * We cap the maximum length we map here to MAX_WRITEBACK_PAGES - * pages to keep the chunks of work done where somewhat - * symmetric with the work writeback does. This is a completely - * arbitrary number pulled out of thin air. - * - * Note that the values needs to be less than 32-bits wide until - * the lower level functions are updated. - */ - count = min_t(loff_t, count, 1024 * PAGE_SIZE); - end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - - if (xfs_is_always_cow_inode(ip)) - whichfork = XFS_COW_FORK; - } - - error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out_unlock; - - if (eof) { - prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset, - count, &icur); - if (prealloc_blocks) { - xfs_extlen_t align; - xfs_off_t end_offset; - xfs_fileoff_t p_end_fsb; - - end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1); - p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) + - prealloc_blocks; - - align = xfs_eof_alignment(ip, 0); - if (align) - p_end_fsb = roundup_64(p_end_fsb, align); - - p_end_fsb = min(p_end_fsb, maxbytes_fsb); - ASSERT(p_end_fsb > offset_fsb); - prealloc_blocks = p_end_fsb - end_fsb; - } - } - -retry: - error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, - whichfork == XFS_DATA_FORK ? &imap : &cmap, - whichfork == XFS_DATA_FORK ? &icur : &ccur, - whichfork == XFS_DATA_FORK ? eof : cow_eof); - switch (error) { - case 0: - break; - case -ENOSPC: - case -EDQUOT: - /* retry without any preallocation */ - trace_xfs_delalloc_enospc(ip, offset, count); - if (prealloc_blocks) { - prealloc_blocks = 0; - goto retry; - } - /*FALLTHRU*/ - default: - goto out_unlock; - } - - /* - * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch - * them out if the write happens to fail. - */ - iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, count, whichfork, - whichfork == XFS_DATA_FORK ? &imap : &cmap); -done: - if (whichfork == XFS_COW_FORK) { - if (imap.br_startoff > offset_fsb) { - xfs_trim_extent(&cmap, offset_fsb, - imap.br_startoff - offset_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); - goto out_unlock; - } - /* ensure we only report blocks we have a reservation for */ - xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount); - shared = true; - } - error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared); -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; -} - int xfs_iomap_write_unwritten( xfs_inode_t *ip, @@ -765,6 +539,11 @@ xfs_iomap_write_unwritten( */ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + /* Attach dquots so that bmbt splits are accounted correctly. */ + error = xfs_qm_dqattach(ip); + if (error) + return error; + do { /* * Set up a transaction to convert the range of extents @@ -783,6 +562,11 @@ xfs_iomap_write_unwritten( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto error_on_bmapi_transaction; + /* * Modify the unwritten extent state of the buffer. */ @@ -840,23 +624,42 @@ error_on_bmapi_transaction: static inline bool imap_needs_alloc( struct inode *inode, + unsigned flags, struct xfs_bmbt_irec *imap, int nimaps) { - return !nimaps || - imap->br_startblock == HOLESTARTBLOCK || - imap->br_startblock == DELAYSTARTBLOCK || - (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN); + /* don't allocate blocks when just zeroing */ + if (flags & IOMAP_ZERO) + return false; + if (!nimaps || + imap->br_startblock == HOLESTARTBLOCK || + imap->br_startblock == DELAYSTARTBLOCK) + return true; + /* we convert unwritten extents before copying the data for DAX */ + if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN) + return true; + return false; } static inline bool -needs_cow_for_zeroing( +imap_needs_cow( + struct xfs_inode *ip, + unsigned int flags, struct xfs_bmbt_irec *imap, int nimaps) { - return nimaps && - imap->br_startblock != HOLESTARTBLOCK && - imap->br_state != XFS_EXT_UNWRITTEN; + if (!xfs_is_cow_inode(ip)) + return false; + + /* when zeroing we don't have to COW holes or unwritten extents */ + if (flags & IOMAP_ZERO) { + if (!nimaps || + imap->br_startblock == HOLESTARTBLOCK || + imap->br_state == XFS_EXT_UNWRITTEN) + return false; + } + + return true; } static int @@ -872,15 +675,8 @@ xfs_ilock_for_iomap( * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_cow_inode(ip) && is_write) { - /* - * FIXME: It could still overwrite on unshared extents and not - * need allocation. - */ - if (flags & IOMAP_NOWAIT) - return -EAGAIN; + if (xfs_is_cow_inode(ip) && is_write) mode = XFS_ILOCK_EXCL; - } /* * Extents not yet cached requires exclusive access, don't block. This @@ -917,111 +713,73 @@ relock: } static int -xfs_file_iomap_begin( +xfs_direct_write_iomap_begin( struct inode *inode, loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec imap; - xfs_fileoff_t offset_fsb, end_fsb; + struct xfs_bmbt_irec imap, cmap; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); int nimaps = 1, error = 0; bool shared = false; + u16 iomap_flags = 0; unsigned lockmode; + ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) && - !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { - /* Reserve delalloc blocks for regular writeback. */ - return xfs_file_iomap_begin_delay(inode, offset, length, flags, - iomap); - } - /* - * Lock the inode in the manner required for the specified operation and - * check for as many conditions that would result in blocking as - * possible. This removes most of the non-blocking checks from the - * mapping code below. + * Writes that span EOF might trigger an IO size update on completion, + * so consider them to be dirty for the purposes of O_DSYNC even if + * there is no other metadata changes pending or have been made here. */ + if (offset + length > i_size_read(inode)) + iomap_flags |= IOMAP_F_DIRTY; + error = xfs_ilock_for_iomap(ip, flags, &lockmode); if (error) return error; - ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset > mp->m_super->s_maxbytes - length) - length = mp->m_super->s_maxbytes - offset; - offset_fsb = XFS_B_TO_FSBT(mp, offset); - end_fsb = XFS_B_TO_FSB(mp, offset + length); - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); if (error) goto out_unlock; - if (flags & IOMAP_REPORT) { - /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, &imap, &shared); - if (error) + if (imap_needs_cow(ip, flags, &imap, nimaps)) { + error = -EAGAIN; + if (flags & IOMAP_NOWAIT) goto out_unlock; - } - - /* Non-modifying mapping requested, so we are done */ - if (!(flags & (IOMAP_WRITE | IOMAP_ZERO))) - goto out_found; - - /* - * Break shared extents if necessary. Checks for non-blocking IO have - * been done up front, so we don't need to do them here. - */ - if (xfs_is_cow_inode(ip)) { - struct xfs_bmbt_irec cmap; - bool directio = (flags & IOMAP_DIRECT); - - /* if zeroing doesn't need COW allocation, then we are done. */ - if ((flags & IOMAP_ZERO) && - !needs_cow_for_zeroing(&imap, nimaps)) - goto out_found; /* may drop and re-acquire the ilock */ - cmap = imap; - error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode, - directio); + error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared, + &lockmode, flags & IOMAP_DIRECT); if (error) goto out_unlock; - - /* - * For buffered writes we need to report the address of the - * previous block (if there was any) so that the higher level - * write code can perform read-modify-write operations; we - * won't need the CoW fork mapping until writeback. For direct - * I/O, which must be block aligned, we need to report the - * newly allocated address. If the data fork has a hole, copy - * the COW fork mapping to avoid allocating to the data fork. - */ - if (directio || imap.br_startblock == HOLESTARTBLOCK) - imap = cmap; - + if (shared) + goto out_found_cow; end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; } - /* Don't need to allocate over holes when doing zeroing operations. */ - if (flags & IOMAP_ZERO) - goto out_found; + if (imap_needs_alloc(inode, flags, &imap, nimaps)) + goto allocate_blocks; - if (!imap_needs_alloc(inode, &imap, nimaps)) - goto out_found; + xfs_iunlock(ip, lockmode); + trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); + return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); - /* If nowait is set bail since we are going to make allocations. */ - if (flags & IOMAP_NOWAIT) { - error = -EAGAIN; +allocate_blocks: + error = -EAGAIN; + if (flags & IOMAP_NOWAIT) goto out_unlock; - } /* * We cap the maximum length we map to a sane size to keep the chunks @@ -1033,48 +791,273 @@ xfs_file_iomap_begin( * lower level functions are updated. */ length = min_t(loff_t, length, 1024 * PAGE_SIZE); + end_fsb = xfs_iomap_end_fsb(mp, offset, length); - /* - * xfs_iomap_write_direct() expects the shared lock. It is unlocked on - * return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - error = xfs_iomap_write_direct(ip, offset, length, &imap, - nimaps); + if (offset + length > XFS_ISIZE(ip)) + end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb); + else if (nimaps && imap.br_startblock == HOLESTARTBLOCK) + end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount); + xfs_iunlock(ip, lockmode); + + error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, + &imap); if (error) return error; - iomap->flags |= IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); + return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW); -out_finish: - return xfs_bmbt_to_iomap(ip, iomap, &imap, shared); - -out_found: - ASSERT(nimaps); +out_found_cow: xfs_iunlock(ip, lockmode); - trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - goto out_finish; + length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); + trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); + if (imap.br_startblock != HOLESTARTBLOCK) { + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + if (error) + return error; + } + return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); out_unlock: xfs_iunlock(ip, lockmode); return error; } +const struct iomap_ops xfs_direct_write_iomap_ops = { + .iomap_begin = xfs_direct_write_iomap_begin, +}; + static int -xfs_file_iomap_end_delalloc( - struct xfs_inode *ip, +xfs_buffered_write_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t count, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); + struct xfs_bmbt_irec imap, cmap; + struct xfs_iext_cursor icur, ccur; + xfs_fsblock_t prealloc_blocks = 0; + bool eof = false, cow_eof = false, shared = false; + int allocfork = XFS_DATA_FORK; + int error = 0; + + /* we can't use delayed allocations when using extent size hints */ + if (xfs_get_extsz_hint(ip)) + return xfs_direct_write_iomap_begin(inode, offset, count, + flags, iomap, srcmap); + + ASSERT(!XFS_IS_REALTIME_INODE(ip)); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, XFS_DATA_FORK)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + error = -EFSCORRUPTED; + goto out_unlock; + } + + XFS_STATS_INC(mp, xs_blk_mapw); + + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + } + + /* + * Search the data fork fork first to look up our source mapping. We + * always need the data fork map, as we have to return it to the + * iomap code so that the higher level write code can read data in to + * perform read-modify-write cycles for unaligned writes. + */ + eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); + if (eof) + imap.br_startoff = end_fsb; /* fake hole until the end */ + + /* We never need to allocate blocks for zeroing a hole. */ + if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); + goto out_unlock; + } + + /* + * Search the COW fork extent list even if we did not find a data fork + * extent. This serves two purposes: first this implements the + * speculative preallocation using cowextsize, so that we also unshare + * block adjacent to shared blocks instead of just the shared blocks + * themselves. Second the lookup in the extent list is generally faster + * than going out to the shared extent tree. + */ + if (xfs_is_cow_inode(ip)) { + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, + &ccur, &cmap); + if (!cow_eof && cmap.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &cmap); + goto found_cow; + } + } + + if (imap.br_startoff <= offset_fsb) { + /* + * For reflink files we may need a delalloc reservation when + * overwriting shared extents. This includes zeroing of + * existing extents that contain data. + */ + if (!xfs_is_cow_inode(ip) || + ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto found_imap; + } + + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); + + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_bmap_trim_cow(ip, &imap, &shared); + if (error) + goto out_unlock; + + /* Not shared? Just report the (potentially capped) extent. */ + if (!shared) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto found_imap; + } + + /* + * Fork all the shared blocks from our write offset until the + * end of the extent. + */ + allocfork = XFS_COW_FORK; + end_fsb = imap.br_startoff + imap.br_blockcount; + } else { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat + * symmetric with the work writeback does. This is a completely + * arbitrary number pulled out of thin air. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + count = min_t(loff_t, count, 1024 * PAGE_SIZE); + end_fsb = xfs_iomap_end_fsb(mp, offset, count); + + if (xfs_is_always_cow_inode(ip)) + allocfork = XFS_COW_FORK; + } + + error = xfs_qm_dqattach_locked(ip, false); + if (error) + goto out_unlock; + + if (eof) { + prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, offset, + count, &icur); + if (prealloc_blocks) { + xfs_extlen_t align; + xfs_off_t end_offset; + xfs_fileoff_t p_end_fsb; + + end_offset = XFS_ALLOC_ALIGN(mp, offset + count - 1); + p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) + + prealloc_blocks; + + align = xfs_eof_alignment(ip); + if (align) + p_end_fsb = roundup_64(p_end_fsb, align); + + p_end_fsb = min(p_end_fsb, + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); + ASSERT(p_end_fsb > offset_fsb); + prealloc_blocks = p_end_fsb - end_fsb; + } + } + +retry: + error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, + allocfork == XFS_DATA_FORK ? &imap : &cmap, + allocfork == XFS_DATA_FORK ? &icur : &ccur, + allocfork == XFS_DATA_FORK ? eof : cow_eof); + switch (error) { + case 0: + break; + case -ENOSPC: + case -EDQUOT: + /* retry without any preallocation */ + trace_xfs_delalloc_enospc(ip, offset, count); + if (prealloc_blocks) { + prealloc_blocks = 0; + goto retry; + } + /*FALLTHRU*/ + default: + goto out_unlock; + } + + if (allocfork == XFS_COW_FORK) { + trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap); + goto found_cow; + } + + /* + * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch + * them out if the write happens to fail. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); + return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW); + +found_imap: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + +found_cow: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (imap.br_startoff <= offset_fsb) { + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + if (error) + return error; + } else { + xfs_trim_extent(&cmap, offset_fsb, + imap.br_startoff - offset_fsb); + } + return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +static int +xfs_buffered_write_iomap_end( + struct inode *inode, loff_t offset, loff_t length, ssize_t written, + unsigned flags, struct iomap *iomap) { + struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t start_fsb; xfs_fileoff_t end_fsb; int error = 0; + if (iomap->type != IOMAP_DELALLOC) + return 0; + /* * Behave as if the write failed if drop writes is enabled. Set the NEW * flag to force delalloc cleanup. @@ -1119,24 +1102,51 @@ xfs_file_iomap_end_delalloc( return 0; } +const struct iomap_ops xfs_buffered_write_iomap_ops = { + .iomap_begin = xfs_buffered_write_iomap_begin, + .iomap_end = xfs_buffered_write_iomap_end, +}; + static int -xfs_file_iomap_end( +xfs_read_iomap_begin( struct inode *inode, loff_t offset, loff_t length, - ssize_t written, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { - if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) - return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, - length, written, iomap); - return 0; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); + int nimaps = 1, error = 0; + bool shared = false; + unsigned lockmode; + + ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, 0); + if (!error && (flags & IOMAP_REPORT)) + error = xfs_reflink_trim_around_shared(ip, &imap, &shared); + xfs_iunlock(ip, lockmode); + + if (error) + return error; + trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); + return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0); } -const struct iomap_ops xfs_iomap_ops = { - .iomap_begin = xfs_file_iomap_begin, - .iomap_end = xfs_file_iomap_end, +const struct iomap_ops xfs_read_iomap_ops = { + .iomap_begin = xfs_read_iomap_begin, }; static int @@ -1145,7 +1155,8 @@ xfs_seek_iomap_begin( loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1178,8 +1189,7 @@ xfs_seek_iomap_begin( /* * Fake a hole until the end of the file. */ - data_fsb = min(XFS_B_TO_FSB(mp, offset + length), - XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); + data_fsb = xfs_iomap_end_fsb(mp, offset, length); } /* @@ -1193,7 +1203,7 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1215,7 +1225,7 @@ xfs_seek_iomap_begin( imap.br_state = XFS_EXT_NORM; done: xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1231,7 +1241,8 @@ xfs_xattr_iomap_begin( loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1261,7 +1272,7 @@ out_unlock: if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, false); + return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 5c2f6aa6d78f..7d3703556d0e 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -11,13 +11,14 @@ struct xfs_inode; struct xfs_bmbt_irec; -int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *, int); +int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, + xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); +xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, + xfs_fileoff_t end_fsb); int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *, bool shared); -xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); + struct xfs_bmbt_irec *, u16); static inline xfs_filblks_t xfs_aligned_fsb_count( @@ -39,7 +40,9 @@ xfs_aligned_fsb_count( return count_fsb; } -extern const struct iomap_ops xfs_iomap_ops; +extern const struct iomap_ops xfs_buffered_write_iomap_ops; +extern const struct iomap_ops xfs_direct_write_iomap_ops; +extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index fe285d123d69..f7a99b3bbcf7 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -20,8 +20,8 @@ #include "xfs_symlink.h" #include "xfs_dir2.h" #include "xfs_iomap.h" +#include "xfs_error.h" -#include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/security.h> #include <linux/iversion.h> @@ -49,8 +49,15 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, xattr->value, - xattr->value_len, ATTR_SECURE); + struct xfs_da_args args = { + .dp = ip, + .attr_filter = XFS_ATTR_SECURE, + .name = xattr->name, + .namelen = strlen(xattr->name), + .value = xattr->value, + .valuelen = xattr->value_len, + }; + error = xfs_attr_set(&args); if (error < 0) break; } @@ -470,20 +477,57 @@ xfs_vn_get_link_inline( struct inode *inode, struct delayed_call *done) { + struct xfs_inode *ip = XFS_I(inode); char *link; - ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE); + ASSERT(ip->i_df.if_flags & XFS_IFINLINE); /* * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if * if_data is junk. */ - link = XFS_I(inode)->i_df.if_u1.if_data; - if (!link) + link = ip->i_df.if_u1.if_data; + if (XFS_IS_CORRUPT(ip->i_mount, !link)) return ERR_PTR(-EFSCORRUPTED); return link; } +static uint32_t +xfs_stat_blksize( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If the file blocks are being allocated from a realtime volume, then + * always return the realtime extent size. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; + + /* + * Allow large block sizes to be reported to userspace programs if the + * "largeio" mount option is used. + * + * If compatibility mode is specified, simply return the basic unit of + * caching so that we don't get inefficient read/modify/write I/O from + * user apps. Otherwise.... + * + * If the underlying volume is a stripe, then return the stripe width in + * bytes as the recommended I/O size. It is not a stripe and we've set a + * default buffered I/O size, return that, otherwise return the compat + * default. + */ + if (mp->m_flags & XFS_MOUNT_LARGEIO) { + if (mp->m_swidth) + return mp->m_swidth << mp->m_sb.sb_blocklog; + if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) + return 1U << mp->m_allocsize_log; + } + + return PAGE_SIZE; +} + STATIC int xfs_vn_getattr( const struct path *path, @@ -513,11 +557,10 @@ xfs_vn_getattr( stat->blocks = XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); - if (ip->i_d.di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; - stat->btime.tv_sec = ip->i_d.di_crtime.t_sec; - stat->btime.tv_nsec = ip->i_d.di_crtime.t_nsec; + stat->btime = ip->i_d.di_crtime; } } @@ -543,16 +586,7 @@ xfs_vn_getattr( stat->rdev = inode->i_rdev; break; default: - if (XFS_IS_REALTIME_INODE(ip)) { - /* - * If the file blocks are being allocated from a - * realtime volume, then return the inode's realtime - * extent size or the realtime volume's extent size. - */ - stat->blksize = - xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; - } else - stat->blksize = xfs_preferred_iosize(mp); + stat->blksize = xfs_stat_blksize(ip); stat->rdev = 0; break; } @@ -662,9 +696,7 @@ xfs_setattr_nonsize( */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); - error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid), - xfs_kgid_to_gid(gid), - xfs_get_projid(ip), + error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid, qflags, &udqp, &gdqp, NULL); if (error) return error; @@ -733,7 +765,6 @@ xfs_setattr_nonsize( olddquot1 = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - ip->i_d.di_uid = xfs_kuid_to_uid(uid); inode->i_uid = uid; } if (!gid_eq(igid, gid)) { @@ -745,7 +776,6 @@ xfs_setattr_nonsize( olddquot2 = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - ip->i_d.di_gid = xfs_kgid_to_gid(gid); inode->i_gid = gid; } } @@ -883,10 +913,10 @@ xfs_setattr_size( if (newsize > oldsize) { trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); error = iomap_zero_range(inode, oldsize, newsize - oldsize, - &did_zeroing, &xfs_iomap_ops); + &did_zeroing, &xfs_buffered_write_iomap_ops); } else { error = iomap_truncate_page(inode, newsize, &did_zeroing, - &xfs_iomap_ops); + &xfs_buffered_write_iomap_ops); } if (error) @@ -1114,7 +1144,7 @@ xfs_vn_fiemap( &xfs_xattr_iomap_ops); } else { error = iomap_fiemap(inode, fieinfo, start, length, - &xfs_iomap_ops); + &xfs_read_iomap_ops); } xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED); @@ -1227,7 +1257,7 @@ xfs_inode_supports_dax( return false; /* Device has to support DAX too. */ - return xfs_find_daxdev_for_inode(VFS_I(ip)) != NULL; + return xfs_inode_buftarg(ip)->bt_daxdev != NULL; } STATIC void @@ -1274,9 +1304,6 @@ xfs_setup_inode( /* make the inode look hashed for the writeback code */ inode_fake_hash(inode); - inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); - inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); - i_size_write(inode, ip->i_d.di_size); xfs_diflags_to_iflags(inode, ip); @@ -1290,9 +1317,7 @@ xfs_setup_inode( lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_dir_key); lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); - ip->d_ops = ip->i_mount->m_dir_inode_ops; } else { - ip->d_ops = ip->i_mount->m_nondir_inode_ops; lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); } diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 884950adbd16..ff2da28fed90 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -84,10 +84,10 @@ xfs_bulkstat_one_int( /* xfs_iget returns the following without needing * further change. */ - buf->bs_projectid = xfs_get_projid(ip); + buf->bs_projectid = ip->i_d.di_projid; buf->bs_ino = ino; - buf->bs_uid = dic->di_uid; - buf->bs_gid = dic->di_gid; + buf->bs_uid = i_uid_read(inode); + buf->bs_gid = i_gid_read(inode); buf->bs_size = dic->di_size; buf->bs_nlink = inode->i_nlink; @@ -97,8 +97,8 @@ xfs_bulkstat_one_int( buf->bs_mtime_nsec = inode->i_mtime.tv_nsec; buf->bs_ctime = inode->i_ctime.tv_sec; buf->bs_ctime_nsec = inode->i_ctime.tv_nsec; - buf->bs_btime = dic->di_crtime.t_sec; - buf->bs_btime_nsec = dic->di_crtime.t_nsec; + buf->bs_btime = dic->di_crtime.tv_sec; + buf->bs_btime_nsec = dic->di_crtime.tv_nsec; buf->bs_gen = inode->i_generation; buf->bs_mode = inode->i_mode; @@ -110,7 +110,7 @@ xfs_bulkstat_one_int( buf->bs_forkoff = XFS_IFORK_BOFF(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; - if (dic->di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE) buf->bs_cowextsize_blks = dic->di_cowextsize; } diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index aa375cf53021..233dcc8784db 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -298,7 +298,8 @@ xfs_iwalk_ag_start( error = xfs_inobt_get_rec(*curpp, irec, has_more); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(mp, *has_more == 1); + if (XFS_IS_CORRUPT(mp, *has_more != 1)) + return -EFSCORRUPTED; /* * If the LE lookup yielded an inobt record before the cursor position, diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index ca15105681ca..9f70d2f68e05 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -60,6 +60,7 @@ typedef __u32 xfs_nlink_t; #include <linux/list_sort.h> #include <linux/ratelimit.h> #include <linux/rhashtable.h> +#include <linux/xattr.h> #include <asm/page.h> #include <asm/div64.h> @@ -163,32 +164,6 @@ struct xstats { extern struct xstats xfsstats; -/* Kernel uid/gid conversion. These are used to convert to/from the on disk - * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally. - * The conversion here is type only, the value will remain the same since we - * are converting to the init_user_ns. The uid is later mapped to a particular - * user namespace value when crossing the kernel/user boundary. - */ -static inline uint32_t xfs_kuid_to_uid(kuid_t uid) -{ - return from_kuid(&init_user_ns, uid); -} - -static inline kuid_t xfs_uid_to_kuid(uint32_t uid) -{ - return make_kuid(&init_user_ns, uid); -} - -static inline uint32_t xfs_kgid_to_gid(kgid_t gid) -{ - return from_kgid(&init_user_ns, gid); -} - -static inline kgid_t xfs_gid_to_kgid(uint32_t gid) -{ - return make_kgid(&init_user_ns, gid); -} - static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev) { return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev)); @@ -223,26 +198,32 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, unsigned int op); #define ASSERT_ALWAYS(expr) \ - (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) #ifdef DEBUG #define ASSERT(expr) \ - (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) #else /* !DEBUG */ #ifdef XFS_WARN #define ASSERT(expr) \ - (likely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) + (likely(expr) ? (void)0 : asswarn(NULL, #expr, __FILE__, __LINE__)) #else /* !DEBUG && !XFS_WARN */ -#define ASSERT(expr) ((void)0) +#define ASSERT(expr) ((void)0) #endif /* XFS_WARN */ #endif /* DEBUG */ +#define XFS_IS_CORRUPT(mp, expr) \ + (unlikely(expr) ? xfs_corruption_error(#expr, XFS_ERRLEVEL_LOW, (mp), \ + NULL, 0, __FILE__, __LINE__, \ + __this_address), \ + true : false) + #define STATIC static noinline #ifdef CONFIG_XFS_RT diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 641d07f30a27..00fda2e8e738 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -24,13 +24,6 @@ kmem_zone_t *xfs_log_ticket_zone; /* Local miscellaneous function prototypes */ -STATIC int -xlog_commit_record( - struct xlog *log, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp); - STATIC struct xlog * xlog_alloc_log( struct xfs_mount *mp, @@ -47,8 +40,7 @@ xlog_dealloc_log( /* local state machine functions */ STATIC void xlog_state_done_syncing( - struct xlog_in_core *iclog, - bool aborted); + struct xlog_in_core *iclog); STATIC int xlog_state_get_iclog_space( struct xlog *log, @@ -57,33 +49,19 @@ xlog_state_get_iclog_space( struct xlog_ticket *ticket, int *continued_write, int *logoffsetp); -STATIC int -xlog_state_release_iclog( - struct xlog *log, - struct xlog_in_core *iclog); STATIC void xlog_state_switch_iclogs( struct xlog *log, struct xlog_in_core *iclog, int eventual_size); STATIC void -xlog_state_want_sync( - struct xlog *log, - struct xlog_in_core *iclog); - -STATIC void xlog_grant_push_ail( struct xlog *log, int need_bytes); STATIC void -xlog_regrant_reserve_log_space( - struct xlog *log, - struct xlog_ticket *ticket); -STATIC void -xlog_ungrant_log_space( +xlog_sync( struct xlog *log, - struct xlog_ticket *ticket); - + struct xlog_in_core *iclog); #if defined(DEBUG) STATIC void xlog_verify_dest_ptr( @@ -485,84 +463,67 @@ out_error: return error; } +static bool +__xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog) +{ + lockdep_assert_held(&log->l_icloglock); -/* - * NOTES: - * - * 1. currblock field gets updated at startup and after in-core logs - * marked as with WANT_SYNC. - */ + if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { + /* update tail before writing to iclog */ + xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); -/* - * This routine is called when a user of a log manager ticket is done with - * the reservation. If the ticket was ever used, then a commit record for - * the associated transaction is written out as a log operation header with - * no data. The flag XLOG_TIC_INITED is set when the first write occurs with - * a given ticket. If the ticket was one with a permanent reservation, then - * a few operations are done differently. Permanent reservation tickets by - * default don't release the reservation. They just commit the current - * transaction with the belief that the reservation is still needed. A flag - * must be passed in before permanent reservations are actually released. - * When these type of tickets are not released, they need to be set into - * the inited state again. By doing this, a start record will be written - * out when the next write occurs. - */ -xfs_lsn_t -xfs_log_done( - struct xfs_mount *mp, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - bool regrant) -{ - struct xlog *log = mp->m_log; - xfs_lsn_t lsn = 0; - - if (XLOG_FORCED_SHUTDOWN(log) || - /* - * If nothing was ever written, don't write out commit record. - * If we get an error, just continue and give back the log ticket. - */ - (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(log, ticket, iclog, &lsn)))) { - lsn = (xfs_lsn_t) -1; - regrant = false; + iclog->ic_state = XLOG_STATE_SYNCING; + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + xlog_verify_tail_lsn(log, iclog, tail_lsn); + /* cycle incremented when incrementing curr_block */ + return true; } + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + return false; +} - if (!regrant) { - trace_xfs_log_done_nonperm(log, ticket); +/* + * Flush iclog to disk if this is the last reference to the given iclog and the + * it is in the WANT_SYNC state. + */ +static int +xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog) +{ + lockdep_assert_held(&log->l_icloglock); - /* - * Release ticket if not permanent reservation or a specific - * request has been made to release a permanent reservation. - */ - xlog_ungrant_log_space(log, ticket); - } else { - trace_xfs_log_done_perm(log, ticket); + if (iclog->ic_state == XLOG_STATE_IOERROR) + return -EIO; - xlog_regrant_reserve_log_space(log, ticket); - /* If this ticket was a permanent reservation and we aren't - * trying to release it, reset the inited flags; so next time - * we write, a start record will be written out. - */ - ticket->t_flags |= XLOG_TIC_INITED; + if (atomic_dec_and_test(&iclog->ic_refcnt) && + __xlog_state_release_iclog(log, iclog)) { + spin_unlock(&log->l_icloglock); + xlog_sync(log, iclog); + spin_lock(&log->l_icloglock); } - xfs_log_ticket_put(ticket); - return lsn; + return 0; } -int +void xfs_log_release_iclog( - struct xfs_mount *mp, struct xlog_in_core *iclog) { - if (xlog_state_release_iclog(mp->m_log, iclog)) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - return -EIO; + struct xlog *log = iclog->ic_log; + bool sync = false; + + if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) { + if (iclog->ic_state != XLOG_STATE_IOERROR) + sync = __xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); } - return 0; + if (sync) + xlog_sync(log, iclog); } /* @@ -801,32 +762,69 @@ xfs_log_mount_cancel( } /* - * Final log writes as part of unmount. - * - * Mark the filesystem clean as unmount happens. Note that during relocation - * this routine needs to be executed as part of source-bag while the - * deallocation must not be done until source-end. + * Wait for the iclog to be written disk, or return an error if the log has been + * shut down. */ +static int +xlog_wait_on_iclog( + struct xlog_in_core *iclog) + __releases(iclog->ic_log->l_icloglock) +{ + struct xlog *log = iclog->ic_log; -/* Actually write the unmount record to disk. */ -static void -xfs_log_write_unmount_record( - struct xfs_mount *mp) + if (!XLOG_FORCED_SHUTDOWN(log) && + iclog->ic_state != XLOG_STATE_ACTIVE && + iclog->ic_state != XLOG_STATE_DIRTY) { + XFS_STATS_INC(log->l_mp, xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + } else { + spin_unlock(&log->l_icloglock); + } + + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + return 0; +} + +/* + * Write out an unmount record using the ticket provided. We have to account for + * the data space used in the unmount ticket as this write is not done from a + * transaction context that has already done the accounting for us. + */ +static int +xlog_write_unmount_record( + struct xlog *log, + struct xlog_ticket *ticket, + xfs_lsn_t *lsn, + uint flags) { - /* the data section must be 32 bit size aligned */ - struct xfs_unmount_log_format magic = { + struct xfs_unmount_log_format ulf = { .magic = XLOG_UNMOUNT_TYPE, }; struct xfs_log_iovec reg = { - .i_addr = &magic, - .i_len = sizeof(magic), + .i_addr = &ulf, + .i_len = sizeof(ulf), .i_type = XLOG_REG_TYPE_UNMOUNT, }; struct xfs_log_vec vec = { .lv_niovecs = 1, .lv_iovecp = ®, }; - struct xlog *log = mp->m_log; + + /* account for space used by record data */ + ticket->t_curr_res -= sizeof(ulf); + return xlog_write(log, &vec, ticket, lsn, NULL, flags, false); +} + +/* + * Mark the filesystem clean by writing an unmount record to the head of the + * log. + */ +static void +xlog_unmount_write( + struct xlog *log) +{ + struct xfs_mount *mp = log->l_mp; struct xlog_in_core *iclog; struct xlog_ticket *tic = NULL; xfs_lsn_t lsn; @@ -837,23 +835,7 @@ xfs_log_write_unmount_record( if (error) goto out_err; - /* - * If we think the summary counters are bad, clear the unmount header - * flag in the unmount record so that the summary counters will be - * recalculated during log recovery at next mount. Refer to - * xlog_check_unmount_rec for more details. - */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { - xfs_alert(mp, "%s: will fix summary counters at next mount", - __func__); - flags &= ~XLOG_UNMOUNT_TRANS; - } - - /* remove inited flag, and account for space used */ - tic->t_flags = 0; - tic->t_curr_res -= sizeof(magic); - error = xlog_write(log, &vec, tic, &lsn, NULL, flags); + error = xlog_write_unmount_record(log, tic, &lsn, flags); /* * At this point, we're umounting anyway, so there's no point in * transitioning log state to IOERROR. Just continue... @@ -865,31 +847,32 @@ out_err: spin_lock(&log->l_icloglock); iclog = log->l_iclog; atomic_inc(&iclog->ic_refcnt); - xlog_state_want_sync(log, iclog); - spin_unlock(&log->l_icloglock); + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, iclog, 0); + else + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR); error = xlog_state_release_iclog(log, iclog); - - spin_lock(&log->l_icloglock); - switch (iclog->ic_state) { - default: - if (!XLOG_FORCED_SHUTDOWN(log)) { - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - break; - } - /* fall through */ - case XLOG_STATE_ACTIVE: - case XLOG_STATE_DIRTY: - spin_unlock(&log->l_icloglock); - break; - } + xlog_wait_on_iclog(iclog); if (tic) { trace_xfs_log_umount_write(log, tic); - xlog_ungrant_log_space(log, tic); - xfs_log_ticket_put(tic); + xfs_log_ticket_ungrant(log, tic); } } +static void +xfs_log_unmount_verify_iclog( + struct xlog *log) +{ + struct xlog_in_core *iclog = log->l_iclog; + + do { + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + ASSERT(iclog->ic_offset == 0); + } while ((iclog = iclog->ic_next) != log->l_iclog); +} + /* * Unmount record used to have a string "Unmount filesystem--" in the * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). @@ -897,16 +880,11 @@ out_err: * currently architecture converted and "Unmount" is a bit foo. * As far as I know, there weren't any dependencies on the old behaviour. */ - -static int -xfs_log_unmount_write(xfs_mount_t *mp) +static void +xfs_log_unmount_write( + struct xfs_mount *mp) { - struct xlog *log = mp->m_log; - xlog_in_core_t *iclog; -#ifdef DEBUG - xlog_in_core_t *first_iclog; -#endif - int error; + struct xlog *log = mp->m_log; /* * Don't write out unmount record on norecovery mounts or ro devices. @@ -915,61 +893,30 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (mp->m_flags & XFS_MOUNT_NORECOVERY || xfs_readonly_buftarg(log->l_targ)) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); - return 0; + return; } - error = xfs_log_force(mp, XFS_LOG_SYNC); - ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); - -#ifdef DEBUG - first_iclog = iclog = log->l_iclog; - do { - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { - ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE); - ASSERT(iclog->ic_offset == 0); - } - iclog = iclog->ic_next; - } while (iclog != first_iclog); -#endif - if (! (XLOG_FORCED_SHUTDOWN(log))) { - xfs_log_write_unmount_record(mp); - } else { - /* - * We're already in forced_shutdown mode, couldn't - * even attempt to write out the unmount transaction. - * - * Go through the motions of sync'ing and releasing - * the iclog, even though no I/O will actually happen, - * we need to wait for other log I/Os that may already - * be in progress. Do this as a separate section of - * code so we'll know if we ever get stuck here that - * we're in this odd situation of trying to unmount - * a file system that went into forced_shutdown as - * the result of an unmount.. - */ - spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - atomic_inc(&iclog->ic_refcnt); - - xlog_state_want_sync(log, iclog); - spin_unlock(&log->l_icloglock); - error = xlog_state_release_iclog(log, iclog); - - spin_lock(&log->l_icloglock); + xfs_log_force(mp, XFS_LOG_SYNC); - if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE - || iclog->ic_state == XLOG_STATE_DIRTY - || iclog->ic_state == XLOG_STATE_IOERROR) ) { + if (XLOG_FORCED_SHUTDOWN(log)) + return; - xlog_wait(&iclog->ic_force_wait, - &log->l_icloglock); - } else { - spin_unlock(&log->l_icloglock); - } + /* + * If we think the summary counters are bad, avoid writing the unmount + * record to force log recovery at next mount, after which the summary + * counters will be recalculated. Refer to xlog_check_unmount_rec for + * more details. + */ + if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, + XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + xfs_alert(mp, "%s: will fix summary counters at next mount", + __func__); + return; } - return error; -} /* xfs_log_unmount_write */ + xfs_log_unmount_verify_iclog(log); + xlog_unmount_write(log); +} /* * Empty the log for unmount/freeze. @@ -1232,7 +1179,6 @@ xlog_ioend_work( struct xlog_in_core *iclog = container_of(work, struct xlog_in_core, ic_end_io_work); struct xlog *log = iclog->ic_log; - bool aborted = false; int error; error = blk_status_to_errno(iclog->ic_bio.bi_status); @@ -1248,17 +1194,9 @@ xlog_ioend_work( if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); - /* - * This flag will be propagated to the trans-committed - * callback routines to let them know that the log-commit - * didn't succeed. - */ - aborted = true; - } else if (iclog->ic_state & XLOG_STATE_IOERROR) { - aborted = true; } - xlog_state_done_syncing(iclog, aborted); + xlog_state_done_syncing(iclog); bio_uninit(&iclog->ic_bio); /* @@ -1479,7 +1417,7 @@ xlog_alloc_log( log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0, - mp->m_fsname); + mp->m_super->s_id); if (!log->l_ioend_workqueue) goto out_free_iclog; @@ -1495,6 +1433,8 @@ out_free_iclog: prev_iclog = iclog->ic_next; kmem_free(iclog->ic_data); kmem_free(iclog); + if (prev_iclog == log->l_iclog) + break; } out_free_log: kmem_free(log); @@ -1502,20 +1442,17 @@ out: return ERR_PTR(error); } /* xlog_alloc_log */ - /* * Write out the commit record of a transaction associated with the given - * ticket. Return the lsn of the commit record. + * ticket to close off a running log write. Return the lsn of the commit record. */ -STATIC int +int xlog_commit_record( struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp) + xfs_lsn_t *lsn) { - struct xfs_mount *mp = log->l_mp; - int error; struct xfs_log_iovec reg = { .i_addr = NULL, .i_len = 0, @@ -1525,12 +1462,15 @@ xlog_commit_record( .lv_niovecs = 1, .lv_iovecp = ®, }; + int error; + + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; - ASSERT_ALWAYS(iclog); - error = xlog_write(log, &vec, ticket, commitlsnp, iclog, - XLOG_COMMIT_TRANS); + error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS, + false); if (error) - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -1690,7 +1630,7 @@ xlog_bio_end_io( &iclog->ic_end_io_work); } -static void +static int xlog_map_iclog_data( struct bio *bio, void *data, @@ -1701,11 +1641,14 @@ xlog_map_iclog_data( unsigned int off = offset_in_page(data); size_t len = min_t(size_t, count, PAGE_SIZE - off); - WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len); + if (bio_add_page(bio, page, len, off) != len) + return -EIO; data += len; count -= len; } while (count); + + return 0; } STATIC void @@ -1727,7 +1670,7 @@ xlog_write_iclog( * across the log IO to archieve that. */ down(&iclog->ic_sema); - if (unlikely(iclog->ic_state & XLOG_STATE_IOERROR)) { + if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) { /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of @@ -1735,25 +1678,34 @@ xlog_write_iclog( * the buffer manually, the code needs to be kept in sync * with the I/O completion path. */ - xlog_state_done_syncing(iclog, XFS_LI_ABORTED); + xlog_state_done_syncing(iclog); up(&iclog->ic_sema); return; } - iclog->ic_io_size = count; - bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE)); bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev); iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; iclog->ic_bio.bi_end_io = xlog_bio_end_io; iclog->ic_bio.bi_private = iclog; - iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA; + + /* + * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more + * IOs coming immediately after this one. This prevents the block layer + * writeback throttle from throttling log writes behind background + * metadata writeback and causing priority inversions. + */ + iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | + REQ_IDLE | REQ_FUA; if (need_flush) iclog->ic_bio.bi_opf |= REQ_PREFLUSH; - xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, iclog->ic_io_size); + if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + return; + } if (is_vmalloc_addr(iclog->ic_data)) - flush_kernel_vmap_range(iclog->ic_data, iclog->ic_io_size); + flush_kernel_vmap_range(iclog->ic_data, count); /* * If this log buffer would straddle the end of the log we will have @@ -1964,12 +1916,11 @@ xlog_dealloc_log( log->l_mp->m_log = NULL; destroy_workqueue(log->l_ioend_workqueue); kmem_free(log); -} /* xlog_dealloc_log */ +} /* * Update counters atomically now that memcpy is done. */ -/* ARGSUSED */ static inline void xlog_state_finish_copy( struct xlog *log, @@ -1977,16 +1928,11 @@ xlog_state_finish_copy( int record_cnt, int copy_bytes) { - spin_lock(&log->l_icloglock); + lockdep_assert_held(&log->l_icloglock); be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); iclog->ic_offset += copy_bytes; - - spin_unlock(&log->l_icloglock); -} /* xlog_state_finish_copy */ - - - +} /* * print out info relating to regions written which consume @@ -2107,23 +2053,21 @@ xlog_print_trans( } /* - * Calculate the potential space needed by the log vector. Each region gets - * its own xlog_op_header_t and may need to be double word aligned. + * Calculate the potential space needed by the log vector. We may need a start + * record, and each region gets its own struct xlog_op_header and may need to be + * double word aligned. */ static int xlog_write_calc_vec_length( struct xlog_ticket *ticket, - struct xfs_log_vec *log_vector) + struct xfs_log_vec *log_vector, + bool need_start_rec) { struct xfs_log_vec *lv; - int headers = 0; + int headers = need_start_rec ? 1 : 0; int len = 0; int i; - /* acct for start rec of xact */ - if (ticket->t_flags & XLOG_TIC_INITED) - headers++; - for (lv = log_vector; lv; lv = lv->lv_next) { /* we don't write ordered log vectors */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) @@ -2145,27 +2089,16 @@ xlog_write_calc_vec_length( return len; } -/* - * If first write for transaction, insert start record We can't be trying to - * commit if we are inited. We can't have any "partial_copy" if we are inited. - */ -static int +static void xlog_write_start_rec( struct xlog_op_header *ophdr, struct xlog_ticket *ticket) { - if (!(ticket->t_flags & XLOG_TIC_INITED)) - return 0; - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); ophdr->oh_clientid = ticket->t_clientid; ophdr->oh_len = 0; ophdr->oh_flags = XLOG_START_TRANS; ophdr->oh_res2 = 0; - - ticket->t_flags &= ~XLOG_TIC_INITED; - - return sizeof(struct xlog_op_header); } static xlog_op_header_t * @@ -2263,15 +2196,18 @@ xlog_write_copy_finish( int log_offset, struct xlog_in_core **commit_iclog) { + int error; + if (*partial_copy) { /* * This iclog has already been marked WANT_SYNC by * xlog_state_get_iclog_space. */ + spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); *record_cnt = 0; *data_cnt = 0; - return xlog_state_release_iclog(log, iclog); + goto release_iclog; } *partial_copy = 0; @@ -2279,21 +2215,29 @@ xlog_write_copy_finish( if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { /* no more space in this iclog - push it. */ + spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); *record_cnt = 0; *data_cnt = 0; - spin_lock(&log->l_icloglock); - xlog_state_want_sync(log, iclog); - spin_unlock(&log->l_icloglock); - + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, iclog, 0); + else + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR); if (!commit_iclog) - return xlog_state_release_iclog(log, iclog); + goto release_iclog; + spin_unlock(&log->l_icloglock); ASSERT(flags & XLOG_COMMIT_TRANS); *commit_iclog = iclog; } return 0; + +release_iclog: + error = xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); + return error; } /* @@ -2343,39 +2287,28 @@ xlog_write( struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, - uint flags) + uint flags, + bool need_start_rec) { struct xlog_in_core *iclog = NULL; - struct xfs_log_iovec *vecp; - struct xfs_log_vec *lv; + struct xfs_log_vec *lv = log_vector; + struct xfs_log_iovec *vecp = lv->lv_iovecp; + int index = 0; int len; - int index; int partial_copy = 0; int partial_copy_len = 0; int contwr = 0; int record_cnt = 0; int data_cnt = 0; - int error; - - *start_lsn = 0; - - len = xlog_write_calc_vec_length(ticket, log_vector); - - /* - * Region headers and bytes are already accounted for. - * We only need to take into account start records and - * split regions in this function. - */ - if (ticket->t_flags & XLOG_TIC_INITED) - ticket->t_curr_res -= sizeof(xlog_op_header_t); + int error = 0; /* - * Commit record headers need to be accounted for. These - * come in as separate writes so are easy to detect. + * If this is a commit or unmount transaction, we don't need a start + * record to be written. We do, however, have to account for the + * commit or unmount header that gets written. Hence we always have + * to account for an extra xlog_op_header here. */ - if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - + ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); @@ -2383,9 +2316,8 @@ xlog_write( xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); } - index = 0; - lv = log_vector; - vecp = lv->lv_iovecp; + len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec); + *start_lsn = 0; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2409,7 +2341,6 @@ xlog_write( while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { struct xfs_log_iovec *reg; struct xlog_op_header *ophdr; - int start_rec_copy; int copy_len; int copy_off; bool ordered = false; @@ -2425,11 +2356,15 @@ xlog_write( ASSERT(reg->i_len % sizeof(int32_t) == 0); ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); - start_rec_copy = xlog_write_start_rec(ptr, ticket); - if (start_rec_copy) { - record_cnt++; + /* + * Before we start formatting log vectors, we need to + * write a start record. Only do this for the first + * iclog we write to. + */ + if (need_start_rec) { + xlog_write_start_rec(ptr, ticket); xlog_write_adv_cnt(&ptr, &len, &log_offset, - start_rec_copy); + sizeof(struct xlog_op_header)); } ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); @@ -2461,8 +2396,13 @@ xlog_write( xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); } - copy_len += start_rec_copy + sizeof(xlog_op_header_t); + copy_len += sizeof(struct xlog_op_header); record_cnt++; + if (need_start_rec) { + copy_len += sizeof(struct xlog_op_header); + record_cnt++; + need_start_rec = false; + } data_cnt += contwr ? copy_len : 0; error = xlog_write_copy_finish(log, iclog, flags, @@ -2506,128 +2446,119 @@ next_lv: ASSERT(len == 0); + spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - if (!commit_iclog) - return xlog_state_release_iclog(log, iclog); + if (commit_iclog) { + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + } else { + error = xlog_state_release_iclog(log, iclog); + } + spin_unlock(&log->l_icloglock); - ASSERT(flags & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - return 0; + return error; } +static void +xlog_state_activate_iclog( + struct xlog_in_core *iclog, + int *iclogs_changed) +{ + ASSERT(list_empty_careful(&iclog->ic_callbacks)); -/***************************************************************************** - * - * State Machine functions - * - ***************************************************************************** - */ + /* + * If the number of ops in this iclog indicate it just contains the + * dummy transaction, we can change state into IDLE (the second time + * around). Otherwise we should change the state into NEED a dummy. + * We don't need to cover the dummy. + */ + if (*iclogs_changed == 0 && + iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { + *iclogs_changed = 1; + } else { + /* + * We have two dirty iclogs so start over. This could also be + * num of ops indicating this is not the dummy going out. + */ + *iclogs_changed = 2; + } + + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_offset = 0; + iclog->ic_header.h_num_logops = 0; + memset(iclog->ic_header.h_cycle_data, 0, + sizeof(iclog->ic_header.h_cycle_data)); + iclog->ic_header.h_lsn = 0; +} /* - * An iclog has just finished IO completion processing, so we need to update - * the iclog state and propagate that up into the overall log state. Hence we - * prepare the iclog for cleaning, and then clean all the pending dirty iclogs - * starting from the head, and then wake up any threads that are waiting for the - * iclog to be marked clean. - * - * The ordering of marking iclogs ACTIVE must be maintained, so an iclog - * doesn't become ACTIVE beyond one that is SYNCING. This is also required to - * maintain the notion that we use a ordered wait queue to hold off would be - * writers to the log when every iclog is trying to sync to disk. - * - * Caller must hold the icloglock before calling us. - * - * State Change: !IOERROR -> DIRTY -> ACTIVE + * Loop through all iclogs and mark all iclogs currently marked DIRTY as + * ACTIVE after iclog I/O has completed. */ -STATIC void -xlog_state_clean_iclog( +static void +xlog_state_activate_iclogs( struct xlog *log, - struct xlog_in_core *dirty_iclog) + int *iclogs_changed) { - struct xlog_in_core *iclog; - int changed = 0; - - /* Prepare the completed iclog. */ - if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR)) - dirty_iclog->ic_state = XLOG_STATE_DIRTY; + struct xlog_in_core *iclog = log->l_iclog; - /* Walk all the iclogs to update the ordered active state. */ - iclog = log->l_iclog; do { - if (iclog->ic_state == XLOG_STATE_DIRTY) { - iclog->ic_state = XLOG_STATE_ACTIVE; - iclog->ic_offset = 0; - ASSERT(list_empty_careful(&iclog->ic_callbacks)); - /* - * If the number of ops in this iclog indicate it just - * contains the dummy transaction, we can - * change state into IDLE (the second time around). - * Otherwise we should change the state into - * NEED a dummy. - * We don't need to cover the dummy. - */ - if (!changed && - (be32_to_cpu(iclog->ic_header.h_num_logops) == - XLOG_COVER_OPS)) { - changed = 1; - } else { - /* - * We have two dirty iclogs so start over - * This could also be num of ops indicates - * this is not the dummy going out. - */ - changed = 2; - } - iclog->ic_header.h_num_logops = 0; - memset(iclog->ic_header.h_cycle_data, 0, - sizeof(iclog->ic_header.h_cycle_data)); - iclog->ic_header.h_lsn = 0; - } else if (iclog->ic_state == XLOG_STATE_ACTIVE) - /* do nothing */; - else - break; /* stop cleaning */ - iclog = iclog->ic_next; - } while (iclog != log->l_iclog); - + if (iclog->ic_state == XLOG_STATE_DIRTY) + xlog_state_activate_iclog(iclog, iclogs_changed); + /* + * The ordering of marking iclogs ACTIVE must be maintained, so + * an iclog doesn't become ACTIVE beyond one that is SYNCING. + */ + else if (iclog->ic_state != XLOG_STATE_ACTIVE) + break; + } while ((iclog = iclog->ic_next) != log->l_iclog); +} +static int +xlog_covered_state( + int prev_state, + int iclogs_changed) +{ /* - * Wake up threads waiting in xfs_log_force() for the dirty iclog - * to be cleaned. + * We usually go to NEED. But we go to NEED2 if the changed indicates we + * are done writing the dummy record. If we are done with the second + * dummy recored (DONE2), then we go to IDLE. */ - wake_up_all(&dirty_iclog->ic_force_wait); + switch (prev_state) { + case XLOG_STATE_COVER_IDLE: + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + break; + case XLOG_STATE_COVER_DONE: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_NEED2; + break; + case XLOG_STATE_COVER_DONE2: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_IDLE; + break; + default: + ASSERT(0); + } - /* - * Change state for the dummy log recording. - * We usually go to NEED. But we go to NEED2 if the changed indicates - * we are done writing the dummy record. - * If we are done with the second dummy recored (DONE2), then - * we go to IDLE. - */ - if (changed) { - switch (log->l_covered_state) { - case XLOG_STATE_COVER_IDLE: - case XLOG_STATE_COVER_NEED: - case XLOG_STATE_COVER_NEED2: - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; + return XLOG_STATE_COVER_NEED; +} - case XLOG_STATE_COVER_DONE: - if (changed == 1) - log->l_covered_state = XLOG_STATE_COVER_NEED2; - else - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; +STATIC void +xlog_state_clean_iclog( + struct xlog *log, + struct xlog_in_core *dirty_iclog) +{ + int iclogs_changed = 0; - case XLOG_STATE_COVER_DONE2: - if (changed == 1) - log->l_covered_state = XLOG_STATE_COVER_IDLE; - else - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; + dirty_iclog->ic_state = XLOG_STATE_DIRTY; - default: - ASSERT(0); - } + xlog_state_activate_iclogs(log, &iclogs_changed); + wake_up_all(&dirty_iclog->ic_force_wait); + + if (iclogs_changed) { + log->l_covered_state = xlog_covered_state(log->l_covered_state, + iclogs_changed); } } @@ -2639,7 +2570,8 @@ xlog_get_lowest_lsn( xfs_lsn_t lowest_lsn = 0, lsn; do { - if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)) + if (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY) continue; lsn = be64_to_cpu(iclog->ic_header.h_lsn); @@ -2699,61 +2631,48 @@ static bool xlog_state_iodone_process_iclog( struct xlog *log, struct xlog_in_core *iclog, - struct xlog_in_core *completed_iclog, bool *ioerror) { xfs_lsn_t lowest_lsn; xfs_lsn_t header_lsn; - /* Skip all iclogs in the ACTIVE & DIRTY states */ - if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)) + switch (iclog->ic_state) { + case XLOG_STATE_ACTIVE: + case XLOG_STATE_DIRTY: + /* + * Skip all iclogs in the ACTIVE & DIRTY states: + */ return false; - - /* - * Between marking a filesystem SHUTDOWN and stopping the log, we do - * flush all iclogs to disk (if there wasn't a log I/O error). So, we do - * want things to go smoothly in case of just a SHUTDOWN w/o a - * LOG_IO_ERROR. - */ - if (iclog->ic_state & XLOG_STATE_IOERROR) { + case XLOG_STATE_IOERROR: + /* + * Between marking a filesystem SHUTDOWN and stopping the log, + * we do flush all iclogs to disk (if there wasn't a log I/O + * error). So, we do want things to go smoothly in case of just + * a SHUTDOWN w/o a LOG_IO_ERROR. + */ *ioerror = true; return false; - } - - /* - * Can only perform callbacks in order. Since this iclog is not in the - * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean - * up. If we set our iclog to DO_CALLBACK, we will not process it when - * we retry since a previous iclog is in the CALLBACK and the state - * cannot change since we are holding the l_icloglock. - */ - if (!(iclog->ic_state & - (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) { - if (completed_iclog && - (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) { - completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK; - } + case XLOG_STATE_DONE_SYNC: + /* + * Now that we have an iclog that is in the DONE_SYNC state, do + * one more check here to see if we have chased our tail around. + * If this is not the lowest lsn iclog, then we will leave it + * for another completion to process. + */ + header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + lowest_lsn = xlog_get_lowest_lsn(log); + if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) + return false; + xlog_state_set_callback(log, iclog, header_lsn); + return false; + default: + /* + * Can only perform callbacks in order. Since this iclog is not + * in the DONE_SYNC state, we skip the rest and just try to + * clean up. + */ return true; } - - /* - * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC - * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught - * by the above if and are going to clean (i.e. we aren't doing their - * callbacks) see the above if. - * - * We will do one more check here to see if we have chased our tail - * around. If this is not the lowest lsn iclog, then we will leave it - * for another completion to process. - */ - header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); - lowest_lsn = xlog_get_lowest_lsn(log); - if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) - return false; - - xlog_state_set_callback(log, iclog, header_lsn); - return false; - } /* @@ -2768,8 +2687,9 @@ xlog_state_iodone_process_iclog( static void xlog_state_do_iclog_callbacks( struct xlog *log, - struct xlog_in_core *iclog, - bool aborted) + struct xlog_in_core *iclog) + __releases(&log->l_icloglock) + __acquires(&log->l_icloglock) { spin_unlock(&log->l_icloglock); spin_lock(&iclog->ic_callback_lock); @@ -2779,7 +2699,7 @@ xlog_state_do_iclog_callbacks( list_splice_init(&iclog->ic_callbacks, &tmp); spin_unlock(&iclog->ic_callback_lock); - xlog_cil_process_committed(&tmp, aborted); + xlog_cil_process_committed(&tmp); spin_lock(&iclog->ic_callback_lock); } @@ -2792,57 +2712,12 @@ xlog_state_do_iclog_callbacks( spin_unlock(&iclog->ic_callback_lock); } -#ifdef DEBUG -/* - * Make one last gasp attempt to see if iclogs are being left in limbo. If the - * above loop finds an iclog earlier than the current iclog and in one of the - * syncing states, the current iclog is put into DO_CALLBACK and the callbacks - * are deferred to the completion of the earlier iclog. Walk the iclogs in order - * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in - * one of the syncing states. - * - * Note that SYNCING|IOERROR is a valid state so we cannot just check for - * ic_state == SYNCING. - */ -static void -xlog_state_callback_check_state( - struct xlog *log) -{ - struct xlog_in_core *first_iclog = log->l_iclog; - struct xlog_in_core *iclog = first_iclog; - - do { - ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); - /* - * Terminate the loop if iclogs are found in states - * which will cause other threads to clean up iclogs. - * - * SYNCING - i/o completion will go through logs - * DONE_SYNC - interrupt thread should be waiting for - * l_icloglock - * IOERROR - give up hope all ye who enter here - */ - if (iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state & XLOG_STATE_SYNCING || - iclog->ic_state == XLOG_STATE_DONE_SYNC || - iclog->ic_state == XLOG_STATE_IOERROR ) - break; - iclog = iclog->ic_next; - } while (first_iclog != iclog); -} -#else -#define xlog_state_callback_check_state(l) ((void)0) -#endif - STATIC void xlog_state_do_callback( - struct xlog *log, - bool aborted, - struct xlog_in_core *ciclog) + struct xlog *log) { struct xlog_in_core *iclog; struct xlog_in_core *first_iclog; - bool did_callbacks = false; bool cycled_icloglock; bool ioerror; int flushcnt = 0; @@ -2866,11 +2741,11 @@ xlog_state_do_callback( do { if (xlog_state_iodone_process_iclog(log, iclog, - ciclog, &ioerror)) + &ioerror)) break; - if (!(iclog->ic_state & - (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) { + if (iclog->ic_state != XLOG_STATE_CALLBACK && + iclog->ic_state != XLOG_STATE_IOERROR) { iclog = iclog->ic_next; continue; } @@ -2880,14 +2755,14 @@ xlog_state_do_callback( * we'll have to run at least one more complete loop. */ cycled_icloglock = true; - xlog_state_do_iclog_callbacks(log, iclog, aborted); - - xlog_state_clean_iclog(log, iclog); + xlog_state_do_iclog_callbacks(log, iclog); + if (XLOG_FORCED_SHUTDOWN(log)) + wake_up_all(&iclog->ic_force_wait); + else + xlog_state_clean_iclog(log, iclog); iclog = iclog->ic_next; } while (first_iclog != iclog); - did_callbacks |= cycled_icloglock; - if (repeats > 5000) { flushcnt += repeats; repeats = 0; @@ -2897,10 +2772,8 @@ xlog_state_do_callback( } } while (!ioerror && cycled_icloglock); - if (did_callbacks) - xlog_state_callback_check_state(log); - - if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) + if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE || + log->l_iclog->ic_state == XLOG_STATE_IOERROR) wake_up_all(&log->l_flush_wait); spin_unlock(&log->l_icloglock); @@ -2922,25 +2795,22 @@ xlog_state_do_callback( */ STATIC void xlog_state_done_syncing( - struct xlog_in_core *iclog, - bool aborted) + struct xlog_in_core *iclog) { struct xlog *log = iclog->ic_log; spin_lock(&log->l_icloglock); - - ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || - iclog->ic_state == XLOG_STATE_IOERROR); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); /* * If we got an error, either on the first buffer, or in the case of - * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, - * and none should ever be attempted to be written to disk - * again. + * split log writes, on the second, we shut down the file system and + * no iclogs should ever be attempted to be written to disk again. */ - if (iclog->ic_state != XLOG_STATE_IOERROR) + if (!XLOG_FORCED_SHUTDOWN(log)) { + ASSERT(iclog->ic_state == XLOG_STATE_SYNCING); iclog->ic_state = XLOG_STATE_DONE_SYNC; + } /* * Someone could be sleeping prior to writing out the next @@ -2949,9 +2819,8 @@ xlog_state_done_syncing( */ wake_up_all(&iclog->ic_write_wait); spin_unlock(&log->l_icloglock); - xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ -} /* xlog_state_done_syncing */ - + xlog_state_do_callback(log); +} /* * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must @@ -2983,7 +2852,6 @@ xlog_state_get_iclog_space( int log_offset; xlog_rec_header_t *head; xlog_in_core_t *iclog; - int error; restart: spin_lock(&log->l_icloglock); @@ -3032,24 +2900,22 @@ restart: * can fit into remaining data section. */ if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + int error = 0; + xlog_state_switch_iclogs(log, iclog, iclog->ic_size); /* - * If I'm the only one writing to this iclog, sync it to disk. - * We need to do an atomic compare and decrement here to avoid - * racing with concurrent atomic_dec_and_lock() calls in + * If we are the only one writing to this iclog, sync it to + * disk. We need to do an atomic compare and decrement here to + * avoid racing with concurrent atomic_dec_and_lock() calls in * xlog_state_release_iclog() when there is more than one * reference to the iclog. */ - if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { - /* we are the only one */ - spin_unlock(&log->l_icloglock); + if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) error = xlog_state_release_iclog(log, iclog); - if (error) - return error; - } else { - spin_unlock(&log->l_icloglock); - } + spin_unlock(&log->l_icloglock); + if (error) + return error; goto restart; } @@ -3073,21 +2939,21 @@ restart: *logoffsetp = log_offset; return 0; -} /* xlog_state_get_iclog_space */ - -/* The first cnt-1 times through here we don't need to - * move the grant write head because the permanent - * reservation has reserved cnt times the unit amount. - * Release part of current permanent unit reservation and - * reset current reservation to be one units worth. Also - * move grant reservation head forward. +} + +/* + * The first cnt-1 times a ticket goes through here we don't need to move the + * grant write head because the permanent reservation has reserved cnt times the + * unit amount. Release part of current permanent unit reservation and reset + * current reservation to be one units worth. Also move grant reservation head + * forward. */ -STATIC void -xlog_regrant_reserve_log_space( +void +xfs_log_ticket_regrant( struct xlog *log, struct xlog_ticket *ticket) { - trace_xfs_log_regrant_reserve_enter(log, ticket); + trace_xfs_log_ticket_regrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; @@ -3099,21 +2965,20 @@ xlog_regrant_reserve_log_space( ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); - trace_xfs_log_regrant_reserve_sub(log, ticket); + trace_xfs_log_ticket_regrant_sub(log, ticket); /* just return if we still have some of the pre-reserved space */ - if (ticket->t_cnt > 0) - return; + if (!ticket->t_cnt) { + xlog_grant_add_space(log, &log->l_reserve_head.grant, + ticket->t_unit_res); + trace_xfs_log_ticket_regrant_exit(log, ticket); - xlog_grant_add_space(log, &log->l_reserve_head.grant, - ticket->t_unit_res); - - trace_xfs_log_regrant_reserve_exit(log, ticket); - - ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); -} /* xlog_regrant_reserve_log_space */ + ticket->t_curr_res = ticket->t_unit_res; + xlog_tic_reset_res(ticket); + } + xfs_log_ticket_put(ticket); +} /* * Give back the space left from a reservation. @@ -3129,18 +2994,19 @@ xlog_regrant_reserve_log_space( * space, the count will stay at zero and the only space remaining will be * in the current reservation field. */ -STATIC void -xlog_ungrant_log_space( +void +xfs_log_ticket_ungrant( struct xlog *log, struct xlog_ticket *ticket) { - int bytes; + int bytes; + + trace_xfs_log_ticket_ungrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; - trace_xfs_log_ungrant_enter(log, ticket); - trace_xfs_log_ungrant_sub(log, ticket); + trace_xfs_log_ticket_ungrant_sub(log, ticket); /* * If this is a permanent reservation ticket, we may be able to free @@ -3155,71 +3021,15 @@ xlog_ungrant_log_space( xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); - trace_xfs_log_ungrant_exit(log, ticket); + trace_xfs_log_ticket_ungrant_exit(log, ticket); xfs_log_space_wake(log->l_mp); + xfs_log_ticket_put(ticket); } /* - * Flush iclog to disk if this is the last reference to the given iclog and - * the WANT_SYNC bit is set. - * - * When this function is entered, the iclog is not necessarily in the - * WANT_SYNC state. It may be sitting around waiting to get filled. - * - * - */ -STATIC int -xlog_state_release_iclog( - struct xlog *log, - struct xlog_in_core *iclog) -{ - int sync = 0; /* do we sync? */ - - if (iclog->ic_state & XLOG_STATE_IOERROR) - return -EIO; - - ASSERT(atomic_read(&iclog->ic_refcnt) > 0); - if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) - return 0; - - if (iclog->ic_state & XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - return -EIO; - } - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_WANT_SYNC); - - if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { - /* update tail before writing to iclog */ - xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); - sync++; - iclog->ic_state = XLOG_STATE_SYNCING; - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); - xlog_verify_tail_lsn(log, iclog, tail_lsn); - /* cycle incremented when incrementing curr_block */ - } - spin_unlock(&log->l_icloglock); - - /* - * We let the log lock go, so it's possible that we hit a log I/O - * error or some other SHUTDOWN condition that marks the iclog - * as XLOG_STATE_IOERROR before the bwrite. However, we know that - * this iclog has consistent data, so we ignore IOERROR - * flags after this point. - */ - if (sync) - xlog_sync(log, iclog); - return 0; -} /* xlog_state_release_iclog */ - - -/* - * This routine will mark the current iclog in the ring as WANT_SYNC - * and move the current iclog pointer to the next iclog in the ring. - * When this routine is called from xlog_state_get_iclog_space(), the - * exact size of the iclog has not yet been determined. All we know is - * that every data block. We have run out of space in this log record. + * This routine will mark the current iclog in the ring as WANT_SYNC and move + * the current iclog pointer to the next iclog in the ring. */ STATIC void xlog_state_switch_iclogs( @@ -3228,6 +3038,8 @@ xlog_state_switch_iclogs( int eventual_size) { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + assert_spin_locked(&log->l_icloglock); + if (!eventual_size) eventual_size = iclog->ic_offset; iclog->ic_state = XLOG_STATE_WANT_SYNC; @@ -3262,7 +3074,7 @@ xlog_state_switch_iclogs( } ASSERT(iclog == log->l_iclog); log->l_iclog = iclog->ic_next; -} /* xlog_state_switch_iclogs */ +} /* * Write out all data in the in-core log as of this exact moment in time. @@ -3307,7 +3119,7 @@ xfs_log_force( spin_lock(&log->l_icloglock); iclog = log->l_iclog; - if (iclog->ic_state & XLOG_STATE_IOERROR) + if (iclog->ic_state == XLOG_STATE_IOERROR) goto out_error; if (iclog->ic_state == XLOG_STATE_DIRTY || @@ -3322,9 +3134,6 @@ xfs_log_force( * previous iclog and go to sleep. */ iclog = iclog->ic_prev; - if (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY) - goto out_unlock; } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { if (atomic_read(&iclog->ic_refcnt) == 0) { /* @@ -3337,14 +3146,10 @@ xfs_log_force( atomic_inc(&iclog->ic_refcnt); lsn = be64_to_cpu(iclog->ic_header.h_lsn); xlog_state_switch_iclogs(log, iclog, 0); - spin_unlock(&log->l_icloglock); - if (xlog_state_release_iclog(log, iclog)) - return -EIO; + goto out_error; - spin_lock(&log->l_icloglock); - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn || - iclog->ic_state == XLOG_STATE_DIRTY) + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) goto out_unlock; } else { /* @@ -3364,17 +3169,8 @@ xfs_log_force( ; } - if (!(flags & XFS_LOG_SYNC)) - goto out_unlock; - - if (iclog->ic_state & XLOG_STATE_IOERROR) - goto out_error; - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state & XLOG_STATE_IOERROR) - return -EIO; - return 0; - + if (flags & XFS_LOG_SYNC) + return xlog_wait_on_iclog(iclog); out_unlock: spin_unlock(&log->l_icloglock); return 0; @@ -3396,7 +3192,7 @@ __xfs_log_force_lsn( spin_lock(&log->l_icloglock); iclog = log->l_iclog; - if (iclog->ic_state & XLOG_STATE_IOERROR) + if (iclog->ic_state == XLOG_STATE_IOERROR) goto out_error; while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { @@ -3405,9 +3201,6 @@ __xfs_log_force_lsn( goto out_unlock; } - if (iclog->ic_state == XLOG_STATE_DIRTY) - goto out_unlock; - if (iclog->ic_state == XLOG_STATE_ACTIVE) { /* * We sleep here if we haven't already slept (e.g. this is the @@ -3425,10 +3218,8 @@ __xfs_log_force_lsn( * will go out then. */ if (!already_slept && - (iclog->ic_prev->ic_state & - (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { - ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); - + (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) { XFS_STATS_INC(mp, xs_log_force_sleep); xlog_wait(&iclog->ic_prev->ic_write_wait, @@ -3437,27 +3228,14 @@ __xfs_log_force_lsn( } atomic_inc(&iclog->ic_refcnt); xlog_state_switch_iclogs(log, iclog, 0); - spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) - return -EIO; + goto out_error; if (log_flushed) *log_flushed = 1; - spin_lock(&log->l_icloglock); } - if (!(flags & XFS_LOG_SYNC) || - (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) - goto out_unlock; - - if (iclog->ic_state & XLOG_STATE_IOERROR) - goto out_error; - - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state & XLOG_STATE_IOERROR) - return -EIO; - return 0; - + if (flags & XFS_LOG_SYNC) + return xlog_wait_on_iclog(iclog); out_unlock: spin_unlock(&log->l_icloglock); return 0; @@ -3504,33 +3282,6 @@ xfs_log_force_lsn( } /* - * Called when we want to mark the current iclog as being ready to sync to - * disk. - */ -STATIC void -xlog_state_want_sync( - struct xlog *log, - struct xlog_in_core *iclog) -{ - assert_spin_locked(&log->l_icloglock); - - if (iclog->ic_state == XLOG_STATE_ACTIVE) { - xlog_state_switch_iclogs(log, iclog, 0); - } else { - ASSERT(iclog->ic_state & - (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); - } -} - - -/***************************************************************************** - * - * TICKET functions - * - ***************************************************************************** - */ - -/* * Free a used ticket when its refcount falls to zero. */ void @@ -3539,7 +3290,7 @@ xfs_log_ticket_put( { ASSERT(atomic_read(&ticket->t_ref) > 0); if (atomic_dec_and_test(&ticket->t_ref)) - kmem_zone_free(xfs_log_ticket_zone, ticket); + kmem_cache_free(xfs_log_ticket_zone, ticket); } xlog_ticket_t * @@ -3678,7 +3429,6 @@ xlog_ticket_alloc( tic->t_ocnt = cnt; tic->t_tid = prandom_u32(); tic->t_clientid = client; - tic->t_flags = XLOG_TIC_INITED; if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; @@ -3687,13 +3437,6 @@ xlog_ticket_alloc( return tic; } - -/****************************************************************************** - * - * Log debug routines - * - ****************************************************************************** - */ #if defined(DEBUG) /* * Make sure that the destination ptr is within the valid data region of @@ -3779,7 +3522,7 @@ xlog_verify_tail_lsn( if (blocks < BTOBB(iclog->ic_offset) + 1) xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); } -} /* xlog_verify_tail_lsn */ +} /* * Perform a number of checks on the iclog before writing to disk. @@ -3882,7 +3625,7 @@ xlog_verify_iclog( } ptr += sizeof(xlog_op_header_t) + op_len; } -} /* xlog_verify_iclog */ +} #endif /* @@ -3895,7 +3638,7 @@ xlog_state_ioerror( xlog_in_core_t *iclog, *ic; iclog = log->l_iclog; - if (! (iclog->ic_state & XLOG_STATE_IOERROR)) { + if (iclog->ic_state != XLOG_STATE_IOERROR) { /* * Mark all the incore logs IOERROR. * From now on, no log flushes will result. @@ -3955,7 +3698,7 @@ xfs_log_force_umount( * Somebody could've already done the hard work for us. * No need to get locks for this. */ - if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) { + if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) { ASSERT(XLOG_FORCED_SHUTDOWN(log)); return 1; } @@ -4006,21 +3749,8 @@ xfs_log_force_umount( spin_lock(&log->l_cilp->xc_push_lock); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); - xlog_state_do_callback(log, true, NULL); + xlog_state_do_callback(log); -#ifdef XFSERRORDEBUG - { - xlog_in_core_t *iclog; - - spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - do { - ASSERT(iclog->ic_callback == 0); - iclog = iclog->ic_next; - } while (iclog != log->l_iclog); - spin_unlock(&log->l_icloglock); - } -#endif /* return non-zero if log IOERROR transition had already happened */ return retval; } diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 84e06805160f..1412d6993f1e 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -105,10 +105,6 @@ struct xfs_log_item; struct xfs_item_ops; struct xfs_trans; -xfs_lsn_t xfs_log_done(struct xfs_mount *mp, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - bool regrant); int xfs_log_force(struct xfs_mount *mp, uint flags); int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, int *log_forced); @@ -121,8 +117,7 @@ void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); -int xfs_log_release_iclog(struct xfs_mount *mp, - struct xlog_in_core *iclog); +void xfs_log_release_iclog(struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, int length, int count, @@ -138,7 +133,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket); void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, bool regrant); -void xlog_cil_process_committed(struct list_head *list, bool aborted); +void xlog_cil_process_committed(struct list_head *list); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ef652abd112c..b43f0e8f43f2 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -179,7 +179,7 @@ xlog_cil_alloc_shadow_bufs( /* * We free and allocate here as a realloc would copy - * unecessary data. We don't use kmem_zalloc() for the + * unnecessary data. We don't use kmem_zalloc() for the * same reason - we don't need to zero the data area in * the buffer, only the log vector header and the iovec * storage. @@ -574,10 +574,10 @@ xlog_discard_busy_extents( */ static void xlog_cil_committed( - struct xfs_cil_ctx *ctx, - bool abort) + struct xfs_cil_ctx *ctx) { struct xfs_mount *mp = ctx->cil->xc_log->l_mp; + bool abort = XLOG_FORCED_SHUTDOWN(ctx->cil->xc_log); /* * If the I/O failed, we're aborting the commit and already shutdown. @@ -613,37 +613,38 @@ xlog_cil_committed( void xlog_cil_process_committed( - struct list_head *list, - bool aborted) + struct list_head *list) { struct xfs_cil_ctx *ctx; while ((ctx = list_first_entry_or_null(list, struct xfs_cil_ctx, iclog_entry))) { list_del(&ctx->iclog_entry); - xlog_cil_committed(ctx, aborted); + xlog_cil_committed(ctx); } } /* - * Push the Committed Item List to the log. If @push_seq flag is zero, then it - * is a background flush and so we can chose to ignore it. Otherwise, if the - * current sequence is the same as @push_seq we need to do a flush. If - * @push_seq is less than the current sequence, then it has already been + * Push the Committed Item List to the log. + * + * If the current sequence is the same as xc_push_seq we need to do a flush. If + * xc_push_seq is less than the current sequence, then it has already been * flushed and we don't need to do anything - the caller will wait for it to * complete if necessary. * - * @push_seq is a value rather than a flag because that allows us to do an - * unlocked check of the sequence number for a match. Hence we can allows log - * forces to run racily and not issue pushes for the same sequence twice. If we - * get a race between multiple pushes for the same sequence they will block on - * the first one and then abort, hence avoiding needless pushes. + * xc_push_seq is checked unlocked against the sequence number for a match. + * Hence we can allow log forces to run racily and not issue pushes for the + * same sequence twice. If we get a race between multiple pushes for the same + * sequence they will block on the first one and then abort, hence avoiding + * needless pushes. */ -STATIC int -xlog_cil_push( - struct xlog *log) +static void +xlog_cil_push_work( + struct work_struct *work) { - struct xfs_cil *cil = log->l_cilp; + struct xfs_cil *cil = + container_of(work, struct xfs_cil, xc_push_work); + struct xlog *log = cil->xc_log; struct xfs_log_vec *lv; struct xfs_cil_ctx *ctx; struct xfs_cil_ctx *new_ctx; @@ -657,9 +658,6 @@ xlog_cil_push( xfs_lsn_t commit_lsn; xfs_lsn_t push_seq; - if (!cil) - return 0; - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -671,6 +669,11 @@ xlog_cil_push( ASSERT(push_seq <= ctx->sequence); /* + * Wake up any background push waiters now this context is being pushed. + */ + wake_up_all(&ctx->push_wait); + + /* * Check if we've anything to push. If there is nothing, then we don't * move on to a new sequence number and so we have to be able to push * this sequence again later. @@ -682,7 +685,7 @@ xlog_cil_push( } - /* check for a previously pushed seqeunce */ + /* check for a previously pushed sequence */ if (push_seq < cil->xc_ctx->sequence) { spin_unlock(&cil->xc_push_lock); goto out_skip; @@ -746,6 +749,7 @@ xlog_cil_push( */ INIT_LIST_HEAD(&new_ctx->committing); INIT_LIST_HEAD(&new_ctx->busy_extents); + init_waitqueue_head(&new_ctx->push_wait); new_ctx->sequence = ctx->sequence + 1; new_ctx->cil = cil; cil->xc_ctx = new_ctx; @@ -803,7 +807,7 @@ xlog_cil_push( lvhdr.lv_iovecp = &lhdr; lvhdr.lv_next = ctx->lv_chain; - error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true); if (error) goto out_abort_free_ticket; @@ -841,13 +845,14 @@ restart: } spin_unlock(&cil->xc_push_lock); - /* xfs_log_done always frees the ticket on error. */ - commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false); - if (commit_lsn == -1) - goto out_abort; + error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn); + if (error) + goto out_abort_free_ticket; + + xfs_log_ticket_ungrant(log, tic); spin_lock(&commit_iclog->ic_callback_lock); - if (commit_iclog->ic_state & XLOG_STATE_IOERROR) { + if (commit_iclog->ic_state == XLOG_STATE_IOERROR) { spin_unlock(&commit_iclog->ic_callback_lock); goto out_abort; } @@ -867,28 +872,20 @@ restart: spin_unlock(&cil->xc_push_lock); /* release the hounds! */ - return xfs_log_release_iclog(log->l_mp, commit_iclog); + xfs_log_release_iclog(commit_iclog); + return; out_skip: up_write(&cil->xc_ctx_lock); xfs_log_ticket_put(new_ctx->ticket); kmem_free(new_ctx); - return 0; + return; out_abort_free_ticket: - xfs_log_ticket_put(tic); + xfs_log_ticket_ungrant(log, tic); out_abort: - xlog_cil_committed(ctx, true); - return -EIO; -} - -static void -xlog_cil_push_work( - struct work_struct *work) -{ - struct xfs_cil *cil = container_of(work, struct xfs_cil, - xc_push_work); - xlog_cil_push(cil->xc_log); + ASSERT(XLOG_FORCED_SHUTDOWN(log)); + xlog_cil_committed(ctx); } /* @@ -900,7 +897,7 @@ xlog_cil_push_work( */ static void xlog_cil_push_background( - struct xlog *log) + struct xlog *log) __releases(cil->xc_ctx_lock) { struct xfs_cil *cil = log->l_cilp; @@ -914,14 +911,36 @@ xlog_cil_push_background( * don't do a background push if we haven't used up all the * space available yet. */ - if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) { + up_read(&cil->xc_ctx_lock); return; + } spin_lock(&cil->xc_push_lock); if (cil->xc_push_seq < cil->xc_current_sequence) { cil->xc_push_seq = cil->xc_current_sequence; queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); } + + /* + * Drop the context lock now, we can't hold that if we need to sleep + * because we are over the blocking threshold. The push_lock is still + * held, so blocking threshold sleep/wakeup is still correctly + * serialised here. + */ + up_read(&cil->xc_ctx_lock); + + /* + * If we are well over the space limit, throttle the work that is being + * done until the push work on this context has begun. + */ + if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) { + trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); + ASSERT(cil->xc_ctx->space_used < log->l_logsize); + xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock); + return; + } + spin_unlock(&cil->xc_push_lock); } @@ -1017,7 +1036,10 @@ xfs_log_commit_cil( if (commit_lsn) *commit_lsn = xc_commit_lsn; - xfs_log_done(mp, tp->t_ticket, NULL, regrant); + if (regrant && !XLOG_FORCED_SHUTDOWN(log)) + xfs_log_ticket_regrant(log, tp->t_ticket); + else + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; xfs_trans_unreserve_and_mod_sb(tp); @@ -1038,9 +1060,9 @@ xfs_log_commit_cil( if (lip->li_ops->iop_committing) lip->li_ops->iop_committing(lip, xc_commit_lsn); } - xlog_cil_push_background(log); - up_read(&cil->xc_ctx_lock); + /* xlog_cil_push_background() releases cil->xc_ctx_lock */ + xlog_cil_push_background(log); } /* @@ -1199,6 +1221,7 @@ xlog_cil_init( INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents); + init_waitqueue_head(&ctx->push_wait); ctx->sequence = 1; ctx->cil = cil; cil->xc_ctx = ctx; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b880c23cb6e4..ec22c7a3867f 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -40,26 +40,22 @@ static inline uint xlog_get_client_id(__be32 i) /* * In core log state */ -#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ -#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ -#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ -#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ -#define XLOG_STATE_DO_CALLBACK \ - 0x0010 /* Process callback functions */ -#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ -#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ -#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ -#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ -#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ +enum xlog_iclog_state { + XLOG_STATE_ACTIVE, /* Current IC log being written to */ + XLOG_STATE_WANT_SYNC, /* Want to sync this iclog; no more writes */ + XLOG_STATE_SYNCING, /* This IC log is syncing */ + XLOG_STATE_DONE_SYNC, /* Done syncing to disk */ + XLOG_STATE_CALLBACK, /* Callback functions now */ + XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */ + XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */ +}; /* - * Flags to log ticket + * Log ticket flags */ -#define XLOG_TIC_INITED 0x1 /* has been initialized */ -#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ +#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */ #define XLOG_TIC_FLAGS \ - { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } /* @@ -179,8 +175,6 @@ typedef struct xlog_ticket { * - ic_next is the pointer to the next iclog in the ring. * - ic_log is a pointer back to the global log structure. * - ic_size is the full size of the log buffer, minus the cycle headers. - * - ic_io_size is the size of the currently pending log buffer write, which - * might be smaller than ic_size * - ic_offset is the current number of bytes written to in this iclog. * - ic_refcnt is bumped when someone is writing to the log. * - ic_state is the state of the iclog. @@ -205,9 +199,8 @@ typedef struct xlog_in_core { struct xlog_in_core *ic_prev; struct xlog *ic_log; u32 ic_size; - u32 ic_io_size; u32 ic_offset; - unsigned short ic_state; + enum xlog_iclog_state ic_state; char *ic_datap; /* pointer to iclog data */ /* Callback structures need their own cacheline */ @@ -247,6 +240,7 @@ struct xfs_cil_ctx { struct xfs_log_vec *lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ + wait_queue_head_t push_wait; /* background push throttle */ struct work_struct discard_endio_work; }; @@ -323,13 +317,53 @@ struct xfs_cil { * tries to keep 25% of the log free, so we need to keep below that limit or we * risk running out of free log space to start any new transactions. * - * In order to keep background CIL push efficient, we will set a lower - * threshold at which background pushing is attempted without blocking current - * transaction commits. A separate, higher bound defines when CIL pushes are - * enforced to ensure we stay within our maximum checkpoint size bounds. - * threshold, yet give us plenty of space for aggregation on large logs. + * In order to keep background CIL push efficient, we only need to ensure the + * CIL is large enough to maintain sufficient in-memory relogging to avoid + * repeated physical writes of frequently modified metadata. If we allow the CIL + * to grow to a substantial fraction of the log, then we may be pinning hundreds + * of megabytes of metadata in memory until the CIL flushes. This can cause + * issues when we are running low on memory - pinned memory cannot be reclaimed, + * and the CIL consumes a lot of memory. Hence we need to set an upper physical + * size limit for the CIL that limits the maximum amount of memory pinned by the + * CIL but does not limit performance by reducing relogging efficiency + * significantly. + * + * As such, the CIL push threshold ends up being the smaller of two thresholds: + * - a threshold large enough that it allows CIL to be pushed and progress to be + * made without excessive blocking of incoming transaction commits. This is + * defined to be 12.5% of the log space - half the 25% push threshold of the + * AIL. + * - small enough that it doesn't pin excessive amounts of memory but maintains + * close to peak relogging efficiency. This is defined to be 16x the iclog + * buffer window (32MB) as measurements have shown this to be roughly the + * point of diminishing performance increases under highly concurrent + * modification workloads. + * + * To prevent the CIL from overflowing upper commit size bounds, we introduce a + * new threshold at which we block committing transactions until the background + * CIL commit commences and switches to a new context. While this is not a hard + * limit, it forces the process committing a transaction to the CIL to block and + * yeild the CPU, giving the CIL push work a chance to be scheduled and start + * work. This prevents a process running lots of transactions from overfilling + * the CIL because it is not yielding the CPU. We set the blocking limit at + * twice the background push space threshold so we keep in line with the AIL + * push thresholds. + * + * Note: this is not a -hard- limit as blocking is applied after the transaction + * is inserted into the CIL and the push has been triggered. It is largely a + * throttling mechanism that allows the CIL push to be scheduled and run. A hard + * limit will be difficult to implement without introducing global serialisation + * in the CIL commit fast path, and it's not at all clear that we actually need + * such hard limits given the ~7 years we've run without a hard limit before + * finding the first situation where a checkpoint size overflow actually + * occurred. Hence the simple throttle, and an ASSERT check to tell us that + * we've overrun the max size. */ -#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) +#define XLOG_CIL_SPACE_LIMIT(log) \ + min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4) + +#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \ + (XLOG_CIL_SPACE_LIMIT(log) * 2) /* * ticket grant locks, queues and accounting have their own cachlines @@ -399,8 +433,6 @@ struct xlog { /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG void *l_iclog_bak[XLOG_MAX_ICLOGS]; - /* log record crc error injection factor */ - uint32_t l_badcrc_factor; #endif /* log recovery lsn tracking (for buffer submission */ xfs_lsn_t l_recovery_lsn; @@ -409,7 +441,8 @@ struct xlog { #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) -#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) +#define XLOG_FORCED_SHUTDOWN(log) \ + (unlikely((log)->l_flags & XLOG_IO_ERROR)) /* common routines */ extern int @@ -445,14 +478,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); -int -xlog_write( - struct xlog *log, - struct xfs_log_vec *log_vector, - struct xlog_ticket *tic, - xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, - uint flags); +int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, uint flags, + bool need_start_rec); +int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, + struct xlog_in_core **iclog, xfs_lsn_t *lsn); +void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); +void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); /* * When we crack an atomic LSN, we sample it first so that the value will not @@ -532,17 +565,15 @@ xlog_cil_force(struct xlog *log) } /* - * Unmount record type is used as a pseudo transaction type for the ticket. - * It's value must be outside the range of XFS_TRANS_* values. - */ -#define XLOG_UNMOUNT_REC_TYPE (-1U) - -/* * Wrapper function for waiting on a wait queue serialised against wakeups * by a spinlock. This matches the semantics of all the wait queues used in the * log code. */ -static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) +static inline void +xlog_wait( + struct wait_queue_head *wq, + struct spinlock *lock) + __releases(lock) { DECLARE_WAITQUEUE(wait, current); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index c1a514ffff55..11c3502b07b1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -103,10 +103,9 @@ xlog_alloc_buffer( * Pass log block 0 since we don't have an addr yet, buffer will be * verified on read. */ - if (!xlog_verify_bno(log, 0, nbblks)) { + if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) { xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", nbblks); - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return NULL; } @@ -152,11 +151,10 @@ xlog_do_io( { int error; - if (!xlog_verify_bno(log, blk_no, nbblks)) { + if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) { xfs_warn(log->l_mp, "Invalid log block/length (0x%llx, 0x%x) for buffer", blk_no, nbblks); - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return -EFSCORRUPTED; } @@ -244,19 +242,17 @@ xlog_header_check_recover( * (XLOG_FMT_UNKNOWN). This stops us from trying to recover * a dirty log created in IRIX. */ - if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) { + if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) { xfs_warn(mp, "dirty log written in incompatible format - can't recover"); xlog_header_check_dump(mp, head); - XFS_ERROR_REPORT("xlog_header_check_recover(1)", - XFS_ERRLEVEL_HIGH, mp); return -EFSCORRUPTED; - } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { + } + if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid, + &head->h_fs_uuid))) { xfs_warn(mp, "dirty log entry has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); - XFS_ERROR_REPORT("xlog_header_check_recover(2)", - XFS_ERRLEVEL_HIGH, mp); return -EFSCORRUPTED; } return 0; @@ -279,11 +275,10 @@ xlog_header_check_mount( * by IRIX and continue. */ xfs_warn(mp, "null uuid in log - IRIX style log"); - } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { + } else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid, + &head->h_fs_uuid))) { xfs_warn(mp, "log has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); - XFS_ERROR_REPORT("xlog_header_check_mount", - XFS_ERRLEVEL_HIGH, mp); return -EFSCORRUPTED; } return 0; @@ -299,7 +294,7 @@ xlog_recover_iodone( * this during recovery. One strike! */ if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) { - xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_ioerror_alert(bp, __this_address); xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); } } @@ -471,7 +466,7 @@ xlog_find_verify_log_record( xfs_warn(log->l_mp, "Log inconsistent (didn't find previous header)"); ASSERT(0); - error = -EIO; + error = -EFSCORRUPTED; goto out; } @@ -1347,10 +1342,11 @@ xlog_find_tail( error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer, &rhead_blk, &rhead, &wrapped); if (error < 0) - return error; + goto done; if (!error) { xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); - return -EIO; + error = -EFSCORRUPTED; + goto done; } *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); @@ -1699,11 +1695,10 @@ xlog_clear_stale_blocks( * the distance from the beginning of the log to the * tail. */ - if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) { - XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)", - XFS_ERRLEVEL_LOW, log->l_mp); + if (XFS_IS_CORRUPT(log->l_mp, + head_block < tail_block || + head_block >= log->l_logBBsize)) return -EFSCORRUPTED; - } tail_distance = tail_block + (log->l_logBBsize - head_block); } else { /* @@ -1711,11 +1706,10 @@ xlog_clear_stale_blocks( * so the distance from the head to the tail is just * the tail block minus the head block. */ - if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){ - XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)", - XFS_ERRLEVEL_LOW, log->l_mp); + if (XFS_IS_CORRUPT(log->l_mp, + head_block >= tail_block || + head_cycle != tail_cycle + 1)) return -EFSCORRUPTED; - } tail_distance = tail_block - head_block; } @@ -1940,6 +1934,12 @@ xlog_recover_buffer_pass1( struct list_head *bucket; struct xfs_buf_cancel *bcp; + if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { + xfs_err(log->l_mp, "bad buffer log item size (%d)", + item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + /* * If this isn't a cancel buffer item, then just return. */ @@ -2135,13 +2135,11 @@ xlog_recover_do_inode_buffer( */ logged_nextp = item->ri_buf[item_index].i_addr + next_unlinked_offset - reg_buf_offset; - if (unlikely(*logged_nextp == 0)) { + if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { xfs_alert(mp, "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " "Trying to replay bad (0) inode di_next_unlinked field.", item, bp); - XFS_ERROR_REPORT("xlog_recover_do_inode_buf", - XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } @@ -2576,6 +2574,7 @@ xlog_recover_do_reg_buffer( int bit; int nbits; xfs_failaddr_t fa; + const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); @@ -2618,7 +2617,7 @@ xlog_recover_do_reg_buffer( "XFS: NULL dquot in %s.", __func__); goto next; } - if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { + if (item->ri_buf[i].i_len < size_disk_dquot) { xfs_alert(mp, "XFS: dquot too small (%d) in %s.", item->ri_buf[i].i_len, __func__); @@ -2746,15 +2745,10 @@ xlog_recover_buffer_pass2( if (buf_f->blf_flags & XFS_BLF_INODE_BUF) buf_flags |= XBF_UNMAPPED; - bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags, NULL); - if (!bp) - return -ENOMEM; - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); - goto out_release; - } + error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags, &bp, NULL); + if (error) + return error; /* * Recover the buffer only if we get an LSN from it and it's less than @@ -2875,8 +2869,8 @@ xfs_recover_inode_owner_change( return -ENOMEM; /* instantiate the inode */ + ASSERT(dip->di_version >= 3); xfs_inode_from_disk(ip, dip); - ASSERT(ip->i_d.di_version >= 3); error = xfs_iformat_fork(ip, dip); if (error) @@ -2951,17 +2945,10 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, - &xfs_inode_buf_ops); - if (!bp) { - error = -ENOMEM; + error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, + 0, &bp, &xfs_inode_buf_ops); + if (error) goto error; - } - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); - goto out_release; - } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = xfs_buf_offset(bp, in_f->ilf_boffset); @@ -2969,22 +2956,18 @@ xlog_recover_inode_pass2( * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) { + if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", __func__, dip, bp, in_f->ilf_ino); - XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", - XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto out_release; } ldip = item->ri_buf[1].i_addr; - if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) { + if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { xfs_alert(mp, "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", __func__, item, in_f->ilf_ino); - XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", - XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto out_release; } @@ -3014,7 +2997,7 @@ xlog_recover_inode_pass2( * superblock flag to determine whether we need to look at di_flushiter * to skip replay when the on disk inode is newer than the log one */ - if (!xfs_sb_version_hascrc(&mp->m_sb) && + if (!xfs_sb_version_has_v3inode(&mp->m_sb) && ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less @@ -3085,7 +3068,7 @@ xlog_recover_inode_pass2( error = -EFSCORRUPTED; goto out_release; } - isize = xfs_log_dinode_size(ldip->di_version); + isize = xfs_log_dinode_size(mp); if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, ldip, @@ -3166,7 +3149,7 @@ xlog_recover_inode_pass2( default: xfs_warn(log->l_mp, "%s: Invalid flag", __func__); ASSERT(0); - error = -EIO; + error = -EFSCORRUPTED; goto out_release; } } @@ -3247,12 +3230,12 @@ xlog_recover_dquot_pass2( recddq = item->ri_buf[1].i_addr; if (recddq == NULL) { xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); - return -EIO; + return -EFSCORRUPTED; } - if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) { xfs_alert(log->l_mp, "dquot too small (%d) in %s.", item->ri_buf[1].i_len, __func__); - return -EIO; + return -EFSCORRUPTED; } /* @@ -3279,7 +3262,7 @@ xlog_recover_dquot_pass2( if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", dq_f->qlf_id, fa); - return -EIO; + return -EFSCORRUPTED; } ASSERT(dq_f->qlf_len == 1); @@ -3537,6 +3520,7 @@ xfs_cui_copy_format( memcpy(dst_cui_fmt, src_cui_fmt, len); return 0; } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; } @@ -3601,8 +3585,10 @@ xlog_recover_cud_pass2( struct xfs_ail *ailp = log->l_ailp; cud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) + if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; + } cui_id = cud_formatp->cud_cui_id; /* @@ -3654,6 +3640,7 @@ xfs_bui_copy_format( memcpy(dst_bui_fmt, src_bui_fmt, len); return 0; } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; } @@ -3677,8 +3664,10 @@ xlog_recover_bui_pass2( bui_formatp = item->ri_buf[0].i_addr; - if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; + } buip = xfs_bui_init(mp); error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); if (error) { @@ -3720,8 +3709,10 @@ xlog_recover_bud_pass2( struct xfs_ail *ailp = log->l_ailp; bud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) + if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; + } bui_id = bud_formatp->bud_bui_id; /* @@ -4018,7 +4009,7 @@ xlog_recover_commit_pass1( xfs_warn(log->l_mp, "%s: invalid item type (%d)", __func__, ITEM_TYPE(item)); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } } @@ -4066,7 +4057,7 @@ xlog_recover_commit_pass2( xfs_warn(log->l_mp, "%s: invalid item type (%d)", __func__, ITEM_TYPE(item)); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } } @@ -4187,7 +4178,7 @@ xlog_recover_add_to_cont_trans( ASSERT(len <= sizeof(struct xfs_trans_header)); if (len > sizeof(struct xfs_trans_header)) { xfs_warn(log->l_mp, "%s: bad header length", __func__); - return -EIO; + return -EFSCORRUPTED; } xlog_recover_add_item(&trans->r_itemq); @@ -4243,13 +4234,13 @@ xlog_recover_add_to_trans( xfs_warn(log->l_mp, "%s: bad header magic number", __func__); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } if (len > sizeof(struct xfs_trans_header)) { xfs_warn(log->l_mp, "%s: bad header length", __func__); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } /* @@ -4285,7 +4276,7 @@ xlog_recover_add_to_trans( in_f->ilf_size); ASSERT(0); kmem_free(ptr); - return -EIO; + return -EFSCORRUPTED; } item->ri_total = in_f->ilf_size; @@ -4293,7 +4284,16 @@ xlog_recover_add_to_trans( kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), 0); } - ASSERT(item->ri_total > item->ri_cnt); + + if (item->ri_total <= item->ri_cnt) { + xfs_warn(log->l_mp, + "log item region count (%d) overflowed size (%d)", + item->ri_cnt, item->ri_total); + ASSERT(0); + kmem_free(ptr); + return -EFSCORRUPTED; + } + /* Description region is ri_buf[0] */ item->ri_buf[item->ri_cnt].i_addr = ptr; item->ri_buf[item->ri_cnt].i_len = len; @@ -4380,7 +4380,7 @@ xlog_recovery_process_trans( default: xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags); ASSERT(0); - error = -EIO; + error = -EFSCORRUPTED; break; } if (error || freeit) @@ -4460,7 +4460,7 @@ xlog_recover_process_ophdr( xfs_warn(log->l_mp, "%s: bad clientid 0x%x", __func__, ohead->oh_clientid); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } /* @@ -4470,7 +4470,7 @@ xlog_recover_process_ophdr( if (dp + len > end) { xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len); WARN_ON(1); - return -EIO; + return -EFSCORRUPTED; } trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead); @@ -4947,7 +4947,7 @@ xlog_recover_clear_agi_bucket( if (error) goto out_abort; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); @@ -5083,7 +5083,7 @@ xlog_recover_process_iunlinks( * buffer reference though, so that it stays pinned in memory * while we need the buffer. */ - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; xfs_buf_unlock(agibp); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { @@ -5172,8 +5172,10 @@ xlog_recover_process( * If the filesystem is CRC enabled, this mismatch becomes a * fatal log corruption failure. */ - if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; + } } xlog_unpack_data(rhead, dp, log); @@ -5190,31 +5192,25 @@ xlog_valid_rec_header( { int hlen; - if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) { - XFS_ERROR_REPORT("xlog_valid_rec_header(1)", - XFS_ERRLEVEL_LOW, log->l_mp); + if (XFS_IS_CORRUPT(log->l_mp, + rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) return -EFSCORRUPTED; - } - if (unlikely( - (!rhead->h_version || - (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { + if (XFS_IS_CORRUPT(log->l_mp, + (!rhead->h_version || + (be32_to_cpu(rhead->h_version) & + (~XLOG_VERSION_OKBITS))))) { xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", __func__, be32_to_cpu(rhead->h_version)); - return -EIO; + return -EFSCORRUPTED; } /* LR body must have data or it wouldn't have been written */ hlen = be32_to_cpu(rhead->h_len); - if (unlikely( hlen <= 0 || hlen > INT_MAX )) { - XFS_ERROR_REPORT("xlog_valid_rec_header(2)", - XFS_ERRLEVEL_LOW, log->l_mp); + if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > INT_MAX)) return -EFSCORRUPTED; - } - if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) { - XFS_ERROR_REPORT("xlog_valid_rec_header(3)", - XFS_ERRLEVEL_LOW, log->l_mp); + if (XFS_IS_CORRUPT(log->l_mp, + blkno > log->l_logBBsize || blkno > INT_MAX)) return -EFSCORRUPTED; - } return 0; } @@ -5296,8 +5292,12 @@ xlog_do_recovery_pass( "invalid iclog size (%d bytes), using lsunit (%d bytes)", h_size, log->l_mp->m_logbsize); h_size = log->l_mp->m_logbsize; - } else - return -EFSCORRUPTED; + } else { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, + log->l_mp); + error = -EFSCORRUPTED; + goto bread_err1; + } } if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && @@ -5627,7 +5627,7 @@ xlog_do_recover( error = xfs_buf_submit(bp); if (error) { if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_ioerror_alert(bp, __this_address); ASSERT(0); } xfs_buf_relse(bp); @@ -5636,7 +5636,7 @@ xlog_do_recover( /* Convert superblock from on-disk format */ sbp = &mp->m_sb; - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, bp->b_addr); xfs_buf_relse(bp); /* re-initialise in-core superblock and geometry structures */ @@ -5809,7 +5809,6 @@ xlog_recover_check_summary( struct xlog *log) { xfs_mount_t *mp; - xfs_agf_t *agfp; xfs_buf_t *agfbp; xfs_buf_t *agibp; xfs_agnumber_t agno; @@ -5829,7 +5828,8 @@ xlog_recover_check_summary( xfs_alert(mp, "%s agf read failed agno %d error %d", __func__, agno, error); } else { - agfp = XFS_BUF_TO_AGF(agfbp); + struct xfs_agf *agfp = agfbp->b_addr; + freeblks += be32_to_cpu(agfp->agf_freeblks) + be32_to_cpu(agfp->agf_flcount); xfs_buf_relse(agfbp); @@ -5840,7 +5840,7 @@ xlog_recover_check_summary( xfs_alert(mp, "%s agi read failed agno %d error %d", __func__, agno, error); } else { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + struct xfs_agi *agi = agibp->b_addr; itotal += be32_to_cpu(agi->agi_count); ifree += be32_to_cpu(agi->agi_freecount); diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 9804efe525a9..e0f9d3b6abe9 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -20,8 +20,8 @@ __xfs_printk( const struct xfs_mount *mp, struct va_format *vaf) { - if (mp && mp->m_fsname) { - printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf); + if (mp && mp->m_super) { + printk("%sXFS (%s): %pV\n", level, mp->m_super->s_id, vaf); return; } printk("%sXFS: %pV\n", level, vaf); @@ -86,17 +86,25 @@ xfs_alert_tag( } void -asswarn(char *expr, char *file, int line) +asswarn( + struct xfs_mount *mp, + char *expr, + char *file, + int line) { - xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d", + xfs_warn(mp, "Assertion failed: %s, file: %s, line: %d", expr, file, line); WARN_ON(1); } void -assfail(char *expr, char *file, int line) +assfail( + struct xfs_mount *mp, + char *expr, + char *file, + int line) { - xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d", + xfs_emerg(mp, "Assertion failed: %s, file: %s, line: %d", expr, file, line); if (xfs_globals.bug_on_assert) BUG(); @@ -105,7 +113,7 @@ assfail(char *expr, char *file, int line) } void -xfs_hex_dump(void *p, int length) +xfs_hex_dump(const void *p, int length) { print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1); } diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 34447dca97d1..0b05e10995a0 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -57,9 +57,9 @@ do { \ #define xfs_debug_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) -extern void assfail(char *expr, char *f, int l); -extern void asswarn(char *expr, char *f, int l); +void assfail(struct xfs_mount *mp, char *expr, char *f, int l); +void asswarn(struct xfs_mount *mp, char *expr, char *f, int l); -extern void xfs_hex_dump(void *p, int length); +extern void xfs_hex_dump(const void *p, int length); #endif /* __XFS_MESSAGE_H */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index ba5b6f3b2b88..c5513e5a226a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -31,7 +31,7 @@ #include "xfs_reflink.h" #include "xfs_extent_busy.h" #include "xfs_health.h" - +#include "xfs_trace.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -310,7 +310,7 @@ reread: /* * Initialize the mount structure from the superblock. */ - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, bp->b_addr); /* * If we haven't validated the superblock, do so now before we try @@ -360,108 +360,122 @@ release_buf: } /* - * Update alignment values based on mount options and sb values + * If the sunit/swidth change would move the precomputed root inode value, we + * must reject the ondisk change because repair will stumble over that. + * However, we allow the mount to proceed because we never rejected this + * combination before. Returns true to update the sb, false otherwise. + */ +static inline int +xfs_check_new_dalign( + struct xfs_mount *mp, + int new_dalign, + bool *update_sb) +{ + struct xfs_sb *sbp = &mp->m_sb; + xfs_ino_t calc_ino; + + calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign); + trace_xfs_check_new_dalign(mp, new_dalign, calc_ino); + + if (sbp->sb_rootino == calc_ino) { + *update_sb = true; + return 0; + } + + xfs_warn(mp, +"Cannot change stripe alignment; would require moving root inode."); + + /* + * XXX: Next time we add a new incompat feature, this should start + * returning -EINVAL to fail the mount. Until then, spit out a warning + * that we're ignoring the administrator's instructions. + */ + xfs_warn(mp, "Skipping superblock stripe alignment update."); + *update_sb = false; + return 0; +} + +/* + * If we were provided with new sunit/swidth values as mount options, make sure + * that they pass basic alignment and superblock feature checks, and convert + * them into the same units (FSB) that everything else expects. This step + * /must/ be done before computing the inode geometry. */ STATIC int -xfs_update_alignment(xfs_mount_t *mp) +xfs_validate_new_dalign( + struct xfs_mount *mp) { - xfs_sb_t *sbp = &(mp->m_sb); + if (mp->m_dalign == 0) + return 0; - if (mp->m_dalign) { + /* + * If stripe unit and stripe width are not multiples + * of the fs blocksize turn off alignment. + */ + if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || + (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. blocksize(%d)", + mp->m_sb.sb_blocksize); + return -EINVAL; + } else { /* - * If stripe unit and stripe width are not multiples - * of the fs blocksize turn off alignment. + * Convert the stripe unit and width to FSBs. */ - if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || - (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { xfs_warn(mp, - "alignment check failed: sunit/swidth vs. blocksize(%d)", - sbp->sb_blocksize); + "alignment check failed: sunit/swidth vs. agsize(%d)", + mp->m_sb.sb_agblocks); return -EINVAL; - } else { - /* - * Convert the stripe unit and width to FSBs. - */ - mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); - if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { - xfs_warn(mp, - "alignment check failed: sunit/swidth vs. agsize(%d)", - sbp->sb_agblocks); - return -EINVAL; - } else if (mp->m_dalign) { - mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); - } else { - xfs_warn(mp, - "alignment check failed: sunit(%d) less than bsize(%d)", - mp->m_dalign, sbp->sb_blocksize); - return -EINVAL; - } - } - - /* - * Update superblock with new values - * and log changes - */ - if (xfs_sb_version_hasdalign(sbp)) { - if (sbp->sb_unit != mp->m_dalign) { - sbp->sb_unit = mp->m_dalign; - mp->m_update_sb = true; - } - if (sbp->sb_width != mp->m_swidth) { - sbp->sb_width = mp->m_swidth; - mp->m_update_sb = true; - } + } else if (mp->m_dalign) { + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); } else { xfs_warn(mp, - "cannot change alignment: superblock does not support data alignment"); + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, mp->m_sb.sb_blocksize); return -EINVAL; } - } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && - xfs_sb_version_hasdalign(&mp->m_sb)) { - mp->m_dalign = sbp->sb_unit; - mp->m_swidth = sbp->sb_width; + } + + if (!xfs_sb_version_hasdalign(&mp->m_sb)) { + xfs_warn(mp, +"cannot change alignment: superblock does not support data alignment"); + return -EINVAL; } return 0; } -/* - * Set the default minimum read and write sizes unless - * already specified in a mount option. - * We use smaller I/O sizes when the file system - * is being used for NFS service (wsync mount option). - */ -STATIC void -xfs_set_rw_sizes(xfs_mount_t *mp) +/* Update alignment values based on mount options and sb values. */ +STATIC int +xfs_update_alignment( + struct xfs_mount *mp) { - xfs_sb_t *sbp = &(mp->m_sb); - int readio_log, writeio_log; + struct xfs_sb *sbp = &mp->m_sb; - if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { - if (mp->m_flags & XFS_MOUNT_WSYNC) { - readio_log = XFS_WSYNC_READIO_LOG; - writeio_log = XFS_WSYNC_WRITEIO_LOG; - } else { - readio_log = XFS_READIO_LOG_LARGE; - writeio_log = XFS_WRITEIO_LOG_LARGE; - } - } else { - readio_log = mp->m_readio_log; - writeio_log = mp->m_writeio_log; - } + if (mp->m_dalign) { + bool update_sb; + int error; - if (sbp->sb_blocklog > readio_log) { - mp->m_readio_log = sbp->sb_blocklog; - } else { - mp->m_readio_log = readio_log; - } - mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog); - if (sbp->sb_blocklog > writeio_log) { - mp->m_writeio_log = sbp->sb_blocklog; - } else { - mp->m_writeio_log = writeio_log; + if (sbp->sb_unit == mp->m_dalign && + sbp->sb_width == mp->m_swidth) + return 0; + + error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb); + if (error || !update_sb) + return error; + + sbp->sb_unit = mp->m_dalign; + sbp->sb_width = mp->m_swidth; + mp->m_update_sb = true; + } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && + xfs_sb_version_hasdalign(&mp->m_sb)) { + mp->m_dalign = sbp->sb_unit; + mp->m_swidth = sbp->sb_width; } - mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog); + + return 0; } /* @@ -687,12 +701,12 @@ xfs_mountfs( } /* - * Check if sb_agblocks is aligned at stripe boundary - * If sb_agblocks is NOT aligned turn off m_dalign since - * allocator alignment is within an ag, therefore ag has - * to be aligned at stripe boundary. + * If we were given new sunit/swidth options, do some basic validation + * checks and convert the incore dalign and swidth values to the + * same units (FSB) that everything else uses. This /must/ happen + * before computing the inode geometry. */ - error = xfs_update_alignment(mp); + error = xfs_validate_new_dalign(mp); if (error) goto out; @@ -703,10 +717,22 @@ xfs_mountfs( xfs_rmapbt_compute_maxlevels(mp); xfs_refcountbt_compute_maxlevels(mp); + /* + * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks + * is NOT aligned turn off m_dalign since allocator alignment is within + * an ag, therefore ag has to be aligned at stripe boundary. Note that + * we must compute the free space and rmap btree geometry before doing + * this. + */ + error = xfs_update_alignment(mp); + if (error) + goto out; + /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; - error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, + NULL, mp->m_super->s_id); if (error) goto out; @@ -728,9 +754,12 @@ xfs_mountfs( goto out_remove_errortag; /* - * Set the minimum read and write sizes + * Update the preferred write size based on the information from the + * on-disk superblock. */ - xfs_set_rw_sizes(mp); + mp->m_allocsize_log = + max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log); + mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog); /* set the low space thresholds for dynamic preallocation */ xfs_set_low_space_thresholds(mp); @@ -796,9 +825,8 @@ xfs_mountfs( goto out_free_dir; } - if (!sbp->sb_logblocks) { + if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) { xfs_warn(mp, "no log defined"); - XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto out_free_perag; } @@ -836,12 +864,10 @@ xfs_mountfs( ASSERT(rip != NULL); - if (unlikely(!S_ISDIR(VFS_I(rip)->i_mode))) { + if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) { xfs_warn(mp, "corrupted root inode %llu: not a directory", (unsigned long long)rip->i_ino); xfs_iunlock(rip, XFS_ILOCK_EXCL); - XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, - mp); error = -EFSCORRUPTED; goto out_rele_rip; } @@ -1277,7 +1303,7 @@ xfs_mod_fdblocks( printk_once(KERN_WARNING "Filesystem \"%s\": reserve blocks depleted! " "Consider increasing reserve pool size.", - mp->m_fsname); + mp->m_super->s_id); fdblocks_enospc: spin_unlock(&mp->m_sb_lock); return -ENOSPC; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index fdb60e09a9c5..b2e4598fdf7d 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -9,10 +9,8 @@ struct xlog; struct xfs_inode; struct xfs_mru_cache; -struct xfs_nameops; struct xfs_ail; struct xfs_quotainfo; -struct xfs_dir_ops; struct xfs_da_geometry; /* dynamic preallocation free space thresholds, 5% down to 1% */ @@ -59,7 +57,6 @@ struct xfs_error_cfg { typedef struct xfs_mount { struct super_block *m_super; - xfs_tid_t m_tid; /* next unused tid for fs */ /* * Bitsets of per-fs metadata that have been checked and/or are sick. @@ -89,8 +86,6 @@ typedef struct xfs_mount { struct percpu_counter m_delalloc_blks; struct xfs_buf *m_sb_bp; /* buffer for superblock */ - char *m_fsname; /* filesystem name */ - int m_fsname_len; /* strlen of fs name */ char *m_rtname; /* realtime device name */ char *m_logname; /* external log device name */ int m_bsize; /* fs logical block size */ @@ -98,10 +93,8 @@ typedef struct xfs_mount { xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ spinlock_t m_agirotor_lock;/* .. and lock protecting it */ xfs_agnumber_t m_maxagi; /* highest inode alloc group */ - uint m_readio_log; /* min read size log bytes */ - uint m_readio_blocks; /* min read size blocks */ - uint m_writeio_log; /* min write size log bytes */ - uint m_writeio_blocks; /* min write size blocks */ + uint m_allocsize_log;/* min write size log bytes */ + uint m_allocsize_blocks; /* min write size blocks */ struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ struct xlog *m_log; /* log specific stuff */ @@ -159,10 +152,6 @@ typedef struct xfs_mount { int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ - const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ - const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */ - const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ - uint m_chsize; /* size of next field */ atomic_t m_active_trans; /* number trans frozen */ struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ @@ -179,6 +168,11 @@ typedef struct xfs_mount { struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ + /* + * Workqueue item so that we can coalesce multiple inode flush attempts + * into a single flush. + */ + struct work_struct m_flush_inodes_work; struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; @@ -229,7 +223,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ #define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ #define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ -#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ +#define XFS_MOUNT_ALLOCSIZE (1ULL << 12) /* specified allocation size */ #define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */ #define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */ #define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */ @@ -238,7 +232,7 @@ typedef struct xfs_mount { * allocation */ #define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */ #define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ -#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred +#define XFS_MOUNT_LARGEIO (1ULL << 22) /* report large preferred * I/O size in stat() */ #define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams allocator */ @@ -246,13 +240,6 @@ typedef struct xfs_mount { #define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */ - -/* - * Default minimum read and write sizes. - */ -#define XFS_READIO_LOG_LARGE 16 -#define XFS_WRITEIO_LOG_LARGE 16 - /* * Max and min values for mount-option defined I/O * preallocation sizes. @@ -260,37 +247,6 @@ typedef struct xfs_mount { #define XFS_MAX_IO_LOG 30 /* 1G */ #define XFS_MIN_IO_LOG PAGE_SHIFT -/* - * Synchronous read and write sizes. This should be - * better for NFSv2 wsync filesystems. - */ -#define XFS_WSYNC_READIO_LOG 15 /* 32k */ -#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */ - -/* - * Allow large block sizes to be reported to userspace programs if the - * "largeio" mount option is used. - * - * If compatibility mode is specified, simply return the basic unit of caching - * so that we don't get inefficient read/modify/write I/O from user apps. - * Otherwise.... - * - * If the underlying volume is a stripe, then return the stripe width in bytes - * as the recommended I/O size. It is not a stripe and we've set a default - * buffered I/O size, return that, otherwise return the compat default. - */ -static inline unsigned long -xfs_preferred_iosize(xfs_mount_t *mp) -{ - if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE) - return PAGE_SIZE; - return (mp->m_swidth ? - (mp->m_swidth << mp->m_sb.sb_blocklog) : - ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ? - (1 << (int)max(mp->m_readio_log, mp->m_writeio_log)) : - PAGE_SIZE)); -} - #define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) #define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index b6701b4f59a9..5f04d8a5ab2a 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -111,6 +111,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); /* log structures */ + XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28); XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index a339bd5fa260..bb3008d390aa 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -12,6 +12,7 @@ #include "xfs_trans.h" #include "xfs_bmap.h" #include "xfs_iomap.h" +#include "xfs_pnfs.h" /* * Ensure that we do not have any outstanding pNFS layouts that can be used by @@ -59,7 +60,7 @@ xfs_fs_get_uuid( printk_once(KERN_NOTICE "XFS (%s): using experimental pNFS feature, use at your own risk!\n", - mp->m_fsname); + mp->m_super->s_id); if (*len < sizeof(uuid_t)) return -EINVAL; @@ -142,43 +143,38 @@ xfs_fs_map_blocks( lock_flags = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, bmapi_flags); - xfs_iunlock(ip, lock_flags); - if (error) - goto out_unlock; + ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK); + + if (!error && write && + (!nimaps || imap.br_startblock == HOLESTARTBLOCK)) { + if (offset + length > XFS_ISIZE(ip)) + end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb); + else if (nimaps && imap.br_startblock == HOLESTARTBLOCK) + end_fsb = min(end_fsb, imap.br_startoff + + imap.br_blockcount); + xfs_iunlock(ip, lock_flags); + + error = xfs_iomap_write_direct(ip, offset_fsb, + end_fsb - offset_fsb, &imap); + if (error) + goto out_unlock; - if (write) { - enum xfs_prealloc_flags flags = 0; - - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - - if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { - /* - * xfs_iomap_write_direct() expects to take ownership of - * the shared ilock. - */ - xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_iomap_write_direct(ip, offset, length, - &imap, nimaps); - if (error) - goto out_unlock; - - /* - * Ensure the next transaction is committed - * synchronously so that the blocks allocated and - * handed out to the client are guaranteed to be - * present even after a server crash. - */ - flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC; - } - - error = xfs_update_prealloc_flags(ip, flags); + /* + * Ensure the next transaction is committed synchronously so + * that the blocks allocated and handed out to the client are + * guaranteed to be present even after a server crash. + */ + error = xfs_update_prealloc_flags(ip, + XFS_PREALLOC_SET | XFS_PREALLOC_SYNC); if (error) goto out_unlock; + } else { + xfs_iunlock(ip, lock_flags); } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index ecd8ce152ab1..c225691fad15 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -22,6 +22,7 @@ #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_error.h" /* * The global quota manager. There is only one of these for the entire @@ -29,10 +30,10 @@ * quota functionality, including maintaining the freelist and hash * tables of dquots. */ -STATIC int xfs_qm_init_quotainos(xfs_mount_t *); -STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); +STATIC int xfs_qm_init_quotainos(struct xfs_mount *mp); +STATIC int xfs_qm_init_quotainfo(struct xfs_mount *mp); -STATIC void xfs_qm_destroy_quotainos(xfs_quotainfo_t *qi); +STATIC void xfs_qm_destroy_quotainos(struct xfs_quotainfo *qi); STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); /* * We use the batch lookup interface to iterate over the dquots as it @@ -120,12 +121,11 @@ xfs_qm_dqpurge( { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; + int error = -EAGAIN; xfs_dqlock(dqp); - if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { - xfs_dqunlock(dqp); - return -EAGAIN; - } + if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) + goto out_unlock; dqp->dq_flags |= XFS_DQ_FREEING; @@ -138,7 +138,6 @@ xfs_qm_dqpurge( */ if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; - int error; /* * We don't care about getting disk errors here. We need @@ -148,6 +147,8 @@ xfs_qm_dqpurge( if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); + } else if (error == -EAGAIN) { + goto out_unlock; } xfs_dqflock(dqp); } @@ -173,6 +174,10 @@ xfs_qm_dqpurge( xfs_qm_dqdestroy(dqp); return 0; + +out_unlock: + xfs_dqunlock(dqp); + return error; } /* @@ -243,14 +248,14 @@ xfs_qm_unmount_quotas( STATIC int xfs_qm_dqattach_one( - xfs_inode_t *ip, - xfs_dqid_t id, - uint type, - bool doalloc, - xfs_dquot_t **IO_idqpp) + struct xfs_inode *ip, + xfs_dqid_t id, + uint type, + bool doalloc, + struct xfs_dquot **IO_idqpp) { - xfs_dquot_t *dqp; - int error; + struct xfs_dquot *dqp; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); error = 0; @@ -325,23 +330,23 @@ xfs_qm_dqattach_locked( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, - doalloc, &ip->i_udquot); + error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)), + XFS_DQ_USER, doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); } if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, - doalloc, &ip->i_gdquot); + error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)), + XFS_DQ_GROUP, doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { - error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, + error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; @@ -539,12 +544,12 @@ xfs_qm_shrink_count( STATIC void xfs_qm_set_defquota( - xfs_mount_t *mp, - uint type, - xfs_quotainfo_t *qinf) + struct xfs_mount *mp, + uint type, + struct xfs_quotainfo *qinf) { - xfs_dquot_t *dqp; - struct xfs_def_quota *defq; + struct xfs_dquot *dqp; + struct xfs_def_quota *defq; struct xfs_disk_dquot *ddqp; int error; @@ -642,7 +647,7 @@ xfs_qm_init_quotainfo( ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0); + qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0); error = list_lru_init(&qinf->qi_lru); if (error) @@ -709,9 +714,9 @@ out_free_qinf: */ void xfs_qm_destroy_quotainfo( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_quotainfo_t *qi; + struct xfs_quotainfo *qi; qi = mp->m_quotainfo; ASSERT(qi != NULL); @@ -754,11 +759,15 @@ xfs_qm_qino_alloc( if ((flags & XFS_QMOPT_PQUOTA) && (mp->m_sb.sb_gquotino != NULLFSINO)) { ino = mp->m_sb.sb_gquotino; - ASSERT(mp->m_sb.sb_pquotino == NULLFSINO); + if (XFS_IS_CORRUPT(mp, + mp->m_sb.sb_pquotino != NULLFSINO)) + return -EFSCORRUPTED; } else if ((flags & XFS_QMOPT_GQUOTA) && (mp->m_sb.sb_pquotino != NULLFSINO)) { ino = mp->m_sb.sb_pquotino; - ASSERT(mp->m_sb.sb_gquotino == NULLFSINO); + if (XFS_IS_CORRUPT(mp, + mp->m_sb.sb_gquotino != NULLFSINO)) + return -EFSCORRUPTED; } if (ino != NULLFSINO) { error = xfs_iget(mp, NULL, ino, 0, 0, ip); @@ -866,12 +875,20 @@ xfs_qm_reset_dqcounts( ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; - ddq->d_btimer = 0; - ddq->d_itimer = 0; - ddq->d_rtbtimer = 0; - ddq->d_bwarns = 0; - ddq->d_iwarns = 0; - ddq->d_rtbwarns = 0; + + /* + * dquot id 0 stores the default grace period and the maximum + * warning limit that were set by the administrator, so we + * should not reset them. + */ + if (ddq->d_id != 0) { + ddq->d_btimer = 0; + ddq->d_itimer = 0; + ddq->d_rtbtimer = 0; + ddq->d_bwarns = 0; + ddq->d_iwarns = 0; + ddq->d_rtbwarns = 0; + } if (xfs_sb_version_hascrc(&mp->m_sb)) { xfs_update_cksum((char *)&dqb[j], @@ -1559,7 +1576,7 @@ error_rele: STATIC void xfs_qm_destroy_quotainos( - xfs_quotainfo_t *qi) + struct xfs_quotainfo *qi) { if (qi->qi_uquotaip) { xfs_irele(qi->qi_uquotaip); @@ -1608,8 +1625,8 @@ xfs_qm_dqfree_one( int xfs_qm_vop_dqalloc( struct xfs_inode *ip, - xfs_dqid_t uid, - xfs_dqid_t gid, + kuid_t uid, + kgid_t gid, prid_t prid, uint flags, struct xfs_dquot **O_udqpp, @@ -1617,6 +1634,8 @@ xfs_qm_vop_dqalloc( struct xfs_dquot **O_pdqpp) { struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + struct user_namespace *user_ns = inode->i_sb->s_user_ns; struct xfs_dquot *uq = NULL; struct xfs_dquot *gq = NULL; struct xfs_dquot *pq = NULL; @@ -1630,7 +1649,7 @@ xfs_qm_vop_dqalloc( xfs_ilock(ip, lockflags); if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) - gid = ip->i_d.di_gid; + gid = inode->i_gid; /* * Attach the dquot(s) to this inode, doing a dquot allocation @@ -1645,7 +1664,7 @@ xfs_qm_vop_dqalloc( } if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { - if (ip->i_d.di_uid != uid) { + if (!uid_eq(inode->i_uid, uid)) { /* * What we need is the dquot that has this uid, and * if we send the inode to dqget, the uid of the inode @@ -1656,7 +1675,8 @@ xfs_qm_vop_dqalloc( * holding ilock. */ xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq); + error = xfs_qm_dqget(mp, from_kuid(user_ns, uid), + XFS_DQ_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; @@ -1677,9 +1697,10 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { - if (ip->i_d.di_gid != gid) { + if (!gid_eq(inode->i_gid, gid)) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq); + error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), + XFS_DQ_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1693,7 +1714,7 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { - if (xfs_get_projid(ip) != prid) { + if (ip->i_d.di_projid != prid) { xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ, true, &pq); @@ -1737,14 +1758,14 @@ error_rele: * Actually transfer ownership, and do dquot modifications. * These were already reserved. */ -xfs_dquot_t * +struct xfs_dquot * xfs_qm_vop_chown( - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_dquot_t **IO_olddq, - xfs_dquot_t *newdq) + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot **IO_olddq, + struct xfs_dquot *newdq) { - xfs_dquot_t *prevdq; + struct xfs_dquot *prevdq; uint bfield = XFS_IS_REALTIME_INODE(ip) ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; @@ -1805,7 +1826,7 @@ xfs_qm_vop_chown_reserve( XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && - ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) { + i_uid_read(VFS_I(ip)) != be32_to_cpu(udqp->q_core.d_id)) { udq_delblks = udqp; /* * If there are delayed allocation blocks, then we have to @@ -1818,7 +1839,7 @@ xfs_qm_vop_chown_reserve( } } if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && - ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) { + i_gid_read(VFS_I(ip)) != be32_to_cpu(gdqp->q_core.d_id)) { gdq_delblks = gdqp; if (delblks) { ASSERT(ip->i_gdquot); @@ -1827,7 +1848,7 @@ xfs_qm_vop_chown_reserve( } if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && - xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) { + ip->i_d.di_projid != be32_to_cpu(pdqp->q_core.d_id)) { prjflags = XFS_QMOPT_ENOSPC; pdq_delblks = pdqp; if (delblks) { @@ -1915,20 +1936,21 @@ xfs_qm_vop_create_dqattach( if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); - ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); + ASSERT(i_uid_read(VFS_I(ip)) == be32_to_cpu(udqp->q_core.d_id)); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); - ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); + ASSERT(i_gid_read(VFS_I(ip)) == be32_to_cpu(gdqp->q_core.d_id)); + ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); - ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); + ASSERT(ip->i_d.di_projid == be32_to_cpu(pdqp->q_core.d_id)); ip->i_pdquot = xfs_qm_dqhold(pdqp); xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index b41b75089548..4e57edca8bce 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -54,7 +54,7 @@ struct xfs_def_quota { * Various quota information for individual filesystems. * The mount structure keeps a pointer to this. */ -typedef struct xfs_quotainfo { +struct xfs_quotainfo { struct radix_tree_root qi_uquota_tree; struct radix_tree_root qi_gquota_tree; struct radix_tree_root qi_pquota_tree; @@ -64,9 +64,9 @@ typedef struct xfs_quotainfo { struct xfs_inode *qi_pquotaip; /* project quota inode */ struct list_lru qi_lru; int qi_dquots; - time_t qi_btimelimit; /* limit for blks timer */ - time_t qi_itimelimit; /* limit for inodes timer */ - time_t qi_rtbtimelimit;/* limit for rt blks timer */ + time64_t qi_btimelimit; /* limit for blks timer */ + time64_t qi_itimelimit; /* limit for inodes timer */ + time64_t qi_rtbtimelimit;/* limit for rt blks timer */ xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ @@ -76,8 +76,8 @@ typedef struct xfs_quotainfo { struct xfs_def_quota qi_usr_default; struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; - struct shrinker qi_shrinker; -} xfs_quotainfo_t; + struct shrinker qi_shrinker; +}; static inline struct radix_tree_root * xfs_dquot_tree( diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 5d72e88598b4..fc2fa418919f 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -54,13 +54,13 @@ xfs_fill_statvfs_from_dquot( */ void xfs_qm_statvfs( - xfs_inode_t *ip, + struct xfs_inode *ip, struct kstatfs *statp) { - xfs_mount_t *mp = ip->i_mount; - xfs_dquot_t *dqp; + struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *dqp; - if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, false, &dqp)) { + if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQ_PROJ, false, &dqp)) { xfs_fill_statvfs_from_dquot(statp, dqp); xfs_qm_dqput(dqp); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index da7ad0383037..5d5ac65aa1cc 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -19,9 +19,71 @@ #include "xfs_qm.h" #include "xfs_icache.h" -STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); -STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, - uint); +STATIC int +xfs_qm_log_quotaoff( + struct xfs_mount *mp, + struct xfs_qoff_logitem **qoffstartp, + uint flags) +{ + struct xfs_trans *tp; + int error; + struct xfs_qoff_logitem *qoffi; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); + if (error) + goto out; + + qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; + spin_unlock(&mp->m_sb_lock); + + xfs_log_sb(tp); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp); + if (error) + goto out; + + *qoffstartp = qoffi; +out: + return error; +} + +STATIC int +xfs_qm_log_quotaoff_end( + struct xfs_mount *mp, + struct xfs_qoff_logitem **startqoff, + uint flags) +{ + struct xfs_trans *tp; + int error; + struct xfs_qoff_logitem *qoffi; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp); + if (error) + return error; + + qoffi = xfs_trans_get_qoff_item(tp, *startqoff, + flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + *startqoff = NULL; + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp); +} /* * Turn off quota accounting and/or enforcement for all udquots and/or @@ -40,7 +102,7 @@ xfs_qm_scall_quotaoff( uint dqtype; int error; uint inactivate_flags; - xfs_qoff_logitem_t *qoffstart; + struct xfs_qoff_logitem *qoffstart = NULL; /* * No file system can have quotas enabled on disk but not in core. @@ -165,7 +227,7 @@ xfs_qm_scall_quotaoff( * So, we have QUOTAOFF start and end logitems; the start * logitem won't get overwritten until the end logitem appears... */ - error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + error = xfs_qm_log_quotaoff_end(mp, &qoffstart, flags); if (error) { /* We're screwed now. Shutdown is the only option. */ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); @@ -198,6 +260,8 @@ xfs_qm_scall_quotaoff( } out_unlock: + if (error && qoffstart) + xfs_qm_qoff_logitem_relse(qoffstart); mutex_unlock(&q->qi_quotaofflock); return error; } @@ -538,74 +602,6 @@ out_unlock: return error; } -STATIC int -xfs_qm_log_quotaoff_end( - xfs_mount_t *mp, - xfs_qoff_logitem_t *startqoff, - uint flags) -{ - xfs_trans_t *tp; - int error; - xfs_qoff_logitem_t *qoffi; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp); - if (error) - return error; - - qoffi = xfs_trans_get_qoff_item(tp, startqoff, - flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp); -} - - -STATIC int -xfs_qm_log_quotaoff( - xfs_mount_t *mp, - xfs_qoff_logitem_t **qoffstartp, - uint flags) -{ - xfs_trans_t *tp; - int error; - xfs_qoff_logitem_t *qoffi; - - *qoffstartp = NULL; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); - if (error) - goto out; - - qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - - spin_lock(&mp->m_sb_lock); - mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; - spin_unlock(&mp->m_sb_lock); - - xfs_log_sb(tp); - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp); - if (error) - goto out; - - *qoffstartp = qoffi; -out: - return error; -} - /* Fill out the quota context. */ static void xfs_qm_scall_getquota_fill_qc( diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index efe42ae7a2f3..aa8fc1f55fbd 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -86,7 +86,7 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint); -extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t, +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t, prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **); extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, @@ -109,7 +109,7 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *); #else static inline int -xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, +xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, prid_t prid, uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp, struct xfs_dquot **pdqp) { diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index cd6c7210a373..38669e827206 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -37,9 +37,9 @@ xfs_qm_fill_state( tstate->flags |= QCI_SYSFILE; tstate->blocks = ip->i_d.di_nblocks; tstate->nextents = ip->i_d.di_nextents; - tstate->spc_timelimit = q->qi_btimelimit; - tstate->ino_timelimit = q->qi_itimelimit; - tstate->rt_spc_timelimit = q->qi_rtbtimelimit; + tstate->spc_timelimit = (u32)q->qi_btimelimit; + tstate->ino_timelimit = (u32)q->qi_itimelimit; + tstate->rt_spc_timelimit = (u32)q->qi_rtbtimelimit; tstate->spc_warnlimit = q->qi_bwarnlimit; tstate->ino_warnlimit = q->qi_iwarnlimit; tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit; @@ -201,6 +201,9 @@ xfs_fs_rm_xquota( if (XFS_IS_QUOTA_ON(mp)) return -EINVAL; + if (uflags & ~(FS_USER_QUOTA | FS_GROUP_QUOTA | FS_PROJ_QUOTA)) + return -EINVAL; + if (uflags & FS_USER_QUOTA) flags |= XFS_DQ_USER; if (uflags & FS_GROUP_QUOTA) diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 2328268e6245..8eeed73928cd 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -17,7 +17,7 @@ #include "xfs_refcount_item.h" #include "xfs_log.h" #include "xfs_refcount.h" - +#include "xfs_error.h" kmem_zone_t *xfs_cui_zone; kmem_zone_t *xfs_cud_zone; @@ -34,7 +34,7 @@ xfs_cui_item_free( if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) kmem_free(cuip); else - kmem_zone_free(xfs_cui_zone, cuip); + kmem_cache_free(xfs_cui_zone, cuip); } /* @@ -206,7 +206,7 @@ xfs_cud_item_release( struct xfs_cud_log_item *cudp = CUD_ITEM(lip); xfs_cui_release(cudp->cud_cuip); - kmem_zone_free(xfs_cud_zone, cudp); + kmem_cache_free(xfs_cud_zone, cudp); } static const struct xfs_item_ops xfs_cud_item_ops = { @@ -497,7 +497,7 @@ xfs_cui_recover( */ set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); xfs_cui_release(cuip); - return -EIO; + return -EFSCORRUPTED; } } @@ -536,6 +536,7 @@ xfs_cui_recover( type = refc_type; break; default: + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto abort_error; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 0f08153b4994..107bf2a2f344 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -143,8 +143,6 @@ xfs_reflink_find_shared( error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) return error; - if (!agbp) - return -ENOMEM; cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); @@ -223,8 +221,8 @@ xfs_reflink_trim_around_shared( } } -bool -xfs_inode_need_cow( +int +xfs_bmap_trim_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared) @@ -308,13 +306,13 @@ static int xfs_find_trim_cow_extent( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, bool *shared, bool *found) { xfs_fileoff_t offset_fsb = imap->br_startoff; xfs_filblks_t count_fsb = imap->br_blockcount; struct xfs_iext_cursor icur; - struct xfs_bmbt_irec got; *found = false; @@ -322,23 +320,22 @@ xfs_find_trim_cow_extent( * If we don't find an overlapping extent, trim the range we need to * allocate to fit the hole we found. */ - if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) - got.br_startoff = offset_fsb + count_fsb; - if (got.br_startoff > offset_fsb) { + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap)) + cmap->br_startoff = offset_fsb + count_fsb; + if (cmap->br_startoff > offset_fsb) { xfs_trim_extent(imap, imap->br_startoff, - got.br_startoff - imap->br_startoff); - return xfs_inode_need_cow(ip, imap, shared); + cmap->br_startoff - imap->br_startoff); + return xfs_bmap_trim_cow(ip, imap, shared); } *shared = true; - if (isnullstartblock(got.br_startblock)) { - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + if (isnullstartblock(cmap->br_startblock)) { + xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount); return 0; } /* real extent found - no need to allocate */ - xfs_trim_extent(&got, offset_fsb, count_fsb); - *imap = got; + xfs_trim_extent(cmap, offset_fsb, count_fsb); *found = true; return 0; } @@ -348,6 +345,7 @@ int xfs_reflink_allocate_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, bool *shared, uint *lockmode, bool convert_now) @@ -367,7 +365,7 @@ xfs_reflink_allocate_cow( xfs_ifork_init_cow(ip); } - error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); if (error || !*shared) return error; if (found) @@ -392,7 +390,7 @@ xfs_reflink_allocate_cow( /* * Check for an overlapping extent again now that we dropped the ilock. */ - error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); if (error || !*shared) goto out_trans_cancel; if (found) { @@ -410,8 +408,8 @@ xfs_reflink_allocate_cow( /* Allocate the entire reservation as unwritten blocks. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, - XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, - resblks, imap, &nimaps); + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap, + &nimaps); if (error) goto out_unreserve; @@ -427,15 +425,15 @@ xfs_reflink_allocate_cow( if (nimaps == 0) return -ENOSPC; convert: - xfs_trim_extent(imap, offset_fsb, count_fsb); + xfs_trim_extent(cmap, offset_fsb, count_fsb); /* * COW fork extents are supposed to remain unwritten until we're ready * to initiate a disk write. For direct I/O we are going to write the * data and need the conversion, but for buffered writes we're done. */ - if (!convert_now || imap->br_state == XFS_EXT_NORM) + if (!convert_now || cmap->br_state == XFS_EXT_NORM) return 0; - trace_xfs_reflink_convert_cow(ip, imap); + trace_xfs_reflink_convert_cow(ip, cmap); return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); out_unreserve: @@ -1053,6 +1051,7 @@ xfs_reflink_remap_extent( uirec.br_startblock = irec->br_startblock + rlen; uirec.br_startoff = irec->br_startoff + rlen; uirec.br_blockcount = unmap_len - rlen; + uirec.br_state = irec->br_state; unmap_len = rlen; /* If this isn't a real mapping, we're done. */ @@ -1270,7 +1269,7 @@ xfs_reflink_zero_posteof( trace_xfs_zero_eof(ip, isize, pos - isize); return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, - &xfs_iomap_ops); + &xfs_buffered_write_iomap_ops); } /* @@ -1381,85 +1380,6 @@ out_unlock: return ret; } -/* - * The user wants to preemptively CoW all shared blocks in this file, - * which enables us to turn off the reflink flag. Iterate all - * extents which are not prealloc/delalloc to see which ranges are - * mentioned in the refcount tree, then read those blocks into the - * pagecache, dirty them, fsync them back out, and then we can update - * the inode flag. What happens if we run out of memory? :) - */ -STATIC int -xfs_reflink_dirty_extents( - struct xfs_inode *ip, - xfs_fileoff_t fbno, - xfs_filblks_t end, - xfs_off_t isize) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - xfs_extlen_t aglen; - xfs_agblock_t rbno; - xfs_extlen_t rlen; - xfs_off_t fpos; - xfs_off_t flen; - struct xfs_bmbt_irec map[2]; - int nmaps; - int error = 0; - - while (end - fbno > 0) { - nmaps = 1; - /* - * Look for extents in the file. Skip holes, delalloc, or - * unwritten extents; they can't be reflinked. - */ - error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); - if (error) - goto out; - if (nmaps == 0) - break; - if (!xfs_bmap_is_real_extent(&map[0])) - goto next; - - map[1] = map[0]; - while (map[1].br_blockcount) { - agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); - agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); - aglen = map[1].br_blockcount; - - error = xfs_reflink_find_shared(mp, NULL, agno, agbno, - aglen, &rbno, &rlen, true); - if (error) - goto out; - if (rbno == NULLAGBLOCK) - break; - - /* Dirty the pages */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + - (rbno - agbno)); - flen = XFS_FSB_TO_B(mp, rlen); - if (fpos + flen > isize) - flen = isize - fpos; - error = iomap_file_dirty(VFS_I(ip), fpos, flen, - &xfs_iomap_ops); - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (error) - goto out; - - map[1].br_blockcount -= (rbno - agbno + rlen); - map[1].br_startoff += (rbno - agbno + rlen); - map[1].br_startblock += (rbno - agbno + rlen); - } - -next: - fbno = map[0].br_startoff + map[0].br_blockcount; - } -out: - return error; -} - /* Does this inode need the reflink flag? */ int xfs_reflink_inode_has_shared_extents( @@ -1536,7 +1456,8 @@ xfs_reflink_clear_inode_flag( * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF, + true); if (error) return error; @@ -1596,10 +1517,7 @@ xfs_reflink_unshare( xfs_off_t offset, xfs_off_t len) { - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t fbno; - xfs_filblks_t end; - xfs_off_t isize; + struct inode *inode = VFS_I(ip); int error; if (!xfs_is_reflink_inode(ip)) @@ -1607,20 +1525,13 @@ xfs_reflink_unshare( trace_xfs_reflink_unshare(ip, offset, len); - inode_dio_wait(VFS_I(ip)); + inode_dio_wait(inode); - /* Try to CoW the selected ranges */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - fbno = XFS_B_TO_FSBT(mp, offset); - isize = i_size_read(VFS_I(ip)); - end = XFS_B_TO_FSB(mp, offset + len); - error = xfs_reflink_dirty_extents(ip, fbno, end, isize); + error = iomap_file_unshare(inode, offset, len, + &xfs_buffered_write_iomap_ops); if (error) - goto out_unlock; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - /* Wait for the IO to finish */ - error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + goto out; + error = filemap_write_and_wait(inode->i_mapping); if (error) goto out; @@ -1628,11 +1539,8 @@ xfs_reflink_unshare( error = xfs_reflink_try_clear_inode_flag(ip); if (error) goto out; - return 0; -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); out: trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); return error; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 28a43b7f581d..3e4fd46373ab 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -22,11 +22,11 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); -bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, +int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared); -extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode, +int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, bool *shared, uint *lockmode, bool convert_now); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 8939e0ea09cd..4911b68f95dd 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -17,7 +17,7 @@ #include "xfs_rmap_item.h" #include "xfs_log.h" #include "xfs_rmap.h" - +#include "xfs_error.h" kmem_zone_t *xfs_rui_zone; kmem_zone_t *xfs_rud_zone; @@ -34,7 +34,7 @@ xfs_rui_item_free( if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS) kmem_free(ruip); else - kmem_zone_free(xfs_rui_zone, ruip); + kmem_cache_free(xfs_rui_zone, ruip); } /* @@ -171,8 +171,10 @@ xfs_rui_copy_format( src_rui_fmt = buf->i_addr; len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents); - if (buf->i_len != len) + if (buf->i_len != len) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; + } memcpy(dst_rui_fmt, src_rui_fmt, len); return 0; @@ -227,7 +229,7 @@ xfs_rud_item_release( struct xfs_rud_log_item *rudp = RUD_ITEM(lip); xfs_rui_release(rudp->rud_ruip); - kmem_zone_free(xfs_rud_zone, rudp); + kmem_cache_free(xfs_rud_zone, rudp); } static const struct xfs_item_ops xfs_rud_item_ops = { @@ -539,7 +541,7 @@ xfs_rui_recover( */ set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); xfs_rui_release(ruip); - return -EIO; + return -EFSCORRUPTED; } } @@ -581,6 +583,7 @@ xfs_rui_recover( type = XFS_RMAP_FREE; break; default: + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); error = -EFSCORRUPTED; goto abort_error; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 4a48a8c75b4f..6209e7b6b895 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -792,8 +792,7 @@ xfs_growfs_rt_alloc( */ nmap = 1; error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, - XFS_BMAPI_METADATA, resblks, &map, - &nmap); + XFS_BMAPI_METADATA, 0, &map, &nmap); if (!error && nmap < 1) error = -ENOSPC; if (error) @@ -827,12 +826,10 @@ xfs_growfs_rt_alloc( * Get a buffer for the block. */ d = XFS_FSB_TO_DADDR(mp, fsbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize, 0); - if (bp == NULL) { - error = -EIO; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize, 0, &bp); + if (error) goto out_trans_cancel; - } memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 113883c4f202..f70f1255220b 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -57,13 +57,13 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) /* Loop over all stats groups */ for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { - len += snprintf(buf + len, PATH_MAX - len, "%s", + len += scnprintf(buf + len, PATH_MAX - len, "%s", xstats[i].desc); /* inner loop does each group */ for (; j < xstats[i].endpoint; j++) - len += snprintf(buf + len, PATH_MAX - len, " %u", + len += scnprintf(buf + len, PATH_MAX - len, " %u", counter_val(stats, j)); - len += snprintf(buf + len, PATH_MAX - len, "\n"); + len += scnprintf(buf + len, PATH_MAX - len, "\n"); } /* extra precision counters */ for_each_possible_cpu(i) { @@ -72,9 +72,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; } - len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", + len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); - len += snprintf(buf + len, PATH_MAX-len, "debug %u\n", + len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n", #if defined(DEBUG) 1); #else diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8d1df9f8be07..a123cd8267d9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -37,10 +37,10 @@ #include "xfs_reflink.h" #include <linux/magic.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> static const struct super_operations xfs_super_operations; -struct bio_set xfs_ioend_bioset; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG @@ -51,7 +51,7 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ * Table driven mount option parser. */ enum { - Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize, + Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep, @@ -59,382 +59,62 @@ enum { Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, Opt_err, + Opt_discard, Opt_nodiscard, Opt_dax, }; -static const match_table_t tokens = { - {Opt_logbufs, "logbufs=%u"}, /* number of XFS log buffers */ - {Opt_logbsize, "logbsize=%s"}, /* size of XFS log buffers */ - {Opt_logdev, "logdev=%s"}, /* log device */ - {Opt_rtdev, "rtdev=%s"}, /* realtime I/O device */ - {Opt_biosize, "biosize=%u"}, /* log2 of preferred buffered io size */ - {Opt_wsync, "wsync"}, /* safe-mode nfs compatible mount */ - {Opt_noalign, "noalign"}, /* turn off stripe alignment */ - {Opt_swalloc, "swalloc"}, /* turn on stripe width allocation */ - {Opt_sunit, "sunit=%u"}, /* data volume stripe unit */ - {Opt_swidth, "swidth=%u"}, /* data volume stripe width */ - {Opt_nouuid, "nouuid"}, /* ignore filesystem UUID */ - {Opt_grpid, "grpid"}, /* group-ID from parent directory */ - {Opt_nogrpid, "nogrpid"}, /* group-ID from current process */ - {Opt_bsdgroups, "bsdgroups"}, /* group-ID from parent directory */ - {Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */ - {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */ - {Opt_norecovery,"norecovery"}, /* don't run XFS recovery */ - {Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */ - {Opt_inode32, "inode32"}, /* inode allocation limited to - * XFS_MAXINUMBER_32 */ - {Opt_ikeep, "ikeep"}, /* do not free empty inode clusters */ - {Opt_noikeep, "noikeep"}, /* free empty inode clusters */ - {Opt_largeio, "largeio"}, /* report large I/O sizes in stat() */ - {Opt_nolargeio, "nolargeio"}, /* do not report large I/O sizes - * in stat(). */ - {Opt_attr2, "attr2"}, /* do use attr2 attribute format */ - {Opt_noattr2, "noattr2"}, /* do not use attr2 attribute format */ - {Opt_filestreams,"filestreams"},/* use filestreams allocator */ - {Opt_quota, "quota"}, /* disk quotas (user) */ - {Opt_noquota, "noquota"}, /* no quotas */ - {Opt_usrquota, "usrquota"}, /* user quota enabled */ - {Opt_grpquota, "grpquota"}, /* group quota enabled */ - {Opt_prjquota, "prjquota"}, /* project quota enabled */ - {Opt_uquota, "uquota"}, /* user quota (IRIX variant) */ - {Opt_gquota, "gquota"}, /* group quota (IRIX variant) */ - {Opt_pquota, "pquota"}, /* project quota (IRIX variant) */ - {Opt_uqnoenforce,"uqnoenforce"},/* user quota limit enforcement */ - {Opt_gqnoenforce,"gqnoenforce"},/* group quota limit enforcement */ - {Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */ - {Opt_qnoenforce, "qnoenforce"}, /* same as uqnoenforce */ - {Opt_discard, "discard"}, /* Discard unused blocks */ - {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */ - {Opt_dax, "dax"}, /* Enable direct access to bdev pages */ - {Opt_err, NULL}, +static const struct fs_parameter_spec xfs_fs_parameters[] = { + fsparam_u32("logbufs", Opt_logbufs), + fsparam_string("logbsize", Opt_logbsize), + fsparam_string("logdev", Opt_logdev), + fsparam_string("rtdev", Opt_rtdev), + fsparam_flag("wsync", Opt_wsync), + fsparam_flag("noalign", Opt_noalign), + fsparam_flag("swalloc", Opt_swalloc), + fsparam_u32("sunit", Opt_sunit), + fsparam_u32("swidth", Opt_swidth), + fsparam_flag("nouuid", Opt_nouuid), + fsparam_flag("grpid", Opt_grpid), + fsparam_flag("nogrpid", Opt_nogrpid), + fsparam_flag("bsdgroups", Opt_bsdgroups), + fsparam_flag("sysvgroups", Opt_sysvgroups), + fsparam_string("allocsize", Opt_allocsize), + fsparam_flag("norecovery", Opt_norecovery), + fsparam_flag("inode64", Opt_inode64), + fsparam_flag("inode32", Opt_inode32), + fsparam_flag("ikeep", Opt_ikeep), + fsparam_flag("noikeep", Opt_noikeep), + fsparam_flag("largeio", Opt_largeio), + fsparam_flag("nolargeio", Opt_nolargeio), + fsparam_flag("attr2", Opt_attr2), + fsparam_flag("noattr2", Opt_noattr2), + fsparam_flag("filestreams", Opt_filestreams), + fsparam_flag("quota", Opt_quota), + fsparam_flag("noquota", Opt_noquota), + fsparam_flag("usrquota", Opt_usrquota), + fsparam_flag("grpquota", Opt_grpquota), + fsparam_flag("prjquota", Opt_prjquota), + fsparam_flag("uquota", Opt_uquota), + fsparam_flag("gquota", Opt_gquota), + fsparam_flag("pquota", Opt_pquota), + fsparam_flag("uqnoenforce", Opt_uqnoenforce), + fsparam_flag("gqnoenforce", Opt_gqnoenforce), + fsparam_flag("pqnoenforce", Opt_pqnoenforce), + fsparam_flag("qnoenforce", Opt_qnoenforce), + fsparam_flag("discard", Opt_discard), + fsparam_flag("nodiscard", Opt_nodiscard), + fsparam_flag("dax", Opt_dax), + {} }; - -STATIC int -suffix_kstrtoint(const substring_t *s, unsigned int base, int *res) -{ - int last, shift_left_factor = 0, _res; - char *value; - int ret = 0; - - value = match_strdup(s); - if (!value) - return -ENOMEM; - - last = strlen(value) - 1; - if (value[last] == 'K' || value[last] == 'k') { - shift_left_factor = 10; - value[last] = '\0'; - } - if (value[last] == 'M' || value[last] == 'm') { - shift_left_factor = 20; - value[last] = '\0'; - } - if (value[last] == 'G' || value[last] == 'g') { - shift_left_factor = 30; - value[last] = '\0'; - } - - if (kstrtoint(value, base, &_res)) - ret = -EINVAL; - kfree(value); - *res = _res << shift_left_factor; - return ret; -} - -/* - * This function fills in xfs_mount_t fields based on mount args. - * Note: the superblock has _not_ yet been read in. - * - * Note that this function leaks the various device name allocations on - * failure. The caller takes care of them. - * - * *sb is const because this is also used to test options on the remount - * path, and we don't want this to have any side effects at remount time. - * Today this function does not change *sb, but just to future-proof... - */ -STATIC int -xfs_parseargs( - struct xfs_mount *mp, - char *options) -{ - const struct super_block *sb = mp->m_super; - char *p; - substring_t args[MAX_OPT_ARGS]; - int dsunit = 0; - int dswidth = 0; - int iosize = 0; - uint8_t iosizelog = 0; - - /* - * set up the mount name first so all the errors will refer to the - * correct device. - */ - mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL); - if (!mp->m_fsname) - return -ENOMEM; - mp->m_fsname_len = strlen(mp->m_fsname) + 1; - - /* - * Copy binary VFS mount flags we are interested in. - */ - if (sb_rdonly(sb)) - mp->m_flags |= XFS_MOUNT_RDONLY; - if (sb->s_flags & SB_DIRSYNC) - mp->m_flags |= XFS_MOUNT_DIRSYNC; - if (sb->s_flags & SB_SYNCHRONOUS) - mp->m_flags |= XFS_MOUNT_WSYNC; - - /* - * Set some default flags that could be cleared by the mount option - * parsing. - */ - mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; - - /* - * These can be overridden by the mount option parsing. - */ - mp->m_logbufs = -1; - mp->m_logbsize = -1; - - if (!options) - goto done; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_logbufs: - if (match_int(args, &mp->m_logbufs)) - return -EINVAL; - break; - case Opt_logbsize: - if (suffix_kstrtoint(args, 10, &mp->m_logbsize)) - return -EINVAL; - break; - case Opt_logdev: - kfree(mp->m_logname); - mp->m_logname = match_strdup(args); - if (!mp->m_logname) - return -ENOMEM; - break; - case Opt_rtdev: - kfree(mp->m_rtname); - mp->m_rtname = match_strdup(args); - if (!mp->m_rtname) - return -ENOMEM; - break; - case Opt_allocsize: - case Opt_biosize: - if (suffix_kstrtoint(args, 10, &iosize)) - return -EINVAL; - iosizelog = ffs(iosize) - 1; - break; - case Opt_grpid: - case Opt_bsdgroups: - mp->m_flags |= XFS_MOUNT_GRPID; - break; - case Opt_nogrpid: - case Opt_sysvgroups: - mp->m_flags &= ~XFS_MOUNT_GRPID; - break; - case Opt_wsync: - mp->m_flags |= XFS_MOUNT_WSYNC; - break; - case Opt_norecovery: - mp->m_flags |= XFS_MOUNT_NORECOVERY; - break; - case Opt_noalign: - mp->m_flags |= XFS_MOUNT_NOALIGN; - break; - case Opt_swalloc: - mp->m_flags |= XFS_MOUNT_SWALLOC; - break; - case Opt_sunit: - if (match_int(args, &dsunit)) - return -EINVAL; - break; - case Opt_swidth: - if (match_int(args, &dswidth)) - return -EINVAL; - break; - case Opt_inode32: - mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - break; - case Opt_inode64: - mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; - break; - case Opt_nouuid: - mp->m_flags |= XFS_MOUNT_NOUUID; - break; - case Opt_ikeep: - mp->m_flags |= XFS_MOUNT_IKEEP; - break; - case Opt_noikeep: - mp->m_flags &= ~XFS_MOUNT_IKEEP; - break; - case Opt_largeio: - mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; - break; - case Opt_nolargeio: - mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; - break; - case Opt_attr2: - mp->m_flags |= XFS_MOUNT_ATTR2; - break; - case Opt_noattr2: - mp->m_flags &= ~XFS_MOUNT_ATTR2; - mp->m_flags |= XFS_MOUNT_NOATTR2; - break; - case Opt_filestreams: - mp->m_flags |= XFS_MOUNT_FILESTREAMS; - break; - case Opt_noquota: - mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; - mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; - mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; - break; - case Opt_quota: - case Opt_uquota: - case Opt_usrquota: - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | - XFS_UQUOTA_ENFD); - break; - case Opt_qnoenforce: - case Opt_uqnoenforce: - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_UQUOTA_ENFD; - break; - case Opt_pquota: - case Opt_prjquota: - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | - XFS_PQUOTA_ENFD); - break; - case Opt_pqnoenforce: - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_PQUOTA_ENFD; - break; - case Opt_gquota: - case Opt_grpquota: - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | - XFS_GQUOTA_ENFD); - break; - case Opt_gqnoenforce: - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_GQUOTA_ENFD; - break; - case Opt_discard: - mp->m_flags |= XFS_MOUNT_DISCARD; - break; - case Opt_nodiscard: - mp->m_flags &= ~XFS_MOUNT_DISCARD; - break; -#ifdef CONFIG_FS_DAX - case Opt_dax: - mp->m_flags |= XFS_MOUNT_DAX; - break; -#endif - default: - xfs_warn(mp, "unknown mount option [%s].", p); - return -EINVAL; - } - } - - /* - * no recovery flag requires a read-only mount - */ - if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && - !(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_warn(mp, "no-recovery mounts must be read-only."); - return -EINVAL; - } - - if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { - xfs_warn(mp, - "sunit and swidth options incompatible with the noalign option"); - return -EINVAL; - } - -#ifndef CONFIG_XFS_QUOTA - if (XFS_IS_QUOTA_RUNNING(mp)) { - xfs_warn(mp, "quota support not available in this kernel."); - return -EINVAL; - } -#endif - - if ((dsunit && !dswidth) || (!dsunit && dswidth)) { - xfs_warn(mp, "sunit and swidth must be specified together"); - return -EINVAL; - } - - if (dsunit && (dswidth % dsunit != 0)) { - xfs_warn(mp, - "stripe width (%d) must be a multiple of the stripe unit (%d)", - dswidth, dsunit); - return -EINVAL; - } - -done: - if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) { - /* - * At this point the superblock has not been read - * in, therefore we do not know the block size. - * Before the mount call ends we will convert - * these to FSBs. - */ - mp->m_dalign = dsunit; - mp->m_swidth = dswidth; - } - - if (mp->m_logbufs != -1 && - mp->m_logbufs != 0 && - (mp->m_logbufs < XLOG_MIN_ICLOGS || - mp->m_logbufs > XLOG_MAX_ICLOGS)) { - xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]", - mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); - return -EINVAL; - } - if (mp->m_logbsize != -1 && - mp->m_logbsize != 0 && - (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || - mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || - !is_power_of_2(mp->m_logbsize))) { - xfs_warn(mp, - "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", - mp->m_logbsize); - return -EINVAL; - } - - if (iosizelog) { - if (iosizelog > XFS_MAX_IO_LOG || - iosizelog < XFS_MIN_IO_LOG) { - xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", - iosizelog, XFS_MIN_IO_LOG, - XFS_MAX_IO_LOG); - return -EINVAL; - } - - mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; - mp->m_readio_log = iosizelog; - mp->m_writeio_log = iosizelog; - } - - return 0; -} - struct proc_xfs_info { uint64_t flag; char *str; }; -STATIC void -xfs_showargs( - struct xfs_mount *mp, - struct seq_file *m) +static int +xfs_fs_show_options( + struct seq_file *m, + struct dentry *root) { static struct proc_xfs_info xfs_info_set[] = { /* the few simple ones we can get from the mount struct */ @@ -448,30 +128,24 @@ xfs_showargs( { XFS_MOUNT_FILESTREAMS, ",filestreams" }, { XFS_MOUNT_GRPID, ",grpid" }, { XFS_MOUNT_DISCARD, ",discard" }, - { XFS_MOUNT_SMALL_INUMS, ",inode32" }, + { XFS_MOUNT_LARGEIO, ",largeio" }, { XFS_MOUNT_DAX, ",dax" }, { 0, NULL } }; - static struct proc_xfs_info xfs_info_unset[] = { - /* the few simple ones we can get from the mount struct */ - { XFS_MOUNT_COMPAT_IOSIZE, ",largeio" }, - { XFS_MOUNT_SMALL_INUMS, ",inode64" }, - { 0, NULL } - }; + struct xfs_mount *mp = XFS_M(root->d_sb); struct proc_xfs_info *xfs_infop; for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) { if (mp->m_flags & xfs_infop->flag) seq_puts(m, xfs_infop->str); } - for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) { - if (!(mp->m_flags & xfs_infop->flag)) - seq_puts(m, xfs_infop->str); - } - if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + seq_printf(m, ",inode%d", + (mp->m_flags & XFS_MOUNT_SMALL_INUMS) ? 32 : 64); + + if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) seq_printf(m, ",allocsize=%dk", - (int)(1 << mp->m_writeio_log) >> 10); + (1 << mp->m_allocsize_log) >> 10); if (mp->m_logbufs > 0) seq_printf(m, ",logbufs=%d", mp->m_logbufs); @@ -510,32 +184,8 @@ xfs_showargs( if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); -} -static uint64_t -xfs_max_file_offset( - unsigned int blockshift) -{ - unsigned int pagefactor = 1; - unsigned int bitshift = BITS_PER_LONG - 1; - - /* Figure out maximum filesize, on Linux this can depend on - * the filesystem blocksize (on 32 bit platforms). - * __block_write_begin does this in an [unsigned] long long... - * page->index << (PAGE_SHIFT - bbits) - * So, for page sized blocks (4K on 32 bit platforms), - * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is - * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) - * but for smaller blocksizes it is less (bbits = log2 bsize). - */ - -#if BITS_PER_LONG == 32 - ASSERT(sizeof(sector_t) == 8); - pagefactor = PAGE_SIZE; - bitshift = BITS_PER_LONG; -#endif - - return (((uint64_t)pagefactor) << bitshift) - 1; + return 0; } /* @@ -655,7 +305,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS); } STATIC void @@ -808,33 +458,33 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_super->s_id); if (!mp->m_buf_workqueue) goto out; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); if (!mp->m_unwritten_workqueue) goto out_destroy_buf; mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND, - 0, mp->m_fsname); + 0, mp->m_super->s_id); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) goto out_destroy_cil; mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); if (!mp->m_eofblocks_workqueue) goto out_destroy_reclaim; mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0, - mp->m_fsname); + mp->m_super->s_id); if (!mp->m_sync_workqueue) goto out_destroy_eofb; @@ -866,6 +516,20 @@ xfs_destroy_mount_workqueues( destroy_workqueue(mp->m_buf_workqueue); } +static void +xfs_flush_inodes_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(work, struct xfs_mount, + m_flush_inodes_work); + struct super_block *sb = mp->m_super; + + if (down_read_trylock(&sb->s_umount)) { + sync_inodes_sb(sb); + up_read(&sb->s_umount); + } +} + /* * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting @@ -876,12 +540,15 @@ void xfs_flush_inodes( struct xfs_mount *mp) { - struct super_block *sb = mp->m_super; + /* + * If flush_work() returns true then that means we waited for a flush + * which was already in progress. Don't bother running another scan. + */ + if (flush_work(&mp->m_flush_inodes_work)) + return; - if (down_read_trylock(&sb->s_umount)) { - sync_inodes_sb(sb); - up_read(&sb->s_umount); - } + queue_work(mp->m_sync_workqueue, &mp->m_flush_inodes_work); + flush_work(&mp->m_flush_inodes_work); } /* Catch misguided souls that try to use this interface on XFS */ @@ -1038,13 +705,13 @@ xfs_fs_drop_inode( return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); } -STATIC void -xfs_free_fsname( +static void +xfs_mount_free( struct xfs_mount *mp) { - kfree(mp->m_fsname); kfree(mp->m_rtname); kfree(mp->m_logname); + kmem_free(mp); } STATIC int @@ -1205,181 +872,6 @@ xfs_quiesce_attr( xfs_log_quiesce(mp); } -STATIC int -xfs_test_remount_options( - struct super_block *sb, - char *options) -{ - int error = 0; - struct xfs_mount *tmp_mp; - - tmp_mp = kmem_zalloc(sizeof(*tmp_mp), KM_MAYFAIL); - if (!tmp_mp) - return -ENOMEM; - - tmp_mp->m_super = sb; - error = xfs_parseargs(tmp_mp, options); - xfs_free_fsname(tmp_mp); - kmem_free(tmp_mp); - - return error; -} - -STATIC int -xfs_fs_remount( - struct super_block *sb, - int *flags, - char *options) -{ - struct xfs_mount *mp = XFS_M(sb); - xfs_sb_t *sbp = &mp->m_sb; - substring_t args[MAX_OPT_ARGS]; - char *p; - int error; - - /* First, check for complete junk; i.e. invalid options */ - error = xfs_test_remount_options(sb, options); - if (error) - return error; - - sync_filesystem(sb); - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_inode64: - mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; - mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); - break; - case Opt_inode32: - mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); - break; - default: - /* - * Logically we would return an error here to prevent - * users from believing they might have changed - * mount options using remount which can't be changed. - * - * But unfortunately mount(8) adds all options from - * mtab and fstab to the mount arguments in some cases - * so we can't blindly reject options, but have to - * check for each specified option if it actually - * differs from the currently set option and only - * reject it if that's the case. - * - * Until that is implemented we return success for - * every remount request, and silently ignore all - * options that we can't actually change. - */ -#if 0 - xfs_info(mp, - "mount option \"%s\" not supported for remount", p); - return -EINVAL; -#else - break; -#endif - } - } - - /* ro -> rw */ - if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) { - if (mp->m_flags & XFS_MOUNT_NORECOVERY) { - xfs_warn(mp, - "ro->rw transition prohibited on norecovery mount"); - return -EINVAL; - } - - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - xfs_sb_has_ro_compat_feature(sbp, - XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { - xfs_warn(mp, -"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem", - (sbp->sb_features_ro_compat & - XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); - return -EINVAL; - } - - mp->m_flags &= ~XFS_MOUNT_RDONLY; - - /* - * If this is the first remount to writeable state we - * might have some superblock changes to update. - */ - if (mp->m_update_sb) { - error = xfs_sync_sb(mp, false); - if (error) { - xfs_warn(mp, "failed to write sb changes"); - return error; - } - mp->m_update_sb = false; - } - - /* - * Fill out the reserve pool if it is empty. Use the stashed - * value if it is non-zero, otherwise go with the default. - */ - xfs_restore_resvblks(mp); - xfs_log_work_queue(mp); - - /* Recover any CoW blocks that never got remapped. */ - error = xfs_reflink_recover_cow(mp); - if (error) { - xfs_err(mp, - "Error %d recovering leftover CoW allocations.", error); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } - xfs_start_block_reaping(mp); - - /* Create the per-AG metadata reservation pool .*/ - error = xfs_fs_reserve_ag_blocks(mp); - if (error && error != -ENOSPC) - return error; - } - - /* rw -> ro */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) { - /* - * Cancel background eofb scanning so it cannot race with the - * final log force+buftarg wait and deadlock the remount. - */ - xfs_stop_block_reaping(mp); - - /* Get rid of any leftover CoW reservations... */ - error = xfs_icache_free_cowblocks(mp, NULL); - if (error) { - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } - - /* Free the per-AG metadata reservation pool. */ - error = xfs_fs_unreserve_ag_blocks(mp); - if (error) { - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } - - /* - * Before we sync the metadata, we need to free up the reserve - * block pool so that the used block count in the superblock on - * disk is correct at the end of the remount. Stash the current - * reserve pool size so that if we get remounted rw, we can - * return it to the same size. - */ - xfs_save_resvblks(mp); - - xfs_quiesce_attr(mp); - mp->m_flags |= XFS_MOUNT_RDONLY; - } - - return 0; -} - /* * Second stage of a freeze. The data is already frozen so we only * need to take care of the metadata. Once that's done sync the superblock @@ -1410,15 +902,6 @@ xfs_fs_unfreeze( return 0; } -STATIC int -xfs_fs_show_options( - struct seq_file *m, - struct dentry *root) -{ - xfs_showargs(XFS_M(root->d_sb), m); - return 0; -} - /* * This function fills in xfs_mount_t fields based on mount args. * Note: the superblock _has_ now been read in. @@ -1541,60 +1024,337 @@ xfs_destroy_percpu_counters( percpu_counter_destroy(&mp->m_delalloc_blks); } -static struct xfs_mount * -xfs_mount_alloc( +static void +xfs_fs_put_super( struct super_block *sb) { - struct xfs_mount *mp; + struct xfs_mount *mp = XFS_M(sb); - mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); - if (!mp) - return NULL; + /* if ->fill_super failed, we have no mount to tear down */ + if (!sb->s_fs_info) + return; - mp->m_super = sb; - spin_lock_init(&mp->m_sb_lock); - spin_lock_init(&mp->m_agirotor_lock); - INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); - spin_lock_init(&mp->m_perag_lock); - mutex_init(&mp->m_growlock); - atomic_set(&mp->m_active_trans, 0); - INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); - INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); - mp->m_kobj.kobject.kset = xfs_kset; - /* - * We don't create the finobt per-ag space reservation until after log - * recovery, so we must set this to true so that an ifree transaction - * started during log recovery will not depend on space reservations - * for finobt expansion. - */ - mp->m_finobt_nores = true; - return mp; + xfs_notice(mp, "Unmounting Filesystem"); + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + + xfs_freesb(mp); + free_percpu(mp->m_stats.xs_stats); + xfs_destroy_percpu_counters(mp); + xfs_destroy_mount_workqueues(mp); + xfs_close_devices(mp); + + sb->s_fs_info = NULL; + xfs_mount_free(mp); } +static long +xfs_fs_nr_cached_objects( + struct super_block *sb, + struct shrink_control *sc) +{ + /* Paranoia: catch incorrect calls during mount setup or teardown */ + if (WARN_ON_ONCE(!sb->s_fs_info)) + return 0; + return xfs_reclaim_inodes_count(XFS_M(sb)); +} -STATIC int -xfs_fs_fill_super( +static long +xfs_fs_free_cached_objects( struct super_block *sb, - void *data, - int silent) + struct shrink_control *sc) { - struct inode *root; - struct xfs_mount *mp = NULL; - int flags = 0, error = -ENOMEM; + return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); +} + +static const struct super_operations xfs_super_operations = { + .alloc_inode = xfs_fs_alloc_inode, + .destroy_inode = xfs_fs_destroy_inode, + .dirty_inode = xfs_fs_dirty_inode, + .drop_inode = xfs_fs_drop_inode, + .put_super = xfs_fs_put_super, + .sync_fs = xfs_fs_sync_fs, + .freeze_fs = xfs_fs_freeze, + .unfreeze_fs = xfs_fs_unfreeze, + .statfs = xfs_fs_statfs, + .show_options = xfs_fs_show_options, + .nr_cached_objects = xfs_fs_nr_cached_objects, + .free_cached_objects = xfs_fs_free_cached_objects, +}; + +static int +suffix_kstrtoint( + const char *s, + unsigned int base, + int *res) +{ + int last, shift_left_factor = 0, _res; + char *value; + int ret = 0; + value = kstrdup(s, GFP_KERNEL); + if (!value) + return -ENOMEM; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + if (kstrtoint(value, base, &_res)) + ret = -EINVAL; + kfree(value); + *res = _res << shift_left_factor; + return ret; +} + +/* + * Set mount state from a mount option. + * + * NOTE: mp->m_super is NULL here! + */ +static int +xfs_fc_parse_param( + struct fs_context *fc, + struct fs_parameter *param) +{ + struct xfs_mount *mp = fc->s_fs_info; + struct fs_parse_result result; + int size = 0; + int opt; + + opt = fs_parse(fc, xfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_logbufs: + mp->m_logbufs = result.uint_32; + return 0; + case Opt_logbsize: + if (suffix_kstrtoint(param->string, 10, &mp->m_logbsize)) + return -EINVAL; + return 0; + case Opt_logdev: + kfree(mp->m_logname); + mp->m_logname = kstrdup(param->string, GFP_KERNEL); + if (!mp->m_logname) + return -ENOMEM; + return 0; + case Opt_rtdev: + kfree(mp->m_rtname); + mp->m_rtname = kstrdup(param->string, GFP_KERNEL); + if (!mp->m_rtname) + return -ENOMEM; + return 0; + case Opt_allocsize: + if (suffix_kstrtoint(param->string, 10, &size)) + return -EINVAL; + mp->m_allocsize_log = ffs(size) - 1; + mp->m_flags |= XFS_MOUNT_ALLOCSIZE; + return 0; + case Opt_grpid: + case Opt_bsdgroups: + mp->m_flags |= XFS_MOUNT_GRPID; + return 0; + case Opt_nogrpid: + case Opt_sysvgroups: + mp->m_flags &= ~XFS_MOUNT_GRPID; + return 0; + case Opt_wsync: + mp->m_flags |= XFS_MOUNT_WSYNC; + return 0; + case Opt_norecovery: + mp->m_flags |= XFS_MOUNT_NORECOVERY; + return 0; + case Opt_noalign: + mp->m_flags |= XFS_MOUNT_NOALIGN; + return 0; + case Opt_swalloc: + mp->m_flags |= XFS_MOUNT_SWALLOC; + return 0; + case Opt_sunit: + mp->m_dalign = result.uint_32; + return 0; + case Opt_swidth: + mp->m_swidth = result.uint_32; + return 0; + case Opt_inode32: + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + return 0; + case Opt_inode64: + mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; + return 0; + case Opt_nouuid: + mp->m_flags |= XFS_MOUNT_NOUUID; + return 0; + case Opt_ikeep: + mp->m_flags |= XFS_MOUNT_IKEEP; + return 0; + case Opt_noikeep: + mp->m_flags &= ~XFS_MOUNT_IKEEP; + return 0; + case Opt_largeio: + mp->m_flags |= XFS_MOUNT_LARGEIO; + return 0; + case Opt_nolargeio: + mp->m_flags &= ~XFS_MOUNT_LARGEIO; + return 0; + case Opt_attr2: + mp->m_flags |= XFS_MOUNT_ATTR2; + return 0; + case Opt_noattr2: + mp->m_flags &= ~XFS_MOUNT_ATTR2; + mp->m_flags |= XFS_MOUNT_NOATTR2; + return 0; + case Opt_filestreams: + mp->m_flags |= XFS_MOUNT_FILESTREAMS; + return 0; + case Opt_noquota: + mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; + mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; + mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; + return 0; + case Opt_quota: + case Opt_uquota: + case Opt_usrquota: + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | + XFS_UQUOTA_ENFD); + return 0; + case Opt_qnoenforce: + case Opt_uqnoenforce: + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_UQUOTA_ENFD; + return 0; + case Opt_pquota: + case Opt_prjquota: + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | + XFS_PQUOTA_ENFD); + return 0; + case Opt_pqnoenforce: + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_PQUOTA_ENFD; + return 0; + case Opt_gquota: + case Opt_grpquota: + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | + XFS_GQUOTA_ENFD); + return 0; + case Opt_gqnoenforce: + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_GQUOTA_ENFD; + return 0; + case Opt_discard: + mp->m_flags |= XFS_MOUNT_DISCARD; + return 0; + case Opt_nodiscard: + mp->m_flags &= ~XFS_MOUNT_DISCARD; + return 0; +#ifdef CONFIG_FS_DAX + case Opt_dax: + mp->m_flags |= XFS_MOUNT_DAX; + return 0; +#endif + default: + xfs_warn(mp, "unknown mount option [%s].", param->key); + return -EINVAL; + } + + return 0; +} + +static int +xfs_fc_validate_params( + struct xfs_mount *mp) +{ /* - * allocate mp and do all low-level struct initializations before we - * attach it to the super + * no recovery flag requires a read-only mount */ - mp = xfs_mount_alloc(sb); - if (!mp) - goto out; - sb->s_fs_info = mp; + if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && + !(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, "no-recovery mounts must be read-only."); + return -EINVAL; + } + + if ((mp->m_flags & XFS_MOUNT_NOALIGN) && + (mp->m_dalign || mp->m_swidth)) { + xfs_warn(mp, + "sunit and swidth options incompatible with the noalign option"); + return -EINVAL; + } + + if (!IS_ENABLED(CONFIG_XFS_QUOTA) && mp->m_qflags != 0) { + xfs_warn(mp, "quota support not available in this kernel."); + return -EINVAL; + } + + if ((mp->m_dalign && !mp->m_swidth) || + (!mp->m_dalign && mp->m_swidth)) { + xfs_warn(mp, "sunit and swidth must be specified together"); + return -EINVAL; + } + + if (mp->m_dalign && (mp->m_swidth % mp->m_dalign != 0)) { + xfs_warn(mp, + "stripe width (%d) must be a multiple of the stripe unit (%d)", + mp->m_swidth, mp->m_dalign); + return -EINVAL; + } + + if (mp->m_logbufs != -1 && + mp->m_logbufs != 0 && + (mp->m_logbufs < XLOG_MIN_ICLOGS || + mp->m_logbufs > XLOG_MAX_ICLOGS)) { + xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]", + mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); + return -EINVAL; + } + + if (mp->m_logbsize != -1 && + mp->m_logbsize != 0 && + (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || + mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || + !is_power_of_2(mp->m_logbsize))) { + xfs_warn(mp, + "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", + mp->m_logbsize); + return -EINVAL; + } + + if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) && + (mp->m_allocsize_log > XFS_MAX_IO_LOG || + mp->m_allocsize_log < XFS_MIN_IO_LOG)) { + xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", + mp->m_allocsize_log, XFS_MIN_IO_LOG, XFS_MAX_IO_LOG); + return -EINVAL; + } + + return 0; +} + +static int +xfs_fc_fill_super( + struct super_block *sb, + struct fs_context *fc) +{ + struct xfs_mount *mp = sb->s_fs_info; + struct inode *root; + int flags = 0, error; - error = xfs_parseargs(mp, (char *)data); + mp->m_super = sb; + + error = xfs_fc_validate_params(mp); if (error) - goto out_free_fsname; + goto out_free_names; sb_min_blocksize(sb, BBSIZE); sb->s_xattr = xfs_xattr_handlers; @@ -1616,12 +1376,12 @@ xfs_fs_fill_super( msleep(xfs_globals.mount_delay * 1000); } - if (silent) + if (fc->sb_flags & SB_SILENT) flags |= XFS_MFSI_QUIET; error = xfs_open_devices(mp); if (error) - goto out_free_fsname; + goto out_free_names; error = xfs_init_mount_workqueues(mp); if (error) @@ -1650,6 +1410,26 @@ xfs_fs_fill_super( if (error) goto out_free_sb; + /* + * XFS block mappings use 54 bits to store the logical block offset. + * This should suffice to handle the maximum file size that the VFS + * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT + * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes + * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON + * to check this assertion. + * + * Avoid integer overflow by comparing the maximum bmbt offset to the + * maximum pagecache offset in units of fs blocks. + */ + if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) { + xfs_warn(mp, +"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!", + XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE), + XFS_MAX_FILEOFF); + error = -EINVAL; + goto out_free_sb; + } + error = xfs_filestream_mount(mp); if (error) goto out_free_sb; @@ -1661,7 +1441,7 @@ xfs_fs_fill_super( sb->s_magic = XFS_SUPER_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; - sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; sb->s_time_min = S32_MIN; @@ -1758,11 +1538,9 @@ xfs_fs_fill_super( xfs_destroy_mount_workqueues(mp); out_close_devices: xfs_close_devices(mp); - out_free_fsname: + out_free_names: sb->s_fs_info = NULL; - xfs_free_fsname(mp); - kfree(mp); - out: + xfs_mount_free(mp); return error; out_unmount: @@ -1771,80 +1549,253 @@ xfs_fs_fill_super( goto out_free_sb; } -STATIC void -xfs_fs_put_super( - struct super_block *sb) +static int +xfs_fc_get_tree( + struct fs_context *fc) { - struct xfs_mount *mp = XFS_M(sb); + return get_tree_bdev(fc, xfs_fc_fill_super); +} - /* if ->fill_super failed, we have no mount to tear down */ - if (!sb->s_fs_info) - return; +static int +xfs_remount_rw( + struct xfs_mount *mp) +{ + struct xfs_sb *sbp = &mp->m_sb; + int error; - xfs_notice(mp, "Unmounting Filesystem"); - xfs_filestream_unmount(mp); - xfs_unmountfs(mp); + if (mp->m_flags & XFS_MOUNT_NORECOVERY) { + xfs_warn(mp, + "ro->rw transition prohibited on norecovery mount"); + return -EINVAL; + } - xfs_freesb(mp); - free_percpu(mp->m_stats.xs_stats); - xfs_destroy_percpu_counters(mp); - xfs_destroy_mount_workqueues(mp); - xfs_close_devices(mp); + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_warn(mp, + "ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + return -EINVAL; + } - sb->s_fs_info = NULL; - xfs_free_fsname(mp); - kfree(mp); + mp->m_flags &= ~XFS_MOUNT_RDONLY; + + /* + * If this is the first remount to writeable state we might have some + * superblock changes to update. + */ + if (mp->m_update_sb) { + error = xfs_sync_sb(mp, false); + if (error) { + xfs_warn(mp, "failed to write sb changes"); + return error; + } + mp->m_update_sb = false; + } + + /* + * Fill out the reserve pool if it is empty. Use the stashed value if + * it is non-zero, otherwise go with the default. + */ + xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); + + /* Recover any CoW blocks that never got remapped. */ + error = xfs_reflink_recover_cow(mp); + if (error) { + xfs_err(mp, + "Error %d recovering leftover CoW allocations.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + xfs_start_block_reaping(mp); + + /* Create the per-AG metadata reservation pool .*/ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + return error; + + return 0; } -STATIC struct dentry * -xfs_fs_mount( - struct file_system_type *fs_type, - int flags, - const char *dev_name, - void *data) +static int +xfs_remount_ro( + struct xfs_mount *mp) { - return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); + int error; + + /* + * Cancel background eofb scanning so it cannot race with the final + * log force+buftarg wait and deadlock the remount. + */ + xfs_stop_block_reaping(mp); + + /* Get rid of any leftover CoW reservations... */ + error = xfs_icache_free_cowblocks(mp, NULL); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + /* Free the per-AG metadata reservation pool. */ + error = xfs_fs_unreserve_ag_blocks(mp); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + /* + * Before we sync the metadata, we need to free up the reserve block + * pool so that the used block count in the superblock on disk is + * correct at the end of the remount. Stash the current* reserve pool + * size so that if we get remounted rw, we can return it to the same + * size. + */ + xfs_save_resvblks(mp); + + xfs_quiesce_attr(mp); + mp->m_flags |= XFS_MOUNT_RDONLY; + + return 0; } -static long -xfs_fs_nr_cached_objects( - struct super_block *sb, - struct shrink_control *sc) +/* + * Logically we would return an error here to prevent users from believing + * they might have changed mount options using remount which can't be changed. + * + * But unfortunately mount(8) adds all options from mtab and fstab to the mount + * arguments in some cases so we can't blindly reject options, but have to + * check for each specified option if it actually differs from the currently + * set option and only reject it if that's the case. + * + * Until that is implemented we return success for every remount request, and + * silently ignore all options that we can't actually change. + */ +static int +xfs_fc_reconfigure( + struct fs_context *fc) { - /* Paranoia: catch incorrect calls during mount setup or teardown */ - if (WARN_ON_ONCE(!sb->s_fs_info)) - return 0; - return xfs_reclaim_inodes_count(XFS_M(sb)); + struct xfs_mount *mp = XFS_M(fc->root->d_sb); + struct xfs_mount *new_mp = fc->s_fs_info; + xfs_sb_t *sbp = &mp->m_sb; + int flags = fc->sb_flags; + int error; + + error = xfs_fc_validate_params(new_mp); + if (error) + return error; + + sync_filesystem(mp->m_super); + + /* inode32 -> inode64 */ + if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && + !(new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) { + mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; + mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); + } + + /* inode64 -> inode32 */ + if (!(mp->m_flags & XFS_MOUNT_SMALL_INUMS) && + (new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) { + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); + } + + /* ro -> rw */ + if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(flags & SB_RDONLY)) { + error = xfs_remount_rw(mp); + if (error) + return error; + } + + /* rw -> ro */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (flags & SB_RDONLY)) { + error = xfs_remount_ro(mp); + if (error) + return error; + } + + return 0; } -static long -xfs_fs_free_cached_objects( - struct super_block *sb, - struct shrink_control *sc) +static void xfs_fc_free( + struct fs_context *fc) { - return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); + struct xfs_mount *mp = fc->s_fs_info; + + /* + * mp is stored in the fs_context when it is initialized. + * mp is transferred to the superblock on a successful mount, + * but if an error occurs before the transfer we have to free + * it here. + */ + if (mp) + xfs_mount_free(mp); } -static const struct super_operations xfs_super_operations = { - .alloc_inode = xfs_fs_alloc_inode, - .destroy_inode = xfs_fs_destroy_inode, - .dirty_inode = xfs_fs_dirty_inode, - .drop_inode = xfs_fs_drop_inode, - .put_super = xfs_fs_put_super, - .sync_fs = xfs_fs_sync_fs, - .freeze_fs = xfs_fs_freeze, - .unfreeze_fs = xfs_fs_unfreeze, - .statfs = xfs_fs_statfs, - .remount_fs = xfs_fs_remount, - .show_options = xfs_fs_show_options, - .nr_cached_objects = xfs_fs_nr_cached_objects, - .free_cached_objects = xfs_fs_free_cached_objects, +static const struct fs_context_operations xfs_context_ops = { + .parse_param = xfs_fc_parse_param, + .get_tree = xfs_fc_get_tree, + .reconfigure = xfs_fc_reconfigure, + .free = xfs_fc_free, }; +static int xfs_init_fs_context( + struct fs_context *fc) +{ + struct xfs_mount *mp; + + mp = kmem_alloc(sizeof(struct xfs_mount), KM_ZERO); + if (!mp) + return -ENOMEM; + + spin_lock_init(&mp->m_sb_lock); + spin_lock_init(&mp->m_agirotor_lock); + INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); + spin_lock_init(&mp->m_perag_lock); + mutex_init(&mp->m_growlock); + atomic_set(&mp->m_active_trans, 0); + INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); + INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); + mp->m_kobj.kobject.kset = xfs_kset; + /* + * We don't create the finobt per-ag space reservation until after log + * recovery, so we must set this to true so that an ifree transaction + * started during log recovery will not depend on space reservations + * for finobt expansion. + */ + mp->m_finobt_nores = true; + + /* + * These can be overridden by the mount option parsing. + */ + mp->m_logbufs = -1; + mp->m_logbsize = -1; + mp->m_allocsize_log = 16; /* 64k */ + + /* + * Copy binary VFS mount flags we are interested in. + */ + if (fc->sb_flags & SB_RDONLY) + mp->m_flags |= XFS_MOUNT_RDONLY; + if (fc->sb_flags & SB_DIRSYNC) + mp->m_flags |= XFS_MOUNT_DIRSYNC; + if (fc->sb_flags & SB_SYNCHRONOUS) + mp->m_flags |= XFS_MOUNT_WSYNC; + + fc->s_fs_info = mp; + fc->ops = &xfs_context_ops; + + return 0; +} + static struct file_system_type xfs_fs_type = { .owner = THIS_MODULE, .name = "xfs", - .mount = xfs_fs_mount, + .init_fs_context = xfs_init_fs_context, + .parameters = xfs_fs_parameters, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -1853,37 +1804,39 @@ MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) { - if (bioset_init(&xfs_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct xfs_ioend, io_inline_bio), - BIOSET_NEED_BVECS)) - goto out; - - xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), - "xfs_log_ticket"); + xfs_log_ticket_zone = kmem_cache_create("xfs_log_ticket", + sizeof(struct xlog_ticket), + 0, 0, NULL); if (!xfs_log_ticket_zone) - goto out_free_ioend_bioset; + goto out; - xfs_bmap_free_item_zone = kmem_zone_init( - sizeof(struct xfs_extent_free_item), - "xfs_bmap_free_item"); + xfs_bmap_free_item_zone = kmem_cache_create("xfs_bmap_free_item", + sizeof(struct xfs_extent_free_item), + 0, 0, NULL); if (!xfs_bmap_free_item_zone) goto out_destroy_log_ticket_zone; - xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), - "xfs_btree_cur"); + xfs_btree_cur_zone = kmem_cache_create("xfs_btree_cur", + sizeof(struct xfs_btree_cur), + 0, 0, NULL); if (!xfs_btree_cur_zone) goto out_destroy_bmap_free_item_zone; - xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t), - "xfs_da_state"); + xfs_da_state_zone = kmem_cache_create("xfs_da_state", + sizeof(struct xfs_da_state), + 0, 0, NULL); if (!xfs_da_state_zone) goto out_destroy_btree_cur_zone; - xfs_ifork_zone = kmem_zone_init(sizeof(struct xfs_ifork), "xfs_ifork"); + xfs_ifork_zone = kmem_cache_create("xfs_ifork", + sizeof(struct xfs_ifork), + 0, 0, NULL); if (!xfs_ifork_zone) goto out_destroy_da_state_zone; - xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); + xfs_trans_zone = kmem_cache_create("xf_trans", + sizeof(struct xfs_trans), + 0, 0, NULL); if (!xfs_trans_zone) goto out_destroy_ifork_zone; @@ -1893,111 +1846,122 @@ xfs_init_zones(void) * size possible under XFS. This wastes a little bit of memory, * but it is much faster. */ - xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item), - "xfs_buf_item"); + xfs_buf_item_zone = kmem_cache_create("xfs_buf_item", + sizeof(struct xfs_buf_log_item), + 0, 0, NULL); if (!xfs_buf_item_zone) goto out_destroy_trans_zone; - xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + - ((XFS_EFD_MAX_FAST_EXTENTS - 1) * - sizeof(xfs_extent_t))), "xfs_efd_item"); + xfs_efd_zone = kmem_cache_create("xfs_efd_item", + (sizeof(struct xfs_efd_log_item) + + (XFS_EFD_MAX_FAST_EXTENTS - 1) * + sizeof(struct xfs_extent)), + 0, 0, NULL); if (!xfs_efd_zone) goto out_destroy_buf_item_zone; - xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) + - ((XFS_EFI_MAX_FAST_EXTENTS - 1) * - sizeof(xfs_extent_t))), "xfs_efi_item"); + xfs_efi_zone = kmem_cache_create("xfs_efi_item", + (sizeof(struct xfs_efi_log_item) + + (XFS_EFI_MAX_FAST_EXTENTS - 1) * + sizeof(struct xfs_extent)), + 0, 0, NULL); if (!xfs_efi_zone) goto out_destroy_efd_zone; - xfs_inode_zone = - kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD | - KM_ZONE_ACCOUNT, xfs_fs_inode_init_once); + xfs_inode_zone = kmem_cache_create("xfs_inode", + sizeof(struct xfs_inode), 0, + (SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD | SLAB_ACCOUNT), + xfs_fs_inode_init_once); if (!xfs_inode_zone) goto out_destroy_efi_zone; - xfs_ili_zone = - kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", - KM_ZONE_SPREAD, NULL); + xfs_ili_zone = kmem_cache_create("xfs_ili", + sizeof(struct xfs_inode_log_item), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); if (!xfs_ili_zone) goto out_destroy_inode_zone; - xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item), - "xfs_icr"); + + xfs_icreate_zone = kmem_cache_create("xfs_icr", + sizeof(struct xfs_icreate_item), + 0, 0, NULL); if (!xfs_icreate_zone) goto out_destroy_ili_zone; - xfs_rud_zone = kmem_zone_init(sizeof(struct xfs_rud_log_item), - "xfs_rud_item"); + xfs_rud_zone = kmem_cache_create("xfs_rud_item", + sizeof(struct xfs_rud_log_item), + 0, 0, NULL); if (!xfs_rud_zone) goto out_destroy_icreate_zone; - xfs_rui_zone = kmem_zone_init( + xfs_rui_zone = kmem_cache_create("xfs_rui_item", xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS), - "xfs_rui_item"); + 0, 0, NULL); if (!xfs_rui_zone) goto out_destroy_rud_zone; - xfs_cud_zone = kmem_zone_init(sizeof(struct xfs_cud_log_item), - "xfs_cud_item"); + xfs_cud_zone = kmem_cache_create("xfs_cud_item", + sizeof(struct xfs_cud_log_item), + 0, 0, NULL); if (!xfs_cud_zone) goto out_destroy_rui_zone; - xfs_cui_zone = kmem_zone_init( + xfs_cui_zone = kmem_cache_create("xfs_cui_item", xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS), - "xfs_cui_item"); + 0, 0, NULL); if (!xfs_cui_zone) goto out_destroy_cud_zone; - xfs_bud_zone = kmem_zone_init(sizeof(struct xfs_bud_log_item), - "xfs_bud_item"); + xfs_bud_zone = kmem_cache_create("xfs_bud_item", + sizeof(struct xfs_bud_log_item), + 0, 0, NULL); if (!xfs_bud_zone) goto out_destroy_cui_zone; - xfs_bui_zone = kmem_zone_init( + xfs_bui_zone = kmem_cache_create("xfs_bui_item", xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS), - "xfs_bui_item"); + 0, 0, NULL); if (!xfs_bui_zone) goto out_destroy_bud_zone; return 0; out_destroy_bud_zone: - kmem_zone_destroy(xfs_bud_zone); + kmem_cache_destroy(xfs_bud_zone); out_destroy_cui_zone: - kmem_zone_destroy(xfs_cui_zone); + kmem_cache_destroy(xfs_cui_zone); out_destroy_cud_zone: - kmem_zone_destroy(xfs_cud_zone); + kmem_cache_destroy(xfs_cud_zone); out_destroy_rui_zone: - kmem_zone_destroy(xfs_rui_zone); + kmem_cache_destroy(xfs_rui_zone); out_destroy_rud_zone: - kmem_zone_destroy(xfs_rud_zone); + kmem_cache_destroy(xfs_rud_zone); out_destroy_icreate_zone: - kmem_zone_destroy(xfs_icreate_zone); + kmem_cache_destroy(xfs_icreate_zone); out_destroy_ili_zone: - kmem_zone_destroy(xfs_ili_zone); + kmem_cache_destroy(xfs_ili_zone); out_destroy_inode_zone: - kmem_zone_destroy(xfs_inode_zone); + kmem_cache_destroy(xfs_inode_zone); out_destroy_efi_zone: - kmem_zone_destroy(xfs_efi_zone); + kmem_cache_destroy(xfs_efi_zone); out_destroy_efd_zone: - kmem_zone_destroy(xfs_efd_zone); + kmem_cache_destroy(xfs_efd_zone); out_destroy_buf_item_zone: - kmem_zone_destroy(xfs_buf_item_zone); + kmem_cache_destroy(xfs_buf_item_zone); out_destroy_trans_zone: - kmem_zone_destroy(xfs_trans_zone); + kmem_cache_destroy(xfs_trans_zone); out_destroy_ifork_zone: - kmem_zone_destroy(xfs_ifork_zone); + kmem_cache_destroy(xfs_ifork_zone); out_destroy_da_state_zone: - kmem_zone_destroy(xfs_da_state_zone); + kmem_cache_destroy(xfs_da_state_zone); out_destroy_btree_cur_zone: - kmem_zone_destroy(xfs_btree_cur_zone); + kmem_cache_destroy(xfs_btree_cur_zone); out_destroy_bmap_free_item_zone: - kmem_zone_destroy(xfs_bmap_free_item_zone); + kmem_cache_destroy(xfs_bmap_free_item_zone); out_destroy_log_ticket_zone: - kmem_zone_destroy(xfs_log_ticket_zone); - out_free_ioend_bioset: - bioset_exit(&xfs_ioend_bioset); + kmem_cache_destroy(xfs_log_ticket_zone); out: return -ENOMEM; } @@ -2010,25 +1974,24 @@ xfs_destroy_zones(void) * destroy caches. */ rcu_barrier(); - kmem_zone_destroy(xfs_bui_zone); - kmem_zone_destroy(xfs_bud_zone); - kmem_zone_destroy(xfs_cui_zone); - kmem_zone_destroy(xfs_cud_zone); - kmem_zone_destroy(xfs_rui_zone); - kmem_zone_destroy(xfs_rud_zone); - kmem_zone_destroy(xfs_icreate_zone); - kmem_zone_destroy(xfs_ili_zone); - kmem_zone_destroy(xfs_inode_zone); - kmem_zone_destroy(xfs_efi_zone); - kmem_zone_destroy(xfs_efd_zone); - kmem_zone_destroy(xfs_buf_item_zone); - kmem_zone_destroy(xfs_trans_zone); - kmem_zone_destroy(xfs_ifork_zone); - kmem_zone_destroy(xfs_da_state_zone); - kmem_zone_destroy(xfs_btree_cur_zone); - kmem_zone_destroy(xfs_bmap_free_item_zone); - kmem_zone_destroy(xfs_log_ticket_zone); - bioset_exit(&xfs_ioend_bioset); + kmem_cache_destroy(xfs_bui_zone); + kmem_cache_destroy(xfs_bud_zone); + kmem_cache_destroy(xfs_cui_zone); + kmem_cache_destroy(xfs_cud_zone); + kmem_cache_destroy(xfs_rui_zone); + kmem_cache_destroy(xfs_rud_zone); + kmem_cache_destroy(xfs_icreate_zone); + kmem_cache_destroy(xfs_ili_zone); + kmem_cache_destroy(xfs_inode_zone); + kmem_cache_destroy(xfs_efi_zone); + kmem_cache_destroy(xfs_efd_zone); + kmem_cache_destroy(xfs_buf_item_zone); + kmem_cache_destroy(xfs_trans_zone); + kmem_cache_destroy(xfs_ifork_zone); + kmem_cache_destroy(xfs_da_state_zone); + kmem_cache_destroy(xfs_btree_cur_zone); + kmem_cache_destroy(xfs_bmap_free_item_zone); + kmem_cache_destroy(xfs_log_ticket_zone); } STATIC int __init diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 763e43d22dee..b552cf6d3379 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -11,9 +11,11 @@ #ifdef CONFIG_XFS_QUOTA extern int xfs_qm_init(void); extern void xfs_qm_exit(void); +# define XFS_QUOTA_STRING "quota, " #else # define xfs_qm_init() (0) # define xfs_qm_exit() do { } while (0) +# define XFS_QUOTA_STRING #endif #ifdef CONFIG_XFS_POSIX_ACL @@ -50,6 +52,12 @@ extern void xfs_qm_exit(void); # define XFS_WARN_STRING #endif +#ifdef CONFIG_XFS_ASSERT_FATAL +# define XFS_ASSERT_FATAL_STRING "fatal assert, " +#else +# define XFS_ASSERT_FATAL_STRING +#endif + #ifdef DEBUG # define XFS_DBG_STRING "debug" #else @@ -63,6 +71,8 @@ extern void xfs_qm_exit(void); XFS_SCRUB_STRING \ XFS_REPAIR_STRING \ XFS_WARN_STRING \ + XFS_QUOTA_STRING \ + XFS_ASSERT_FATAL_STRING \ XFS_DBG_STRING /* DBG must be last */ struct xfs_inode; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index ed66fd2de327..13fb4b919648 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -17,6 +17,7 @@ #include "xfs_bmap.h" #include "xfs_bmap_btree.h" #include "xfs_quota.h" +#include "xfs_symlink.h" #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_trans.h" @@ -52,20 +53,10 @@ xfs_readlink_bmap_ilocked( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, - &xfs_symlink_buf_ops); - if (!bp) - return -ENOMEM; - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, __func__); - xfs_buf_relse(bp); - - /* bad CRC means corrupted metadata */ - if (error == -EFSBADCRC) - error = -EFSCORRUPTED; - goto out; - } + error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, + &bp, &xfs_symlink_buf_ops); + if (error) + return error; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); if (pathlen < byte_cnt) byte_cnt = pathlen; @@ -185,15 +176,12 @@ xfs_symlink( return -ENAMETOOLONG; ASSERT(pathlen > 0); - udqp = gdqp = NULL; prid = xfs_get_initial_prid(dp); /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, - xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -203,7 +191,7 @@ xfs_symlink( * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. */ - if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) + if (pathlen <= XFS_LITINO(mp)) fs_blocks = 0; else fs_blocks = xfs_symlink_blocks(mp, pathlen); @@ -289,12 +277,10 @@ xfs_symlink( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - BTOBB(byte_cnt), 0); - if (!bp) { - error = -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0, &bp); + if (error) goto out_trans_cancel; - } bp->b_ops = &xfs_symlink_buf_ops; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); @@ -432,13 +418,12 @@ xfs_inactive_symlink_rmt( * Invalidate the block(s). No validation is done. */ for (i = 0; i < nmaps; i++) { - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), - XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); - if (!bp) { - error = -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0, + &bp); + if (error) goto error_trans_cancel; - } xfs_trans_binval(tp, bp); } /* diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index 9743d8c9394b..b1fa091427e6 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -5,7 +5,7 @@ #ifndef __XFS_SYMLINK_H #define __XFS_SYMLINK_H 1 -/* Kernel only symlink defintions */ +/* Kernel only symlink definitions */ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index bc85b89f88ca..120398a37c2a 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -6,6 +6,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_bit.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" @@ -27,6 +28,7 @@ #include "xfs_log_recover.h" #include "xfs_filestream.h" #include "xfs_fsmap.h" +#include "xfs_btree_staging.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index eaae275ed430..a4323a63438d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -35,6 +35,12 @@ struct xfs_icreate_log; struct xfs_owner_info; struct xfs_trans_res; struct xfs_inobt_rec_incore; +union xfs_btree_ptr; + +#define XFS_ATTR_FILTER_FLAGS \ + { XFS_ATTR_ROOT, "ROOT" }, \ + { XFS_ATTR_SECURE, "SECURE" }, \ + { XFS_ATTR_INCOMPLETE, "INCOMPLETE" } DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -45,39 +51,39 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class, __field(u32, hashval) __field(u32, blkno) __field(u32, offset) - __field(void *, alist) + __field(void *, buffer) __field(int, bufsize) __field(int, count) __field(int, firstu) __field(int, dupcnt) - __field(int, flags) + __field(unsigned int, attr_filter) ), TP_fast_assign( __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; __entry->ino = ctx->dp->i_ino; - __entry->hashval = ctx->cursor->hashval; - __entry->blkno = ctx->cursor->blkno; - __entry->offset = ctx->cursor->offset; - __entry->alist = ctx->alist; + __entry->hashval = ctx->cursor.hashval; + __entry->blkno = ctx->cursor.blkno; + __entry->offset = ctx->cursor.offset; + __entry->buffer = ctx->buffer; __entry->bufsize = ctx->bufsize; __entry->count = ctx->count; __entry->firstu = ctx->firstu; - __entry->flags = ctx->flags; + __entry->attr_filter = ctx->attr_filter; ), TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " - "alist %p size %u count %u firstu %u flags %d %s", + "buffer %p size %u count %u firstu %u filter %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->hashval, __entry->blkno, __entry->offset, __entry->dupcnt, - __entry->alist, + __entry->buffer, __entry->bufsize, __entry->count, __entry->firstu, - __entry->flags, - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) + __print_flags(__entry->attr_filter, "|", + XFS_ATTR_FILTER_FLAGS) ) ) @@ -169,31 +175,31 @@ TRACE_EVENT(xfs_attr_list_node_descend, __field(u32, hashval) __field(u32, blkno) __field(u32, offset) - __field(void *, alist) + __field(void *, buffer) __field(int, bufsize) __field(int, count) __field(int, firstu) __field(int, dupcnt) - __field(int, flags) + __field(unsigned int, attr_filter) __field(u32, bt_hashval) __field(u32, bt_before) ), TP_fast_assign( __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; __entry->ino = ctx->dp->i_ino; - __entry->hashval = ctx->cursor->hashval; - __entry->blkno = ctx->cursor->blkno; - __entry->offset = ctx->cursor->offset; - __entry->alist = ctx->alist; + __entry->hashval = ctx->cursor.hashval; + __entry->blkno = ctx->cursor.blkno; + __entry->offset = ctx->cursor.offset; + __entry->buffer = ctx->buffer; __entry->bufsize = ctx->bufsize; __entry->count = ctx->count; __entry->firstu = ctx->firstu; - __entry->flags = ctx->flags; + __entry->attr_filter = ctx->attr_filter; __entry->bt_hashval = be32_to_cpu(btree->hashval); __entry->bt_before = be32_to_cpu(btree->before); ), TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " - "alist %p size %u count %u firstu %u flags %d %s " + "buffer %p size %u count %u firstu %u filter %s " "node hashval %u, node before %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -201,12 +207,12 @@ TRACE_EVENT(xfs_attr_list_node_descend, __entry->blkno, __entry->offset, __entry->dupcnt, - __entry->alist, + __entry->buffer, __entry->bufsize, __entry->count, __entry->firstu, - __entry->flags, - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), + __print_flags(__entry->attr_filter, "|", + XFS_ATTR_FILTER_FLAGS), __entry->bt_hashval, __entry->bt_before) ); @@ -218,8 +224,8 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(void *, leaf); - __field(int, pos); + __field(void *, leaf) + __field(int, pos) __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) @@ -725,7 +731,7 @@ TRACE_EVENT(xfs_iomap_prealloc_size, __entry->writeio_blocks = writeio_blocks; ), TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d " - "m_writeio_blocks %u", + "m_allocsize_blocks %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->blocks, __entry->shift, __entry->writeio_blocks) ) @@ -995,8 +1001,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, DEFINE_EVENT(xfs_loggrant_class, name, \ TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \ TP_ARGS(log, tic)) -DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); -DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); @@ -1005,12 +1009,13 @@ DEFINE_LOGGRANT_EVENT(xfs_log_reserve); DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait); DECLARE_EVENT_CLASS(xfs_log_item_class, TP_PROTO(struct xfs_log_item *lip), @@ -1158,71 +1163,6 @@ DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); -DECLARE_EVENT_CLASS(xfs_page_class, - TP_PROTO(struct inode *inode, struct page *page, unsigned long off, - unsigned int len), - TP_ARGS(inode, page, off, len), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(pgoff_t, pgoff) - __field(loff_t, size) - __field(unsigned long, offset) - __field(unsigned int, length) - ), - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = XFS_I(inode)->i_ino; - __entry->pgoff = page_offset(page); - __entry->size = i_size_read(inode); - __entry->offset = off; - __entry->length = len; - ), - TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " - "length %x", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->pgoff, - __entry->size, - __entry->offset, - __entry->length) -) - -#define DEFINE_PAGE_EVENT(name) \ -DEFINE_EVENT(xfs_page_class, name, \ - TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ - unsigned int len), \ - TP_ARGS(inode, page, off, len)) -DEFINE_PAGE_EVENT(xfs_writepage); -DEFINE_PAGE_EVENT(xfs_releasepage); -DEFINE_PAGE_EVENT(xfs_invalidatepage); - -DECLARE_EVENT_CLASS(xfs_readpage_class, - TP_PROTO(struct inode *inode, int nr_pages), - TP_ARGS(inode, nr_pages), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(int, nr_pages) - ), - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->nr_pages = nr_pages; - ), - TP_printk("dev %d:%d ino 0x%llx nr_pages %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->nr_pages) -) - -#define DEFINE_READPAGE_EVENT(name) \ -DEFINE_EVENT(xfs_readpage_class, name, \ - TP_PROTO(struct inode *inode, int nr_pages), \ - TP_ARGS(inode, nr_pages)) -DEFINE_READPAGE_EVENT(xfs_vm_readpage); -DEFINE_READPAGE_EVENT(xfs_vm_readpages); - DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int whichfork, struct xfs_bmbt_irec *irec), @@ -1642,8 +1582,11 @@ DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound); DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); DEFINE_ALLOC_EVENT(xfs_alloc_near_first); -DEFINE_ALLOC_EVENT(xfs_alloc_near_greater); -DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser); +DEFINE_ALLOC_EVENT(xfs_alloc_cur); +DEFINE_ALLOC_EVENT(xfs_alloc_cur_right); +DEFINE_ALLOC_EVENT(xfs_alloc_cur_left); +DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup); +DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup_done); DEFINE_ALLOC_EVENT(xfs_alloc_near_error); DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry); DEFINE_ALLOC_EVENT(xfs_alloc_near_busy); @@ -1663,6 +1606,32 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed); +TRACE_EVENT(xfs_alloc_cur_check, + TP_PROTO(struct xfs_mount *mp, xfs_btnum_t btnum, xfs_agblock_t bno, + xfs_extlen_t len, xfs_extlen_t diff, bool new), + TP_ARGS(mp, btnum, bno, len, diff, new), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(xfs_agblock_t, bno) + __field(xfs_extlen_t, len) + __field(xfs_extlen_t, diff) + __field(bool, new) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->btnum = btnum; + __entry->bno = bno; + __entry->len = len; + __entry->diff = diff; + __entry->new = new; + ), + TP_printk("dev %d:%d btree %s bno 0x%x len 0x%x diff 0x%x new %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->bno, __entry->len, __entry->diff, __entry->new) +) + DECLARE_EVENT_CLASS(xfs_da_class, TP_PROTO(struct xfs_da_args *args), TP_ARGS(args), @@ -1737,7 +1706,8 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __field(int, namelen) __field(int, valuelen) __field(xfs_dahash_t, hashval) - __field(int, flags) + __field(unsigned int, attr_filter) + __field(unsigned int, attr_flags) __field(int, op_flags) ), TP_fast_assign( @@ -1748,11 +1718,12 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen = args->namelen; __entry->valuelen = args->valuelen; __entry->hashval = args->hashval; - __entry->flags = args->flags; + __entry->attr_filter = args->attr_filter; + __entry->attr_flags = args->attr_flags; __entry->op_flags = args->op_flags; ), TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " - "hashval 0x%x flags %s op_flags %s", + "hashval 0x%x filter %s flags %s op_flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->namelen, @@ -1760,7 +1731,11 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen, __entry->valuelen, __entry->hashval, - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), + __print_flags(__entry->attr_filter, "|", + XFS_ATTR_FILTER_FLAGS), + __print_flags(__entry->attr_flags, "|", + { XATTR_CREATE, "CREATE" }, + { XATTR_REPLACE, "REPLACE" }), __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) ) @@ -3609,6 +3584,172 @@ DEFINE_KMEM_EVENT(kmem_alloc_large); DEFINE_KMEM_EVENT(kmem_realloc); DEFINE_KMEM_EVENT(kmem_zone_alloc); +TRACE_EVENT(xfs_check_new_dalign, + TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), + TP_ARGS(mp, new_dalign, calc_rootino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, new_dalign) + __field(xfs_ino_t, sb_rootino) + __field(xfs_ino_t, calc_rootino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->new_dalign = new_dalign; + __entry->sb_rootino = mp->m_sb.sb_rootino; + __entry->calc_rootino = calc_rootino; + ), + TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->new_dalign, __entry->sb_rootino, + __entry->calc_rootino) +) + +TRACE_EVENT(xfs_btree_commit_afakeroot, + TP_PROTO(struct xfs_btree_cur *cur), + TP_ARGS(cur), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(unsigned int, levels) + __field(unsigned int, blocks) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->agno = cur->bc_ag.agno; + __entry->agbno = cur->bc_ag.afake->af_root; + __entry->levels = cur->bc_ag.afake->af_levels; + __entry->blocks = cur->bc_ag.afake->af_blocks; + ), + TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->agno, + __entry->levels, + __entry->blocks, + __entry->agbno) +) + +TRACE_EVENT(xfs_btree_commit_ifakeroot, + TP_PROTO(struct xfs_btree_cur *cur), + TP_ARGS(cur), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(unsigned int, levels) + __field(unsigned int, blocks) + __field(int, whichfork) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->agno = XFS_INO_TO_AGNO(cur->bc_mp, + cur->bc_ino.ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(cur->bc_mp, + cur->bc_ino.ip->i_ino); + __entry->levels = cur->bc_ino.ifake->if_levels; + __entry->blocks = cur->bc_ino.ifake->if_blocks; + __entry->whichfork = cur->bc_ino.whichfork; + ), + TP_printk("dev %d:%d btree %s ag %u agino %u whichfork %s levels %u blocks %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->agno, + __entry->agino, + __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data", + __entry->levels, + __entry->blocks) +) + +TRACE_EVENT(xfs_btree_bload_level_geometry, + TP_PROTO(struct xfs_btree_cur *cur, unsigned int level, + uint64_t nr_this_level, unsigned int nr_per_block, + unsigned int desired_npb, uint64_t blocks, + uint64_t blocks_with_extra), + TP_ARGS(cur, level, nr_this_level, nr_per_block, desired_npb, blocks, + blocks_with_extra), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(unsigned int, level) + __field(unsigned int, nlevels) + __field(uint64_t, nr_this_level) + __field(unsigned int, nr_per_block) + __field(unsigned int, desired_npb) + __field(unsigned long long, blocks) + __field(unsigned long long, blocks_with_extra) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->nlevels = cur->bc_nlevels; + __entry->nr_this_level = nr_this_level; + __entry->nr_per_block = nr_per_block; + __entry->desired_npb = desired_npb; + __entry->blocks = blocks; + __entry->blocks_with_extra = blocks_with_extra; + ), + TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->level, + __entry->nlevels, + __entry->nr_this_level, + __entry->nr_per_block, + __entry->desired_npb, + __entry->blocks, + __entry->blocks_with_extra) +) + +TRACE_EVENT(xfs_btree_bload_block, + TP_PROTO(struct xfs_btree_cur *cur, unsigned int level, + uint64_t block_idx, uint64_t nr_blocks, + union xfs_btree_ptr *ptr, unsigned int nr_records), + TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(unsigned int, level) + __field(unsigned long long, block_idx) + __field(unsigned long long, nr_blocks) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(unsigned int, nr_records) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->block_idx = block_idx; + __entry->nr_blocks = nr_blocks; + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + xfs_fsblock_t fsb = be64_to_cpu(ptr->l); + + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb); + __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb); + } else { + __entry->agno = cur->bc_ag.agno; + __entry->agbno = be32_to_cpu(ptr->s); + } + __entry->nr_records = nr_records; + ), + TP_printk("dev %d:%d btree %s level %u block %llu/%llu fsb (%u/%u) recs %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->level, + __entry->block_idx, + __entry->nr_blocks, + __entry->agno, + __entry->agbno, + __entry->nr_records) +) + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index f4795fdb7389..28b983ff8b11 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -9,6 +9,7 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_log_priv.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_extent_busy.h" @@ -71,7 +72,7 @@ xfs_trans_free( if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); - kmem_zone_free(xfs_trans_zone, tp); + kmem_cache_free(xfs_trans_zone, tp); } /* @@ -150,8 +151,9 @@ xfs_trans_reserve( uint blocks, uint rtextents) { - int error = 0; - bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + struct xfs_mount *mp = tp->t_mountp; + int error = 0; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -162,7 +164,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (blocks > 0) { - error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); + error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return -ENOSPC; @@ -191,9 +193,9 @@ xfs_trans_reserve( if (tp->t_ticket != NULL) { ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); - error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); + error = xfs_log_regrant(mp, tp->t_ticket); } else { - error = xfs_log_reserve(tp->t_mountp, + error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount, &tp->t_ticket, XFS_TRANSACTION, @@ -213,7 +215,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (rtextents > 0) { - error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents)); + error = xfs_mod_frextents(mp, -((int64_t)rtextents)); if (error) { error = -ENOSPC; goto undo_log; @@ -229,7 +231,7 @@ xfs_trans_reserve( */ undo_log: if (resp->tr_logres > 0) { - xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false); + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; tp->t_log_res = 0; tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; @@ -237,7 +239,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd); + xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd); tp->t_blk_res = 0; } @@ -306,6 +308,11 @@ xfs_trans_alloc( * * Note the zero-length reservation; this transaction MUST be cancelled * without any dirty data. + * + * Callers should obtain freeze protection to avoid two conflicts with fs + * freezing: (1) having active transactions trip the m_active_trans ASSERTs; + * and (2) grabbing buffers at the same time that freeze is trying to drain + * the buffer LRU list. */ int xfs_trans_alloc_empty( @@ -450,7 +457,7 @@ xfs_trans_apply_sb_deltas( int whole = 0; bp = xfs_trans_getsb(tp, tp->t_mountp); - sbp = XFS_BUF_TO_SBP(bp); + sbp = bp->b_addr; /* * Check that superblock mods match the mods made to AGF counters. @@ -999,9 +1006,10 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant); - if (commit_lsn == -1 && !error) - error = -EIO; + if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log)) + xfs_log_ticket_regrant(mp->m_log, tp->t_ticket); + else + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; } current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -1060,7 +1068,7 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - xfs_log_done(mp, tp->t_ticket, NULL, false); + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 64d7f171ebd3..752c7fef9de7 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -169,21 +169,21 @@ int xfs_trans_alloc_empty(struct xfs_mount *mp, struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); -struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, - struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps, - uint flags); +int xfs_trans_get_buf_map(struct xfs_trans *tp, struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, + struct xfs_buf **bpp); -static inline struct xfs_buf * +static inline int xfs_trans_get_buf( struct xfs_trans *tp, struct xfs_buftarg *target, xfs_daddr_t blkno, int numblks, - uint flags) + uint flags, + struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_trans_get_buf_map(tp, target, &map, 1, flags); + return xfs_trans_get_buf_map(tp, target, &map, 1, flags, bpp); } int xfs_trans_read_buf_map(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 6ccfd75d3c24..564253550b75 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -32,6 +32,7 @@ STATIC void xfs_ail_check( struct xfs_ail *ailp, struct xfs_log_item *lip) + __must_hold(&ailp->ail_lock) { struct xfs_log_item *prev_lip; struct xfs_log_item *next_lip; @@ -108,17 +109,25 @@ xfs_ail_next( * We need the AIL lock in order to get a coherent read of the lsn of the last * item in the AIL. */ +static xfs_lsn_t +__xfs_ail_min_lsn( + struct xfs_ail *ailp) +{ + struct xfs_log_item *lip = xfs_ail_min(ailp); + + if (lip) + return lip->li_lsn; + return 0; +} + xfs_lsn_t xfs_ail_min_lsn( struct xfs_ail *ailp) { - xfs_lsn_t lsn = 0; - struct xfs_log_item *lip; + xfs_lsn_t lsn; spin_lock(&ailp->ail_lock); - lip = xfs_ail_min(ailp); - if (lip) - lsn = lip->li_lsn; + lsn = __xfs_ail_min_lsn(ailp); spin_unlock(&ailp->ail_lock); return lsn; @@ -427,15 +436,15 @@ xfsaild_push( case XFS_ITEM_FLUSHING: /* - * The item or its backing buffer is already beeing + * The item or its backing buffer is already being * flushed. The typical reason for that is that an * inode buffer is locked because we already pushed the * updates to it as part of inode clustering. * * We do not want to to stop flushing just because lots - * of items are already beeing flushed, but we need to + * of items are already being flushed, but we need to * re-try the flushing relatively soon if most of the - * AIL is beeing flushed. + * AIL is being flushed. */ XFS_STATS_INC(mp, xs_push_ail_flushing); trace_xfs_ail_flushing(lip); @@ -529,8 +538,9 @@ xfsaild( { struct xfs_ail *ailp = data; long tout = 0; /* milliseconds */ + unsigned int noreclaim_flag; - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); set_freezable(); while (1) { @@ -601,6 +611,7 @@ xfsaild( tout = xfsaild_push(ailp); } + memalloc_noreclaim_restore(noreclaim_flag); return 0; } @@ -612,7 +623,7 @@ xfsaild( * The push is run asynchronously in a workqueue, which means the caller needs * to handle waiting on the async flush for space to become available. * We don't want to interrupt any push that is in progress, hence we only queue - * work if we set the pushing bit approriately. + * work if we set the pushing bit appropriately. * * We do this unlocked - we only need to know whether there is anything in the * AIL at the time we are called. We don't need to access the contents of @@ -678,6 +689,28 @@ xfs_ail_push_all_sync( finish_wait(&ailp->ail_empty, &wait); } +void +xfs_ail_update_finish( + struct xfs_ail *ailp, + xfs_lsn_t old_lsn) __releases(ailp->ail_lock) +{ + struct xfs_mount *mp = ailp->ail_mount; + + /* if the tail lsn hasn't changed, don't do updates or wakeups. */ + if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) { + spin_unlock(&ailp->ail_lock); + return; + } + + if (!XFS_FORCED_SHUTDOWN(mp)) + xlog_assign_tail_lsn_locked(mp); + + if (list_empty(&ailp->ail_head)) + wake_up_all(&ailp->ail_empty); + spin_unlock(&ailp->ail_lock); + xfs_log_space_wake(mp); +} + /* * xfs_trans_ail_update - bulk AIL insertion operation. * @@ -709,7 +742,7 @@ xfs_trans_ail_update_bulk( xfs_lsn_t lsn) __releases(ailp->ail_lock) { struct xfs_log_item *mlip; - int mlip_changed = 0; + xfs_lsn_t tail_lsn = 0; int i; LIST_HEAD(tmp); @@ -724,9 +757,10 @@ xfs_trans_ail_update_bulk( continue; trace_xfs_ail_move(lip, lip->li_lsn, lsn); + if (mlip == lip && !tail_lsn) + tail_lsn = lip->li_lsn; + xfs_ail_delete(ailp, lip); - if (mlip == lip) - mlip_changed = 1; } else { trace_xfs_ail_insert(lip, 0, lsn); } @@ -737,23 +771,23 @@ xfs_trans_ail_update_bulk( if (!list_empty(&tmp)) xfs_ail_splice(ailp, cur, &tmp, lsn); - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) - xlog_assign_tail_lsn_locked(ailp->ail_mount); - spin_unlock(&ailp->ail_lock); - - xfs_log_space_wake(ailp->ail_mount); - } else { - spin_unlock(&ailp->ail_lock); - } + xfs_ail_update_finish(ailp, tail_lsn); } -bool +/* + * Delete one log item from the AIL. + * + * If this item was at the tail of the AIL, return the LSN of the log item so + * that we can use it to check if the LSN of the tail of the log has moved + * when finishing up the AIL delete process in xfs_ail_update_finish(). + */ +xfs_lsn_t xfs_ail_delete_one( struct xfs_ail *ailp, struct xfs_log_item *lip) { struct xfs_log_item *mlip = xfs_ail_min(ailp); + xfs_lsn_t lsn = lip->li_lsn; trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); @@ -761,7 +795,9 @@ xfs_ail_delete_one( clear_bit(XFS_LI_IN_AIL, &lip->li_flags); lip->li_lsn = 0; - return mlip == lip; + if (mlip == lip) + return lsn; + return 0; } /** @@ -789,10 +825,10 @@ void xfs_trans_ail_delete( struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->ail_lock) + int shutdown_type) { struct xfs_mount *mp = ailp->ail_mount; - bool mlip_changed; + xfs_lsn_t tail_lsn; if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); @@ -805,17 +841,8 @@ xfs_trans_ail_delete( return; } - mlip_changed = xfs_ail_delete_one(ailp, lip); - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(mp)) - xlog_assign_tail_lsn_locked(mp); - if (list_empty(&ailp->ail_head)) - wake_up_all(&ailp->ail_empty); - } - - spin_unlock(&ailp->ail_lock); - if (mlip_changed) - xfs_log_space_wake(ailp->ail_mount); + tail_lsn = xfs_ail_delete_one(ailp, lip); + xfs_ail_update_finish(ailp, tail_lsn); } int @@ -836,7 +863,7 @@ xfs_trans_ail_init( init_waitqueue_head(&ailp->ail_empty); ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s", - ailp->ail_mount->m_fsname); + ailp->ail_mount->m_super->s_id); if (IS_ERR(ailp->ail_task)) goto out_free_ailp; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index b5b3a78ef31c..08174ffa2118 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -112,19 +112,22 @@ xfs_trans_bjoin( * If the transaction pointer is NULL, make this just a normal * get_buf() call. */ -struct xfs_buf * +int xfs_trans_get_buf_map( struct xfs_trans *tp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { xfs_buf_t *bp; struct xfs_buf_log_item *bip; + int error; + *bpp = NULL; if (!tp) - return xfs_buf_get_map(target, map, nmaps, flags); + return xfs_buf_get_map(target, map, nmaps, flags, bpp); /* * If we find the buffer in the cache with this transaction @@ -146,19 +149,20 @@ xfs_trans_get_buf_map( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; trace_xfs_trans_get_buf_recur(bip); - return bp; + *bpp = bp; + return 0; } - bp = xfs_buf_get_map(target, map, nmaps, flags); - if (bp == NULL) { - return NULL; - } + error = xfs_buf_get_map(target, map, nmaps, flags, &bp); + if (error) + return error; ASSERT(!bp->b_error); _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_get_buf(bp->b_log_item); - return bp; + *bpp = bp; + return 0; } /* @@ -276,7 +280,7 @@ xfs_trans_read_buf_map( ASSERT(bp->b_ops != NULL); error = xfs_buf_reverify(bp, ops); if (error) { - xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_ioerror_alert(bp, __return_address); if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, @@ -298,36 +302,17 @@ xfs_trans_read_buf_map( return 0; } - bp = xfs_buf_read_map(target, map, nmaps, flags, ops); - if (!bp) { - if (!(flags & XBF_TRYLOCK)) - return -ENOMEM; - return tp ? 0 : -EAGAIN; - } - - /* - * If we've had a read error, then the contents of the buffer are - * invalid and should not be used. To ensure that a followup read tries - * to pull the buffer from disk again, we clear the XBF_DONE flag and - * mark the buffer stale. This ensures that anyone who has a current - * reference to the buffer will interpret it's contents correctly and - * future cache lookups will also treat it as an empty, uninitialised - * buffer. - */ - if (bp->b_error) { - error = bp->b_error; - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_buf_ioerror_alert(bp, __func__); - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - + error = xfs_buf_read_map(target, map, nmaps, flags, &bp, ops, + __return_address); + switch (error) { + case 0: + break; + default: if (tp && (tp->t_flags & XFS_TRANS_DIRTY)) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); - xfs_buf_relse(bp); - - /* bad CRC means corrupted metadata */ - if (error == -EFSBADCRC) - error = -EFSCORRUPTED; + /* fall through */ + case -ENOMEM: + case -EAGAIN: return error; } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 16457465833b..d1b9869bc5fa 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -25,8 +25,8 @@ STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); */ void xfs_trans_dqjoin( - xfs_trans_t *tp, - xfs_dquot_t *dqp) + struct xfs_trans *tp, + struct xfs_dquot *dqp) { ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(dqp->q_logitem.qli_dquot == dqp); @@ -49,8 +49,8 @@ xfs_trans_dqjoin( */ void xfs_trans_log_dquot( - xfs_trans_t *tp, - xfs_dquot_t *dqp) + struct xfs_trans *tp, + struct xfs_dquot *dqp) { ASSERT(XFS_DQ_IS_LOCKED(dqp)); @@ -486,12 +486,12 @@ xfs_trans_apply_dquot_deltas( */ void xfs_trans_unreserve_and_mod_dquots( - xfs_trans_t *tp) + struct xfs_trans *tp) { int i, j; - xfs_dquot_t *dqp; + struct xfs_dquot *dqp; struct xfs_dqtrx *qtrx, *qa; - bool locked; + bool locked; if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) return; @@ -571,21 +571,21 @@ xfs_quota_warn( */ STATIC int xfs_trans_dqresv( - xfs_trans_t *tp, - xfs_mount_t *mp, - xfs_dquot_t *dqp, - int64_t nblks, - long ninos, - uint flags) + struct xfs_trans *tp, + struct xfs_mount *mp, + struct xfs_dquot *dqp, + int64_t nblks, + long ninos, + uint flags) { - xfs_qcnt_t hardlimit; - xfs_qcnt_t softlimit; - time_t timer; - xfs_qwarncnt_t warns; - xfs_qwarncnt_t warnlimit; - xfs_qcnt_t total_count; - xfs_qcnt_t *resbcountp; - xfs_quotainfo_t *q = mp->m_quotainfo; + xfs_qcnt_t hardlimit; + xfs_qcnt_t softlimit; + time64_t timer; + xfs_qwarncnt_t warns; + xfs_qwarncnt_t warnlimit; + xfs_qcnt_t total_count; + xfs_qcnt_t *resbcountp; + struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_def_quota *defq; @@ -635,7 +635,8 @@ xfs_trans_dqresv( goto error_return; } if (softlimit && total_count > softlimit) { - if ((timer != 0 && get_seconds() > timer) || + if ((timer != 0 && + ktime_get_real_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTLONGWARN); @@ -662,7 +663,8 @@ xfs_trans_dqresv( goto error_return; } if (softlimit && total_count > softlimit) { - if ((timer != 0 && get_seconds() > timer) || + if ((timer != 0 && + ktime_get_real_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTLONGWARN); @@ -824,13 +826,13 @@ xfs_trans_reserve_quota_nblks( /* * This routine is called to allocate a quotaoff log item. */ -xfs_qoff_logitem_t * +struct xfs_qoff_logitem * xfs_trans_get_qoff_item( - xfs_trans_t *tp, - xfs_qoff_logitem_t *startqoff, + struct xfs_trans *tp, + struct xfs_qoff_logitem *startqoff, uint flags) { - xfs_qoff_logitem_t *q; + struct xfs_qoff_logitem *q; ASSERT(tp != NULL); @@ -852,8 +854,8 @@ xfs_trans_get_qoff_item( */ void xfs_trans_log_quotaoff_item( - xfs_trans_t *tp, - xfs_qoff_logitem_t *qlp) + struct xfs_trans *tp, + struct xfs_qoff_logitem *qlp) { tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags); @@ -872,6 +874,6 @@ xfs_trans_free_dqinfo( { if (!tp->t_dqinfo) return; - kmem_zone_free(xfs_qm_dqtrxzone, tp->t_dqinfo); + kmem_cache_free(xfs_qm_dqtrxzone, tp->t_dqinfo); tp->t_dqinfo = NULL; } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 2e073c1c4614..35655eac01a6 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -91,9 +91,11 @@ xfs_trans_ail_update( xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } -bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn) + __releases(ailp->ail_lock); void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->ail_lock); + int shutdown_type); static inline void xfs_trans_ail_remove( diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index cb895b1df5e4..fc5d7276026e 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -11,51 +11,31 @@ #include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_attr.h" +#include "xfs_acl.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include <linux/posix_acl_xattr.h> -#include <linux/xattr.h> static int xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *value, size_t size) { - int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(inode); - int error, asize = size; - - /* Convert Linux syscall to XFS internal ATTR flags */ - if (!size) { - xflags |= ATTR_KERNOVAL; - value = NULL; - } + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = handler->flags, + .name = name, + .namelen = strlen(name), + .value = value, + .valuelen = size, + }; + int error; - error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags); + error = xfs_attr_get(&args); if (error) return error; - return asize; -} - -void -xfs_forget_acl( - struct inode *inode, - const char *name, - int xflags) -{ - /* - * Invalidate any cached ACLs if the user has bypassed the ACL - * interface. We don't validate the content whatsoever so it is caller - * responsibility to provide data in valid format and ensure i_mode is - * consistent. - */ - if (xflags & ATTR_ROOT) { -#ifdef CONFIG_XFS_POSIX_ACL - if (!strcmp(name, SGI_ACL_FILE)) - forget_cached_acl(inode, ACL_TYPE_ACCESS); - else if (!strcmp(name, SGI_ACL_DEFAULT)) - forget_cached_acl(inode, ACL_TYPE_DEFAULT); -#endif - } + return args.valuelen; } static int @@ -63,23 +43,20 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { - int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = handler->flags, + .attr_flags = flags, + .name = name, + .namelen = strlen(name), + .value = (void *)value, + .valuelen = size, + }; int error; - /* Convert Linux syscall to XFS internal ATTR flags */ - if (flags & XATTR_CREATE) - xflags |= ATTR_CREATE; - if (flags & XATTR_REPLACE) - xflags |= ATTR_REPLACE; - - if (!value) - return xfs_attr_remove(ip, (unsigned char *)name, xflags); - error = xfs_attr_set(ip, (unsigned char *)name, - (void *)value, size, xflags); - if (!error) - xfs_forget_acl(inode, name, xflags); - + error = xfs_attr_set(&args); + if (!error && (handler->flags & XFS_ATTR_ROOT)) + xfs_forget_acl(inode, name); return error; } @@ -92,14 +69,14 @@ static const struct xattr_handler xfs_xattr_user_handler = { static const struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .flags = ATTR_ROOT, + .flags = XFS_ATTR_ROOT, .get = xfs_xattr_get, .set = xfs_xattr_set, }; static const struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .flags = ATTR_SECURE, + .flags = XFS_ATTR_SECURE, .get = xfs_xattr_get, .set = xfs_xattr_set, }; @@ -129,7 +106,7 @@ __xfs_xattr_put_listent( if (context->count < 0 || context->seen_enough) return; - if (!context->alist) + if (!context->buffer) goto compute_size; arraytop = context->count + prefix_len + namelen + 1; @@ -138,7 +115,7 @@ __xfs_xattr_put_listent( context->seen_enough = 1; return; } - offset = (char *)context->alist + context->count; + offset = context->buffer + context->count; strncpy(offset, prefix, prefix_len); offset += prefix_len; strncpy(offset, (char *)name, namelen); /* real name */ @@ -213,7 +190,6 @@ xfs_vn_listxattr( size_t size) { struct xfs_attr_list_context context; - struct attrlist_cursor_kern cursor = { 0 }; struct inode *inode = d_inode(dentry); int error; @@ -222,14 +198,13 @@ xfs_vn_listxattr( */ memset(&context, 0, sizeof(context)); context.dp = XFS_I(inode); - context.cursor = &cursor; context.resynch = 1; - context.alist = size ? data : NULL; + context.buffer = size ? data : NULL; context.bufsize = size; context.firstu = context.bufsize; context.put_listent = xfs_xattr_put_listent; - error = xfs_attr_list_int(&context); + error = xfs_attr_list(&context); if (error) return error; if (context.count < 0) |