diff options
Diffstat (limited to 'fs/xfs')
42 files changed, 517 insertions, 605 deletions
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 6f49bf39183c..c557a030acfe 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -4,7 +4,6 @@ * All Rights Reserved. */ #include "xfs.h" -#include <linux/backing-dev.h> #include "xfs_message.h" #include "xfs_trace.h" @@ -26,6 +25,6 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)", current->comm, current->pid, (unsigned int)size, __func__, lflags); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(lflags); } while (1); } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 4dccd4d90622..74198dd82b03 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4551,7 +4551,7 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4598,7 +4598,7 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index c43877c8a279..505533c43a92 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -93,21 +93,6 @@ struct getbmapx { #define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */ /* - * Structure for XFS_IOC_FSSETDM. - * For use by backup and restore programs to set the XFS on-disk inode - * fields di_dmevmask and di_dmstate. These must be set to exactly and - * only values previously obtained via xfs_bulkstat! (Specifically the - * struct xfs_bstat fields bs_dmevmask and bs_dmstate.) - */ -#ifndef HAVE_FSDMIDATA -struct fsdmidata { - __u32 fsd_dmevmask; /* corresponds to di_dmevmask */ - __u16 fsd_padding; - __u16 fsd_dmstate; /* corresponds to di_dmstate */ -}; -#endif - -/* * File segment locking set data type for 64 bit access. * Also used for all the RESV/FREE interfaces. */ @@ -562,16 +547,10 @@ typedef struct xfs_fsop_handlereq { /* * Compound structures for passing args through Handle Request interfaces - * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle - * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and - * XFS_IOC_ATTRMULTI_BY_HANDLE + * xfs_attrlist_by_handle, xfs_attrmulti_by_handle + * - ioctls: XFS_IOC_ATTRLIST_BY_HANDLE, and XFS_IOC_ATTRMULTI_BY_HANDLE */ -typedef struct xfs_fsop_setdm_handlereq { - struct xfs_fsop_handlereq hreq; /* handle information */ - struct fsdmidata __user *data; /* DMAPI data */ -} xfs_fsop_setdm_handlereq_t; - /* * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface. * @@ -781,15 +760,15 @@ struct xfs_scrub_metadata { * For 'documentation' purposed more than anything else, * the "cmd #" field reflects the IRIX fcntl number. */ -#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) -#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +/* XFS_IOC_ALLOCSP ------- deprecated 10 */ +/* XFS_IOC_FREESP -------- deprecated 11 */ #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) #define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR -#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) -#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) +/* XFS_IOC_ALLOCSP64 ----- deprecated 36 */ +/* XFS_IOC_FREESP64 ------ deprecated 37 */ #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) -#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata) +/* XFS_IOC_FSSETDM ------- deprecated 39 */ #define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64) #define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64) #define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64) @@ -831,7 +810,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */ #define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */ -#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) +/* XFS_IOC_FSSETDM_BY_HANDLE -- deprecated 121 */ #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) #define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index bed798792226..90aebfe9dc5f 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -281,7 +281,7 @@ xchk_superblock( features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT); if ((sb->sb_features2 & features_mask) != (cpu_to_be32(mp->m_sb.sb_features2) & features_mask)) - xchk_block_set_corrupt(sc, bp); + xchk_block_set_preen(sc, bp); if (!xfs_has_crc(mp)) { /* all v5 fields must be zero */ @@ -290,39 +290,38 @@ xchk_superblock( offsetof(struct xfs_dsb, sb_features_compat))) xchk_block_set_corrupt(sc, bp); } else { - /* Check compat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_COMPAT_UNKNOWN); - if ((sb->sb_features_compat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_compat) & features_mask)) + /* compat features must match */ + if (sb->sb_features_compat != + cpu_to_be32(mp->m_sb.sb_features_compat)) xchk_block_set_corrupt(sc, bp); - /* Check ro compat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_RO_COMPAT_UNKNOWN | - XFS_SB_FEAT_RO_COMPAT_FINOBT | - XFS_SB_FEAT_RO_COMPAT_RMAPBT | - XFS_SB_FEAT_RO_COMPAT_REFLINK); - if ((sb->sb_features_ro_compat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_ro_compat) & - features_mask)) + /* ro compat features must match */ + if (sb->sb_features_ro_compat != + cpu_to_be32(mp->m_sb.sb_features_ro_compat)) xchk_block_set_corrupt(sc, bp); - /* Check incompat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_UNKNOWN | - XFS_SB_FEAT_INCOMPAT_FTYPE | - XFS_SB_FEAT_INCOMPAT_SPINODES | - XFS_SB_FEAT_INCOMPAT_META_UUID); - if ((sb->sb_features_incompat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_incompat) & - features_mask)) - xchk_block_set_corrupt(sc, bp); + /* + * NEEDSREPAIR is ignored on a secondary super, so we should + * clear it when we find it, though it's not a corruption. + */ + features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^ + sb->sb_features_incompat) & features_mask) + xchk_block_set_preen(sc, bp); - /* Check log incompat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN); - if ((sb->sb_features_log_incompat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_log_incompat) & - features_mask)) + /* all other incompat features must match */ + if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^ + sb->sb_features_incompat) & ~features_mask) xchk_block_set_corrupt(sc, bp); + /* + * log incompat features protect newer log record types from + * older log recovery code. Log recovery doesn't check the + * secondary supers, so we can clear these if needed. + */ + if (sb->sb_features_log_incompat) + xchk_block_set_preen(sc, bp); + /* Don't care about sb_crc */ if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align)) diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index d7bfed52f4cd..6da7f2ca77de 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -52,6 +52,18 @@ xrep_superblock( xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_sb_to_disk(bp->b_addr, &mp->m_sb); + /* + * Don't write out a secondary super with NEEDSREPAIR or log incompat + * features set, since both are ignored when set on a secondary. + */ + if (xfs_has_crc(mp)) { + struct xfs_dsb *sb = bp->b_addr; + + sb->sb_features_incompat &= + ~cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + sb->sb_features_log_incompat = 0; + } + /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 200a63f58fe7..38897adde7b5 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -497,6 +497,7 @@ STATIC int xchk_directory_leaf1_bestfree( struct xfs_scrub *sc, struct xfs_da_args *args, + xfs_dir2_db_t last_data_db, xfs_dablk_t lblk) { struct xfs_dir3_icleaf_hdr leafhdr; @@ -534,10 +535,14 @@ xchk_directory_leaf1_bestfree( } /* - * There should be as many bestfree slots as there are dir data - * blocks that can fit under i_size. + * There must be enough bestfree slots to cover all the directory data + * blocks that we scanned. It is possible for there to be a hole + * between the last data block and i_disk_size. This seems like an + * oversight to the scrub author, but as we have been writing out + * directories like this (and xfs_repair doesn't mind them) for years, + * that's what we have to check. */ - if (bestcount != xfs_dir2_byte_to_db(geo, sc->ip->i_disk_size)) { + if (bestcount != last_data_db + 1) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out; } @@ -669,6 +674,7 @@ xchk_directory_blocks( xfs_fileoff_t lblk; struct xfs_iext_cursor icur; xfs_dablk_t dabno; + xfs_dir2_db_t last_data_db = 0; bool found; int is_block = 0; int error; @@ -712,6 +718,7 @@ xchk_directory_blocks( args.geo->fsbcount); lblk < got.br_startoff + got.br_blockcount; lblk += args.geo->fsbcount) { + last_data_db = xfs_dir2_da_to_db(args.geo, lblk); error = xchk_directory_data_bestfree(sc, lblk, is_block); if (error) @@ -734,7 +741,7 @@ xchk_directory_blocks( xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); goto out; } - error = xchk_directory_leaf1_bestfree(sc, &args, + error = xchk_directory_leaf1_bestfree(sc, &args, last_data_db, leaf_lblk); if (error) goto out; diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 2405b09d03d0..eac15af7b08c 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -233,6 +233,7 @@ xchk_dinode( unsigned long long isize; uint64_t flags2; uint32_t nextents; + prid_t prid; uint16_t flags; uint16_t mode; @@ -267,6 +268,7 @@ xchk_dinode( * so just mark this inode for preening. */ xchk_ino_set_preen(sc, ino); + prid = 0; break; case 2: case 3: @@ -279,12 +281,17 @@ xchk_dinode( if (dip->di_projid_hi != 0 && !xfs_has_projid32(mp)) xchk_ino_set_corrupt(sc, ino); + + prid = be16_to_cpu(dip->di_projid_lo); break; default: xchk_ino_set_corrupt(sc, ino); return; } + if (xfs_has_projid32(mp)) + prid |= (prid_t)be16_to_cpu(dip->di_projid_hi) << 16; + /* * di_uid/di_gid -- -1 isn't invalid, but there's no way that * userspace could have created that. @@ -293,6 +300,13 @@ xchk_dinode( dip->di_gid == cpu_to_be32(-1U)) xchk_ino_set_warning(sc, ino); + /* + * project id of -1 isn't supposed to be valid, but the kernel didn't + * always validate that. + */ + if (prid == -1U) + xchk_ino_set_warning(sc, ino); + /* di_format */ switch (dip->di_format) { case XFS_DINODE_FMT_DEV: diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index d6c1b00a4fc8..3c7506c7553c 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -48,10 +48,10 @@ xchk_setup_quota( dqtype = xchk_quota_to_dqtype(sc); if (dqtype == 0) return -EINVAL; - sc->flags |= XCHK_HAS_QUOTAOFFLOCK; - mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); + if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; + error = xchk_setup_fs(sc); if (error) return error; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 8f3cba14ada3..1e7b6b209ee8 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -25,6 +25,7 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_quota.h" +#include "xfs_qm.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -912,11 +913,13 @@ xrep_force_quotacheck( if (!(flag & sc->mp->m_qflags)) return; + mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); sc->mp->m_qflags &= ~flag; spin_lock(&sc->mp->m_sb_lock); sc->mp->m_sb.sb_qflags &= ~flag; spin_unlock(&sc->mp->m_sb_lock); xfs_log_sb(sc->tp); + mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); } /* diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 8d528d35b725..b11870d07c56 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -173,10 +173,6 @@ xchk_teardown( mnt_drop_write_file(sc->file); if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); - if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { - mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); - sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK; - } if (sc->buf) { kmem_free(sc->buf); sc->buf = NULL; diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 80e5026bba44..3de5287e98d8 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -88,7 +88,6 @@ struct xfs_scrub { /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ #define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ -#define XCHK_HAS_QUOTAOFFLOCK (1 << 1) /* we hold the quotaoff lock */ #define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */ #define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index c8c15c3c3147..9d6a67c7d227 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -136,7 +136,20 @@ done: memalloc_nofs_restore(nofs_flag); } -/* Finish all pending io completions. */ +/* + * Finish all pending IO completions that require transactional modifications. + * + * We try to merge physical and logically contiguous ioends before completion to + * minimise the number of transactions we need to perform during IO completion. + * Both unwritten extent conversion and COW remapping need to iterate and modify + * one physical extent at a time, so we gain nothing by merging physically + * discontiguous extents here. + * + * The ioend chain length that we can be processing here is largely unbound in + * length and we may have to perform significant amounts of work on each ioend + * to complete it. Hence we have to be careful about holding the CPU for too + * long in this loop. + */ void xfs_end_io( struct work_struct *work) @@ -157,6 +170,7 @@ xfs_end_io( list_del_init(&ioend->io_list); iomap_ioend_try_merge(ioend, &tmp); xfs_end_ioend(ioend); + cond_resched(); } } @@ -359,7 +373,7 @@ retry: isnullstartblock(imap.br_startblock)) goto allocate_blocks; - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: @@ -437,37 +451,37 @@ xfs_prepare_ioend( * see a ENOSPC in writeback). */ static void -xfs_discard_page( - struct page *page, - loff_t fileoff) +xfs_discard_folio( + struct folio *folio, + loff_t pos) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - unsigned int pageoff = offset_in_page(fileoff); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, fileoff); - xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff); + size_t offset = offset_in_folio(folio, pos); + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos); + xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset); int error; if (xfs_is_shutdown(mp)) goto out_invalidate; xfs_alert_ratelimited(mp, - "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", - page, ip->i_ino, fileoff); + "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", + folio, ip->i_ino, pos); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - i_blocks_per_page(inode, page) - pageoff_fsb); + i_blocks_per_folio(inode, folio) - pageoff_fsb); if (error && !xfs_is_shutdown(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: - iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff); + iomap_invalidate_folio(folio, offset, folio_size(folio) - offset); } static const struct iomap_writeback_ops xfs_writeback_ops = { .map_blocks = xfs_map_blocks, .prepare_ioend = xfs_prepare_ioend, - .discard_page = xfs_discard_page, + .discard_folio = xfs_discard_folio, }; STATIC int diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 73a36b7be3bd..eb2e387ba528 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -771,8 +771,7 @@ int xfs_alloc_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, - int alloc_type) + xfs_off_t len) { xfs_mount_t *mp = ip->i_mount; xfs_off_t count; @@ -851,9 +850,6 @@ xfs_alloc_file_space( rblocks = 0; } - /* - * Allocate and setup the transaction. - */ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, rblocks, false, &tp); if (error) @@ -865,14 +861,14 @@ xfs_alloc_file_space( goto error; error = xfs_bmapi_write(tp, ip, startoffset_fsb, - allocatesize_fsb, alloc_type, 0, imapp, - &nimaps); + allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, + &nimaps); if (error) goto error; - /* - * Complete the transaction - */ + ip->i_diflags |= XFS_DIFLAG_PREALLOC; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) @@ -1001,7 +997,7 @@ xfs_free_file_space( /* * Now that we've unmap all full blocks we'll have to zero out any - * partial block at the beginning and/or end. iomap_zero_range is smart + * partial block at the beginning and/or end. xfs_zero_range is smart * enough to skip any holes, including those we just created, but we * must take care not to zero beyond EOF and enlarge i_size. */ @@ -1009,15 +1005,14 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - error = iomap_zero_range(VFS_I(ip), offset, len, NULL, - &xfs_buffered_write_iomap_ops); + error = xfs_zero_range(ip, offset, len, NULL); if (error) return error; /* * If we zeroed right up to EOF and EOF straddles a page boundary we * must make sure that the post-EOF area is also zeroed because the - * page could be mmap'd and iomap_zero_range doesn't do that for us. + * page could be mmap'd and xfs_zero_range doesn't do that for us. * Writeback of the eof page will do this, albeit clumsily. */ if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) { diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 9f993168b55b..24b37d211f1d 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -54,7 +54,7 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, /* preallocation and hole punch interface */ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, int alloc_type); + xfs_off_t len); int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 631c5a61d89b..b45e0d50a405 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -394,7 +394,7 @@ xfs_buf_alloc_pages( } XFS_STATS_INC(bp->b_mount, xb_page_retries); - congestion_wait(BLK_RW_ASYNC, HZ / 50); + memalloc_retry_wait(gfp_mask); } return 0; } @@ -1892,6 +1892,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); + fs_put_dax(btp->bt_daxdev); kmem_free(btp); } @@ -1932,11 +1933,10 @@ xfs_setsize_buftarg_early( return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); } -xfs_buftarg_t * +struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev, - struct dax_device *dax_dev) + struct block_device *bdev) { xfs_buftarg_t *btp; @@ -1945,7 +1945,7 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; - btp->bt_daxdev = dax_dev; + btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off); /* * Buffer IO error rate limiting. Limit it to no more than 10 messages diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 6b0200b8007d..edcb6254fa6a 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -89,6 +89,7 @@ typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; struct dax_device *bt_daxdev; + u64 bt_dax_part_off; struct xfs_mount *bt_mount; unsigned int bt_meta_sectorsize; size_t bt_meta_sectormask; @@ -338,8 +339,8 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) /* * Handling of buftargs. */ -extern struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *, - struct block_device *, struct dax_device *); +struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, + struct block_device *bdev); extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 70ca5751b13e..e484251dc9c8 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -816,7 +816,7 @@ xlog_recover_get_buf_lsn( } if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) goto recover_immediately; return lsn; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 8310005af00f..a7174a5b3203 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -138,7 +138,8 @@ xfs_dir2_sf_getdents( STATIC int xfs_dir2_block_getdents( struct xfs_da_args *args, - struct dir_context *ctx) + struct dir_context *ctx, + unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; /* incore directory inode */ struct xfs_buf *bp; /* buffer for block */ @@ -146,7 +147,6 @@ xfs_dir2_block_getdents( int wantoff; /* starting block offset */ xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; - int lock_mode; unsigned int offset, next_offset; unsigned int end; @@ -156,12 +156,13 @@ xfs_dir2_block_getdents( if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; - lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir3_block_read(args->trans, dp, &bp); - xfs_iunlock(dp, lock_mode); if (error) return error; + xfs_iunlock(dp, *lock_mode); + *lock_mode = 0; + /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. @@ -344,7 +345,8 @@ STATIC int xfs_dir2_leaf_getdents( struct xfs_da_args *args, struct dir_context *ctx, - size_t bufsize) + size_t bufsize, + unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; @@ -356,7 +358,6 @@ xfs_dir2_leaf_getdents( xfs_dir2_off_t curoff; /* current overall offset */ int length; /* temporary length value */ int byteoff; /* offset in current block */ - int lock_mode; unsigned int offset = 0; int error = 0; /* error return value */ @@ -390,13 +391,16 @@ xfs_dir2_leaf_getdents( bp = NULL; } - lock_mode = xfs_ilock_data_map_shared(dp); + if (*lock_mode == 0) + *lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff, &rablk, &bp); - xfs_iunlock(dp, lock_mode); if (error || !bp) break; + xfs_iunlock(dp, *lock_mode); + *lock_mode = 0; + xfs_dir3_data_check(dp, bp); /* * Find our position in the block. @@ -496,7 +500,7 @@ xfs_dir2_leaf_getdents( * * If supplied, the transaction collects locked dir buffers to avoid * nested buffer deadlocks. This function does not dirty the - * transaction. The caller should ensure that the inode is locked + * transaction. The caller must hold the IOLOCK (shared or exclusive) * before calling this function. */ int @@ -507,8 +511,9 @@ xfs_readdir( size_t bufsize) { struct xfs_da_args args = { NULL }; - int rval; - int v; + unsigned int lock_mode; + int isblock; + int error; trace_xfs_readdir(dp); @@ -516,6 +521,7 @@ xfs_readdir( return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); XFS_STATS_INC(dp->i_mount, xs_dir_getdents); args.dp = dp; @@ -523,13 +529,22 @@ xfs_readdir( args.trans = tp; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_getdents(&args, ctx); - else if ((rval = xfs_dir2_isblock(&args, &v))) - ; - else if (v) - rval = xfs_dir2_block_getdents(&args, ctx); - else - rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); + return xfs_dir2_sf_getdents(&args, ctx); - return rval; + lock_mode = xfs_ilock_data_map_shared(dp); + error = xfs_dir2_isblock(&args, &isblock); + if (error) + goto out_unlock; + + if (isblock) { + error = xfs_dir2_block_getdents(&args, ctx, &lock_mode); + goto out_unlock; + } + + error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode); + +out_unlock: + if (lock_mode) + xfs_iunlock(dp, lock_mode); + return error; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index e48ae227bb11..5afedcbc78c7 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -289,13 +289,12 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) */ STATIC int xfs_dquot_disk_alloc( - struct xfs_trans **tpp, struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_bmbt_irec map; - struct xfs_trans *tp = *tpp; - struct xfs_mount *mp = tp->t_mountp; + struct xfs_trans *tp; + struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; xfs_dqtype_t qtype = xfs_dquot_type(dqp); struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); @@ -304,29 +303,35 @@ xfs_dquot_disk_alloc( trace_xfs_dqalloc(dqp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); + if (error) + return error; + xfs_ilock(quotip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, quotip, 0); + if (!xfs_this_quota_on(dqp->q_mount, qtype)) { /* * Return if this type of quotas is turned off while we didn't * have an inode lock */ - xfs_iunlock(quotip, XFS_ILOCK_EXCL); - return -ESRCH; + error = -ESRCH; + goto err_cancel; } - xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); - error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); if (error) - return error; + goto err_cancel; /* Create the block mapping. */ error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map, &nmaps); if (error) - return error; + goto err_cancel; + ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); ASSERT(nmaps == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -341,7 +346,7 @@ xfs_dquot_disk_alloc( error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp); if (error) - return error; + goto err_cancel; bp->b_ops = &xfs_dquot_buf_ops; /* @@ -371,16 +376,25 @@ xfs_dquot_disk_alloc( * is responsible for unlocking any buffer passed back, either * manually or by committing the transaction. On error, the buffer is * released and not passed back. + * + * Keep the quota inode ILOCKed until after the transaction commit to + * maintain the atomicity of bmap/rmap updates. */ xfs_trans_bhold(tp, bp); - error = xfs_defer_finish(tpp); + error = xfs_trans_commit(tp); + xfs_iunlock(quotip, XFS_ILOCK_EXCL); if (error) { - xfs_trans_bhold_release(*tpp, bp); - xfs_trans_brelse(*tpp, bp); + xfs_buf_relse(bp); return error; } + *bpp = bp; return 0; + +err_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + return error; } /* @@ -629,43 +643,6 @@ xfs_dquot_to_disk( ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer); } -/* Allocate and initialize the dquot buffer for this in-core dquot. */ -static int -xfs_qm_dqread_alloc( - struct xfs_mount *mp, - struct xfs_dquot *dqp, - struct xfs_buf **bpp) -{ - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, - XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); - if (error) - goto err; - - error = xfs_dquot_disk_alloc(&tp, dqp, bpp); - if (error) - goto err_cancel; - - error = xfs_trans_commit(tp); - if (error) { - /* - * Buffer was held to the transaction, so we have to unlock it - * manually here because we're not passing it back. - */ - xfs_buf_relse(*bpp); - *bpp = NULL; - goto err; - } - return 0; - -err_cancel: - xfs_trans_cancel(tp); -err: - return error; -} - /* * Read in the ondisk dquot using dqtobp() then copy it to an incore version, * and release the buffer immediately. If @can_alloc is true, fill any @@ -689,7 +666,7 @@ xfs_qm_dqread( /* Try to read the buffer, allocating if necessary. */ error = xfs_dquot_disk_read(mp, dqp, &bp); if (error == -ENOENT && can_alloc) - error = xfs_qm_dqread_alloc(mp, dqp, &bp); + error = xfs_dquot_disk_alloc(dqp, &bp); if (error) goto err; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 81c445e9489b..749fd18c4f32 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -213,11 +213,12 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), NULL, }; +ATTRIBUTE_GROUPS(xfs_errortag); static struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_errortag_sysfs_ops, - .default_attrs = xfs_errortag_attrs, + .default_groups = xfs_errortag_groups, }; int diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 27594738b0d1..5bddb1e9e0b3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -66,40 +66,6 @@ xfs_is_falloc_aligned( return !((pos | len) & mask); } -int -xfs_update_prealloc_flags( - struct xfs_inode *ip, - enum xfs_prealloc_flags flags) -{ - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, - 0, 0, 0, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - if (!(flags & XFS_PREALLOC_INVISIBLE)) { - VFS_I(ip)->i_mode &= ~S_ISUID; - if (VFS_I(ip)->i_mode & S_IXGRP) - VFS_I(ip)->i_mode &= ~S_ISGID; - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } - - if (flags & XFS_PREALLOC_SET) - ip->i_diflags |= XFS_DIFLAG_PREALLOC; - if (flags & XFS_PREALLOC_CLEAR) - ip->i_diflags &= ~XFS_DIFLAG_PREALLOC; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (flags & XFS_PREALLOC_SYNC) - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp); -} - /* * Fsync operations on directories are much simpler than on regular files, * as there is no file data to flush, and thus also no need for explicit @@ -437,8 +403,7 @@ restart: } trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); - error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, - NULL, &xfs_buffered_write_iomap_ops); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); if (error) return error; } else @@ -896,6 +861,21 @@ xfs_break_layouts( return error; } +/* Does this file, inode, or mount want synchronous writes? */ +static inline bool xfs_file_sync_writes(struct file *filp) +{ + struct xfs_inode *ip = XFS_I(file_inode(filp)); + + if (xfs_has_wsync(ip->i_mount)) + return true; + if (filp->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(filp))) + return true; + + return false; +} + #define XFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ @@ -911,7 +891,6 @@ xfs_file_fallocate( struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); long error; - enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; loff_t new_size = 0; bool do_file_insert = false; @@ -956,6 +935,10 @@ xfs_file_fallocate( goto out_unlock; } + error = file_modified(file); + if (error) + goto out_unlock; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -1005,8 +988,6 @@ xfs_file_fallocate( } do_file_insert = true; } else { - flags |= XFS_PREALLOC_SET; - if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) { new_size = offset + len; @@ -1052,20 +1033,12 @@ xfs_file_fallocate( } if (!xfs_is_always_cow_inode(ip)) { - error = xfs_alloc_file_space(ip, offset, len, - XFS_BMAPI_PREALLOC); + error = xfs_alloc_file_space(ip, offset, len); if (error) goto out_unlock; } } - if (file->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; - - error = xfs_update_prealloc_flags(ip, flags); - if (error) - goto out_unlock; - /* Change file size if needed */ if (new_size) { struct iattr iattr; @@ -1084,8 +1057,14 @@ xfs_file_fallocate( * leave shifted extents past EOF and hence losing access to * the data that is contained within them. */ - if (do_file_insert) + if (do_file_insert) { error = xfs_insert_file_space(ip, offset, len); + if (error) + goto out_unlock; + } + + if (xfs_file_sync_writes(file)) + error = xfs_log_force_inode(ip); out_unlock: xfs_iunlock(ip, iolock); @@ -1117,21 +1096,6 @@ xfs_file_fadvise( return ret; } -/* Does this file, inode, or mount want synchronous writes? */ -static inline bool xfs_file_sync_writes(struct file *filp) -{ - struct xfs_inode *ip = XFS_I(file_inode(filp)); - - if (xfs_has_wsync(ip->i_mount)) - return true; - if (filp->f_flags & (__O_SYNC | O_DSYNC)) - return true; - if (IS_SYNC(file_inode(filp))) - return true; - - return false; -} - STATIC loff_t xfs_file_remap_range( struct file *file_in, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index da4af2142a2b..9644f938990c 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -87,6 +87,7 @@ xfs_inode_alloc( /* VFS doesn't initialise i_mode or i_state! */ VFS_I(ip)->i_mode = 0; VFS_I(ip)->i_state = 0; + mapping_set_large_folios(VFS_I(ip)->i_mapping); XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); @@ -320,6 +321,7 @@ xfs_reinit_inode( inode->i_rdev = dev; inode->i_uid = uid; inode->i_gid = gid; + mapping_set_large_folios(inode->i_mapping); return error; } @@ -749,7 +751,8 @@ again: /* * If we have a real type for an on-disk inode, we can setup the inode - * now. If it's a new inode being created, xfs_ialloc will handle it. + * now. If it's a new inode being created, xfs_init_new_inode will + * handle it. */ if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) xfs_setup_existing_inode(ip); @@ -1851,28 +1854,20 @@ xfs_inodegc_worker( } /* - * Force all currently queued inode inactivation work to run immediately, and - * wait for the work to finish. Two pass - queue all the work first pass, wait - * for it in a second pass. + * Force all currently queued inode inactivation work to run immediately and + * wait for the work to finish. */ void xfs_inodegc_flush( struct xfs_mount *mp) { - struct xfs_inodegc *gc; - int cpu; - if (!xfs_is_inodegc_enabled(mp)) return; trace_xfs_inodegc_flush(mp, __return_address); xfs_inodegc_queue_all(mp); - - for_each_online_cpu(cpu) { - gc = per_cpu_ptr(mp->m_inodegc, cpu); - flush_work(&gc->work); - } + flush_workqueue(mp->m_inodegc_wq); } /* @@ -1883,18 +1878,12 @@ void xfs_inodegc_stop( struct xfs_mount *mp) { - struct xfs_inodegc *gc; - int cpu; - if (!xfs_clear_inodegc_enabled(mp)) return; xfs_inodegc_queue_all(mp); + drain_workqueue(mp->m_inodegc_wq); - for_each_online_cpu(cpu) { - gc = per_cpu_ptr(mp->m_inodegc, cpu); - cancel_work_sync(&gc->work); - } trace_xfs_inodegc_stop(mp, __return_address); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6771f357ad2c..04bf467b1090 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -988,8 +988,8 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), - mapped_fsgid(mnt_userns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), + mapped_fsgid(mnt_userns, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1142,8 +1142,8 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), - mapped_fsgid(mnt_userns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), + mapped_fsgid(mnt_userns, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c447bf04205a..b7e8f14d9fca 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -462,15 +462,6 @@ xfs_itruncate_extents( } /* from xfs_file.c */ -enum xfs_prealloc_flags { - XFS_PREALLOC_SET = (1 << 1), - XFS_PREALLOC_CLEAR = (1 << 2), - XFS_PREALLOC_SYNC = (1 << 3), - XFS_PREALLOC_INVISIBLE = (1 << 4), -}; - -int xfs_update_prealloc_flags(struct xfs_inode *ip, - enum xfs_prealloc_flags flags); int xfs_break_layouts(struct inode *inode, uint *iolock, enum layout_break_reason reason); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 174cd8950cb6..2515fe8299e1 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -372,7 +372,7 @@ int xfs_ioc_attr_list( struct xfs_inode *dp, void __user *ubuf, - int bufsize, + size_t bufsize, int flags, struct xfs_attrlist_cursor __user *ucursor) { @@ -627,86 +627,6 @@ xfs_attrmulti_by_handle( return error; } -int -xfs_ioc_space( - struct file *filp, - xfs_flock64_t *bf) -{ - struct inode *inode = file_inode(filp); - struct xfs_inode *ip = XFS_I(inode); - struct iattr iattr; - enum xfs_prealloc_flags flags = XFS_PREALLOC_CLEAR; - uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - int error; - - if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) - return -EPERM; - - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - if (xfs_is_always_cow_inode(ip)) - return -EOPNOTSUPP; - - if (filp->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; - if (filp->f_mode & FMODE_NOCMTIME) - flags |= XFS_PREALLOC_INVISIBLE; - - error = mnt_want_write_file(filp); - if (error) - return error; - - xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); - if (error) - goto out_unlock; - inode_dio_wait(inode); - - switch (bf->l_whence) { - case 0: /*SEEK_SET*/ - break; - case 1: /*SEEK_CUR*/ - bf->l_start += filp->f_pos; - break; - case 2: /*SEEK_END*/ - bf->l_start += XFS_ISIZE(ip); - break; - default: - error = -EINVAL; - goto out_unlock; - } - - if (bf->l_start < 0 || bf->l_start > inode->i_sb->s_maxbytes) { - error = -EINVAL; - goto out_unlock; - } - - if (bf->l_start > XFS_ISIZE(ip)) { - error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), - bf->l_start - XFS_ISIZE(ip), 0); - if (error) - goto out_unlock; - } - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = bf->l_start; - error = xfs_vn_setattr_size(file_mnt_user_ns(filp), file_dentry(filp), - &iattr); - if (error) - goto out_unlock; - - error = xfs_update_prealloc_flags(ip, flags); - -out_unlock: - xfs_iunlock(ip, iolock); - mnt_drop_write_file(filp); - return error; -} - /* Return 0 on success or positive error */ int xfs_fsbulkstat_one_fmt( @@ -1544,7 +1464,7 @@ xfs_ioc_getbmap( if (bmx.bmv_count < 2) return -EINVAL; - if (bmx.bmv_count > ULONG_MAX / recsize) + if (bmx.bmv_count >= INT_MAX / recsize) return -ENOMEM; buf = kvcalloc(bmx.bmv_count, sizeof(*buf), GFP_KERNEL); @@ -1935,6 +1855,15 @@ xfs_fs_eofblocks_from_user( } /* + * These long-unused ioctls were removed from the official ioctl API in 5.17, + * but retain these definitions so that we can log warnings about them. + */ +#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) +#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) +#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) + +/* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. * So we don't "sign flip" like most other routines. This means @@ -1964,13 +1893,11 @@ xfs_file_ioctl( case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: { - xfs_flock64_t bf; - - if (copy_from_user(&bf, arg, sizeof(bf))) - return -EFAULT; - return xfs_ioc_space(filp, &bf); - } + case XFS_IOC_FREESP64: + xfs_warn_once(mp, + "%s should use fallocate; XFS_IOC_{ALLOC,FREE}SP ioctl unsupported", + current->comm); + return -ENOTTY; case XFS_IOC_DIOINFO: { struct xfs_buftarg *target = xfs_inode_buftarg(ip); struct dioattr da; diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 28453a6d4461..d4abba2c13c1 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -10,12 +10,6 @@ struct xfs_bstat; struct xfs_ibulk; struct xfs_inogrp; - -extern int -xfs_ioc_space( - struct file *filp, - xfs_flock64_t *bf); - int xfs_ioc_swapext( xfs_swapext_t *sxp); @@ -38,8 +32,9 @@ xfs_readlink_by_handle( int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode, uint32_t opcode, void __user *uname, void __user *value, uint32_t *len, uint32_t flags); -int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, int bufsize, - int flags, struct xfs_attrlist_cursor __user *ucursor); +int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, + size_t bufsize, int flags, + struct xfs_attrlist_cursor __user *ucursor); extern struct dentry * xfs_handle_to_dentry( diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 8783af203cfc..004ed2a251e8 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -28,22 +28,6 @@ #ifdef BROKEN_X86_ALIGNMENT STATIC int -xfs_compat_flock64_copyin( - xfs_flock64_t *bf, - compat_xfs_flock64_t __user *arg32) -{ - if (get_user(bf->l_type, &arg32->l_type) || - get_user(bf->l_whence, &arg32->l_whence) || - get_user(bf->l_start, &arg32->l_start) || - get_user(bf->l_len, &arg32->l_len) || - get_user(bf->l_sysid, &arg32->l_sysid) || - get_user(bf->l_pid, &arg32->l_pid) || - copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32))) - return -EFAULT; - return 0; -} - -STATIC int xfs_compat_ioc_fsgeometry_v1( struct xfs_mount *mp, compat_xfs_fsop_geom_v1_t __user *arg32) @@ -445,17 +429,6 @@ xfs_file_compat_ioctl( switch (cmd) { #if defined(BROKEN_X86_ALIGNMENT) - case XFS_IOC_ALLOCSP_32: - case XFS_IOC_FREESP_32: - case XFS_IOC_ALLOCSP64_32: - case XFS_IOC_FREESP64_32: { - struct xfs_flock64 bf; - - if (xfs_compat_flock64_copyin(&bf, arg)) - return -EFAULT; - cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(filp, &bf); - } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(ip->i_mount, arg); case XFS_IOC_FSGROWFSDATA_32: { diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 9929482bf358..c14852362fce 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -142,28 +142,6 @@ typedef struct compat_xfs_fsop_attrmulti_handlereq { _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq) #ifdef BROKEN_X86_ALIGNMENT -/* on ia32 l_start is on a 32-bit boundary */ -typedef struct compat_xfs_flock64 { - __s16 l_type; - __s16 l_whence; - __s64 l_start __attribute__((packed)); - /* len == 0 means until end of file */ - __s64 l_len __attribute__((packed)); - __s32 l_sysid; - __u32 l_pid; - __s32 l_pad[4]; /* reserve area */ -} compat_xfs_flock64_t; - -#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64) -#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64) -#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64) -#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64) -#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64) -#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) -#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) -#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) -#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64) - typedef struct compat_xfs_fsop_geom_v1 { __u32 blocksize; /* filesystem (data) block size */ __u32 rtextsize; /* realtime extent size */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 093758440ad5..e552ce541ec2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -28,7 +28,6 @@ #include "xfs_dquot.h" #include "xfs_reflink.h" - #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -54,7 +53,8 @@ xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, - u16 flags) + unsigned int mapping_flags, + u16 iomap_flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -71,16 +71,22 @@ xfs_bmbt_to_iomap( iomap->type = IOMAP_DELALLOC; } else { iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock)); + if (mapping_flags & IOMAP_DAX) + iomap->addr += target->bt_dax_part_off; + if (imap->br_state == XFS_EXT_UNWRITTEN) iomap->type = IOMAP_UNWRITTEN; else iomap->type = IOMAP_MAPPED; + } iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomap->bdev = target->bt_bdev; - iomap->dax_dev = target->bt_daxdev; - iomap->flags = flags; + if (mapping_flags & IOMAP_DAX) + iomap->dax_dev = target->bt_daxdev; + else + iomap->bdev = target->bt_bdev; + iomap->flags = iomap_flags; if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) @@ -188,6 +194,7 @@ xfs_iomap_write_direct( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, + unsigned int flags, struct xfs_bmbt_irec *imap) { struct xfs_mount *mp = ip->i_mount; @@ -229,7 +236,7 @@ xfs_iomap_write_direct( * the reserve block pool for bmbt block allocation if there is no space * left but we need to do unwritten extent conversion. */ - if (IS_DAX(VFS_I(ip))) { + if (flags & IOMAP_DAX) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (imap->br_state == XFS_EXT_UNWRITTEN) { force = true; @@ -620,7 +627,7 @@ imap_needs_alloc( imap->br_startblock == DELAYSTARTBLOCK) return true; /* we convert unwritten extents before copying the data for DAX */ - if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN) + if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN) return true; return false; } @@ -800,7 +807,7 @@ xfs_direct_write_iomap_begin( xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); allocate_blocks: error = -EAGAIN; @@ -826,23 +833,24 @@ allocate_blocks: xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, - &imap); + flags, &imap); if (error) return error; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, + iomap_flags | IOMAP_F_NEW); out_found_cow: xfs_iunlock(ip, lockmode); length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); if (imap.br_startblock != HOLESTARTBLOCK) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); if (error) return error; } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); out_unlock: if (lockmode) @@ -1052,23 +1060,24 @@ retry: */ xfs_iunlock(ip, XFS_ILOCK_EXCL); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); found_imap: xfs_iunlock(ip, XFS_ILOCK_EXCL); - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); found_cow: xfs_iunlock(ip, XFS_ILOCK_EXCL); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); if (error) return error; - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, + IOMAP_F_SHARED); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1177,7 +1186,8 @@ xfs_read_iomap_begin( if (error) return error; trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, + shared ? IOMAP_F_SHARED : 0); } const struct iomap_ops xfs_read_iomap_ops = { @@ -1236,7 +1246,8 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, + IOMAP_F_SHARED); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1258,7 +1269,7 @@ xfs_seek_iomap_begin( imap.br_state = XFS_EXT_NORM; done: xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1305,9 +1316,40 @@ out_unlock: if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); } const struct iomap_ops xfs_xattr_iomap_ops = { .iomap_begin = xfs_xattr_iomap_begin, }; + +int +xfs_zero_range( + struct xfs_inode *ip, + loff_t pos, + loff_t len, + bool *did_zero) +{ + struct inode *inode = VFS_I(ip); + + if (IS_DAX(inode)) + return dax_zero_range(inode, pos, len, did_zero, + &xfs_direct_write_iomap_ops); + return iomap_zero_range(inode, pos, len, did_zero, + &xfs_buffered_write_iomap_ops); +} + +int +xfs_truncate_page( + struct xfs_inode *ip, + loff_t pos, + bool *did_zero) +{ + struct inode *inode = VFS_I(ip); + + if (IS_DAX(inode)) + return dax_truncate_page(inode, pos, did_zero, + &xfs_direct_write_iomap_ops); + return iomap_truncate_page(inode, pos, did_zero, + &xfs_buffered_write_iomap_ops); +} diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 7d3703556d0e..e88dc162c785 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -12,13 +12,19 @@ struct xfs_inode; struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, - xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap); + xfs_fileoff_t count_fsb, unsigned int flags, + struct xfs_bmbt_irec *imap); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, xfs_fileoff_t end_fsb); -int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *, u16); +int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, + struct xfs_bmbt_irec *imap, unsigned int mapping_flags, + u16 iomap_flags); + +int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, + bool *did_zero); +int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero); static inline xfs_filblks_t xfs_aligned_fsb_count( diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index a607d6aca5c4..b79b3846e71b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -511,27 +511,6 @@ xfs_vn_get_link( return ERR_PTR(error); } -STATIC const char * -xfs_vn_get_link_inline( - struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) -{ - struct xfs_inode *ip = XFS_I(inode); - char *link; - - ASSERT(ip->i_df.if_format == XFS_DINODE_FMT_LOCAL); - - /* - * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if - * if_data is junk. - */ - link = ip->i_df.if_u1.if_data; - if (XFS_IS_CORRUPT(ip->i_mount, !link)) - return ERR_PTR(-EFSCORRUPTED); - return link; -} - static uint32_t xfs_stat_blksize( struct xfs_inode *ip) @@ -911,8 +890,8 @@ xfs_setattr_size( */ if (newsize > oldsize) { trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); - error = iomap_zero_range(inode, oldsize, newsize - oldsize, - &did_zeroing, &xfs_buffered_write_iomap_ops); + error = xfs_zero_range(ip, oldsize, newsize - oldsize, + &did_zeroing); } else { /* * iomap won't detect a dirty page over an unwritten block (or a @@ -924,8 +903,7 @@ xfs_setattr_size( newsize); if (error) return error; - error = iomap_truncate_page(inode, newsize, &did_zeroing, - &xfs_buffered_write_iomap_ops); + error = xfs_truncate_page(ip, newsize, &did_zeroing); } if (error) @@ -1250,14 +1228,6 @@ static const struct inode_operations xfs_symlink_inode_operations = { .update_time = xfs_vn_update_time, }; -static const struct inode_operations xfs_inline_symlink_inode_operations = { - .get_link = xfs_vn_get_link_inline, - .getattr = xfs_vn_getattr, - .setattr = xfs_vn_setattr, - .listxattr = xfs_vn_listxattr, - .update_time = xfs_vn_update_time, -}; - /* Figure out if this file actually supports DAX. */ static bool xfs_inode_supports_dax( @@ -1332,9 +1302,9 @@ xfs_diflags_to_iflags( * Initialize the Linux inode. * * When reading existing inodes from disk this is called directly from xfs_iget, - * when creating a new inode it is called from xfs_ialloc after setting up the - * inode. These callers have different criteria for clearing XFS_INEW, so leave - * it up to the caller to deal with unlocking the inode appropriately. + * when creating a new inode it is called from xfs_init_new_inode after setting + * up the inode. These callers have different criteria for clearing XFS_INEW, so + * leave it up to the caller to deal with unlocking the inode appropriately. */ void xfs_setup_inode( @@ -1408,10 +1378,7 @@ xfs_setup_iops( inode->i_fop = &xfs_dir_file_operations; break; case S_IFLNK: - if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) - inode->i_op = &xfs_inline_symlink_inode_operations; - else - inode->i_op = &xfs_symlink_inode_operations; + inode->i_op = &xfs_symlink_inode_operations; break; default: inode->i_op = &xfs_inode_operations; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index c174262a074e..09a8fba84ff9 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -61,6 +61,7 @@ typedef __u32 xfs_nlink_t; #include <linux/ratelimit.h> #include <linux/rhashtable.h> #include <linux/xattr.h> +#include <linux/mnt_idmapping.h> #include <asm/page.h> #include <asm/div64.h> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 6c93c8ada6f3..83a039762b81 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -103,6 +103,39 @@ xlog_cil_iovec_space( } /* + * shadow buffers can be large, so we need to use kvmalloc() here to ensure + * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall + * back to vmalloc, so we can't actually do anything useful with gfp flags to + * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do + * direct reclaim and compaction in the slow path, both of which are + * horrendously expensive. We just want kmalloc to fail fast and fall back to + * vmalloc if it can't get somethign straight away from the free lists or buddy + * allocator. Hence we have to open code kvmalloc outselves here. + * + * Also, we are in memalloc_nofs_save task context here, so despite the use of + * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This + * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets + * just all pretend this is a GFP_KERNEL context operation.... + */ +static inline void * +xlog_cil_kvmalloc( + size_t buf_size) +{ + gfp_t flags = GFP_KERNEL; + void *p; + + flags &= ~__GFP_DIRECT_RECLAIM; + flags |= __GFP_NOWARN | __GFP_NORETRY; + do { + p = kmalloc(buf_size, flags); + if (!p) + p = vmalloc(buf_size); + } while (!p); + + return p; +} + +/* * Allocate or pin log vector buffers for CIL insertion. * * The CIL currently uses disposable buffers for copying a snapshot of the @@ -203,25 +236,16 @@ xlog_cil_alloc_shadow_bufs( */ if (!lip->li_lv_shadow || buf_size > lip->li_lv_shadow->lv_size) { - /* * We free and allocate here as a realloc would copy - * unnecessary data. We don't use kmem_zalloc() for the + * unnecessary data. We don't use kvzalloc() for the * same reason - we don't need to zero the data area in * the buffer, only the log vector header and the iovec * storage. */ kmem_free(lip->li_lv_shadow); + lv = xlog_cil_kvmalloc(buf_size); - /* - * We are in transaction context, which means this - * allocation will pick up GFP_NOFS from the - * memalloc_nofs_save/restore context the transaction - * holds. This means we can use GFP_KERNEL here so the - * generic kvmalloc() code will run vmalloc on - * contiguous page allocation failure as we require. - */ - lv = kvmalloc(buf_size, GFP_KERNEL); memset(lv, 0, xlog_cil_iovec_space(niovecs)); lv->lv_item = lip; @@ -1442,9 +1466,9 @@ out_shutdown: */ bool xfs_log_item_in_current_chkpt( - struct xfs_log_item *lip) + struct xfs_log_item *lip) { - struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; + struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp; if (list_empty(&lip->li_cil)) return false; @@ -1454,7 +1478,7 @@ xfs_log_item_in_current_chkpt( * first checkpoint it is written to. Hence if it is different to the * current sequence, we're in a new checkpoint. */ - return lip->li_seq == ctx->sequence; + return lip->li_seq == READ_ONCE(cil->xc_current_sequence); } /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 53366cc0bc9e..96c997ed2ec8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -27,7 +27,7 @@ #include "xfs_buf_item.h" #include "xfs_ag.h" #include "xfs_quota.h" - +#include "xfs_reflink.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -3498,6 +3498,28 @@ xlog_recover_finish( xlog_recover_process_iunlinks(log); xlog_recover_check_summary(log); + + /* + * Recover any CoW staging blocks that are still referenced by the + * ondisk refcount metadata. During mount there cannot be any live + * staging extents as we have not permitted any user modifications. + * Therefore, it is safe to free them all right now, even on a + * read-only mount. + */ + error = xfs_reflink_recover_cow(log->l_mp); + if (error) { + xfs_alert(log->l_mp, + "Failed to recover leftover CoW staging extents, err %d.", + error); + /* + * If we get an error here, make sure the log is shut down + * but return zero so that any log items committed since the + * end of intents processing can be pushed through the CIL + * and AIL. + */ + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + } + return 0; } @@ -3528,8 +3550,6 @@ xlog_recover_check_summary( uint64_t ifree; int error; - mp = log->l_mp; - freeblks = 0LL; itotal = 0LL; ifree = 0LL; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 359109b6f0d3..bed73e8002a5 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -936,15 +936,6 @@ xfs_mountfs( xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); - /* Recover any CoW blocks that never got remapped. */ - error = xfs_reflink_recover_cow(mp); - if (error) { - xfs_err(mp, - "Error %d recovering leftover CoW allocations.", error); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - goto out_quota; - } - /* Reserve AG blocks for future btree expansion. */ error = xfs_fs_reserve_ag_blocks(mp); if (error && error != -ENOSPC) @@ -955,7 +946,6 @@ xfs_mountfs( out_agresv: xfs_fs_unreserve_ag_blocks(mp); - out_quota: xfs_qm_unmount_quotas(mp); out_rtunmount: xfs_rtunmount_inodes(mp); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 5e1d29d8b2e7..4abe17312c2b 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -71,6 +71,40 @@ xfs_fs_get_uuid( } /* + * We cannot use file based VFS helpers such as file_modified() to update + * inode state as we modify the data/metadata in the inode here. Hence we have + * to open code the timestamp updates and SUID/SGID stripping. We also need + * to set the inode prealloc flag to ensure that the extents we allocate are not + * removed if the inode is reclaimed from memory before xfs_fs_block_commit() + * is from the client to indicate that data has been written and the file size + * can be extended. + */ +static int +xfs_fs_map_update_inode( + struct xfs_inode *ip) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, + 0, 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + VFS_I(ip)->i_mode &= ~S_ISUID; + if (VFS_I(ip)->i_mode & S_IXGRP) + VFS_I(ip)->i_mode &= ~S_ISGID; + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + ip->i_diflags |= XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp); +} + +/* * Get a layout for the pNFS client. */ int @@ -155,7 +189,7 @@ xfs_fs_map_blocks( xfs_iunlock(ip, lock_flags); error = xfs_iomap_write_direct(ip, offset_fsb, - end_fsb - offset_fsb, &imap); + end_fsb - offset_fsb, 0, &imap); if (error) goto out_unlock; @@ -164,16 +198,18 @@ xfs_fs_map_blocks( * that the blocks allocated and handed out to the client are * guaranteed to be present even after a server crash. */ - error = xfs_update_prealloc_flags(ip, - XFS_PREALLOC_SET | XFS_PREALLOC_SYNC); + error = xfs_fs_map_update_inode(ip); + if (!error) + error = xfs_log_force_inode(ip); if (error) goto out_unlock; + } else { xfs_iunlock(ip, lock_flags); } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); *device_generation = mp->m_generation; return error; out_unlock: @@ -255,7 +291,7 @@ xfs_fs_commit_blocks( length = end - start; if (!length) continue; - + /* * Make sure reads through the pagecache see the new data. */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 47fe60e1a887..7d5a31827681 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -303,13 +303,6 @@ xfs_qm_scall_setqlim( return 0; /* - * We don't want to race with a quotaoff so take the quotaoff lock. - * We don't hold an inode lock, so there's nothing else to stop - * a quotaoff from happening. - */ - mutex_lock(&q->qi_quotaofflock); - - /* * Get the dquot (locked) before we start, as we need to do a * transaction to allocate it if it doesn't exist. Once we have the * dquot, unlock it so we can start the next transaction safely. We hold @@ -319,7 +312,7 @@ xfs_qm_scall_setqlim( error = xfs_qm_dqget(mp, id, type, true, &dqp); if (error) { ASSERT(error != -ENOENT); - goto out_unlock; + return error; } defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); @@ -415,8 +408,6 @@ xfs_qm_scall_setqlim( out_rele: xfs_qm_dqrele(dqp); -out_unlock: - mutex_unlock(&q->qi_quotaofflock); return error; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cb0edb1d68ef..db70060e7bf6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -749,7 +749,10 @@ xfs_reflink_end_cow( } /* - * Free leftover CoW reservations that didn't get cleaned out. + * Free all CoW staging blocks that are still referenced by the ondisk refcount + * metadata. The ondisk metadata does not track which inode created the + * staging extent, so callers must ensure that there are no cached inodes with + * live CoW staging extents. */ int xfs_reflink_recover_cow( @@ -1269,8 +1272,7 @@ xfs_reflink_zero_posteof( return 0; trace_xfs_zero_eof(ip, isize, pos - isize); - return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, - &xfs_buffered_write_iomap_ops); + return xfs_zero_range(ip, isize, pos - isize, NULL); } /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 778b57b1f020..4c0dee78b2f8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -331,13 +331,34 @@ xfs_set_inode_alloc( return xfs_is_inode32(mp) ? maxagi : agcount; } -static bool -xfs_buftarg_is_dax( - struct super_block *sb, - struct xfs_buftarg *bt) +static int +xfs_setup_dax_always( + struct xfs_mount *mp) { - return dax_supported(bt->bt_daxdev, bt->bt_bdev, sb->s_blocksize, 0, - bdev_nr_sectors(bt->bt_bdev)); + if (!mp->m_ddev_targp->bt_daxdev && + (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) { + xfs_alert(mp, + "DAX unsupported by block device. Turning off DAX."); + goto disable_dax; + } + + if (mp->m_super->s_blocksize != PAGE_SIZE) { + xfs_alert(mp, + "DAX not supported for blocksize. Turning off DAX."); + goto disable_dax; + } + + if (xfs_has_reflink(mp)) { + xfs_alert(mp, "DAX and reflink cannot be used together!"); + return -EINVAL; + } + + xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + return 0; + +disable_dax: + xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); + return 0; } STATIC int @@ -370,26 +391,19 @@ STATIC void xfs_close_devices( struct xfs_mount *mp) { - struct dax_device *dax_ddev = mp->m_ddev_targp->bt_daxdev; - if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev; xfs_free_buftarg(mp->m_logdev_targp); xfs_blkdev_put(logdev); - fs_put_dax(dax_logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev; xfs_free_buftarg(mp->m_rtdev_targp); xfs_blkdev_put(rtdev); - fs_put_dax(dax_rtdev); } xfs_free_buftarg(mp->m_ddev_targp); - fs_put_dax(dax_ddev); } /* @@ -407,8 +421,6 @@ xfs_open_devices( struct xfs_mount *mp) { struct block_device *ddev = mp->m_super->s_bdev; - struct dax_device *dax_ddev = fs_dax_get_by_bdev(ddev); - struct dax_device *dax_logdev = NULL, *dax_rtdev = NULL; struct block_device *logdev = NULL, *rtdev = NULL; int error; @@ -418,8 +430,7 @@ xfs_open_devices( if (mp->m_logname) { error = xfs_blkdev_get(mp, mp->m_logname, &logdev); if (error) - goto out; - dax_logdev = fs_dax_get_by_bdev(logdev); + return error; } if (mp->m_rtname) { @@ -433,25 +444,24 @@ xfs_open_devices( error = -EINVAL; goto out_close_rtdev; } - dax_rtdev = fs_dax_get_by_bdev(rtdev); } /* * Setup xfs_mount buffer target pointers */ error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev); + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev); + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -467,14 +477,9 @@ xfs_open_devices( xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: xfs_blkdev_put(rtdev); - fs_put_dax(dax_rtdev); out_close_logdev: - if (logdev && logdev != ddev) { + if (logdev && logdev != ddev) xfs_blkdev_put(logdev); - fs_put_dax(dax_logdev); - } - out: - fs_put_dax(dax_ddev); return error; } @@ -730,6 +735,7 @@ xfs_fs_sync_fs( int wait) { struct xfs_mount *mp = XFS_M(sb); + int error; trace_xfs_fs_sync_fs(mp, __return_address); @@ -739,7 +745,10 @@ xfs_fs_sync_fs( if (!wait) return 0; - xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_log_force(mp, XFS_LOG_SYNC); + if (error) + return error; + if (laptop_mode) { /* * The disk must be active because we're syncing. @@ -1593,26 +1602,9 @@ xfs_fs_fill_super( sb->s_flags |= SB_I_VERSION; if (xfs_has_dax_always(mp)) { - bool rtdev_is_dax = false, datadev_is_dax; - - xfs_warn(mp, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - - datadev_is_dax = xfs_buftarg_is_dax(sb, mp->m_ddev_targp); - if (mp->m_rtdev_targp) - rtdev_is_dax = xfs_buftarg_is_dax(sb, - mp->m_rtdev_targp); - if (!rtdev_is_dax && !datadev_is_dax) { - xfs_alert(mp, - "DAX unsupported by block device. Turning off DAX."); - xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); - } - if (xfs_has_reflink(mp)) { - xfs_alert(mp, - "DAX and reflink cannot be used together!"); - error = -EINVAL; + error = xfs_setup_dax_always(mp); + if (error) goto out_filestream_unmount; - } } if (xfs_has_discard(mp)) { @@ -1739,15 +1731,6 @@ xfs_remount_rw( */ xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - - /* Recover any CoW blocks that never got remapped. */ - error = xfs_reflink_recover_cow(mp); - if (error) { - xfs_err(mp, - "Error %d recovering leftover CoW allocations.", error); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } xfs_blockgc_start(mp); /* Create the per-AG metadata reservation pool .*/ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index fc2c6a404647..affbedf78160 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -22,6 +22,7 @@ #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_ialloc.h" +#include "xfs_error.h" /* ----- Kernel only functions below ----- */ int @@ -96,17 +97,15 @@ xfs_readlink_bmap_ilocked( int xfs_readlink( - struct xfs_inode *ip, - char *link) + struct xfs_inode *ip, + char *link) { - struct xfs_mount *mp = ip->i_mount; - xfs_fsize_t pathlen; - int error = 0; + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t pathlen; + int error = -EFSCORRUPTED; trace_xfs_readlink(ip); - ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_LOCAL); - if (xfs_is_shutdown(mp)) return -EIO; @@ -121,12 +120,22 @@ xfs_readlink( __func__, (unsigned long long) ip->i_ino, (long long) pathlen); ASSERT(0); - error = -EFSCORRUPTED; goto out; } - - error = xfs_readlink_bmap_ilocked(ip, link); + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + /* + * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED + * if if_data is junk. + */ + if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data)) + goto out; + + memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1); + error = 0; + } else { + error = xfs_readlink_bmap_ilocked(ip, link); + } out: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -184,8 +193,8 @@ xfs_symlink( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns), - mapped_fsgid(mnt_userns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), + mapped_fsgid(mnt_userns, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 8608f804388f..574b80c29fe1 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -67,11 +67,12 @@ static const struct sysfs_ops xfs_sysfs_ops = { static struct attribute *xfs_mp_attrs[] = { NULL, }; +ATTRIBUTE_GROUPS(xfs_mp); struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_mp_attrs, + .default_groups = xfs_mp_groups, }; #ifdef DEBUG @@ -239,11 +240,12 @@ static struct attribute *xfs_dbg_attrs[] = { #endif NULL, }; +ATTRIBUTE_GROUPS(xfs_dbg); struct kobj_type xfs_dbg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_dbg_attrs, + .default_groups = xfs_dbg_groups, }; #endif /* DEBUG */ @@ -296,11 +298,12 @@ static struct attribute *xfs_stats_attrs[] = { ATTR_LIST(stats_clear), NULL, }; +ATTRIBUTE_GROUPS(xfs_stats); struct kobj_type xfs_stats_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_stats_attrs, + .default_groups = xfs_stats_groups, }; /* xlog */ @@ -381,11 +384,12 @@ static struct attribute *xfs_log_attrs[] = { ATTR_LIST(write_grant_head), NULL, }; +ATTRIBUTE_GROUPS(xfs_log); struct kobj_type xfs_log_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_log_attrs, + .default_groups = xfs_log_groups, }; /* @@ -534,12 +538,12 @@ static struct attribute *xfs_error_attrs[] = { ATTR_LIST(retry_timeout_seconds), NULL, }; - +ATTRIBUTE_GROUPS(xfs_error); static struct kobj_type xfs_error_cfg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, - .default_attrs = xfs_error_attrs, + .default_groups = xfs_error_groups, }; static struct kobj_type xfs_error_ktype = { diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 234a9d9c2f43..59e2f9031b9f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -942,8 +942,17 @@ xfs_trans_cancel( trace_xfs_trans_cancel(tp, _RET_IP_); - if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) + /* + * It's never valid to cancel a transaction with deferred ops attached, + * because the transaction is effectively dirty. Complain about this + * loudly before freeing the in-memory defer items. + */ + if (!list_empty(&tp->t_dfops)) { + ASSERT(xfs_is_shutdown(mp) || list_empty(&tp->t_dfops)); + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + dirty = true; xfs_defer_cancel(tp); + } /* * See if the caller is relying on us to shut down the |