diff options
Diffstat (limited to 'fs/xfs')
54 files changed, 812 insertions, 772 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 8930d5254e1d..b99da294e9a3 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -119,6 +119,15 @@ config XFS_RT See the xfs man page in section 5 for additional information. + This option is mandatory to support zoned block devices. For these + devices, the realtime subvolume must be backed by a zoned block + device and a regular block device used as the main device (for + metadata). If the zoned block device is a host-managed SMR hard-disk + containing conventional zones at the beginning of its address space, + XFS will use the disk conventional zones as the main device and the + remaining sequential write required zones as the backing storage for + the realtime subvolume. + If unsure, say N. config XFS_DRAIN_INTENTS @@ -156,7 +165,7 @@ config XFS_ONLINE_SCRUB_STATS bool "XFS online metadata check usage data collection" default y depends on XFS_ONLINE_SCRUB - select DEBUG_FS + depends on DEBUG_FS help If you say Y here, the kernel will gather usage data about the online metadata check subsystem. This includes the number diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index de840abc0bcd..57e47077c75a 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -73,7 +73,8 @@ #define XFS_ERRTAG_WRITE_DELAY_MS 43 #define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 #define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45 -#define XFS_ERRTAG_MAX 46 +#define XFS_ERRTAG_FORCE_ZERO_RANGE 46 +#define XFS_ERRTAG_MAX 47 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -133,7 +134,8 @@ XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \ XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \ XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \ XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \ -XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) +XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) \ +XFS_ERRTAG(FORCE_ZERO_RANGE, force_zero_range, 4) #endif /* XFS_ERRTAG */ #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h index 4423932a2313..4ae638f1c2c5 100644 --- a/fs/xfs/libxfs/xfs_group.h +++ b/fs/xfs/libxfs/xfs_group.h @@ -98,6 +98,15 @@ xfs_group_max_blocks( return xg->xg_mount->m_groups[xg->xg_type].blocks; } +static inline xfs_rfsblock_t +xfs_groups_to_rfsbs( + struct xfs_mount *mp, + uint32_t nr_groups, + enum xfs_group_type type) +{ + return (xfs_rfsblock_t)mp->m_groups[type].blocks * nr_groups; +} + static inline xfs_fsblock_t xfs_group_start_fsb( struct xfs_group *xg) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 6c50cb2ece19..908e7060428c 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -31,6 +31,7 @@ typedef uint32_t xlog_tid_t; #define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ #define XLOG_MAX_RECORD_BSIZE (256*1024) #define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ +#define XLOG_CYCLE_DATA_SIZE (XLOG_HEADER_CYCLE_SIZE / BBSIZE) #define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ #define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ #define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ @@ -125,7 +126,17 @@ struct xlog_op_header { #define XLOG_FMT XLOG_FMT_LINUX_LE #endif -typedef struct xlog_rec_header { +struct xlog_rec_ext_header { + __be32 xh_cycle; /* write cycle of log */ + __be32 xh_cycle_data[XLOG_CYCLE_DATA_SIZE]; + __u8 xh_reserved[252]; +}; + +/* actual ext header payload size for checksumming */ +#define XLOG_REC_EXT_SIZE \ + offsetofend(struct xlog_rec_ext_header, xh_cycle_data) + +struct xlog_rec_header { __be32 h_magicno; /* log record (LR) identifier : 4 */ __be32 h_cycle; /* write cycle of log : 4 */ __be32 h_version; /* LR version : 4 */ @@ -135,7 +146,7 @@ typedef struct xlog_rec_header { __le32 h_crc; /* crc of log record : 4 */ __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ - __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; + __be32 h_cycle_data[XLOG_CYCLE_DATA_SIZE]; /* fields added by the Linux port: */ __be32 h_fmt; /* format of log record : 4 */ @@ -160,30 +171,19 @@ typedef struct xlog_rec_header { * (little-endian) architectures. */ __u32 h_pad0; -} xlog_rec_header_t; + + __u8 h_reserved[184]; + struct xlog_rec_ext_header h_ext[]; +}; #ifdef __i386__ #define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_size) -#define XLOG_REC_SIZE_OTHER sizeof(struct xlog_rec_header) +#define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_pad0) #else -#define XLOG_REC_SIZE sizeof(struct xlog_rec_header) +#define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_pad0) #define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_size) #endif /* __i386__ */ -typedef struct xlog_rec_ext_header { - __be32 xh_cycle; /* write cycle of log : 4 */ - __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ -} xlog_rec_ext_header_t; - -/* - * Quite misnamed, because this union lays out the actual on-disk log buffer. - */ -typedef union xlog_in_core2 { - xlog_rec_header_t hic_header; - xlog_rec_ext_header_t hic_xheader; - char hic_sector[XLOG_HEADER_SIZE]; -} xlog_in_core_2_t; - /* not an on-disk structure, but needed by log recovery in userspace */ struct xfs_log_iovec { void *i_addr; /* beginning address of region */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 7bfa3242e2c5..2e9715cc1641 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -174,9 +174,11 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16); - XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header, 328); - XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header, 260); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header, 512); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header, 512); + XFS_CHECK_OFFSET(struct xlog_rec_header, h_reserved, 328); + XFS_CHECK_OFFSET(struct xlog_rec_ext_header, xh_reserved, 260); XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents, 16); diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 763d941a8420..551d7ae46c5c 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -29,11 +29,9 @@ typedef uint8_t xfs_dqtype_t; * flags for q_flags field in the dquot. */ #define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */ -#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */ #define XFS_DQFLAG_STRINGS \ - { XFS_DQFLAG_DIRTY, "DIRTY" }, \ - { XFS_DQFLAG_FREEING, "FREEING" } + { XFS_DQFLAG_DIRTY, "DIRTY" } /* * We have the possibility of all three quota types being active at once, and diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index d36a6ae0abe5..03f1e2493334 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -50,6 +50,12 @@ struct xfs_rtgroup { uint8_t *rtg_rsum_cache; struct xfs_open_zone *rtg_open_zone; }; + + /* + * Count of outstanding GC operations for zoned XFS. Any RTG with a + * non-zero rtg_gccount will not be picked as new GC victim. + */ + atomic_t rtg_gccount; }; /* @@ -58,12 +64,6 @@ struct xfs_rtgroup { */ #define XFS_RTG_FREE XA_MARK_0 -/* - * For zoned RT devices this is set on groups that are fully written and that - * have unused blocks. Used by the garbage collection to pick targets. - */ -#define XFS_RTG_RECLAIMABLE XA_MARK_1 - static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg) { return container_of(xg, struct xfs_rtgroup, rtg_group); @@ -365,4 +365,12 @@ static inline int xfs_initialize_rtgroups(struct xfs_mount *mp, # define xfs_rtgroup_get_geometry(rtg, rgeo) (-EOPNOTSUPP) #endif /* CONFIG_XFS_RT */ +static inline xfs_rfsblock_t +xfs_rtgs_to_rfsbs( + struct xfs_mount *mp, + uint32_t nr_groups) +{ + return xfs_groups_to_rfsbs(mp, nr_groups, XG_TYPE_RTG); +} + #endif /* __LIBXFS_RTGROUP_H */ diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c index b0791a71931c..b40f71f878b5 100644 --- a/fs/xfs/libxfs/xfs_zones.c +++ b/fs/xfs/libxfs/xfs_zones.c @@ -95,6 +95,7 @@ xfs_zone_validate_seq( case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: case BLK_ZONE_COND_CLOSED: + case BLK_ZONE_COND_ACTIVE: return xfs_zone_validate_wp(zone, rtg, write_pointer); case BLK_ZONE_COND_FULL: return xfs_zone_validate_full(zone, rtg, write_pointer); diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 2ef7742be7d3..7bfa37c99480 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1249,7 +1249,7 @@ xchk_irele( * hits do not clear DONTCACHE, so we must do it here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index a90a011c7e5f..4f7040c9ddf0 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -1933,7 +1933,7 @@ xrep_inode_pptr( * Unlinked inodes that cannot be added to the directory tree will not * have a parent pointer. */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) return 0; /* Children of the superblock do not have parent pointers. */ diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c index 26721fab5cab..091c79e432e5 100644 --- a/fs/xfs/scrub/nlinks.c +++ b/fs/xfs/scrub/nlinks.c @@ -376,6 +376,36 @@ out_incomplete: return error; } +static uint +xchk_nlinks_ilock_dir( + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + /* + * We're going to scan the directory entries, so we must be ready to + * pull the data fork mappings into memory if they aren't already. + */ + if (xfs_need_iread_extents(&ip->i_df)) + lock_mode = XFS_ILOCK_EXCL; + + /* + * We're going to scan the parent pointers, so we must be ready to + * pull the attr fork mappings into memory if they aren't already. + */ + if (xfs_has_parent(ip->i_mount) && xfs_inode_has_attr_fork(ip) && + xfs_need_iread_extents(&ip->i_af)) + lock_mode = XFS_ILOCK_EXCL; + + /* + * Take the IOLOCK so that other threads cannot start a directory + * update while we're scanning. + */ + lock_mode |= XFS_IOLOCK_SHARED; + xfs_ilock(ip, lock_mode); + return lock_mode; +} + /* Walk a directory to bump the observed link counts of the children. */ STATIC int xchk_nlinks_collect_dir( @@ -394,8 +424,7 @@ xchk_nlinks_collect_dir( return 0; /* Prevent anyone from changing this directory while we walk it. */ - xfs_ilock(dp, XFS_IOLOCK_SHARED); - lock_mode = xfs_ilock_data_map_shared(dp); + lock_mode = xchk_nlinks_ilock_dir(dp); /* * The dotdot entry of an unlinked directory still points to the last @@ -452,7 +481,6 @@ out_abort: xchk_iscan_abort(&xnc->collect_iscan); out_unlock: xfs_iunlock(dp, lock_mode); - xfs_iunlock(dp, XFS_IOLOCK_SHARED); return error; } diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index 9c12cb844231..4e550a1d5353 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -152,11 +152,10 @@ xrep_orphanage_create( } /* Try to find the orphanage directory. */ - inode_lock_nested(root_inode, I_MUTEX_PARENT); - orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry); + orphanage_dentry = start_creating_noperm(root_dentry, &QSTR(ORPHANAGE)); if (IS_ERR(orphanage_dentry)) { error = PTR_ERR(orphanage_dentry); - goto out_unlock_root; + goto out_dput_root; } /* @@ -167,10 +166,10 @@ xrep_orphanage_create( */ if (d_really_is_negative(orphanage_dentry)) { orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode, - orphanage_dentry, 0750); + orphanage_dentry, 0750, NULL); error = PTR_ERR(orphanage_dentry); if (IS_ERR(orphanage_dentry)) - goto out_unlock_root; + goto out_dput_orphanage; } /* Not a directory? Bail out. */ @@ -200,9 +199,7 @@ xrep_orphanage_create( sc->orphanage_ilock_flags = 0; out_dput_orphanage: - dput(orphanage_dentry); -out_unlock_root: - inode_unlock(VFS_I(sc->mp->m_rootip)); + end_creating(orphanage_dentry); out_dput_root: dput(root_dentry); out: diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 3b692c4acc1e..11d5de10fd56 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -915,7 +915,7 @@ xchk_pptr_looks_zapped( * Temporary files that cannot be linked into the directory tree do not * have attr forks because they cannot ever have parents. */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) return false; /* diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 58d6d4ed2853..5c5374c44c5a 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -155,12 +155,9 @@ xchk_quota_item( * We want to validate the bmap record for the storage backing this * dquot, so we need to lock the dquot and the quota file. For quota * operations, the locking order is first the ILOCK and then the dquot. - * However, dqiterate gave us a locked dquot, so drop the dquot lock to - * get the ILOCK. */ - xfs_dqunlock(dq); xchk_ilock(sc, XFS_ILOCK_SHARED); - xfs_dqlock(dq); + mutex_lock(&dq->q_qlock); /* * Except for the root dquot, the actual dquot we got must either have @@ -251,6 +248,7 @@ xchk_quota_item( xchk_quota_item_timer(sc, offset, &dq->q_rtb); out: + mutex_unlock(&dq->q_qlock); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return -ECANCELED; @@ -330,7 +328,7 @@ xchk_quota( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { error = xchk_quota_item(&sqi, dq); - xfs_qm_dqput(dq); + xfs_qm_dqrele(dq); if (error) break; } diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c index 8f4c8d41f308..b1d661aa5f06 100644 --- a/fs/xfs/scrub/quota_repair.c +++ b/fs/xfs/scrub/quota_repair.c @@ -184,17 +184,13 @@ xrep_quota_item( /* * We might need to fix holes in the bmap record for the storage * backing this dquot, so we need to lock the dquot and the quota file. - * dqiterate gave us a locked dquot, so drop the dquot lock to get the - * ILOCK_EXCL. */ - xfs_dqunlock(dq); xchk_ilock(sc, XFS_ILOCK_EXCL); - xfs_dqlock(dq); - + mutex_lock(&dq->q_qlock); error = xrep_quota_item_bmap(sc, dq, &dirty); xchk_iunlock(sc, XFS_ILOCK_EXCL); if (error) - return error; + goto out_unlock_dquot; /* Check the limits. */ if (dq->q_blk.softlimit > dq->q_blk.hardlimit) { @@ -246,7 +242,7 @@ xrep_quota_item( xrep_quota_item_timer(sc, &dq->q_rtb, &dirty); if (!dirty) - return 0; + goto out_unlock_dquot; trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id); @@ -257,8 +253,10 @@ xrep_quota_item( xfs_qm_adjust_dqtimers(dq); } xfs_trans_log_dquot(sc->tp, dq); - error = xfs_trans_roll(&sc->tp); - xfs_dqlock(dq); + return xfs_trans_roll(&sc->tp); + +out_unlock_dquot: + mutex_unlock(&dq->q_qlock); return error; } @@ -513,7 +511,7 @@ xrep_quota_problems( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { error = xrep_quota_item(&rqi, dq); - xfs_qm_dqput(dq); + xfs_qm_dqrele(dq); if (error) break; } diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c index e4105aaafe84..d412a8359784 100644 --- a/fs/xfs/scrub/quotacheck.c +++ b/fs/xfs/scrub/quotacheck.c @@ -563,6 +563,7 @@ xqcheck_compare_dquot( return -ECANCELED; } + mutex_lock(&dq->q_qlock); mutex_lock(&xqc->lock); error = xfarray_load_sparse(counts, dq->q_id, &xcdq); if (error) @@ -589,7 +590,9 @@ xqcheck_compare_dquot( xchk_set_incomplete(xqc->sc); error = -ECANCELED; } +out_unlock: mutex_unlock(&xqc->lock); + mutex_unlock(&dq->q_qlock); if (error) return error; @@ -597,10 +600,6 @@ xqcheck_compare_dquot( return -ECANCELED; return 0; - -out_unlock: - mutex_unlock(&xqc->lock); - return error; } /* @@ -636,7 +635,7 @@ xqcheck_walk_observations( return error; error = xqcheck_compare_dquot(xqc, dqtype, dq); - xfs_qm_dqput(dq); + xfs_qm_dqrele(dq); if (error) return error; @@ -674,7 +673,7 @@ xqcheck_compare_dqtype( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { error = xqcheck_compare_dquot(xqc, dqtype, dq); - xfs_qm_dqput(dq); + xfs_qm_dqrele(dq); if (error) break; } diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c index dd8554c755b5..51be8d8d261b 100644 --- a/fs/xfs/scrub/quotacheck_repair.c +++ b/fs/xfs/scrub/quotacheck_repair.c @@ -52,13 +52,11 @@ xqcheck_commit_dquot( bool dirty = false; int error = 0; - /* Unlock the dquot just long enough to allocate a transaction. */ - xfs_dqunlock(dq); error = xchk_trans_alloc(xqc->sc, 0); - xfs_dqlock(dq); if (error) return error; + mutex_lock(&dq->q_qlock); xfs_trans_dqjoin(xqc->sc->tp, dq); if (xchk_iscan_aborted(&xqc->iscan)) { @@ -115,23 +113,12 @@ xqcheck_commit_dquot( if (dq->q_id) xfs_qm_adjust_dqtimers(dq); xfs_trans_log_dquot(xqc->sc->tp, dq); - - /* - * Transaction commit unlocks the dquot, so we must re-lock it so that - * the caller can put the reference (which apparently requires a locked - * dquot). - */ - error = xrep_trans_commit(xqc->sc); - xfs_dqlock(dq); - return error; + return xrep_trans_commit(xqc->sc); out_unlock: mutex_unlock(&xqc->lock); out_cancel: xchk_trans_cancel(xqc->sc); - - /* Re-lock the dquot so the caller can put the reference. */ - xfs_dqlock(dq); return error; } @@ -156,7 +143,7 @@ xqcheck_commit_dqtype( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { error = xqcheck_commit_dquot(xqc, dqtype, dq); - xfs_qm_dqput(dq); + xfs_qm_dqrele(dq); if (error) break; } @@ -187,7 +174,7 @@ xqcheck_commit_dqtype( return error; error = xqcheck_commit_dquot(xqc, dqtype, dq); - xfs_qm_dqput(dq); + xfs_qm_dqrele(dq); if (error) return error; diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index 5902398185a8..df629892462f 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -184,7 +184,7 @@ xrep_symlink_salvage_inline( sc->ip->i_disk_size == 1 && old_target[0] == '?') return 0; - nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); + nr = min(XFS_SYMLINK_MAXLEN, ifp->if_bytes); memcpy(target_buf, ifp->if_data, nr); return nr; } diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index cdd13ed9c569..ed2e8c64b1a8 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -834,7 +834,7 @@ xfarray_sort_scan( si->first_folio_idx = xfarray_idx(si->array, folio_pos(si->folio) + si->array->obj_size - 1); - next_pos = folio_pos(si->folio) + folio_size(si->folio); + next_pos = folio_next_pos(si->folio); si->last_folio_idx = xfarray_idx(si->array, next_pos - 1); if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos) si->last_folio_idx--; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a26f79815533..56a544638491 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -271,7 +271,7 @@ xfs_discard_folio( * folio itself and not the start offset that is passed in. */ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, - folio_pos(folio) + folio_size(folio), NULL); + folio_next_pos(folio), NULL); } /* @@ -742,14 +742,15 @@ xfs_vm_read_folio( struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &xfs_read_iomap_ops); + iomap_bio_read_folio(folio, &xfs_read_iomap_ops); + return 0; } STATIC void xfs_vm_readahead( struct readahead_control *rac) { - iomap_readahead(rac, &xfs_read_iomap_ops); + iomap_bio_readahead(rac, &xfs_read_iomap_ops); } static int diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 06ca11731e43..2208a720ec3f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -514,7 +514,7 @@ xfs_can_free_eofblocks( * Caller must either hold the exclusive io lock; or be inactivating * the inode, which guarantees there are no other users of the inode. */ - if (!(VFS_I(ip)->i_state & I_FREEING)) + if (!(inode_state_read_once(VFS_I(ip)) & I_FREEING)) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); /* prealloc/delalloc exists only on regular files */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 773d959965dc..47edf3041631 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1751,7 +1751,7 @@ xfs_init_buftarg( const char *descr) { /* The maximum size of the buftarg is only known once the sb is read. */ - btp->bt_nr_sectors = (xfs_daddr_t)-1; + btp->bt_nr_sectors = XFS_BUF_DADDR_MAX; /* Set up device logical sector size mask */ btp->bt_logical_sectorsize = logical_sectorsize; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 8fa7bdf59c91..e25cd2a160f3 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -22,6 +22,7 @@ extern struct kmem_cache *xfs_buf_cache; */ struct xfs_buf; +#define XFS_BUF_DADDR_MAX ((xfs_daddr_t) S64_MAX) #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) #define XBF_READ (1u << 0) /* buffer intended for reading from device */ diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index ee49f20875af..6917de832191 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -726,8 +726,10 @@ xfs_trim_rtgroup_extents( break; } - if (!tr.queued) + if (!tr.queued) { + kfree(tr.extents); break; + } /* * We hand the extent list to the discard function here so the diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 0bd8022e47b4..612ca682a513 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -31,7 +31,7 @@ * * ip->i_lock * qi->qi_tree_lock - * dquot->q_qlock (xfs_dqlock() and friends) + * dquot->q_qlock * dquot->q_flush (xfs_dqflock() and friends) * qi->qi_lru_lock * @@ -801,10 +801,11 @@ xfs_dq_get_next_id( static struct xfs_dquot * xfs_qm_dqget_cache_lookup( struct xfs_mount *mp, - struct xfs_quotainfo *qi, - struct radix_tree_root *tree, - xfs_dqid_t id) + xfs_dqid_t id, + xfs_dqtype_t type) { + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; restart: @@ -816,16 +817,12 @@ restart: return NULL; } - xfs_dqlock(dqp); - if (dqp->q_flags & XFS_DQFLAG_FREEING) { - xfs_dqunlock(dqp); + if (!lockref_get_not_dead(&dqp->q_lockref)) { mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_freeing(dqp); delay(1); goto restart; } - - dqp->q_nrefs++; mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_hit(dqp); @@ -836,8 +833,7 @@ restart: /* * Try to insert a new dquot into the in-core cache. If an error occurs the * caller should throw away the dquot and start over. Otherwise, the dquot - * is returned locked (and held by the cache) as if there had been a cache - * hit. + * is returned (and held by the cache) as if there had been a cache hit. * * The insert needs to be done under memalloc_nofs context because the radix * tree can do memory allocation during insert. The qi->qi_tree_lock is taken in @@ -848,11 +844,12 @@ restart: static int xfs_qm_dqget_cache_insert( struct xfs_mount *mp, - struct xfs_quotainfo *qi, - struct radix_tree_root *tree, xfs_dqid_t id, + xfs_dqtype_t type, struct xfs_dquot *dqp) { + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); unsigned int nofs_flags; int error; @@ -860,14 +857,11 @@ xfs_qm_dqget_cache_insert( mutex_lock(&qi->qi_tree_lock); error = radix_tree_insert(tree, id, dqp); if (unlikely(error)) { - /* Duplicate found! Caller must try again. */ trace_xfs_dqget_dup(dqp); goto out_unlock; } - /* Return a locked dquot to the caller, with a reference taken. */ - xfs_dqlock(dqp); - dqp->q_nrefs = 1; + lockref_init(&dqp->q_lockref); qi->qi_dquots++; out_unlock: @@ -903,7 +897,7 @@ xfs_qm_dqget_checks( /* * Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a - * locked dquot, doing an allocation (if requested) as needed. + * dquot, doing an allocation (if requested) as needed. */ int xfs_qm_dqget( @@ -913,8 +907,6 @@ xfs_qm_dqget( bool can_alloc, struct xfs_dquot **O_dqpp) { - struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; int error; @@ -923,28 +915,30 @@ xfs_qm_dqget( return error; restart: - dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); - if (dqp) { - *O_dqpp = dqp; - return 0; - } + dqp = xfs_qm_dqget_cache_lookup(mp, id, type); + if (dqp) + goto found; error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); if (error) return error; - error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); + error = xfs_qm_dqget_cache_insert(mp, id, type, dqp); if (error) { - /* - * Duplicate found. Just throw away the new dquot and start - * over. - */ xfs_qm_dqdestroy(dqp); - XFS_STATS_INC(mp, xs_qm_dquot_dups); - goto restart; + if (error == -EEXIST) { + /* + * Duplicate found. Just throw away the new dquot and + * start over. + */ + XFS_STATS_INC(mp, xs_qm_dquot_dups); + goto restart; + } + return error; } trace_xfs_dqget_miss(dqp); +found: *O_dqpp = dqp; return 0; } @@ -999,15 +993,16 @@ xfs_qm_dqget_inode( struct xfs_inode *ip, xfs_dqtype_t type, bool can_alloc, - struct xfs_dquot **O_dqpp) + struct xfs_dquot **dqpp) { struct xfs_mount *mp = ip->i_mount; - struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; xfs_dqid_t id; int error; + ASSERT(!*dqpp); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + error = xfs_qm_dqget_checks(mp, type); if (error) return error; @@ -1019,11 +1014,9 @@ xfs_qm_dqget_inode( id = xfs_qm_id_for_quotatype(ip, type); restart: - dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); - if (dqp) { - *O_dqpp = dqp; - return 0; - } + dqp = xfs_qm_dqget_cache_lookup(mp, id, type); + if (dqp) + goto found; /* * Dquot cache miss. We don't want to keep the inode lock across @@ -1049,7 +1042,6 @@ restart: if (dqp1) { xfs_qm_dqdestroy(dqp); dqp = dqp1; - xfs_dqlock(dqp); goto dqret; } } else { @@ -1058,21 +1050,26 @@ restart: return -ESRCH; } - error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); + error = xfs_qm_dqget_cache_insert(mp, id, type, dqp); if (error) { - /* - * Duplicate found. Just throw away the new dquot and start - * over. - */ xfs_qm_dqdestroy(dqp); - XFS_STATS_INC(mp, xs_qm_dquot_dups); - goto restart; + if (error == -EEXIST) { + /* + * Duplicate found. Just throw away the new dquot and + * start over. + */ + XFS_STATS_INC(mp, xs_qm_dquot_dups); + goto restart; + } + return error; } dqret: xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); trace_xfs_dqget_miss(dqp); - *O_dqpp = dqp; +found: + trace_xfs_dqattach_get(dqp); + *dqpp = dqp; return 0; } @@ -1098,63 +1095,41 @@ xfs_qm_dqget_next( else if (error != 0) break; + mutex_lock(&dqp->q_qlock); if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) { *dqpp = dqp; return 0; } - xfs_qm_dqput(dqp); + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); } return error; } /* - * Release a reference to the dquot (decrement ref-count) and unlock it. - * - * If there is a group quota attached to this dquot, carefully release that - * too without tripping over deadlocks'n'stuff. + * Release a reference to the dquot. */ void -xfs_qm_dqput( +xfs_qm_dqrele( struct xfs_dquot *dqp) { - ASSERT(dqp->q_nrefs > 0); - ASSERT(XFS_DQ_IS_LOCKED(dqp)); + if (!dqp) + return; - trace_xfs_dqput(dqp); + trace_xfs_dqrele(dqp); - if (--dqp->q_nrefs == 0) { + if (lockref_put_or_lock(&dqp->q_lockref)) + return; + if (!--dqp->q_lockref.count) { struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; - trace_xfs_dqput_free(dqp); + trace_xfs_dqrele_free(dqp); if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); } - xfs_dqunlock(dqp); -} - -/* - * Release a dquot. Flush it if dirty, then dqput() it. - * dquot must not be locked. - */ -void -xfs_qm_dqrele( - struct xfs_dquot *dqp) -{ - if (!dqp) - return; - - trace_xfs_dqrele(dqp); - - xfs_dqlock(dqp); - /* - * We don't care to flush it if the dquot is dirty here. - * That will create stutters that we want to avoid. - * Instead we do a delayed write when we try to reclaim - * a dirty dquot. Also xfs_sync will take part of the burden... - */ - xfs_qm_dqput(dqp); + spin_unlock(&dqp->q_lockref.lock); } /* diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 61217adf5ba5..bbb824adca82 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -71,7 +71,7 @@ struct xfs_dquot { xfs_dqtype_t q_type; uint16_t q_flags; xfs_dqid_t q_id; - uint q_nrefs; + struct lockref q_lockref; int q_bufoffset; xfs_daddr_t q_blkno; xfs_fileoff_t q_fileoffset; @@ -121,21 +121,6 @@ static inline void xfs_dqfunlock(struct xfs_dquot *dqp) complete(&dqp->q_flush); } -static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp) -{ - return mutex_trylock(&dqp->q_qlock); -} - -static inline void xfs_dqlock(struct xfs_dquot *dqp) -{ - mutex_lock(&dqp->q_qlock); -} - -static inline void xfs_dqunlock(struct xfs_dquot *dqp) -{ - mutex_unlock(&dqp->q_qlock); -} - static inline int xfs_dquot_type(const struct xfs_dquot *dqp) { @@ -233,7 +218,6 @@ int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, int xfs_qm_dqget_uncached(struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_dquot **dqpp); -void xfs_qm_dqput(struct xfs_dquot *dqp); void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); void xfs_dqlockn(struct xfs_dqtrx *q); @@ -246,9 +230,7 @@ void xfs_dquot_detach_buf(struct xfs_dquot *dqp); static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { - xfs_dqlock(dqp); - dqp->q_nrefs++; - xfs_dqunlock(dqp); + lockref_get(&dqp->q_lockref); return dqp; } diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 271b195ebb93..b374cd9f1900 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -132,7 +132,7 @@ xfs_qm_dquot_logitem_push( if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; - if (!xfs_dqlock_nowait(dqp)) + if (!mutex_trylock(&dqp->q_qlock)) return XFS_ITEM_LOCKED; /* @@ -177,7 +177,7 @@ xfs_qm_dquot_logitem_push( out_relock_ail: spin_lock(&lip->li_ailp->ail_lock); out_unlock: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); return rval; } @@ -195,7 +195,7 @@ xfs_qm_dquot_logitem_release( * transaction layer, within trans_commit. Hence, no LI_HOLD flag * for the logitem. */ - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); } STATIC void diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2702fef2c90c..6108612182e2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -27,6 +27,8 @@ #include "xfs_file.h" #include "xfs_aops.h" #include "xfs_zone_alloc.h" +#include "xfs_error.h" +#include "xfs_errortag.h" #include <linux/dax.h> #include <linux/falloc.h> @@ -674,8 +676,17 @@ xfs_file_dio_write_aligned( struct xfs_zone_alloc_ctx *ac) { unsigned int iolock = XFS_IOLOCK_SHARED; + unsigned int dio_flags = 0; ssize_t ret; + /* + * For always COW inodes, each bio must be aligned to the file system + * block size and not just the device sector size because we need to + * allocate a block-aligned amount of space for each write. + */ + if (xfs_is_always_cow_inode(ip)) + dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; @@ -693,7 +704,7 @@ xfs_file_dio_write_aligned( iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(iocb, from); - ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); + ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); out_unlock: xfs_iunlock(ip, iolock); return ret; @@ -890,15 +901,7 @@ xfs_file_dio_write( if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - /* - * For always COW inodes we also must check the alignment of each - * individual iovec segment, as they could end up with different - * I/Os due to the way bio_iov_iter_get_pages works, and we'd - * then overwrite an already written block. - */ - if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || - (xfs_is_always_cow_inode(ip) && - (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) + if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) return xfs_file_dio_write_unaligned(ip, iocb, from); if (xfs_is_zoned_inode(ip)) return xfs_file_dio_write_zoned(ip, iocb, from); @@ -1254,23 +1257,36 @@ xfs_falloc_zero_range( struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); unsigned int blksize = i_blocksize(inode); loff_t new_size = 0; int error; - trace_xfs_zero_file_space(XFS_I(inode)); + trace_xfs_zero_file_space(ip); error = xfs_falloc_newsize(file, mode, offset, len, &new_size); if (error) return error; - error = xfs_free_file_space(XFS_I(inode), offset, len, ac); - if (error) - return error; + /* + * Zero range implements a full zeroing mechanism but is only used in + * limited situations. It is more efficient to allocate unwritten + * extents than to perform zeroing here, so use an errortag to randomly + * force zeroing on DEBUG kernels for added test coverage. + */ + if (XFS_TEST_ERROR(ip->i_mount, + XFS_ERRTAG_FORCE_ZERO_RANGE)) { + error = xfs_zero_range(ip, offset, len, ac, NULL); + } else { + error = xfs_free_file_space(ip, offset, len, ac); + if (error) + return error; - len = round_up(offset + len, blksize) - round_down(offset, blksize); - offset = round_down(offset, blksize); - error = xfs_alloc_file_space(XFS_I(inode), offset, len); + len = round_up(offset + len, blksize) - + round_down(offset, blksize); + offset = round_down(offset, blksize); + error = xfs_alloc_file_space(ip, offset, len); + } if (error) return error; return xfs_falloc_setsize(file, new_size); diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c index f19fce557354..5a3e3bf4e7cc 100644 --- a/fs/xfs/xfs_handle.c +++ b/fs/xfs/xfs_handle.c @@ -233,14 +233,11 @@ xfs_open_by_handle( xfs_fsop_handlereq_t *hreq) { const struct cred *cred = current_cred(); - int error; - int fd; int permflag; - struct file *filp; struct inode *inode; struct dentry *dentry; fmode_t fmode; - struct path path; + struct path path __free(path_put) = {}; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -249,12 +246,11 @@ xfs_open_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); inode = d_inode(dentry); + path.dentry = dentry; /* Restrict xfs_open_by_handle to directories & regular files. */ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - error = -EPERM; - goto out_dput; - } + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + return -EPERM; #if BITS_PER_LONG != 32 hreq->oflags |= O_LARGEFILE; @@ -263,48 +259,30 @@ xfs_open_by_handle( permflag = hreq->oflags; fmode = OPEN_FMODE(permflag); if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && - (fmode & FMODE_WRITE) && IS_APPEND(inode)) { - error = -EPERM; - goto out_dput; - } + (fmode & FMODE_WRITE) && IS_APPEND(inode)) + return -EPERM; - if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { - error = -EPERM; - goto out_dput; - } + if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) + return -EPERM; /* Can't write directories. */ - if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { - error = -EISDIR; - goto out_dput; - } + if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) + return -EISDIR; - fd = get_unused_fd_flags(0); - if (fd < 0) { - error = fd; - goto out_dput; - } + path.mnt = mntget(parfilp->f_path.mnt); - path.mnt = parfilp->f_path.mnt; - path.dentry = dentry; - filp = dentry_open(&path, hreq->oflags, cred); - dput(dentry); - if (IS_ERR(filp)) { - put_unused_fd(fd); - return PTR_ERR(filp); - } + FD_PREPARE(fdf, 0, dentry_open(&path, hreq->oflags, cred)); + if (fdf.err) + return fdf.err; if (S_ISREG(inode->i_mode)) { + struct file *filp = fd_prepare_file(fdf); + filp->f_flags |= O_NOATIME; filp->f_mode |= FMODE_NOCMTIME; } - fd_install(fd, filp); - return fd; - - out_dput: - dput(dentry); - return error; + return fd_publish(fdf); } int diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 7c541fb373d5..3c1557fb1cf0 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -285,7 +285,7 @@ xfs_inode_mark_sick( * is not the case here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } @@ -309,7 +309,7 @@ xfs_inode_mark_corrupt( * is not the case here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e44040206851..23a920437fe4 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -334,7 +334,7 @@ xfs_reinit_inode( dev_t dev = inode->i_rdev; kuid_t uid = inode->i_uid; kgid_t gid = inode->i_gid; - unsigned long state = inode->i_state; + unsigned long state = inode_state_read_once(inode); error = inode_init_always(mp->m_super, inode); @@ -345,7 +345,7 @@ xfs_reinit_inode( inode->i_rdev = dev; inode->i_uid = uid; inode->i_gid = gid; - inode->i_state = state; + inode_state_assign_raw(inode, state); mapping_set_folio_min_order(inode->i_mapping, M_IGEO(mp)->min_folio_order); return error; @@ -358,7 +358,7 @@ xfs_reinit_inode( static int xfs_iget_recycle( struct xfs_perag *pag, - struct xfs_inode *ip) __releases(&ip->i_flags_lock) + struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); @@ -366,20 +366,6 @@ xfs_iget_recycle( trace_xfs_iget_recycle(ip); - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) - return -EAGAIN; - - /* - * We need to make it look like the inode is being reclaimed to prevent - * the actual reclaim workers from stomping over us while we recycle - * the inode. We can't clear the radix tree tag yet as it requires - * pag_ici_lock to be held exclusive. - */ - ip->i_flags |= XFS_IRECLAIM; - - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - ASSERT(!rwsem_is_locked(&inode->i_rwsem)); error = xfs_reinit_inode(mp, inode); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -411,7 +397,7 @@ xfs_iget_recycle( ip->i_flags |= XFS_INEW; xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); - inode->i_state = I_NEW; + inode_state_assign_raw(inode, I_NEW); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); @@ -576,10 +562,19 @@ xfs_iget_cache_hit( /* The inode fits the selection criteria; process it. */ if (ip->i_flags & XFS_IRECLAIMABLE) { - /* Drops i_flags_lock and RCU read lock. */ - error = xfs_iget_recycle(pag, ip); - if (error == -EAGAIN) + /* + * We need to make it look like the inode is being reclaimed to + * prevent the actual reclaim workers from stomping over us + * while we recycle the inode. We can't clear the radix tree + * tag yet as it requires pag_ici_lock to be held exclusive. + */ + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) goto out_skip; + ip->i_flags |= XFS_IRECLAIM; + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + error = xfs_iget_recycle(pag, ip); if (error) return error; } else { diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 36b39539e561..f1f88e48fe22 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1580,7 +1580,7 @@ xfs_iunlink_reload_next( next_ip->i_prev_unlinked = prev_agino; trace_xfs_iunlink_reload_next(next_ip); rele: - ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); + ASSERT(!(inode_state_read_once(VFS_I(next_ip)) & I_DONTCACHE)); if (xfs_is_quotacheck_running(mp) && next_ip) xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); xfs_irele(next_ip); @@ -2111,7 +2111,7 @@ xfs_rename_alloc_whiteout( */ xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); - VFS_I(tmpfile)->i_state |= I_LINKABLE; + inode_state_set_raw(VFS_I(tmpfile), I_LINKABLE); *wip = tmpfile; return 0; @@ -2330,7 +2330,7 @@ retry: * flag from the inode so it doesn't accidentally get misused in * future. */ - VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE; + inode_state_clear_raw(VFS_I(du_wip.ip), I_LINKABLE); } out_commit: diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 1bd411a1114c..2eb0c6011a2e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -113,9 +113,9 @@ xfs_inode_item_precommit( * to log the timestamps, or will clear already cleared fields in the * worst case. */ - if (inode->i_state & I_DIRTY_TIME) { + if (inode_state_read_once(inode) & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; + inode_state_clear(inode, I_DIRTY_TIME); spin_unlock(&inode->i_lock); } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a6bb7ee7a27a..59eaad774371 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1408,10 +1408,8 @@ xfs_file_ioctl( trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_); - sb_start_write(mp->m_super); - error = xfs_blockgc_free_space(mp, &icw); - sb_end_write(mp->m_super); - return error; + guard(super_write)(mp->m_super); + return xfs_blockgc_free_space(mp, &icw); } case XFS_IOC_EXCHANGE_RANGE: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d3f6e3e42a11..04f39ea15898 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1091,6 +1091,29 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = { }; #endif /* CONFIG_XFS_RT */ +#ifdef DEBUG +static void +xfs_check_atomic_cow_conversion( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb, + const struct xfs_bmbt_irec *cmap) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec cmap2 = { }; + + if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap2)) + xfs_trim_extent(&cmap2, offset_fsb, count_fsb); + + ASSERT(cmap2.br_startoff == cmap->br_startoff); + ASSERT(cmap2.br_blockcount == cmap->br_blockcount); + ASSERT(cmap2.br_startblock == cmap->br_startblock); + ASSERT(cmap2.br_state == cmap->br_state); +} +#else +# define xfs_check_atomic_cow_conversion(...) ((void)0) +#endif + static int xfs_atomic_write_cow_iomap_begin( struct inode *inode, @@ -1102,9 +1125,10 @@ xfs_atomic_write_cow_iomap_begin( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); - xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); - xfs_filblks_t count_fsb = end_fsb - offset_fsb; + const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + const xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); + const xfs_filblks_t count_fsb = end_fsb - offset_fsb; + xfs_filblks_t hole_count_fsb; int nmaps = 1; xfs_filblks_t resaligned; struct xfs_bmbt_irec cmap; @@ -1130,7 +1154,7 @@ xfs_atomic_write_cow_iomap_begin( return -EAGAIN; trace_xfs_iomap_atomic_write_cow(ip, offset, length); - +retry: xfs_ilock(ip, XFS_ILOCK_EXCL); if (!ip->i_cowfp) { @@ -1141,14 +1165,22 @@ xfs_atomic_write_cow_iomap_begin( if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) cmap.br_startoff = end_fsb; if (cmap.br_startoff <= offset_fsb) { + if (isnullstartblock(cmap.br_startblock)) + goto convert_delay; + + /* + * cmap could extend outside the write range due to previous + * speculative preallocations. We must trim cmap to the write + * range because the cow fork treats written mappings to mean + * "write in progress". + */ xfs_trim_extent(&cmap, offset_fsb, count_fsb); goto found; } - end_fsb = cmap.br_startoff; - count_fsb = end_fsb - offset_fsb; + hole_count_fsb = cmap.br_startoff - offset_fsb; - resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, + resaligned = xfs_aligned_fsb_count(offset_fsb, hole_count_fsb, xfs_get_cowextsz_hint(ip)); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1169,8 +1201,10 @@ xfs_atomic_write_cow_iomap_begin( if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) cmap.br_startoff = end_fsb; if (cmap.br_startoff <= offset_fsb) { - xfs_trim_extent(&cmap, offset_fsb, count_fsb); xfs_trans_cancel(tp); + if (isnullstartblock(cmap.br_startblock)) + goto convert_delay; + xfs_trim_extent(&cmap, offset_fsb, count_fsb); goto found; } @@ -1182,7 +1216,7 @@ xfs_atomic_write_cow_iomap_begin( * atomic writes to that same range will be aligned (and don't require * this COW-based method). */ - error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + error = xfs_bmapi_write(tp, ip, offset_fsb, hole_count_fsb, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC | XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps); if (error) { @@ -1195,21 +1229,43 @@ xfs_atomic_write_cow_iomap_begin( if (error) goto out_unlock; + /* + * cmap could map more blocks than the range we passed into bmapi_write + * because of EXTSZALIGN or adjacent pre-existing unwritten mappings + * that were merged. Trim cmap to the original write range so that we + * don't convert more than we were asked to do for this write. + */ + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + found: if (cmap.br_state != XFS_EXT_NORM) { - error = xfs_reflink_convert_cow_locked(ip, offset_fsb, - count_fsb); + error = xfs_reflink_convert_cow_locked(ip, cmap.br_startoff, + cmap.br_blockcount); if (error) goto out_unlock; cmap.br_state = XFS_EXT_NORM; + xfs_check_atomic_cow_conversion(ip, offset_fsb, count_fsb, + &cmap); } - length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); - trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); + trace_xfs_iomap_found(ip, offset, length, XFS_COW_FORK, &cmap); seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); xfs_iunlock(ip, XFS_ILOCK_EXCL); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); +convert_delay: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = xfs_bmapi_convert_delalloc(ip, XFS_COW_FORK, offset, iomap, + NULL); + if (error) + return error; + + /* + * Try the lookup again, because the delalloc conversion might have + * turned the COW mapping into unwritten, but we need it to be in + * written state. + */ + goto retry; out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -1702,6 +1758,8 @@ xfs_buffered_write_iomap_begin( struct iomap *iomap, struct iomap *srcmap) { + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, + iomap); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -1767,21 +1825,41 @@ xfs_buffered_write_iomap_begin( } /* - * For zeroing, trim a delalloc extent that extends beyond the EOF - * block. If it starts beyond the EOF block, convert it to an + * For zeroing, trim extents that extend beyond the EOF block. If a + * delalloc extent starts beyond the EOF block, convert it to an * unwritten extent. */ - if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb && - isnullstartblock(imap.br_startblock)) { + if (flags & IOMAP_ZERO) { xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + u64 end; - if (offset_fsb >= eof_fsb) + if (isnullstartblock(imap.br_startblock) && + offset_fsb >= eof_fsb) goto convert_delay; - if (end_fsb > eof_fsb) { + if (offset_fsb < eof_fsb && end_fsb > eof_fsb) end_fsb = eof_fsb; - xfs_trim_extent(&imap, offset_fsb, - end_fsb - offset_fsb); + + /* + * Look up dirty folios for unwritten mappings within EOF. + * Providing this bypasses the flush iomap uses to trigger + * extent conversion when unwritten mappings have dirty + * pagecache in need of zeroing. + * + * Trim the mapping to the end pos of the lookup, which in turn + * was trimmed to the end of the batch if it became full before + * the end of the mapping. + */ + if (imap.br_state == XFS_EXT_UNWRITTEN && + offset_fsb < eof_fsb) { + loff_t len = min(count, + XFS_FSB_TO_B(mp, imap.br_blockcount)); + + end = iomap_fill_dirty_folios(iter, offset, len); + end_fsb = min_t(xfs_fileoff_t, end_fsb, + XFS_B_TO_FSB(mp, end)); } + + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); } /* diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index caff0125faea..ad94fbf55014 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1420,7 +1420,7 @@ xfs_setup_inode( bool is_meta = xfs_is_internal_inode(ip); inode->i_ino = ip->i_ino; - inode->i_state |= I_NEW; + inode_state_set_raw(inode, I_NEW); inode_sb_list_add(inode); /* make the inode look hashed for the writeback code */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 603e85c1ab4c..a311385b23d8 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -534,8 +534,8 @@ xlog_state_release_iclog( */ if ((iclog->ic_state == XLOG_STATE_WANT_SYNC || (iclog->ic_flags & XLOG_ICL_NEED_FUA)) && - !iclog->ic_header.h_tail_lsn) { - iclog->ic_header.h_tail_lsn = + !iclog->ic_header->h_tail_lsn) { + iclog->ic_header->h_tail_lsn = cpu_to_be64(atomic64_read(&log->l_tail_lsn)); } @@ -1279,11 +1279,12 @@ xlog_get_iclog_buffer_size( log->l_iclog_size = mp->m_logbsize; /* - * # headers = size / 32k - one header holds cycles from 32k of data. + * Combined size of the log record headers. The first 32k cycles + * are stored directly in the xlog_rec_header, the rest in the + * variable number of xlog_rec_ext_headers at its end. */ - log->l_iclog_heads = - DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE); - log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT; + log->l_iclog_hsize = struct_size(log->l_iclog->ic_header, h_ext, + DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE) - 1); } void @@ -1367,9 +1368,8 @@ xlog_alloc_log( int num_bblks) { struct xlog *log; - xlog_rec_header_t *head; - xlog_in_core_t **iclogp; - xlog_in_core_t *iclog, *prev_iclog=NULL; + struct xlog_in_core **iclogp; + struct xlog_in_core *iclog, *prev_iclog = NULL; int i; int error = -ENOMEM; uint log2_size = 0; @@ -1436,13 +1436,6 @@ xlog_alloc_log( init_waitqueue_head(&log->l_flush_wait); iclogp = &log->l_iclog; - /* - * The amount of memory to allocate for the iclog structure is - * rather funky due to the way the structure is defined. It is - * done this way so that we can use different sizes for machines - * with different amounts of memory. See the definition of - * xlog_in_core_t in xfs_log_priv.h for details. - */ ASSERT(log->l_iclog_size >= 4096); for (i = 0; i < log->l_iclog_bufs; i++) { size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * @@ -1457,26 +1450,25 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog; - iclog->ic_data = kvzalloc(log->l_iclog_size, + iclog->ic_header = kvzalloc(log->l_iclog_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); - if (!iclog->ic_data) + if (!iclog->ic_header) goto out_free_iclog; - head = &iclog->ic_header; - memset(head, 0, sizeof(xlog_rec_header_t)); - head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); - head->h_version = cpu_to_be32( + iclog->ic_header->h_magicno = + cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + iclog->ic_header->h_version = cpu_to_be32( xfs_has_logv2(log->l_mp) ? 2 : 1); - head->h_size = cpu_to_be32(log->l_iclog_size); - /* new fields */ - head->h_fmt = cpu_to_be32(XLOG_FMT); - memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); + iclog->ic_header->h_size = cpu_to_be32(log->l_iclog_size); + iclog->ic_header->h_fmt = cpu_to_be32(XLOG_FMT); + memcpy(&iclog->ic_header->h_fs_uuid, &mp->m_sb.sb_uuid, + sizeof(iclog->ic_header->h_fs_uuid)); + iclog->ic_datap = (void *)iclog->ic_header + log->l_iclog_hsize; iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); INIT_LIST_HEAD(&iclog->ic_callbacks); - iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize; init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1504,7 +1496,7 @@ out_destroy_workqueue: out_free_iclog: for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { prev_iclog = iclog->ic_next; - kvfree(iclog->ic_data); + kvfree(iclog->ic_header); kfree(iclog); if (prev_iclog == log->l_iclog) break; @@ -1524,36 +1516,19 @@ xlog_pack_data( struct xlog_in_core *iclog, int roundoff) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - __be32 cycle_lsn; - char *dp; - - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + struct xlog_rec_header *rhead = iclog->ic_header; + __be32 cycle_lsn = CYCLE_LSN_DISK(rhead->h_lsn); + char *dp = iclog->ic_datap; + int i; - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size); i++) { - if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) - break; - iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + for (i = 0; i < BTOBB(iclog->ic_offset + roundoff); i++) { + *xlog_cycle_data(rhead, i) = *(__be32 *)dp; *(__be32 *)dp = cycle_lsn; dp += BBSIZE; } - if (xfs_has_logv2(log->l_mp)) { - xlog_in_core_2_t *xhdr = iclog->ic_data; - - for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; - } - - for (i = 1; i < log->l_iclog_heads; i++) - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + for (i = 0; i < (log->l_iclog_hsize >> BBSHIFT) - 1; i++) + rhead->h_ext[i].xh_cycle = cycle_lsn; } /* @@ -1578,16 +1553,11 @@ xlog_cksum( /* ... then for additional cycle data for v2 logs ... */ if (xfs_has_logv2(log->l_mp)) { - union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; - int i; - int xheads; + int xheads, i; - xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE); - - for (i = 1; i < xheads; i++) { - crc = crc32c(crc, &xhdr[i].hic_xheader, - sizeof(struct xlog_rec_ext_header)); - } + xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE) - 1; + for (i = 0; i < xheads; i++) + crc = crc32c(crc, &rhead->h_ext[i], XLOG_REC_EXT_SIZE); } /* ... and finally for the payload */ @@ -1671,11 +1641,11 @@ xlog_write_iclog( iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - if (is_vmalloc_addr(iclog->ic_data)) { - if (!bio_add_vmalloc(&iclog->ic_bio, iclog->ic_data, count)) + if (is_vmalloc_addr(iclog->ic_header)) { + if (!bio_add_vmalloc(&iclog->ic_bio, iclog->ic_header, count)) goto shutdown; } else { - bio_add_virt_nofail(&iclog->ic_bio, iclog->ic_data, count); + bio_add_virt_nofail(&iclog->ic_bio, iclog->ic_header, count); } /* @@ -1804,19 +1774,19 @@ xlog_sync( size = iclog->ic_offset; if (xfs_has_logv2(log->l_mp)) size += roundoff; - iclog->ic_header.h_len = cpu_to_be32(size); + iclog->ic_header->h_len = cpu_to_be32(size); XFS_STATS_INC(log->l_mp, xs_log_writes); XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count)); - bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)); + bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header->h_lsn)); /* Do we need to split this write into 2 parts? */ if (bno + BTOBB(count) > log->l_logBBsize) - xlog_split_iclog(log, &iclog->ic_header, bno, count); + xlog_split_iclog(log, iclog->ic_header, bno, count); /* calculcate the checksum */ - iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, + iclog->ic_header->h_crc = xlog_cksum(log, iclog->ic_header, iclog->ic_datap, XLOG_REC_SIZE, size); /* * Intentionally corrupt the log record CRC based on the error injection @@ -1827,11 +1797,11 @@ xlog_sync( */ #ifdef DEBUG if (XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { - iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); + iclog->ic_header->h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_fail_crc = true; xfs_warn(log->l_mp, "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", - be64_to_cpu(iclog->ic_header.h_lsn)); + be64_to_cpu(iclog->ic_header->h_lsn)); } #endif xlog_verify_iclog(log, iclog, count); @@ -1843,10 +1813,10 @@ xlog_sync( */ STATIC void xlog_dealloc_log( - struct xlog *log) + struct xlog *log) { - xlog_in_core_t *iclog, *next_iclog; - int i; + struct xlog_in_core *iclog, *next_iclog; + int i; /* * Destroy the CIL after waiting for iclog IO completion because an @@ -1858,7 +1828,7 @@ xlog_dealloc_log( iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { next_iclog = iclog->ic_next; - kvfree(iclog->ic_data); + kvfree(iclog->ic_header); kfree(iclog); iclog = next_iclog; } @@ -1880,7 +1850,7 @@ xlog_state_finish_copy( { lockdep_assert_held(&log->l_icloglock); - be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); + be32_add_cpu(&iclog->ic_header->h_num_logops, record_cnt); iclog->ic_offset += copy_bytes; } @@ -2303,7 +2273,7 @@ xlog_state_activate_iclog( * We don't need to cover the dummy. */ if (*iclogs_changed == 0 && - iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { + iclog->ic_header->h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { *iclogs_changed = 1; } else { /* @@ -2315,11 +2285,11 @@ xlog_state_activate_iclog( iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_offset = 0; - iclog->ic_header.h_num_logops = 0; - memset(iclog->ic_header.h_cycle_data, 0, - sizeof(iclog->ic_header.h_cycle_data)); - iclog->ic_header.h_lsn = 0; - iclog->ic_header.h_tail_lsn = 0; + iclog->ic_header->h_num_logops = 0; + memset(iclog->ic_header->h_cycle_data, 0, + sizeof(iclog->ic_header->h_cycle_data)); + iclog->ic_header->h_lsn = 0; + iclog->ic_header->h_tail_lsn = 0; } /* @@ -2411,7 +2381,7 @@ xlog_get_lowest_lsn( iclog->ic_state == XLOG_STATE_DIRTY) continue; - lsn = be64_to_cpu(iclog->ic_header.h_lsn); + lsn = be64_to_cpu(iclog->ic_header->h_lsn); if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0) lowest_lsn = lsn; } while ((iclog = iclog->ic_next) != log->l_iclog); @@ -2446,7 +2416,7 @@ xlog_state_iodone_process_iclog( * If this is not the lowest lsn iclog, then we will leave it * for another completion to process. */ - header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + header_lsn = be64_to_cpu(iclog->ic_header->h_lsn); lowest_lsn = xlog_get_lowest_lsn(log); if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) return false; @@ -2609,9 +2579,9 @@ xlog_state_get_iclog_space( struct xlog_ticket *ticket, int *logoffsetp) { - int log_offset; - xlog_rec_header_t *head; - xlog_in_core_t *iclog; + int log_offset; + struct xlog_rec_header *head; + struct xlog_in_core *iclog; restart: spin_lock(&log->l_icloglock); @@ -2629,7 +2599,7 @@ restart: goto restart; } - head = &iclog->ic_header; + head = iclog->ic_header; atomic_inc(&iclog->ic_refcnt); /* prevents sync */ log_offset = iclog->ic_offset; @@ -2794,7 +2764,7 @@ xlog_state_switch_iclogs( if (!eventual_size) eventual_size = iclog->ic_offset; iclog->ic_state = XLOG_STATE_WANT_SYNC; - iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block); + iclog->ic_header->h_prev_block = cpu_to_be32(log->l_prev_block); log->l_prev_block = log->l_curr_block; log->l_prev_cycle = log->l_curr_cycle; @@ -2838,7 +2808,7 @@ xlog_force_and_check_iclog( struct xlog_in_core *iclog, bool *completed) { - xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn); + xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header->h_lsn); int error; *completed = false; @@ -2850,7 +2820,7 @@ xlog_force_and_check_iclog( * If the iclog has already been completed and reused the header LSN * will have been rewritten by completion */ - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) + if (be64_to_cpu(iclog->ic_header->h_lsn) != lsn) *completed = true; return 0; } @@ -2983,7 +2953,7 @@ xlog_force_lsn( goto out_error; iclog = log->l_iclog; - while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + while (be64_to_cpu(iclog->ic_header->h_lsn) != lsn) { trace_xlog_iclog_force_lsn(iclog, _RET_IP_); iclog = iclog->ic_next; if (iclog == log->l_iclog) @@ -3249,7 +3219,7 @@ xlog_verify_dump_tail( { xfs_alert(log->l_mp, "ran out of log space tail 0x%llx/0x%llx, head lsn 0x%llx, head 0x%x/0x%x, prev head 0x%x/0x%x", - iclog ? be64_to_cpu(iclog->ic_header.h_tail_lsn) : -1, + iclog ? be64_to_cpu(iclog->ic_header->h_tail_lsn) : -1, atomic64_read(&log->l_tail_lsn), log->l_ailp->ail_head_lsn, log->l_curr_cycle, log->l_curr_block, @@ -3268,7 +3238,7 @@ xlog_verify_tail_lsn( struct xlog *log, struct xlog_in_core *iclog) { - xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn); + xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header->h_tail_lsn); int blocks; if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { @@ -3322,13 +3292,12 @@ xlog_verify_iclog( struct xlog_in_core *iclog, int count) { - struct xlog_op_header *ophead; - xlog_in_core_t *icptr; - xlog_in_core_2_t *xhdr; - void *base_ptr, *ptr, *p; + struct xlog_rec_header *rhead = iclog->ic_header; + struct xlog_in_core *icptr; + void *base_ptr, *ptr; ptrdiff_t field_offset; uint8_t clientid; - int len, i, j, k, op_len; + int len, i, op_len; int idx; /* check validity of iclog pointers */ @@ -3342,11 +3311,10 @@ xlog_verify_iclog( spin_unlock(&log->l_icloglock); /* check log magic numbers */ - if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) + if (rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); - base_ptr = ptr = &iclog->ic_header; - p = &iclog->ic_header; + base_ptr = ptr = rhead; for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) { if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: unexpected magic num", @@ -3354,29 +3322,19 @@ xlog_verify_iclog( } /* check fields */ - len = be32_to_cpu(iclog->ic_header.h_num_logops); + len = be32_to_cpu(rhead->h_num_logops); base_ptr = ptr = iclog->ic_datap; - ophead = ptr; - xhdr = iclog->ic_data; for (i = 0; i < len; i++) { - ophead = ptr; + struct xlog_op_header *ophead = ptr; + void *p = &ophead->oh_clientid; /* clientid is only 1 byte */ - p = &ophead->oh_clientid; field_offset = p - base_ptr; if (field_offset & 0x1ff) { clientid = ophead->oh_clientid; } else { idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap); - if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { - j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - clientid = xlog_get_client_id( - xhdr[j].hic_xheader.xh_cycle_data[k]); - } else { - clientid = xlog_get_client_id( - iclog->ic_header.h_cycle_data[idx]); - } + clientid = xlog_get_client_id(*xlog_cycle_data(rhead, idx)); } if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) { xfs_warn(log->l_mp, @@ -3392,13 +3350,7 @@ xlog_verify_iclog( op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap); - if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { - j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); - } else { - op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); - } + op_len = be32_to_cpu(*xlog_cycle_data(rhead, idx)); } ptr += sizeof(struct xlog_op_header) + op_len; } @@ -3529,19 +3481,19 @@ xlog_force_shutdown( STATIC int xlog_iclogs_empty( - struct xlog *log) + struct xlog *log) { - xlog_in_core_t *iclog; + struct xlog_in_core *iclog = log->l_iclog; - iclog = log->l_iclog; do { /* endianness does not matter here, zero is zero in * any language. */ - if (iclog->ic_header.h_num_logops) + if (iclog->ic_header->h_num_logops) return 0; iclog = iclog->ic_next; } while (iclog != log->l_iclog); + return 1; } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index f443757e93c2..778ac47adb8c 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -940,7 +940,7 @@ xlog_cil_set_ctx_write_state( struct xlog_in_core *iclog) { struct xfs_cil *cil = ctx->cil; - xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn); + xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header->h_lsn); ASSERT(!ctx->commit_lsn); if (!ctx->start_lsn) { @@ -1458,9 +1458,9 @@ xlog_cil_push_work( */ spin_lock(&log->l_icloglock); if (ctx->start_lsn != ctx->commit_lsn) { - xfs_lsn_t plsn; + xfs_lsn_t plsn = be64_to_cpu( + ctx->commit_iclog->ic_prev->ic_header->h_lsn); - plsn = be64_to_cpu(ctx->commit_iclog->ic_prev->ic_header.h_lsn); if (plsn && XFS_LSN_CMP(plsn, ctx->commit_lsn) < 0) { /* * Waiting on ic_force_wait orders the completion of diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 0cfc654d8e87..0fe59f0525aa 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -158,10 +158,8 @@ struct xlog_ticket { }; /* - * - A log record header is 512 bytes. There is plenty of room to grow the - * xlog_rec_header_t into the reserved space. - * - ic_data follows, so a write to disk can start at the beginning of - * the iclog. + * In-core log structure. + * * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. * - ic_next is the pointer to the next iclog in the ring. * - ic_log is a pointer back to the global log structure. @@ -183,7 +181,7 @@ struct xlog_ticket { * We'll put all the read-only and l_icloglock fields in the first cacheline, * and move everything else out to subsequent cachelines. */ -typedef struct xlog_in_core { +struct xlog_in_core { wait_queue_head_t ic_force_wait; wait_queue_head_t ic_write_wait; struct xlog_in_core *ic_next; @@ -198,8 +196,7 @@ typedef struct xlog_in_core { /* reference counts need their own cacheline */ atomic_t ic_refcnt ____cacheline_aligned_in_smp; - xlog_in_core_2_t *ic_data; -#define ic_header ic_data->hic_header + struct xlog_rec_header *ic_header; #ifdef DEBUG bool ic_fail_crc : 1; #endif @@ -207,7 +204,7 @@ typedef struct xlog_in_core { struct work_struct ic_end_io_work; struct bio ic_bio; struct bio_vec ic_bvec[]; -} xlog_in_core_t; +}; /* * The CIL context is used to aggregate per-transaction details as well be @@ -409,7 +406,6 @@ struct xlog { struct list_head *l_buf_cancel_table; struct list_head r_dfops; /* recovered log intent items */ int l_iclog_hsize; /* size of iclog header */ - int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ int l_iclog_size; /* size of log in bytes */ int l_iclog_bufs; /* number of iclog buffers */ @@ -422,7 +418,7 @@ struct xlog { /* waiting for iclog flush */ int l_covered_state;/* state of "covering disk * log entries" */ - xlog_in_core_t *l_iclog; /* head log queue */ + struct xlog_in_core *l_iclog; /* head log queue */ spinlock_t l_icloglock; /* grab to change iclog state */ int l_curr_cycle; /* Cycle number of log writes */ int l_prev_cycle; /* Cycle number before last @@ -711,4 +707,21 @@ xlog_item_space( return round_up(nbytes, sizeof(uint64_t)); } +/* + * Cycles over XLOG_CYCLE_DATA_SIZE overflow into the extended header that was + * added for v2 logs. Addressing for the cycles array there is off by one, + * because the first batch of cycles is in the original header. + */ +static inline __be32 *xlog_cycle_data(struct xlog_rec_header *rhead, unsigned i) +{ + if (i >= XLOG_CYCLE_DATA_SIZE) { + unsigned j = i / XLOG_CYCLE_DATA_SIZE; + unsigned k = i % XLOG_CYCLE_DATA_SIZE; + + return &rhead->h_ext[j - 1].xh_cycle_data[k]; + } + + return &rhead->h_cycle_data[i]; +} + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 549d60959aee..03e42c7dab56 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -190,8 +190,8 @@ xlog_bwrite( */ STATIC void xlog_header_check_dump( - xfs_mount_t *mp, - xlog_rec_header_t *head) + struct xfs_mount *mp, + struct xlog_rec_header *head) { xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", __func__, &mp->m_sb.sb_uuid, XLOG_FMT); @@ -207,8 +207,8 @@ xlog_header_check_dump( */ STATIC int xlog_header_check_recover( - xfs_mount_t *mp, - xlog_rec_header_t *head) + struct xfs_mount *mp, + struct xlog_rec_header *head) { ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); @@ -238,8 +238,8 @@ xlog_header_check_recover( */ STATIC int xlog_header_check_mount( - xfs_mount_t *mp, - xlog_rec_header_t *head) + struct xfs_mount *mp, + struct xlog_rec_header *head) { ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); @@ -400,7 +400,7 @@ xlog_find_verify_log_record( xfs_daddr_t i; char *buffer; char *offset = NULL; - xlog_rec_header_t *head = NULL; + struct xlog_rec_header *head = NULL; int error = 0; int smallmem = 0; int num_blks = *last_blk - start_blk; @@ -437,7 +437,7 @@ xlog_find_verify_log_record( goto out; } - head = (xlog_rec_header_t *)offset; + head = (struct xlog_rec_header *)offset; if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) break; @@ -1237,7 +1237,7 @@ xlog_find_tail( xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk) { - xlog_rec_header_t *rhead; + struct xlog_rec_header *rhead; char *offset = NULL; char *buffer; int error; @@ -1487,7 +1487,7 @@ xlog_add_record( int tail_cycle, int tail_block) { - xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; + struct xlog_rec_header *recp = (struct xlog_rec_header *)buf; memset(buf, 0, BBSIZE); recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); @@ -2863,23 +2863,12 @@ xlog_unpack_data( char *dp, struct xlog *log) { - int i, j, k; + int i; - for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; + for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { + *(__be32 *)dp = *xlog_cycle_data(rhead, i); dp += BBSIZE; } - - if (xfs_has_logv2(log->l_mp)) { - xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; - for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; - dp += BBSIZE; - } - } } /* @@ -3008,7 +2997,7 @@ xlog_do_recovery_pass( int pass, xfs_daddr_t *first_bad) /* out: first bad log rec */ { - xlog_rec_header_t *rhead; + struct xlog_rec_header *rhead; xfs_daddr_t blk_no, rblk_no; xfs_daddr_t rhead_blk; char *offset; @@ -3045,7 +3034,7 @@ xlog_do_recovery_pass( if (error) goto bread_err1; - rhead = (xlog_rec_header_t *)offset; + rhead = (struct xlog_rec_header *)offset; /* * xfsprogs has a bug where record length is based on lsunit but @@ -3152,7 +3141,7 @@ xlog_do_recovery_pass( if (error) goto bread_err2; } - rhead = (xlog_rec_header_t *)offset; + rhead = (struct xlog_rec_header *)offset; error = xlog_valid_rec_header(log, rhead, split_hblks ? blk_no : 0, h_size); if (error) @@ -3234,7 +3223,7 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - rhead = (xlog_rec_header_t *)offset; + rhead = (struct xlog_rec_header *)offset; error = xlog_valid_rec_header(log, rhead, blk_no, h_size); if (error) goto bread_err2; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f046d1215b04..b871dfde372b 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -236,7 +236,6 @@ typedef struct xfs_mount { bool m_update_sb; /* sb needs update in mount */ unsigned int m_max_open_zones; unsigned int m_zonegc_low_space; - struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */ /* max_atomic_write mount option value */ unsigned long long m_awu_max_bytes; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 23ba84ec919a..95be67ac6eb4 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -126,14 +126,16 @@ xfs_qm_dqpurge( void *data) { struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; - int error = -EAGAIN; - xfs_dqlock(dqp); - if ((dqp->q_flags & XFS_DQFLAG_FREEING) || dqp->q_nrefs != 0) - goto out_unlock; - - dqp->q_flags |= XFS_DQFLAG_FREEING; + spin_lock(&dqp->q_lockref.lock); + if (dqp->q_lockref.count > 0 || __lockref_is_dead(&dqp->q_lockref)) { + spin_unlock(&dqp->q_lockref.lock); + return -EAGAIN; + } + lockref_mark_dead(&dqp->q_lockref); + spin_unlock(&dqp->q_lockref.lock); + mutex_lock(&dqp->q_qlock); xfs_qm_dqunpin_wait(dqp); xfs_dqflock(dqp); @@ -144,6 +146,7 @@ xfs_qm_dqpurge( */ if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; + int error; /* * We don't care about getting disk errors here. We need @@ -151,9 +154,9 @@ xfs_qm_dqpurge( */ error = xfs_dquot_use_attached_buf(dqp, &bp); if (error == -EAGAIN) { - xfs_dqfunlock(dqp); - dqp->q_flags &= ~XFS_DQFLAG_FREEING; - goto out_unlock; + /* resurrect the refcount from the dead. */ + dqp->q_lockref.count = 0; + goto out_funlock; } if (!bp) goto out_funlock; @@ -177,7 +180,7 @@ out_funlock: !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id); qi->qi_dquots--; @@ -192,10 +195,6 @@ out_funlock: xfs_qm_dqdestroy(dqp); return 0; - -out_unlock: - xfs_dqunlock(dqp); - return error; } /* @@ -288,51 +287,6 @@ xfs_qm_unmount_quotas( xfs_qm_destroy_quotainos(mp->m_quotainfo); } -STATIC int -xfs_qm_dqattach_one( - struct xfs_inode *ip, - xfs_dqtype_t type, - bool doalloc, - struct xfs_dquot **IO_idqpp) -{ - struct xfs_dquot *dqp; - int error; - - xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - error = 0; - - /* - * See if we already have it in the inode itself. IO_idqpp is &i_udquot - * or &i_gdquot. This made the code look weird, but made the logic a lot - * simpler. - */ - dqp = *IO_idqpp; - if (dqp) { - trace_xfs_dqattach_found(dqp); - return 0; - } - - /* - * Find the dquot from somewhere. This bumps the reference count of - * dquot and returns it locked. This can return ENOENT if dquot didn't - * exist on disk and we didn't ask it to allocate; ESRCH if quotas got - * turned off suddenly. - */ - error = xfs_qm_dqget_inode(ip, type, doalloc, &dqp); - if (error) - return error; - - trace_xfs_dqattach_get(dqp); - - /* - * dqget may have dropped and re-acquired the ilock, but it guarantees - * that the dquot returned is the one that should go in the inode. - */ - *IO_idqpp = dqp; - xfs_dqunlock(dqp); - return 0; -} - static bool xfs_qm_need_dqattach( struct xfs_inode *ip) @@ -372,7 +326,7 @@ xfs_qm_dqattach_locked( ASSERT(!xfs_is_metadir_inode(ip)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { - error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER, + error = xfs_qm_dqget_inode(ip, XFS_DQTYPE_USER, doalloc, &ip->i_udquot); if (error) goto done; @@ -380,7 +334,7 @@ xfs_qm_dqattach_locked( } if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { - error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_GROUP, + error = xfs_qm_dqget_inode(ip, XFS_DQTYPE_GROUP, doalloc, &ip->i_gdquot); if (error) goto done; @@ -388,7 +342,7 @@ xfs_qm_dqattach_locked( } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { - error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_PROJ, + error = xfs_qm_dqget_inode(ip, XFS_DQTYPE_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; @@ -468,7 +422,7 @@ xfs_qm_dquot_isolate( struct xfs_qm_isolate *isol = arg; enum lru_status ret = LRU_SKIP; - if (!xfs_dqlock_nowait(dqp)) + if (!spin_trylock(&dqp->q_lockref.lock)) goto out_miss_busy; /* @@ -476,7 +430,7 @@ xfs_qm_dquot_isolate( * from the LRU, leave it for the freeing task to complete the freeing * process rather than risk it being free from under us here. */ - if (dqp->q_flags & XFS_DQFLAG_FREEING) + if (__lockref_is_dead(&dqp->q_lockref)) goto out_miss_unlock; /* @@ -485,16 +439,15 @@ xfs_qm_dquot_isolate( * again. */ ret = LRU_ROTATE; - if (XFS_DQ_IS_DIRTY(dqp) || atomic_read(&dqp->q_pincount) > 0) { + if (XFS_DQ_IS_DIRTY(dqp) || atomic_read(&dqp->q_pincount) > 0) goto out_miss_unlock; - } /* * This dquot has acquired a reference in the meantime remove it from * the freelist and try again. */ - if (dqp->q_nrefs) { - xfs_dqunlock(dqp); + if (dqp->q_lockref.count) { + spin_unlock(&dqp->q_lockref.lock); XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants); trace_xfs_dqreclaim_want(dqp); @@ -518,10 +471,9 @@ xfs_qm_dquot_isolate( /* * Prevent lookups now that we are past the point of no return. */ - dqp->q_flags |= XFS_DQFLAG_FREEING; - xfs_dqunlock(dqp); + lockref_mark_dead(&dqp->q_lockref); + spin_unlock(&dqp->q_lockref.lock); - ASSERT(dqp->q_nrefs == 0); list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); trace_xfs_dqreclaim_done(dqp); @@ -529,7 +481,7 @@ xfs_qm_dquot_isolate( return LRU_REMOVED; out_miss_unlock: - xfs_dqunlock(dqp); + spin_unlock(&dqp->q_lockref.lock); out_miss_busy: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); @@ -1316,9 +1268,10 @@ xfs_qm_quotacheck_dqadjust( return error; } + mutex_lock(&dqp->q_qlock); error = xfs_dquot_attach_buf(NULL, dqp); if (error) - return error; + goto out_unlock; trace_xfs_dqadjust(dqp); @@ -1348,8 +1301,10 @@ xfs_qm_quotacheck_dqadjust( } dqp->q_flags |= XFS_DQFLAG_DIRTY; - xfs_qm_dqput(dqp); - return 0; +out_unlock: + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); + return error; } /* @@ -1466,9 +1421,10 @@ xfs_qm_flush_one( struct xfs_buf *bp = NULL; int error = 0; - xfs_dqlock(dqp); - if (dqp->q_flags & XFS_DQFLAG_FREEING) - goto out_unlock; + if (!lockref_get_not_dead(&dqp->q_lockref)) + return 0; + + mutex_lock(&dqp->q_qlock); if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; @@ -1488,7 +1444,8 @@ xfs_qm_flush_one( xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); out_unlock: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); return error; } @@ -1904,16 +1861,12 @@ xfs_qm_vop_dqalloc( struct xfs_dquot *gq = NULL; struct xfs_dquot *pq = NULL; int error; - uint lockflags; if (!XFS_IS_QUOTA_ON(mp)) return 0; ASSERT(!xfs_is_metadir_inode(ip)); - lockflags = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockflags); - if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) gid = inode->i_gid; @@ -1922,38 +1875,22 @@ xfs_qm_vop_dqalloc( * if necessary. The dquot(s) will not be locked. */ if (XFS_NOT_DQATTACHED(mp, ip)) { + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqattach_locked(ip, true); - if (error) { - xfs_iunlock(ip, lockflags); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) return error; - } } if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { ASSERT(O_udqpp); if (!uid_eq(inode->i_uid, uid)) { - /* - * What we need is the dquot that has this uid, and - * if we send the inode to dqget, the uid of the inode - * takes priority over what's sent in the uid argument. - * We must unlock inode here before calling dqget if - * we're not sending the inode, because otherwise - * we'll deadlock by doing trans_reserve while - * holding ilock. - */ - xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, from_kuid(user_ns, uid), XFS_DQTYPE_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; } - /* - * Get the ilock in the right order. - */ - xfs_dqunlock(uq); - lockflags = XFS_ILOCK_SHARED; - xfs_ilock(ip, lockflags); } else { /* * Take an extra reference, because we'll return @@ -1966,16 +1903,12 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { ASSERT(O_gdqpp); if (!gid_eq(inode->i_gid, gid)) { - xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), XFS_DQTYPE_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; } - xfs_dqunlock(gq); - lockflags = XFS_ILOCK_SHARED; - xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_gdquot); gq = xfs_qm_dqhold(ip->i_gdquot); @@ -1984,16 +1917,12 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { ASSERT(O_pdqpp); if (ip->i_projid != prid) { - xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, prid, XFS_DQTYPE_PROJ, true, &pq); if (error) { ASSERT(error != -ENOENT); goto error_rele; } - xfs_dqunlock(pq); - lockflags = XFS_ILOCK_SHARED; - xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_pdquot); pq = xfs_qm_dqhold(ip->i_pdquot); @@ -2001,7 +1930,6 @@ xfs_qm_vop_dqalloc( } trace_xfs_dquot_dqalloc(ip); - xfs_iunlock(ip, lockflags); if (O_udqpp) *O_udqpp = uq; else @@ -2078,7 +2006,7 @@ xfs_qm_vop_chown( * back now. */ tp->t_flags |= XFS_TRANS_DIRTY; - xfs_dqlock(prevdq); + mutex_lock(&prevdq->q_qlock); if (isrt) { ASSERT(prevdq->q_rtb.reserved >= ip->i_delayed_blks); prevdq->q_rtb.reserved -= ip->i_delayed_blks; @@ -2086,7 +2014,7 @@ xfs_qm_vop_chown( ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks); prevdq->q_blk.reserved -= ip->i_delayed_blks; } - xfs_dqunlock(prevdq); + mutex_unlock(&prevdq->q_qlock); /* * Take an extra reference, because the inode is going to keep diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 35b64bc3a7a8..e88ed6ad0e65 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -57,7 +57,7 @@ struct xfs_quotainfo { struct xfs_inode *qi_pquotaip; /* project quota inode */ struct xfs_inode *qi_dirip; /* quota metadir */ struct list_lru qi_lru; - int qi_dquots; + uint64_t qi_dquots; struct mutex qi_quotaofflock;/* to serialize quotaoff */ xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ uint qi_dqperchunk; /* # ondisk dq in above chunk */ diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 245d754f382a..edc0aef3cf34 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -73,8 +73,10 @@ xfs_qm_statvfs( struct xfs_dquot *dqp; if (!xfs_qm_dqget(mp, ip->i_projid, XFS_DQTYPE_PROJ, false, &dqp)) { + mutex_lock(&dqp->q_qlock); xfs_fill_statvfs_from_dquot(statp, ip, dqp); - xfs_qm_dqput(dqp); + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); } } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 0c78f30fa4a3..022e2179c06b 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -303,13 +303,12 @@ xfs_qm_scall_setqlim( } defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); - xfs_dqunlock(dqp); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp); if (error) goto out_rele; - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); xfs_trans_dqjoin(tp, dqp); /* @@ -459,6 +458,7 @@ xfs_qm_scall_getquota( * If everything's NULL, this dquot doesn't quite exist as far as * our utility programs are concerned. */ + mutex_lock(&dqp->q_qlock); if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { error = -ENOENT; goto out_put; @@ -467,7 +467,8 @@ xfs_qm_scall_getquota( xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); out_put: - xfs_qm_dqput(dqp); + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); return error; } @@ -497,7 +498,8 @@ xfs_qm_scall_getquota_next( *id = dqp->q_id; xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); + mutex_unlock(&dqp->q_qlock); - xfs_qm_dqput(dqp); + xfs_qm_dqrele(dqp); return error; } diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 4c7f7ce4fd2f..94fbe3d99ec7 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -65,7 +65,7 @@ xfs_fs_get_quota_state( memset(state, 0, sizeof(*state)); if (!XFS_IS_QUOTA_ON(mp)) return 0; - state->s_incoredqs = q->qi_dquots; + state->s_incoredqs = min_t(uint64_t, q->qi_dquots, UINT_MAX); if (XFS_IS_UQUOTA_ON(mp)) state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED; if (XFS_IS_UQUOTA_ENFORCED(mp)) diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 36cda724da89..9d1ed9bb0bee 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -17,7 +17,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip) { struct inode *inode = VFS_I(ip); - if ((inode->i_state & I_DIRTY_PAGES) || + if ((inode_state_read_once(inode) & I_DIRTY_PAGES) || mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK) || atomic_read(&inode->i_dio_count)) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e85a156dc17d..bc71aa9dcee8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -102,7 +102,7 @@ static const struct constant_table dax_param_enums[] = { * Table driven mount option parser. */ enum { - Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, + Op_deprecated, Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, @@ -114,7 +114,21 @@ enum { Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write, }; +#define fsparam_dead(NAME) \ + __fsparam(NULL, (NAME), Op_deprecated, fs_param_deprecated, NULL) + static const struct fs_parameter_spec xfs_fs_parameters[] = { + /* + * These mount options were supposed to be deprecated in September 2025 + * but the deprecation warning was buggy, so not all users were + * notified. The deprecation is now obnoxiously loud and postponed to + * September 2030. + */ + fsparam_dead("attr2"), + fsparam_dead("noattr2"), + fsparam_dead("ikeep"), + fsparam_dead("noikeep"), + fsparam_u32("logbufs", Opt_logbufs), fsparam_string("logbsize", Opt_logbsize), fsparam_string("logdev", Opt_logdev), @@ -786,6 +800,12 @@ xfs_fs_evict_inode( truncate_inode_pages_final(&inode->i_data); clear_inode(inode); + + if (IS_ENABLED(CONFIG_XFS_RT) && + S_ISREG(inode->i_mode) && inode->i_private) { + xfs_open_zone_put(inode->i_private); + inode->i_private = NULL; + } } static void @@ -1373,16 +1393,25 @@ suffix_kstrtoull( static inline void xfs_fs_warn_deprecated( struct fs_context *fc, - struct fs_parameter *param, - uint64_t flag, - bool value) + struct fs_parameter *param) { - /* Don't print the warning if reconfiguring and current mount point - * already had the flag set + /* + * Always warn about someone passing in a deprecated mount option. + * Previously we wouldn't print the warning if we were reconfiguring + * and current mount point already had the flag set, but that was not + * the right thing to do. + * + * Many distributions mount the root filesystem with no options in the + * initramfs and rely on mount -a to remount the root fs with the + * options in fstab. However, the old behavior meant that there would + * never be a warning about deprecated mount options for the root fs in + * /etc/fstab. On a single-fs system, that means no warning at all. + * + * Compounding this problem are distribution scripts that copy + * /proc/mounts to fstab, which means that we can't remove mount + * options unless we're 100% sure they have only ever been advertised + * in /proc/mounts in response to explicitly provided mount options. */ - if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) && - !!(XFS_M(fc->root->d_sb)->m_features & flag) == value) - return; xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key); } @@ -1408,6 +1437,9 @@ xfs_fs_parse_param( return opt; switch (opt) { + case Op_deprecated: + xfs_fs_warn_deprecated(fc, param); + return 0; case Opt_logbufs: parsing_mp->m_logbufs = result.uint_32; return 0; @@ -1528,7 +1560,6 @@ xfs_fs_parse_param( xfs_mount_set_dax_mode(parsing_mp, result.uint_32); return 0; #endif - /* Following mount options will be removed in September 2025 */ case Opt_max_open_zones: parsing_mp->m_max_open_zones = result.uint_32; return 0; @@ -1662,7 +1693,10 @@ xfs_fs_fill_super( if (error) return error; - sb_min_blocksize(sb, BBSIZE); + if (!sb_min_blocksize(sb, BBSIZE)) { + xfs_err(mp, "unable to set blocksize"); + return -EINVAL; + } sb->s_xattr = xfs_xattr_handlers; sb->s_export_op = &xfs_export_operations; #ifdef CONFIG_XFS_QUOTA @@ -2221,7 +2255,7 @@ xfs_init_fs_context( struct xfs_mount *mp; int i; - mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL); + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); if (!mp) return -ENOMEM; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 79b8641880ab..f70afbf3cb19 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1350,7 +1350,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, __entry->id = dqp->q_id; __entry->type = dqp->q_type; __entry->flags = dqp->q_flags; - __entry->nrefs = dqp->q_nrefs; + __entry->nrefs = data_race(dqp->q_lockref.count); __entry->res_bcount = dqp->q_blk.reserved; __entry->res_rtbcount = dqp->q_rtb.reserved; @@ -1399,7 +1399,6 @@ DEFINE_DQUOT_EVENT(xfs_dqadjust); DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy); DEFINE_DQUOT_EVENT(xfs_dqreclaim_done); -DEFINE_DQUOT_EVENT(xfs_dqattach_found); DEFINE_DQUOT_EVENT(xfs_dqattach_get); DEFINE_DQUOT_EVENT(xfs_dqalloc); DEFINE_DQUOT_EVENT(xfs_dqtobp_read); @@ -1409,9 +1408,8 @@ DEFINE_DQUOT_EVENT(xfs_dqget_hit); DEFINE_DQUOT_EVENT(xfs_dqget_miss); DEFINE_DQUOT_EVENT(xfs_dqget_freeing); DEFINE_DQUOT_EVENT(xfs_dqget_dup); -DEFINE_DQUOT_EVENT(xfs_dqput); -DEFINE_DQUOT_EVENT(xfs_dqput_free); DEFINE_DQUOT_EVENT(xfs_dqrele); +DEFINE_DQUOT_EVENT(xfs_dqrele_free); DEFINE_DQUOT_EVENT(xfs_dqflush); DEFINE_DQUOT_EVENT(xfs_dqflush_force); DEFINE_DQUOT_EVENT(xfs_dqflush_done); @@ -4934,7 +4932,7 @@ DECLARE_EVENT_CLASS(xlog_iclog_class, __entry->refcount = atomic_read(&iclog->ic_refcnt); __entry->offset = iclog->ic_offset; __entry->flags = iclog->ic_flags; - __entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn); + __entry->lsn = be64_to_cpu(iclog->ic_header->h_lsn); __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS", diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 765456bf3428..c842ce06acd6 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -393,7 +393,7 @@ xfs_trans_dqlockedjoin( unsigned int i; ASSERT(q[0].qt_dquot != NULL); if (q[1].qt_dquot == NULL) { - xfs_dqlock(q[0].qt_dquot); + mutex_lock(&q[0].qt_dquot->q_qlock); xfs_trans_dqjoin(tp, q[0].qt_dquot); } else if (q[2].qt_dquot == NULL) { xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot); @@ -693,7 +693,7 @@ xfs_trans_unreserve_and_mod_dquots( locked = already_locked; if (qtrx->qt_blk_res) { if (!locked) { - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); locked = true; } dqp->q_blk.reserved -= @@ -701,7 +701,7 @@ xfs_trans_unreserve_and_mod_dquots( } if (qtrx->qt_ino_res) { if (!locked) { - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); locked = true; } dqp->q_ino.reserved -= @@ -710,14 +710,14 @@ xfs_trans_unreserve_and_mod_dquots( if (qtrx->qt_rtblk_res) { if (!locked) { - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); locked = true; } dqp->q_rtb.reserved -= (xfs_qcnt_t)qtrx->qt_rtblk_res; } if (locked && !already_locked) - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); } } @@ -820,7 +820,7 @@ xfs_trans_dqresv( struct xfs_dquot_res *blkres; struct xfs_quota_limits *qlim; - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); @@ -887,16 +887,16 @@ xfs_trans_dqresv( XFS_IS_CORRUPT(mp, dqp->q_ino.reserved < dqp->q_ino.count)) goto error_corrupt; - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); return 0; error_return: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ) return -ENOSPC; return -EDQUOT; error_corrupt: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK); return -EFSCORRUPTED; diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 1147bacb2da8..bbcf21704ea0 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -26,14 +26,22 @@ #include "xfs_trace.h" #include "xfs_mru_cache.h" +static void +xfs_open_zone_free_rcu( + struct callback_head *cb) +{ + struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu); + + xfs_rtgroup_rele(oz->oz_rtg); + kfree(oz); +} + void xfs_open_zone_put( struct xfs_open_zone *oz) { - if (atomic_dec_and_test(&oz->oz_ref)) { - xfs_rtgroup_rele(oz->oz_rtg); - kfree(oz); - } + if (atomic_dec_and_test(&oz->oz_ref)) + call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu); } static inline uint32_t @@ -95,9 +103,6 @@ xfs_zone_account_reclaimable( */ trace_xfs_zone_emptied(rtg); - if (!was_full) - xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE); - spin_lock(&zi->zi_used_buckets_lock); if (!was_full) xfs_zone_remove_from_bucket(zi, rgno, from_bucket); @@ -119,7 +124,6 @@ xfs_zone_account_reclaimable( xfs_zone_add_to_bucket(zi, rgno, to_bucket); spin_unlock(&zi->zi_used_buckets_lock); - xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE); if (zi->zi_gc_thread && xfs_zoned_need_gc(mp)) wake_up_process(zi->zi_gc_thread); } else if (to_bucket != from_bucket) { @@ -134,6 +138,28 @@ xfs_zone_account_reclaimable( } } +/* + * Check if we have any zones that can be reclaimed by looking at the entry + * counters for the zone buckets. + */ +bool +xfs_zoned_have_reclaimable( + struct xfs_zone_info *zi) +{ + int i; + + spin_lock(&zi->zi_used_buckets_lock); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { + if (zi->zi_used_bucket_entries[i]) { + spin_unlock(&zi->zi_used_buckets_lock); + return true; + } + } + spin_unlock(&zi->zi_used_buckets_lock); + + return false; +} + static void xfs_open_zone_mark_full( struct xfs_open_zone *oz) @@ -238,6 +264,14 @@ xfs_zoned_map_extent( * If a data write raced with this GC write, keep the existing data in * the data fork, mark our newly written GC extent as reclaimable, then * move on to the next extent. + * + * Note that this can also happen when racing with operations that do + * not actually invalidate the data, but just move it to a different + * inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the + * inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE). If the + * data was just moved around, GC fails to free the zone, but the zone + * becomes a GC candidate again as soon as all previous GC I/O has + * finished and these blocks will be moved out eventually. */ if (old_startblock != NULLFSBLOCK && old_startblock != data.br_startblock) @@ -599,7 +633,7 @@ xfs_select_open_zone_mru( lockdep_assert_held(&zi->zi_open_zones_lock); list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) - if (xfs_try_use_zone(zi, file_hint, oz, false)) + if (xfs_try_use_zone(zi, file_hint, oz, XFS_ZONE_ALLOC_OK)) return oz; cond_resched_lock(&zi->zi_open_zones_lock); @@ -614,14 +648,25 @@ static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip) } /* - * Try to pack inodes that are written back after they were closed tight instead - * of trying to open new zones for them or spread them to the least recently - * used zone. This optimizes the data layout for workloads that untar or copy - * a lot of small files. Right now this does not separate multiple such + * Try to tightly pack small files that are written back after they were closed + * instead of trying to open new zones for them or spread them to the least + * recently used zone. This optimizes the data layout for workloads that untar + * or copy a lot of small files. Right now this does not separate multiple such * streams. */ static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) { + struct xfs_mount *mp = ip->i_mount; + size_t zone_capacity = + XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks); + + /* + * Do not pack write files that are already using a full zone to avoid + * fragmentation. + */ + if (i_size_read(VFS_I(ip)) >= zone_capacity) + return false; + return !inode_is_open_for_write(VFS_I(ip)) && !(ip->i_diflags & XFS_DIFLAG_APPEND); } @@ -746,97 +791,54 @@ xfs_mark_rtg_boundary( } /* - * Cache the last zone written to for an inode so that it is considered first - * for subsequent writes. - */ -struct xfs_zone_cache_item { - struct xfs_mru_cache_elem mru; - struct xfs_open_zone *oz; -}; - -static inline struct xfs_zone_cache_item * -xfs_zone_cache_item(struct xfs_mru_cache_elem *mru) -{ - return container_of(mru, struct xfs_zone_cache_item, mru); -} - -static void -xfs_zone_cache_free_func( - void *data, - struct xfs_mru_cache_elem *mru) -{ - struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru); - - xfs_open_zone_put(item->oz); - kfree(item); -} - -/* * Check if we have a cached last open zone available for the inode and * if yes return a reference to it. */ static struct xfs_open_zone * -xfs_cached_zone( - struct xfs_mount *mp, - struct xfs_inode *ip) +xfs_get_cached_zone( + struct xfs_inode *ip) { - struct xfs_mru_cache_elem *mru; - struct xfs_open_zone *oz; + struct xfs_open_zone *oz; - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); - if (!mru) - return NULL; - oz = xfs_zone_cache_item(mru)->oz; + rcu_read_lock(); + oz = VFS_I(ip)->i_private; if (oz) { /* * GC only steals open zones at mount time, so no GC zones * should end up in the cache. */ ASSERT(!oz->oz_is_gc); - ASSERT(atomic_read(&oz->oz_ref) > 0); - atomic_inc(&oz->oz_ref); + if (!atomic_inc_not_zero(&oz->oz_ref)) + oz = NULL; } - xfs_mru_cache_done(mp->m_zone_cache); + rcu_read_unlock(); + return oz; } /* - * Update the last used zone cache for a given inode. + * Stash our zone in the inode so that is is reused for future allocations. * - * The caller must have a reference on the open zone. + * The open_zone structure will be pinned until either the inode is freed or + * until the cached open zone is replaced with a different one because the + * current one was full when we tried to use it. This means we keep any + * open zone around forever as long as any inode that used it for the last + * write is cached, which slightly increases the memory use of cached inodes + * that were every written to, but significantly simplifies the cached zone + * lookup. Because the open_zone is clearly marked as full when all data + * in the underlying RTG was written, the caching is always safe. */ static void -xfs_zone_cache_create_association( - struct xfs_inode *ip, - struct xfs_open_zone *oz) +xfs_set_cached_zone( + struct xfs_inode *ip, + struct xfs_open_zone *oz) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_zone_cache_item *item = NULL; - struct xfs_mru_cache_elem *mru; + struct xfs_open_zone *old_oz; - ASSERT(atomic_read(&oz->oz_ref) > 0); atomic_inc(&oz->oz_ref); - - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); - if (mru) { - /* - * If we have an association already, update it to point to the - * new zone. - */ - item = xfs_zone_cache_item(mru); - xfs_open_zone_put(item->oz); - item->oz = oz; - xfs_mru_cache_done(mp->m_zone_cache); - return; - } - - item = kmalloc(sizeof(*item), GFP_KERNEL); - if (!item) { - xfs_open_zone_put(oz); - return; - } - item->oz = oz; - xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru); + old_oz = xchg(&VFS_I(ip)->i_private, oz); + if (old_oz) + xfs_open_zone_put(old_oz); } static void @@ -880,15 +882,14 @@ xfs_zone_alloc_and_submit( * the inode is still associated with a zone and use that if so. */ if (!*oz) - *oz = xfs_cached_zone(mp, ip); + *oz = xfs_get_cached_zone(ip); if (!*oz) { select_zone: *oz = xfs_select_zone(mp, write_hint, pack_tight); if (!*oz) goto out_error; - - xfs_zone_cache_create_association(ip, *oz); + xfs_set_cached_zone(ip, *oz); } alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), @@ -966,6 +967,12 @@ xfs_free_open_zones( xfs_open_zone_put(oz); } spin_unlock(&zi->zi_open_zones_lock); + + /* + * Wait for all open zones to be freed so that they drop the group + * references: + */ + rcu_barrier(); } struct xfs_init_zones { @@ -1215,6 +1222,7 @@ xfs_mount_zones( .mp = mp, }; struct xfs_buftarg *bt = mp->m_rtdev_targp; + xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks; int error; if (!bt) { @@ -1245,12 +1253,35 @@ xfs_mount_zones( return -ENOMEM; xfs_info(mp, "%u zones of %u blocks (%u max open zones)", - mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, - mp->m_max_open_zones); + mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones); trace_xfs_zones_mount(mp); + /* + * The writeback code switches between inodes regularly to provide + * fairness. The default lower bound is 4MiB, but for zoned file + * systems we want to increase that both to reduce seeks, but also more + * importantly so that workloads that writes files in a multiple of the + * zone size do not get fragmented and require garbage collection when + * they shouldn't. Increase is to the zone size capped by the max + * extent len. + * + * Note that because s_min_writeback_pages is a superblock field, this + * value also get applied to non-zoned files on the data device if + * there are any. On typical zoned setup all data is on the RT device + * because using the more efficient sequential write required zones + * is the reason for using the zone allocator, and either the RT device + * and the (meta)data device are on the same block device, or the + * (meta)data device is on a fast SSD while the data on the RT device + * is on a SMR HDD. In any combination of the above cases enforcing + * the higher min_writeback_pages for non-RT inodes is either a noop + * or beneficial. + */ + mp->m_super->s_min_writeback_pages = + XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >> + PAGE_SHIFT; + if (bdev_is_zoned(bt->bt_bdev)) { - error = blkdev_report_zones(bt->bt_bdev, + error = blkdev_report_zones_cached(bt->bt_bdev, XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz); if (error < 0) @@ -1260,8 +1291,10 @@ xfs_mount_zones( while ((rtg = xfs_rtgroup_next(mp, rtg))) { error = xfs_init_zone(&iz, rtg, NULL); - if (error) + if (error) { + xfs_rtgroup_rele(rtg); goto out_free_zone_info; + } } } @@ -1279,14 +1312,6 @@ xfs_mount_zones( error = xfs_zone_gc_mount(mp); if (error) goto out_free_zone_info; - - /* - * Set up a mru cache to track inode to open zone for data placement - * purposes. The magic values for group count and life time is the - * same as the defaults for file streams, which seems sane enough. - */ - xfs_mru_cache_create(&mp->m_zone_cache, mp, - 5000, 10, xfs_zone_cache_free_func); return 0; out_free_zone_info: @@ -1300,5 +1325,4 @@ xfs_unmount_zones( { xfs_zone_gc_unmount(mp); xfs_free_zone_info(mp->m_zone_info); - xfs_mru_cache_destroy(mp->m_zone_cache); } diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 064cd1a857a0..3c52cc1497d4 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -114,8 +114,9 @@ struct xfs_gc_bio { /* Open Zone being written to */ struct xfs_open_zone *oz; + struct xfs_rtgroup *victim_rtg; + /* Bio used for reads and writes, including the bvec used by it */ - struct bio_vec bv; struct bio bio; /* must be last */ }; @@ -173,14 +174,13 @@ xfs_zoned_need_gc( s64 available, free, threshold; s32 remainder; - if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) + if (!xfs_zoned_have_reclaimable(mp->m_zone_info)) return false; available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); if (available < - mp->m_groups[XG_TYPE_RTG].blocks * - (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) + xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) return true; free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); @@ -264,6 +264,7 @@ xfs_zone_gc_iter_init( iter->rec_count = 0; iter->rec_idx = 0; iter->victim_rtg = victim_rtg; + atomic_inc(&victim_rtg->rtg_gccount); } /* @@ -362,6 +363,7 @@ xfs_zone_gc_query( return 0; done: + atomic_dec(&iter->victim_rtg->rtg_gccount); xfs_rtgroup_rele(iter->victim_rtg); iter->victim_rtg = NULL; return 0; @@ -451,6 +453,20 @@ xfs_zone_gc_pick_victim_from( if (!rtg) continue; + /* + * If the zone is already undergoing GC, don't pick it again. + * + * This prevents us from picking one of the zones for which we + * already submitted GC I/O, but for which the remapping hasn't + * concluded yet. This won't cause data corruption, but + * increases write amplification and slows down GC, so this is + * a bad thing. + */ + if (atomic_read(&rtg->rtg_gccount)) { + xfs_rtgroup_rele(rtg); + continue; + } + /* skip zones that are just waiting for a reset */ if (rtg_rmap(rtg)->i_used_blocks == 0 || rtg_rmap(rtg)->i_used_blocks >= victim_used) { @@ -491,21 +507,6 @@ xfs_zone_gc_select_victim( struct xfs_rtgroup *victim_rtg = NULL; unsigned int bucket; - if (xfs_is_shutdown(mp)) - return false; - - if (iter->victim_rtg) - return true; - - /* - * Don't start new work if we are asked to stop or park. - */ - if (kthread_should_stop() || kthread_should_park()) - return false; - - if (!xfs_zoned_need_gc(mp)) - return false; - spin_lock(&zi->zi_used_buckets_lock); for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); @@ -703,6 +704,9 @@ xfs_zone_gc_start_chunk( chunk->scratch = &data->scratch[data->scratch_idx]; chunk->data = data; chunk->oz = oz; + chunk->victim_rtg = iter->victim_rtg; + atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); + atomic_inc(&chunk->victim_rtg->rtg_gccount); bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); bio->bi_end_io = xfs_zone_gc_end_io; @@ -725,6 +729,8 @@ static void xfs_zone_gc_free_chunk( struct xfs_gc_bio *chunk) { + atomic_dec(&chunk->victim_rtg->rtg_gccount); + xfs_rtgroup_rele(chunk->victim_rtg); list_del(&chunk->entry); xfs_open_zone_put(chunk->oz); xfs_irele(chunk->ip); @@ -785,6 +791,10 @@ xfs_zone_gc_split_write( split_chunk->oz = chunk->oz; atomic_inc(&chunk->oz->oz_ref); + split_chunk->victim_rtg = chunk->victim_rtg; + atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); + atomic_inc(&chunk->victim_rtg->rtg_gccount); + chunk->offset += split_len; chunk->len -= split_len; chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); @@ -975,6 +985,27 @@ xfs_zone_gc_reset_zones( } while (next); } +static bool +xfs_zone_gc_should_start_new_work( + struct xfs_zone_gc_data *data) +{ + if (xfs_is_shutdown(data->mp)) + return false; + if (!xfs_zone_gc_space_available(data)) + return false; + + if (!data->iter.victim_rtg) { + if (kthread_should_stop() || kthread_should_park()) + return false; + if (!xfs_zoned_need_gc(data->mp)) + return false; + if (!xfs_zone_gc_select_victim(data)) + return false; + } + + return true; +} + /* * Handle the work to read and write data for GC and to reset the zones, * including handling all completions. @@ -982,7 +1013,7 @@ xfs_zone_gc_reset_zones( * Note that the order of the chunks is preserved so that we don't undo the * optimal order established by xfs_zone_gc_query(). */ -static bool +static void xfs_zone_gc_handle_work( struct xfs_zone_gc_data *data) { @@ -996,30 +1027,22 @@ xfs_zone_gc_handle_work( zi->zi_reset_list = NULL; spin_unlock(&zi->zi_reset_list_lock); - if (!xfs_zone_gc_select_victim(data) || - !xfs_zone_gc_space_available(data)) { - if (list_empty(&data->reading) && - list_empty(&data->writing) && - list_empty(&data->resetting) && - !reset_list) - return false; - } - - __set_current_state(TASK_RUNNING); - try_to_freeze(); - - if (reset_list) + if (reset_list) { + set_current_state(TASK_RUNNING); xfs_zone_gc_reset_zones(data, reset_list); + } list_for_each_entry_safe(chunk, next, &data->resetting, entry) { if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) break; + set_current_state(TASK_RUNNING); xfs_zone_gc_finish_reset(chunk); } list_for_each_entry_safe(chunk, next, &data->writing, entry) { if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) break; + set_current_state(TASK_RUNNING); xfs_zone_gc_finish_chunk(chunk); } @@ -1027,15 +1050,18 @@ xfs_zone_gc_handle_work( list_for_each_entry_safe(chunk, next, &data->reading, entry) { if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) break; + set_current_state(TASK_RUNNING); xfs_zone_gc_write_chunk(chunk); } blk_finish_plug(&plug); - blk_start_plug(&plug); - while (xfs_zone_gc_start_chunk(data)) - ; - blk_finish_plug(&plug); - return true; + if (xfs_zone_gc_should_start_new_work(data)) { + set_current_state(TASK_RUNNING); + blk_start_plug(&plug); + while (xfs_zone_gc_start_chunk(data)) + ; + blk_finish_plug(&plug); + } } /* @@ -1059,8 +1085,18 @@ xfs_zoned_gcd( for (;;) { set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); xfs_set_zonegc_running(mp); - if (xfs_zone_gc_handle_work(data)) + + xfs_zone_gc_handle_work(data); + + /* + * Only sleep if nothing set the state to running. Else check for + * work again as someone might have queued up more work and woken + * us in the meantime. + */ + if (get_current_state() == TASK_RUNNING) { + try_to_freeze(); continue; + } if (list_empty(&data->reading) && list_empty(&data->writing) && @@ -1146,16 +1182,16 @@ xfs_zone_gc_mount( goto out_put_gc_zone; } - mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, + zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, "xfs-zone-gc/%s", mp->m_super->s_id); - if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { + if (IS_ERR(zi->zi_gc_thread)) { xfs_warn(mp, "unable to create zone gc thread"); - error = PTR_ERR(mp->m_zone_info->zi_gc_thread); + error = PTR_ERR(zi->zi_gc_thread); goto out_free_gc_data; } /* xfs_zone_gc_start will unpark for rw mounts */ - kthread_park(mp->m_zone_info->zi_gc_thread); + kthread_park(zi->zi_gc_thread); return 0; out_free_gc_data: diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h index 35e6de3d25ed..ce7f0e2f4598 100644 --- a/fs/xfs/xfs_zone_priv.h +++ b/fs/xfs/xfs_zone_priv.h @@ -44,6 +44,8 @@ struct xfs_open_zone { * the life time of an open zone. */ struct xfs_rtgroup *oz_rtg; + + struct rcu_head oz_rcu; }; /* @@ -111,6 +113,7 @@ struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg); bool xfs_zoned_need_gc(struct xfs_mount *mp); +bool xfs_zoned_have_reclaimable(struct xfs_zone_info *zi); int xfs_zone_gc_mount(struct xfs_mount *mp); void xfs_zone_gc_unmount(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c index 9cd38716fd25..fc1a4d1ce10c 100644 --- a/fs/xfs/xfs_zone_space_resv.c +++ b/fs/xfs/xfs_zone_space_resv.c @@ -54,12 +54,10 @@ xfs_zoned_default_resblks( { switch (ctr) { case XC_FREE_RTEXTENTS: - return (uint64_t)XFS_RESERVED_ZONES * - mp->m_groups[XG_TYPE_RTG].blocks + - mp->m_sb.sb_rtreserved; + return xfs_rtgs_to_rfsbs(mp, XFS_RESERVED_ZONES) + + mp->m_sb.sb_rtreserved; case XC_FREE_RTAVAILABLE: - return (uint64_t)XFS_GC_ZONES * - mp->m_groups[XG_TYPE_RTG].blocks; + return xfs_rtgs_to_rfsbs(mp, XFS_GC_ZONES); default: ASSERT(0); return 0; @@ -174,7 +172,7 @@ xfs_zoned_reserve_available( * processing a pending GC request give up as we're fully out * of space. */ - if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) && + if (!xfs_zoned_have_reclaimable(mp->m_zone_info) && !xfs_is_zonegc_running(mp)) break; |
