From ff694ab60c29cfeba81b3d5068d3c908f22110ed Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:55 -0700 Subject: fs/ext4: Narrow scope of DAX check in setflags When preventing DAX and journaling on an inode. Use the effective DAX check rather than the mount option. This will be required to support per inode DAX flags. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-2-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bfc1281fc4cb..5813e5e73eab 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -393,9 +393,9 @@ flags_err: if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { /* * Changes to the journaling mode can cause unsafe changes to - * S_DAX if we are using the DAX mount option. + * S_DAX if the inode is DAX */ - if (test_opt(inode->i_sb, DAX)) { + if (IS_DAX(inode)) { err = -EBUSY; goto flags_out; } -- cgit From 6c0d077ff8200a7b8e7131ca8e25315dec243a60 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:56 -0700 Subject: fs/ext4: Disallow verity if inode is DAX Verity and DAX are incompatible. Changing the DAX mode due to a verity flag change is wrong without a corresponding address_space_operations update. Make the 2 options mutually exclusive by returning an error if DAX was set first. (Setting DAX is already disabled if Verity is set first.) Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-3-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/verity.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index dc5ec724d889..f05a09fb2ae4 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -113,6 +113,9 @@ static int ext4_begin_enable_verity(struct file *filp) handle_t *handle; int err; + if (IS_DAX(inode)) + return -EINVAL; + if (ext4_verity_in_progress(inode)) return -EBUSY; -- cgit From fc626fe322f13ecdfac58cbfa6cfea797ed22623 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:57 -0700 Subject: fs/ext4: Change EXT4_MOUNT_DAX to EXT4_MOUNT_DAX_ALWAYS In prep for the new tri-state mount option which then introduces EXT4_MOUNT_DAX_NEVER. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-4-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ++-- fs/ext4/inode.c | 2 +- fs/ext4/super.c | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 91eb4381cae5..1a3daf2d18ef 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1123,9 +1123,9 @@ struct ext4_inode_info { #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ #ifdef CONFIG_FS_DAX -#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */ #else -#define EXT4_MOUNT_DAX 0 +#define EXT4_MOUNT_DAX_ALWAYS 0 #endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a4aae6acdcb..a10ff12194db 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4400,7 +4400,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) static bool ext4_should_use_dax(struct inode *inode) { - if (!test_opt(inode->i_sb, DAX)) + if (!test_opt(inode->i_sb, DAX_ALWAYS)) return false; if (!S_ISREG(inode->i_mode)) return false; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf5fcb477f66..7b99c44d0a91 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1775,7 +1775,7 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, - {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET}, + {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -3982,7 +3982,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dioread_nolock"); goto failed_mount; } - if (test_opt(sb, DAX)) { + if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); goto failed_mount; @@ -4092,7 +4092,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); @@ -5412,7 +5412,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } - if (test_opt(sb, DAX)) { + if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); err = -EINVAL; @@ -5433,10 +5433,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS) { ext4_msg(sb, KERN_WARNING, "warning: refusing change of " "dax flag with busy inodes while remounting"); - sbi->s_mount_opt ^= EXT4_MOUNT_DAX; + sbi->s_mount_opt ^= EXT4_MOUNT_DAX_ALWAYS; } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) -- cgit From a8ab6d3885ef5e2300d683b79a9e1999403eefd9 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:58 -0700 Subject: fs/ext4: Update ext4_should_use_dax() S_DAX should only be enabled when the underlying block device supports dax. Cache the underlying support for DAX in the super block and modify ext4_should_use_dax() to check for device support prior to the over riding mount option. While we are at it change the function to ext4_should_enable_dax() as this better reflects the ask as well as matches xfs. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-5-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/inode.c | 15 ++++++++++----- fs/ext4/super.c | 5 ++++- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1a3daf2d18ef..0b4db9ce7756 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1979,6 +1979,7 @@ static inline bool ext4_has_incompat_features(struct super_block *sb) */ #define EXT4_FLAGS_RESIZING 0 #define EXT4_FLAGS_SHUTDOWN 1 +#define EXT4_FLAGS_BDEV_IS_DAX 2 static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a10ff12194db..6532870f6a0b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4398,10 +4398,10 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); } -static bool ext4_should_use_dax(struct inode *inode) +static bool ext4_should_enable_dax(struct inode *inode) { - if (!test_opt(inode->i_sb, DAX_ALWAYS)) - return false; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) @@ -4412,7 +4412,12 @@ static bool ext4_should_use_dax(struct inode *inode) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) return false; - return true; + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) + return false; + if (test_opt(inode->i_sb, DAX_ALWAYS)) + return true; + + return false; } void ext4_set_inode_flags(struct inode *inode) @@ -4430,7 +4435,7 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (ext4_should_use_dax(inode)) + if (ext4_should_enable_dax(inode)) new_fl |= S_DAX; if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7b99c44d0a91..f7d76dcaedfe 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4092,13 +4092,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (bdev_dax_supported(sb->s_bdev, blocksize)) + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); goto failed_mount; } - if (!bdev_dax_supported(sb->s_bdev, blocksize)) { + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device."); goto failed_mount; -- cgit From 043546e46dc70c25ff7e2cf6d09cbb0424fc9978 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 07:59:59 -0700 Subject: fs/ext4: Only change S_DAX on inode load To prevent complications with in memory inodes we only set S_DAX on inode load. FS_XFLAG_DAX can be changed at any time and S_DAX will change after inode eviction and reload. Add init bool to ext4_set_inode_flags() to indicate if the inode is being newly initialized. Assert that S_DAX is not set on an inode which is just being loaded. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-6-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/inode.c | 13 ++++++++++--- fs/ext4/ioctl.c | 3 ++- fs/ext4/super.c | 4 ++-- fs/ext4/verity.c | 2 +- 6 files changed, 17 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0b4db9ce7756..f5291693ce6e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2693,7 +2693,7 @@ extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); -extern void ext4_set_inode_flags(struct inode *); +extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4b8c9a9bdf0c..7941c140723f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1116,7 +1116,7 @@ got: ei->i_block_group = group; ei->i_last_alloc_group = ~0; - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, true); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6532870f6a0b..01636cf5f322 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4420,11 +4420,13 @@ static bool ext4_should_enable_dax(struct inode *inode) return false; } -void ext4_set_inode_flags(struct inode *inode) +void ext4_set_inode_flags(struct inode *inode, bool init) { unsigned int flags = EXT4_I(inode)->i_flags; unsigned int new_fl = 0; + WARN_ON_ONCE(IS_DAX(inode) && init); + if (flags & EXT4_SYNC_FL) new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) @@ -4435,8 +4437,13 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (ext4_should_enable_dax(inode)) + + /* Because of the way inode_set_flags() works we must preserve S_DAX + * here if already set. */ + new_fl |= (inode->i_flags & S_DAX); + if (init && ext4_should_enable_dax(inode)) new_fl |= S_DAX; + if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) @@ -4650,7 +4657,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, true); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5813e5e73eab..145083e8cd1e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -381,7 +381,8 @@ static int ext4_ioctl_setflags(struct inode *inode, ext4_clear_inode_flag(inode, i); } - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); + inode->i_ctime = current_time(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f7d76dcaedfe..80eb814c47eb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1348,7 +1348,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); } return res; } @@ -1375,7 +1375,7 @@ retry: * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index f05a09fb2ae4..89a155ece323 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -244,7 +244,7 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc, if (err) goto out_stop; ext4_set_inode_flag(inode, EXT4_INODE_VERITY); - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); err = ext4_mark_iloc_dirty(handle, inode, &iloc); } out_stop: -- cgit From 9cb20f94afcd2964944f9468e38da736ee855b19 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 08:00:00 -0700 Subject: fs/ext4: Make DAX mount option a tri-state We add 'always', 'never', and 'inode' (default). '-o dax' continues to operate the same which is equivalent to 'always'. This new functionality is limited to ext4 only. Specifically we introduce a 2nd DAX mount flag EXT4_MOUNT2_DAX_NEVER and set it and EXT4_MOUNT_DAX_ALWAYS appropriately for the mode. We also force EXT4_MOUNT2_DAX_NEVER if !CONFIG_FS_DAX. Finally, EXT4_MOUNT2_DAX_INODE is used solely to detect if the user specified that option for printing. Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-7-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/inode.c | 2 ++ fs/ext4/super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 61 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f5291693ce6e..65ffb831b2b9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1168,6 +1168,8 @@ struct ext4_inode_info { blocks */ #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated file systems */ +#define EXT4_MOUNT2_DAX_NEVER 0x00000008 /* Do not allow Direct Access */ +#define EXT4_MOUNT2_DAX_INODE 0x00000010 /* For printing options only */ #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly specified journal checksum */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 01636cf5f322..68fac9289109 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4402,6 +4402,8 @@ static bool ext4_should_enable_dax(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + if (test_opt2(inode->i_sb, DAX_NEVER)) + return false; if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 80eb814c47eb..5e056aa20ce9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1512,7 +1512,8 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, + Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, @@ -1579,6 +1580,9 @@ static const match_table_t tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_dax, "dax"}, + {Opt_dax_always, "dax=always"}, + {Opt_dax_inode, "dax=inode"}, + {Opt_dax_never, "dax=never"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, {Opt_warn_on_error, "warn_on_error"}, @@ -1726,6 +1730,7 @@ static int clear_qf_name(struct super_block *sb, int qtype) #define MOPT_NO_EXT3 0x0200 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_STRING 0x0400 +#define MOPT_SKIP 0x0800 static const struct mount_opts { int token; @@ -1775,7 +1780,13 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, - {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET}, + {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP}, + {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, + {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, + {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -2084,13 +2095,32 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_jquota_fmt = m->mount_opt; #endif - } else if (token == Opt_dax) { + } else if (token == Opt_dax || token == Opt_dax_always || + token == Opt_dax_inode || token == Opt_dax_never) { #ifdef CONFIG_FS_DAX - ext4_msg(sb, KERN_WARNING, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - sbi->s_mount_opt |= m->mount_opt; + switch (token) { + case Opt_dax: + case Opt_dax_always: + ext4_msg(sb, KERN_WARNING, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + break; + case Opt_dax_never: + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + break; + case Opt_dax_inode: + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + /* Strictly for printing options */ + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE; + break; + } #else ext4_msg(sb, KERN_INFO, "dax option not supported"); + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; return -1; #endif } else if (token == Opt_data_err_abort) { @@ -2254,7 +2284,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || - (m->flags & MOPT_CLEAR_ERR)) + (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP) continue; if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) continue; /* skip if same as the default */ @@ -2314,6 +2344,17 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (DUMMY_ENCRYPTION_ENABLED(sbi)) SEQ_OPTS_PUTS("test_dummy_encryption"); + if (test_opt(sb, DAX_ALWAYS)) { + if (IS_EXT2_SB(sb)) + SEQ_OPTS_PUTS("dax"); + else + SEQ_OPTS_PUTS("dax=always"); + } else if (test_opt2(sb, DAX_NEVER)) { + SEQ_OPTS_PUTS("dax=never"); + } else if (test_opt2(sb, DAX_INODE)) { + SEQ_OPTS_PUTS("dax=inode"); + } + ext4_show_quota_options(seq, sb); return 0; } @@ -5436,10 +5477,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS) { + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS || + (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_NEVER || + (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_INODE) { ext4_msg(sb, KERN_WARNING, "warning: refusing change of " - "dax flag with busy inodes while remounting"); - sbi->s_mount_opt ^= EXT4_MOUNT_DAX_ALWAYS; + "dax mount option with busy inodes while remounting"); + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt |= old_opts.s_mount_opt & EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt2 &= ~(EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE); + sbi->s_mount_opt2 |= old_opts.s_mount_opt2 & + (EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE); } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) -- cgit From fcebc7949cd2ff97407e5b77ed99a7211674c6de Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 08:00:01 -0700 Subject: fs/ext4: Remove jflag variable The jflag variable serves almost no purpose. Remove it. Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-8-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 145083e8cd1e..779631e8e849 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -300,7 +300,6 @@ static int ext4_ioctl_setflags(struct inode *inode, int err = -EPERM, migrate = 0; struct ext4_iloc iloc; unsigned int oldflags, mask, i; - unsigned int jflag; struct super_block *sb = inode->i_sb; /* Is it quota file? Do not allow user to mess with it */ @@ -309,9 +308,6 @@ static int ext4_ioctl_setflags(struct inode *inode, oldflags = ei->i_flags; - /* The JOURNAL_DATA flag is modifiable only by root */ - jflag = flags & EXT4_JOURNAL_DATA_FL; - err = vfs_ioc_setflags_prepare(inode, oldflags, flags); if (err) goto flags_out; @@ -320,7 +316,7 @@ static int ext4_ioctl_setflags(struct inode *inode, * The JOURNAL_DATA flag can only be changed by * the relevant capability. */ - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } @@ -391,7 +387,7 @@ flags_err: if (err) goto flags_out; - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { /* * Changes to the journaling mode can cause unsafe changes to * S_DAX if the inode is DAX @@ -401,7 +397,8 @@ flags_err: goto flags_out; } - err = ext4_change_inode_journal_flag(inode, jflag); + err = ext4_change_inode_journal_flag(inode, + flags & EXT4_JOURNAL_DATA_FL); if (err) goto flags_out; } -- cgit From b383a73f2b832491a2f9e6e8ada26aad53b5763d Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 08:00:02 -0700 Subject: fs/ext4: Introduce DAX inode flag Add a flag ([EXT4|FS]_DAX_FL) to preserve FS_XFLAG_DAX in the ext4 inode. Set the flag to be user visible and changeable. Set the flag to be inherited. Allow applications to change the flag at any time except if it conflicts with the set of mutually exclusive flags (Currently VERITY, ENCRYPT, JOURNAL_DATA). Furthermore, restrict setting any of the exclusive flags if DAX is set. While conceptually possible, we do not allow setting EXT4_DAX_FL while at the same time clearing exclusion flags (or vice versa) for 2 reasons: 1) The DAX flag does not take effect immediately which introduces quite a bit of complexity 2) There is no clear use case for being this flexible Finally, on regular files, flag the inode to not be cached to facilitate changing S_DAX on the next creation of the inode. Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-9-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 18 ++++++++++++++---- fs/ext4/inode.c | 2 +- fs/ext4/ioctl.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/super.c | 3 +++ fs/ext4/verity.c | 2 +- 5 files changed, 65 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 65ffb831b2b9..598e00a9453f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -415,13 +415,16 @@ struct flex_groups { #define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ /* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ + +#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */ + #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x725BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x624BC0FF /* User modifiable flags */ /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ @@ -429,14 +432,16 @@ struct flex_groups { EXT4_APPEND_FL | \ EXT4_NODUMP_FL | \ EXT4_NOATIME_FL | \ - EXT4_PROJINHERIT_FL) + EXT4_PROJINHERIT_FL | \ + EXT4_DAX_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ - EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL) + EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\ + EXT4_DAX_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ @@ -448,6 +453,10 @@ struct flex_groups { /* The only flags that should be swapped */ #define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) +/* Flags which are mutually exclusive to DAX */ +#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\ + EXT4_JOURNAL_DATA_FL) + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { @@ -488,6 +497,7 @@ enum { EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ /* 22 was formerly EXT4_INODE_EOFBLOCKS */ + EXT4_INODE_DAX = 25, /* Inode is DAX */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 68fac9289109..778b0dbe3da6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4419,7 +4419,7 @@ static bool ext4_should_enable_dax(struct inode *inode) if (test_opt(inode->i_sb, DAX_ALWAYS)) return true; - return false; + return ext4_test_inode_flag(inode, EXT4_INODE_DAX); } void ext4_set_inode_flags(struct inode *inode, bool init) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 779631e8e849..1b520d07d371 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -292,6 +292,38 @@ static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid, return 0; } +static void ext4_dax_dontcache(struct inode *inode, unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (S_ISDIR(inode->i_mode)) + return; + + if (test_opt2(inode->i_sb, DAX_NEVER) || + test_opt(inode->i_sb, DAX_ALWAYS)) + return; + + if ((ei->i_flags ^ flags) & EXT4_DAX_FL) + d_mark_dontcache(inode); +} + +static bool dax_compatible(struct inode *inode, unsigned int oldflags, + unsigned int flags) +{ + if (flags & EXT4_DAX_FL) { + if ((oldflags & EXT4_DAX_MUT_EXCL) || + ext4_test_inode_state(inode, + EXT4_STATE_VERITY_IN_PROGRESS)) { + return false; + } + } + + if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL)) + return false; + + return true; +} + static int ext4_ioctl_setflags(struct inode *inode, unsigned int flags) { @@ -320,6 +352,12 @@ static int ext4_ioctl_setflags(struct inode *inode, if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } + + if (!dax_compatible(inode, oldflags, flags)) { + err = -EOPNOTSUPP; + goto flags_out; + } + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; @@ -365,6 +403,8 @@ static int ext4_ioctl_setflags(struct inode *inode, if (err) goto flags_err; + ext4_dax_dontcache(inode, flags); + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { if (!(mask & EXT4_FL_USER_MODIFIABLE)) continue; @@ -525,12 +565,15 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) xflags |= FS_XFLAG_NOATIME; if (iflags & EXT4_PROJINHERIT_FL) xflags |= FS_XFLAG_PROJINHERIT; + if (iflags & EXT4_DAX_FL) + xflags |= FS_XFLAG_DAX; return xflags; } #define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ - FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) + FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT | \ + FS_XFLAG_DAX) /* Transfer xflags flags to internal */ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) @@ -549,6 +592,8 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) iflags |= EXT4_NOATIME_FL; if (xflags & FS_XFLAG_PROJINHERIT) iflags |= EXT4_PROJINHERIT_FL; + if (xflags & FS_XFLAG_DAX) + iflags |= EXT4_DAX_FL; return iflags; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5e056aa20ce9..3658e3016999 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1323,6 +1323,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) return -EINVAL; + if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) + return -EOPNOTSUPP; + res = ext4_convert_inline_data(inode); if (res) return res; diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 89a155ece323..4fecb3e4e338 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -113,7 +113,7 @@ static int ext4_begin_enable_verity(struct file *filp) handle_t *handle; int err; - if (IS_DAX(inode)) + if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX)) return -EINVAL; if (ext4_verity_in_progress(inode)) -- cgit From 829b37b8cddb1db75c1b7905505b90e593b15db1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 10 Jun 2020 11:16:37 -0400 Subject: ext4: avoid race conditions when remounting with options that change dax Trying to change dax mount options when remounting could allow mount options to be enabled for a small amount of time, and then the mount option change would be reverted. In the case of "mount -o remount,dax", this can cause a race where files would temporarily treated as DAX --- and then not. Cc: stable@kernel.org Reported-by: syzbot+bca9799bf129256190da@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a22d67c5bc00..edf06c1bee9d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2104,16 +2104,40 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, switch (token) { case Opt_dax: case Opt_dax_always: + if (is_remount && + (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { + fail_dax_change_remount: + ext4_msg(sb, KERN_ERR, "can't change " + "dax mount option while remounting"); + return -1; + } + if (is_remount && + (test_opt(sb, DATA_FLAGS) == + EXT4_MOUNT_JOURNAL_DATA)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + return -1; + } ext4_msg(sb, KERN_WARNING, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS; sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; break; case Opt_dax_never: + if (is_remount && + (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) + goto fail_dax_change_remount; sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; break; case Opt_dax_inode: + if (is_remount && + ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) + goto fail_dax_change_remount; sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; /* Strictly for printing options */ @@ -5454,12 +5478,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } - if (test_opt(sb, DAX_ALWAYS)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and dax"); - err = -EINVAL; - goto restore_opts; - } } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " @@ -5475,18 +5493,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS || - (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_NEVER || - (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_INODE) { - ext4_msg(sb, KERN_WARNING, "warning: refusing change of " - "dax mount option with busy inodes while remounting"); - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; - sbi->s_mount_opt |= old_opts.s_mount_opt & EXT4_MOUNT_DAX_ALWAYS; - sbi->s_mount_opt2 &= ~(EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE); - sbi->s_mount_opt2 |= old_opts.s_mount_opt2 & - (EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE); - } - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user"); -- cgit From cfb3c85a600c6aa25a2581b3c1c4db3460f14e46 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Fri, 22 May 2020 12:18:44 +0800 Subject: ext4: fix partial cluster initialization when splitting extent Fix the bug when calculating the physical block number of the first block in the split extent. This bug will cause xfstests shared/298 failure on ext4 with bigalloc enabled occasionally. Ext4 error messages indicate that previously freed blocks are being freed again, and the following fsck will fail due to the inconsistency of block bitmap and bg descriptor. The following is an example case: 1. First, Initialize a ext4 filesystem with cluster size '16K', block size '4K', in which case, one cluster contains four blocks. 2. Create one file (e.g., xxx.img) on this ext4 filesystem. Now the extent tree of this file is like: ... 36864:[0]4:220160 36868:[0]14332:145408 51200:[0]2:231424 ... 3. Then execute PUNCH_HOLE fallocate on this file. The hole range is like: .. ext4_ext_remove_space: dev 254,16 ino 12 since 49506 end 49506 depth 1 ext4_ext_remove_space: dev 254,16 ino 12 since 49544 end 49546 depth 1 ext4_ext_remove_space: dev 254,16 ino 12 since 49605 end 49607 depth 1 ... 4. Then the extent tree of this file after punching is like ... 49507:[0]37:158047 49547:[0]58:158087 ... 5. Detailed procedure of punching hole [49544, 49546] 5.1. The block address space: ``` lblk ~49505 49506 49507~49543 49544~49546 49547~ ---------+------+-------------+----------------+-------- extent | hole | extent | hole | extent ---------+------+-------------+----------------+-------- pblk ~158045 158046 158047~158083 158084~158086 158087~ ``` 5.2. The detailed layout of cluster 39521: ``` cluster 39521 <-------------------------------> hole extent <----------------------><-------- lblk 49544 49545 49546 49547 +-------+-------+-------+-------+ | | | | | +-------+-------+-------+-------+ pblk 158084 1580845 158086 158087 ``` 5.3. The ftrace output when punching hole [49544, 49546]: - ext4_ext_remove_space (start 49544, end 49546) - ext4_ext_rm_leaf (start 49544, end 49546, last_extent [49507(158047), 40], partial [pclu 39522 lblk 0 state 2]) - ext4_remove_blocks (extent [49507(158047), 40], from 49544 to 49546, partial [pclu 39522 lblk 0 state 2] - ext4_free_blocks: (block 158084 count 4) - ext4_mballoc_free (extent 1/6753/1) 5.4. Ext4 error message in dmesg: EXT4-fs error (device vdb): mb_free_blocks:1457: group 1, block 158084:freeing already freed block (bit 6753); block bitmap corrupt. EXT4-fs error (device vdb): ext4_mb_generate_buddy:747: group 1, block bitmap and bg descriptor inconsistent: 19550 vs 19551 free clusters In this case, the whole cluster 39521 is freed mistakenly when freeing pblock 158084~158086 (i.e., the first three blocks of this cluster), although pblock 158087 (the last remaining block of this cluster) has not been freed yet. The root cause of this isuue is that, the pclu of the partial cluster is calculated mistakenly in ext4_ext_remove_space(). The correct partial_cluster.pclu (i.e., the cluster number of the first block in the next extent, that is, lblock 49597 (pblock 158086)) should be 39521 rather than 39522. Fixes: f4226d9ea400 ("ext4: fix partial cluster initialization") Signed-off-by: Jeffle Xu Reviewed-by: Eric Whitney Cc: stable@kernel.org # v3.19+ Link: https://lore.kernel.org/r/1590121124-37096-1-git-send-email-jefflexu@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7d088ff1e902..221f240eae60 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2844,7 +2844,7 @@ again: * in use to avoid freeing it when removing blocks. */ if (sbi->s_cluster_ratio > 1) { - pblk = ext4_ext_pblock(ex) + end - ee_block + 2; + pblk = ext4_ext_pblock(ex) + end - ee_block + 1; partial.pclu = EXT4_B2C(sbi, pblk); partial.state = nofree; } -- cgit From 5adaccac46ea79008d7b75f47913f1a00f91d0ce Mon Sep 17 00:00:00 2001 From: yangerkun Date: Mon, 1 Jun 2020 15:34:04 +0800 Subject: ext4: stop overwrite the errcode in ext4_setup_super Now the errcode from ext4_commit_super will overwrite EROFS exists in ext4_setup_super. Actually, no need to call ext4_commit_super since we will return EROFS. Fix it by goto done directly. Fixes: c89128a00838 ("ext4: handle errors on ext4_commit_super") Signed-off-by: yangerkun Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200601073404.3712492-1-yangerkun@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index edf06c1bee9d..127b8efa8d54 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2412,6 +2412,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); err = -EROFS; + goto done; } if (read_only) goto done; -- cgit From 2ce3ee931a097e9720310db3f09c01c825a4580c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 1 Jun 2020 13:05:43 -0700 Subject: ext4: avoid utf8_strncasecmp() with unstable name If the dentry name passed to ->d_compare() fits in dentry::d_iname, then it may be concurrently modified by a rename. This can cause undefined behavior (possibly out-of-bounds memory accesses or crashes) in utf8_strncasecmp(), since fs/unicode/ isn't written to handle strings that may be concurrently modified. Fix this by first copying the filename to a stack buffer if needed. This way we get a stable snapshot of the filename. Fixes: b886ee3e778e ("ext4: Support case-insensitive file name lookups") Cc: # v5.2+ Cc: Al Viro Cc: Daniel Rosenberg Cc: Gabriel Krisman Bertazi Signed-off-by: Eric Biggers Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20200601200543.59417-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/dir.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index c654205f648d..1d82336b1cd4 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -675,6 +675,7 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len, struct qstr qstr = {.name = str, .len = len }; const struct dentry *parent = READ_ONCE(dentry->d_parent); const struct inode *inode = READ_ONCE(parent->d_inode); + char strbuf[DNAME_INLINE_LEN]; if (!inode || !IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) { @@ -683,6 +684,21 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len, return memcmp(str, name->name, len); } + /* + * If the dentry name is stored in-line, then it may be concurrently + * modified by a rename. If this happens, the VFS will eventually retry + * the lookup, so it doesn't matter what ->d_compare() returns. + * However, it's unsafe to call utf8_strncasecmp() with an unstable + * string. Therefore, we have to copy the name into a temporary buffer. + */ + if (len <= DNAME_INLINE_LEN - 1) { + memcpy(strbuf, str, len); + strbuf[len] = 0; + qstr.name = strbuf; + /* prevent compiler from optimizing out the temporary buffer */ + barrier(); + } + return ext4_ci_compare(inode, name, &qstr, false); } -- cgit From 811985365378df01386c3cfb7ff716e74ca376d5 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Tue, 9 Jun 2020 16:23:10 +0530 Subject: ext4: mballoc: Use this_cpu_read instead of this_cpu_ptr Simplify reading a seq variable by directly using this_cpu_read API instead of doing this_cpu_ptr and then dereferencing it. This also avoid the below kernel BUG: which happens when CONFIG_DEBUG_PREEMPT is enabled BUG: using smp_processor_id() in preemptible [00000000] code: syz-fuzzer/6927 caller is ext4_mb_new_blocks+0xa4d/0x3b70 fs/ext4/mballoc.c:4711 CPU: 1 PID: 6927 Comm: syz-fuzzer Not tainted 5.7.0-next-20200602-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x18f/0x20d lib/dump_stack.c:118 check_preemption_disabled+0x20d/0x220 lib/smp_processor_id.c:48 ext4_mb_new_blocks+0xa4d/0x3b70 fs/ext4/mballoc.c:4711 ext4_ext_map_blocks+0x201b/0x33e0 fs/ext4/extents.c:4244 ext4_map_blocks+0x4cb/0x1640 fs/ext4/inode.c:626 ext4_getblk+0xad/0x520 fs/ext4/inode.c:833 ext4_bread+0x7c/0x380 fs/ext4/inode.c:883 ext4_append+0x153/0x360 fs/ext4/namei.c:67 ext4_init_new_dir fs/ext4/namei.c:2757 [inline] ext4_mkdir+0x5e0/0xdf0 fs/ext4/namei.c:2802 vfs_mkdir+0x419/0x690 fs/namei.c:3632 do_mkdirat+0x21e/0x280 fs/namei.c:3655 do_syscall_64+0x60/0xe0 arch/x86/entry/common.c:359 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 42f56b7a4a7d ("ext4: mballoc: introduce pcpu seqcnt for freeing PA to improve ENOSPC handling") Suggested-by: Borislav Petkov Tested-by: Marek Szyprowski Signed-off-by: Ritesh Harjani Reviewed-by: Christoph Hellwig Reported-by: syzbot+82f324bb69744c5f6969@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/534f275016296996f54ecf65168bb3392b6f653d.1591699601.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a9083113a8c0..c0a331e2feb0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4708,7 +4708,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } ac->ac_op = EXT4_MB_HISTORY_PREALLOC; - seq = *this_cpu_ptr(&discard_pa_seq); + seq = this_cpu_read(discard_pa_seq); if (!ext4_mb_use_preallocated(ac)) { ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); -- cgit From 88ee9d571b6d8ed345f877e05f685814412e359b Mon Sep 17 00:00:00 2001 From: "Jan (janneke) Nieuwenhuizen" Date: Mon, 25 May 2020 21:39:40 +0200 Subject: ext4: support xattr gnu.* namespace for the Hurd The Hurd gained[0] support for moving the translator and author fields out of the inode and into the "gnu.*" xattr namespace. In anticipation of that, an xattr INDEX was reserved[1]. The Hurd has now been brought into compliance[2] with that. This patch adds support for reading and writing such attributes from Linux; you can now do something like mkdir -p hurd-root/servers/socket touch hurd-root/servers/socket/1 setfattr --name=gnu.translator --value='"/hurd/pflocal\0"' \ hurd-root/servers/socket/1 getfattr --name=gnu.translator hurd-root/servers/socket/1 # file: 1 gnu.translator="/hurd/pflocal" to setup a pipe translator, which is being used to create[3] a vm-image for the Hurd from GNU Guix. [0] https://summerofcode.withgoogle.com/projects/#5869799859027968 [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3980bd3b406addb327d858aebd19e229ea340b9a [2] https://git.savannah.gnu.org/cgit/hurd/hurd.git/commit/?id=a04c7bf83172faa7cb080fbe3b6c04a8415ca645 [3] https://git.savannah.gnu.org/cgit/guix.git/log/?h=wip-hurd-vm Signed-off-by: Jan Nieuwenhuizen Link: https://lore.kernel.org/r/20200525193940.878-1-janneke@gnu.org Signed-off-by: Theodore Ts'o --- fs/ext4/Makefile | 3 ++- fs/ext4/xattr.c | 2 ++ fs/ext4/xattr.h | 1 + fs/ext4/xattr_hurd.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 fs/ext4/xattr_hurd.c (limited to 'fs') diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 4ccb3c9189d8..2e42f47a7f98 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -9,7 +9,8 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ - super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o + super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \ + xattr_user.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 9b29a40738ac..7d2f6576d954 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -93,6 +93,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = { #ifdef CONFIG_EXT4_FS_SECURITY [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, #endif + [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler, }; const struct xattr_handler *ext4_xattr_handlers[] = { @@ -105,6 +106,7 @@ const struct xattr_handler *ext4_xattr_handlers[] = { #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, #endif + &ext4_xattr_hurd_handler, NULL }; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index ffe21ac77f78..730b91fa0dd7 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -124,6 +124,7 @@ struct ext4_xattr_inode_array { extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_security_handler; +extern const struct xattr_handler ext4_xattr_hurd_handler; #define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c new file mode 100644 index 000000000000..8cfa74a56361 --- /dev/null +++ b/fs/ext4/xattr_hurd.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/xattr_hurd.c + * Handler for extended gnu attributes for the Hurd. + * + * Copyright (C) 2001 by Andreas Gruenbacher, + * Copyright (C) 2020 by Jan (janneke) Nieuwenhuizen, + */ + +#include +#include +#include "ext4.h" +#include "xattr.h" + +static bool +ext4_xattr_hurd_list(struct dentry *dentry) +{ + return test_opt(dentry->d_sb, XATTR_USER); +} + +static int +ext4_xattr_hurd_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_HURD, + name, buffer, size); +} + +static int +ext4_xattr_hurd_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_HURD, + name, value, size, flags); +} + +const struct xattr_handler ext4_xattr_hurd_handler = { + .prefix = XATTR_HURD_PREFIX, + .list = ext4_xattr_hurd_list, + .get = ext4_xattr_hurd_get, + .set = ext4_xattr_hurd_set, +}; -- cgit From 7b97d868b7ab2448859668de9222b8af43f76e78 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Tue, 9 Jun 2020 15:35:40 +0800 Subject: ext4, jbd2: ensure panic by fix a race between jbd2 abort and ext4 error handlers In the ext4 filesystem with errors=panic, if one process is recording errno in the superblock when invoking jbd2_journal_abort() due to some error cases, it could be raced by another __ext4_abort() which is setting the SB_RDONLY flag but missing panic because errno has not been recorded. jbd2_journal_commit_transaction() jbd2_journal_abort() journal->j_flags |= JBD2_ABORT; jbd2_journal_update_sb_errno() | ext4_journal_check_start() | __ext4_abort() | sb->s_flags |= SB_RDONLY; | if (!JBD2_REC_ERR) | return; journal->j_flags |= JBD2_REC_ERR; Finally, it will no longer trigger panic because the filesystem has already been set read-only. Fix this by introduce j_abort_mutex to make sure journal abort is completed before panic, and remove JBD2_REC_ERR flag. Fixes: 4327ba52afd03 ("ext4, jbd2: ensure entering into panic after recording an error in superblock") Signed-off-by: zhangyi (F) Reviewed-by: Jan Kara Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200609073540.3810702-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 16 +++++----------- fs/jbd2/journal.c | 17 ++++++++++++----- 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 127b8efa8d54..2660a7e47eef 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -522,9 +522,6 @@ static void ext4_handle_error(struct super_block *sb) smp_wmb(); sb->s_flags |= SB_RDONLY; } else if (test_opt(sb, ERRORS_PANIC)) { - if (EXT4_SB(sb)->s_journal && - !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) - return; panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } @@ -725,23 +722,20 @@ void __ext4_abort(struct super_block *sb, const char *function, va_end(args); if (sb_rdonly(sb) == 0) { - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + if (EXT4_SB(sb)->s_journal) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); + + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible * before ->s_flags update */ smp_wmb(); sb->s_flags |= SB_RDONLY; - if (EXT4_SB(sb)->s_journal) - jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); } - if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { - if (EXT4_SB(sb)->s_journal && - !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) - return; + if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) panic("EXT4-fs panic from previous error\n"); - } } void __ext4_msg(struct super_block *sb, diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index a49d0e670ddf..e4944436e733 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1140,6 +1140,7 @@ static journal_t *journal_init_common(struct block_device *bdev, init_waitqueue_head(&journal->j_wait_commit); init_waitqueue_head(&journal->j_wait_updates); init_waitqueue_head(&journal->j_wait_reserved); + mutex_init(&journal->j_abort_mutex); mutex_init(&journal->j_barrier); mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); @@ -1402,7 +1403,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) printk(KERN_ERR "JBD2: Error %d detected when updating " "journal superblock for %s.\n", ret, journal->j_devname); - jbd2_journal_abort(journal, ret); + if (!is_journal_aborted(journal)) + jbd2_journal_abort(journal, ret); } return ret; @@ -2153,6 +2155,13 @@ void jbd2_journal_abort(journal_t *journal, int errno) { transaction_t *transaction; + /* + * Lock the aborting procedure until everything is done, this avoid + * races between filesystem's error handling flow (e.g. ext4_abort()), + * ensure panic after the error info is written into journal's + * superblock. + */ + mutex_lock(&journal->j_abort_mutex); /* * ESHUTDOWN always takes precedence because a file system check * caused by any other journal abort error is not required after @@ -2167,6 +2176,7 @@ void jbd2_journal_abort(journal_t *journal, int errno) journal->j_errno = errno; jbd2_journal_update_sb_errno(journal); } + mutex_unlock(&journal->j_abort_mutex); return; } @@ -2188,10 +2198,7 @@ void jbd2_journal_abort(journal_t *journal, int errno) * layer could realise that a filesystem check is needed. */ jbd2_journal_update_sb_errno(journal); - - write_lock(&journal->j_state_lock); - journal->j_flags |= JBD2_REC_ERR; - write_unlock(&journal->j_state_lock); + mutex_unlock(&journal->j_abort_mutex); } /** -- cgit From 59960b9deb5354e4cdb0b6ed3a3b653a2b4eb602 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 16:36:30 +0300 Subject: io_uring: fix lazy work init Don't leave garbage in req.work before punting async on -EAGAIN in io_iopoll_queue(). [ 140.922099] general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI ... [ 140.922105] RIP: 0010:io_worker_handle_work+0x1db/0x480 ... [ 140.922114] Call Trace: [ 140.922118] ? __next_timer_interrupt+0xe0/0xe0 [ 140.922119] io_wqe_worker+0x2a9/0x360 [ 140.922121] ? _raw_spin_unlock_irqrestore+0x24/0x40 [ 140.922124] kthread+0x12c/0x170 [ 140.922125] ? io_worker_handle_work+0x480/0x480 [ 140.922126] ? kthread_park+0x90/0x90 [ 140.922127] ret_from_fork+0x22/0x30 Fixes: 7cdaf587de7c ("io_uring: avoid whole io_wq_work copy for requests completed inline") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 155f3d830ddb..c04b20bf2803 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1087,6 +1087,7 @@ static inline void io_prep_async_work(struct io_kiocb *req, req->work.flags |= IO_WQ_WORK_UNBOUND; } + io_req_init_async(req); io_req_work_grab_env(req, def); *link = io_prep_linked_timeout(req); -- cgit From bb413489288e4e457353bac513fddb6330d245ca Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 12 Jun 2020 00:15:13 +0100 Subject: afs: Fix non-setting of mtime when writing into mmap The mtime on an inode needs to be updated when a write is made into an mmap'ed section. There are three ways in which this could be done: update it when page_mkwrite is called, update it when a page is changed from dirty to writeback or leave it to the server and fix the mtime up from the reply to the StoreData RPC. Found with the generic/215 xfstest. Fixes: 1cf7a1518aef ("afs: Implement shared-writeable mmap") Signed-off-by: David Howells --- fs/afs/write.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index 768497f82aee..9270bb01be67 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -844,6 +844,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) vmf->page->index, priv); SetPagePrivate(vmf->page); set_page_private(vmf->page, priv); + file_update_time(file); sb_end_pagefault(inode->i_sb); return VM_FAULT_LOCKED; -- cgit From 1f32ef79897052ef7d3d154610d8d6af95abde83 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 12 Jun 2020 23:58:51 +0100 Subject: afs: afs_write_end() should change i_size under the right lock Fix afs_write_end() to change i_size under vnode->cb_lock rather than ->wb_lock so that it doesn't race with afs_vnode_commit_status() and afs_getattr(). The ->wb_lock is only meant to guard access to ->wb_keys which isn't accessed by that piece of code. Fixes: 4343d00872e1 ("afs: Get rid of the afs_writeback record") Signed-off-by: David Howells --- fs/afs/write.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index 9270bb01be67..a55cb73e0449 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -194,11 +194,11 @@ int afs_write_end(struct file *file, struct address_space *mapping, i_size = i_size_read(&vnode->vfs_inode); if (maybe_i_size > i_size) { - spin_lock(&vnode->wb_lock); + write_seqlock(&vnode->cb_lock); i_size = i_size_read(&vnode->vfs_inode); if (maybe_i_size > i_size) i_size_write(&vnode->vfs_inode, maybe_i_size); - spin_unlock(&vnode->wb_lock); + write_sequnlock(&vnode->cb_lock); } if (!PageUptodate(page)) { -- cgit From 3f4aa981816368fe6b1d13c2bfbe76df9687e787 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 13 Jun 2020 00:03:48 +0100 Subject: afs: Fix EOF corruption When doing a partial writeback, afs_write_back_from_locked_page() may generate an FS.StoreData RPC request that writes out part of a file when a file has been constructed from pieces by doing seek, write, seek, write, ... as is done by ld. The FS.StoreData RPC is given the current i_size as the file length, but the server basically ignores it unless the data length is 0 (in which case it's just a truncate operation). The revised file length returned in the result of the RPC may then not reflect what we suggested - and this leads to i_size getting moved backwards - which causes issues later. Fix the client to take account of this by ignoring the returned file size unless the data version number jumped unexpectedly - in which case we're going to have to clear the pagecache and reload anyway. This can be observed when doing a kernel build on an AFS mount. The following pair of commands produce the issue: ld -m elf_x86_64 -z max-page-size=0x200000 --emit-relocs \ -T arch/x86/realmode/rm/realmode.lds \ arch/x86/realmode/rm/header.o \ arch/x86/realmode/rm/trampoline_64.o \ arch/x86/realmode/rm/stack.o \ arch/x86/realmode/rm/reboot.o \ -o arch/x86/realmode/rm/realmode.elf arch/x86/tools/relocs --realmode \ arch/x86/realmode/rm/realmode.elf \ >arch/x86/realmode/rm/realmode.relocs This results in the latter giving: Cannot read ELF section headers 0/18: Success as the realmode.elf file got corrupted. The sequence of events can also be driven with: xfs_io -t -f \ -c "pwrite -S 0x58 0 0x58" \ -c "pwrite -S 0x59 10000 1000" \ -c "close" \ /afs/example.com/scratch/a Fixes: 31143d5d515e ("AFS: implement basic file write support") Signed-off-by: David Howells --- fs/afs/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/inode.c b/fs/afs/inode.c index cd0a0060950b..8d10bfb392d1 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -168,6 +168,7 @@ static void afs_apply_status(struct afs_operation *op, struct timespec64 t; umode_t mode; bool data_changed = false; + bool change_size = false; _enter("{%llx:%llu.%u} %s", vp->fid.vid, vp->fid.vnode, vp->fid.unique, @@ -226,6 +227,7 @@ static void afs_apply_status(struct afs_operation *op, } else { set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); } + change_size = true; } else if (vnode->status.type == AFS_FTYPE_DIR) { /* Expected directory change is handled elsewhere so * that we can locally edit the directory and save on a @@ -233,11 +235,19 @@ static void afs_apply_status(struct afs_operation *op, */ if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) data_changed = false; + change_size = true; } if (data_changed) { inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); - afs_set_i_size(vnode, status->size); + + /* Only update the size if the data version jumped. If the + * file is being modified locally, then we might have our own + * idea of what the size should be that's not the same as + * what's on the server. + */ + if (change_size) + afs_set_i_size(vnode, status->size); } } -- cgit From da8d07551275abb3a38fae2d16e02bc9cc7396b2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 13 Jun 2020 19:34:59 +0100 Subject: afs: Concoct ctimes The in-kernel afs filesystem ignores ctime because the AFS fileserver protocol doesn't support ctimes. This, however, causes various xfstests to fail. Work around this by: (1) Setting ctime to attr->ia_ctime in afs_setattr(). (2) Not ignoring ATTR_MTIME_SET, ATTR_TIMES_SET and ATTR_TOUCH settings. (3) Setting the ctime from the server mtime when on the target file when creating a hard link to it. (4) Setting the ctime on directories from their revised mtimes when renaming/moving a file. Found by the generic/221 and generic/309 xfstests. Signed-off-by: David Howells --- fs/afs/dir.c | 18 +++++++++++++++++- fs/afs/inode.c | 29 ++++++++++++++++++----------- fs/afs/internal.h | 2 ++ fs/afs/write.c | 1 + 4 files changed, 38 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index aa1d34141ea3..308a125e9de3 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1268,6 +1268,7 @@ static void afs_vnode_new_inode(struct afs_operation *op) static void afs_create_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); + op->ctime = op->file[0].scb.status.mtime_client; afs_check_for_remote_deletion(op, op->file[0].vnode); afs_vnode_commit_status(op, &op->file[0]); afs_update_dentry_version(op, &op->file[0], op->dentry); @@ -1325,6 +1326,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].update_ctime = true; op->dentry = dentry; op->create.mode = S_IFDIR | mode; op->create.reason = afs_edit_dir_for_mkdir; @@ -1350,6 +1352,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry) static void afs_rmdir_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); + op->ctime = op->file[0].scb.status.mtime_client; afs_check_for_remote_deletion(op, op->file[0].vnode); afs_vnode_commit_status(op, &op->file[0]); afs_update_dentry_version(op, &op->file[0], op->dentry); @@ -1404,6 +1407,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].update_ctime = true; op->dentry = dentry; op->ops = &afs_rmdir_operation; @@ -1479,6 +1483,7 @@ static void afs_dir_remove_link(struct afs_operation *op) static void afs_unlink_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); + op->ctime = op->file[0].scb.status.mtime_client; afs_check_for_remote_deletion(op, op->file[0].vnode); afs_vnode_commit_status(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[1]); @@ -1537,6 +1542,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].update_ctime = true; /* Try to make sure we have a callback promise on the victim. */ ret = afs_validate(vnode, op->key); @@ -1561,6 +1567,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); op->file[1].vnode = vnode; + op->file[1].update_ctime = true; op->dentry = dentry; op->ops = &afs_unlink_operation; return afs_do_sync_operation(op); @@ -1601,6 +1608,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].update_ctime = true; op->dentry = dentry; op->create.mode = S_IFREG | mode; @@ -1620,6 +1628,7 @@ static void afs_link_success(struct afs_operation *op) struct afs_vnode_param *vp = &op->file[1]; _enter("op=%08x", op->debug_id); + op->ctime = dvp->scb.status.mtime_client; afs_vnode_commit_status(op, dvp); afs_vnode_commit_status(op, vp); afs_update_dentry_version(op, dvp, op->dentry); @@ -1672,6 +1681,8 @@ static int afs_link(struct dentry *from, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, vnode); op->file[0].dv_delta = 1; + op->file[0].update_ctime = true; + op->file[1].update_ctime = true; op->dentry = dentry; op->dentry_2 = from; @@ -1740,9 +1751,12 @@ static void afs_rename_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); + op->ctime = op->file[0].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[0]); - if (op->file[1].vnode != op->file[0].vnode) + if (op->file[1].vnode != op->file[0].vnode) { + op->ctime = op->file[1].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[1]); + } } static void afs_rename_edit_dir(struct afs_operation *op) @@ -1860,6 +1874,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ op->file[0].dv_delta = 1; op->file[1].dv_delta = 1; + op->file[0].update_ctime = true; + op->file[1].update_ctime = true; op->dentry = old_dentry; op->dentry_2 = new_dentry; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 8d10bfb392d1..e99705474dd1 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -165,6 +165,7 @@ static void afs_apply_status(struct afs_operation *op, { struct afs_file_status *status = &vp->scb.status; struct afs_vnode *vnode = vp->vnode; + struct inode *inode = &vnode->vfs_inode; struct timespec64 t; umode_t mode; bool data_changed = false; @@ -187,25 +188,25 @@ static void afs_apply_status(struct afs_operation *op, } if (status->nlink != vnode->status.nlink) - set_nlink(&vnode->vfs_inode, status->nlink); + set_nlink(inode, status->nlink); if (status->owner != vnode->status.owner) - vnode->vfs_inode.i_uid = make_kuid(&init_user_ns, status->owner); + inode->i_uid = make_kuid(&init_user_ns, status->owner); if (status->group != vnode->status.group) - vnode->vfs_inode.i_gid = make_kgid(&init_user_ns, status->group); + inode->i_gid = make_kgid(&init_user_ns, status->group); if (status->mode != vnode->status.mode) { - mode = vnode->vfs_inode.i_mode; + mode = inode->i_mode; mode &= ~S_IALLUGO; mode |= status->mode; - WRITE_ONCE(vnode->vfs_inode.i_mode, mode); + WRITE_ONCE(inode->i_mode, mode); } t = status->mtime_client; - vnode->vfs_inode.i_ctime = t; - vnode->vfs_inode.i_mtime = t; - vnode->vfs_inode.i_atime = t; + inode->i_mtime = t; + if (vp->update_ctime) + inode->i_ctime = op->ctime; if (vnode->status.data_version != status->data_version) data_changed = true; @@ -239,15 +240,18 @@ static void afs_apply_status(struct afs_operation *op, } if (data_changed) { - inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); + inode_set_iversion_raw(inode, status->data_version); /* Only update the size if the data version jumped. If the * file is being modified locally, then we might have our own * idea of what the size should be that's not the same as * what's on the server. */ - if (change_size) + if (change_size) { afs_set_i_size(vnode, status->size); + inode->i_ctime = t; + inode->i_atime = t; + } } } @@ -817,7 +821,8 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid); if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID | - ATTR_MTIME))) { + ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | + ATTR_TOUCH))) { _leave(" = 0 [unsupported]"); return 0; } @@ -837,6 +842,8 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE) op->file[0].dv_delta = 1; + op->ctime = attr->ia_ctime; + op->file[0].update_ctime = 1; op->ops = &afs_setattr_operation; return afs_do_sync_operation(op); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 0c9806ef2a19..92cd6b8cc01f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -746,6 +746,7 @@ struct afs_vnode_param { u8 dv_delta; /* Expected change in data version */ bool put_vnode; /* T if we have a ref on the vnode */ bool need_io_lock; /* T if we need the I/O lock on this */ + bool update_ctime; /* Need to update the ctime */ }; /* @@ -766,6 +767,7 @@ struct afs_operation { struct dentry *dentry; /* Dentry to be altered */ struct dentry *dentry_2; /* Second dentry to be altered */ struct timespec64 mtime; /* Modification time to record */ + struct timespec64 ctime; /* Change time to set */ short nr_files; /* Number of entries in file[], more_files */ short error; unsigned int abort_code; diff --git a/fs/afs/write.c b/fs/afs/write.c index a55cb73e0449..2003d7ee9e43 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -393,6 +393,7 @@ static void afs_store_data_success(struct afs_operation *op) { struct afs_vnode *vnode = op->file[0].vnode; + op->ctime = op->file[0].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[0]); if (op->error == 0) { afs_pages_written_back(vnode, op->store.first, op->store.last); -- cgit From 793fe82ee33aab1023cf023cd7d744af19a3dff9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 12 Jun 2020 16:13:52 +0100 Subject: afs: Fix truncation issues and mmap writeback size Fix the following issues: (1) Fix writeback to reduce the size of a store operation to i_size, effectively discarding the extra data. The problem comes when afs_page_mkwrite() records that a page is about to be modified by mmap(). It doesn't know what bits of the page are going to be modified, so it records the whole page as being dirty (this is stored in page->private as start and end offsets). Without this, the marshalling for the store to the server extends the size of the file to the end of the page (in afs_fs_store_data() and yfs_fs_store_data()). (2) Fix setattr to actually truncate the pagecache, thereby clearing the discarded part of a file. (3) Fix setattr to check that the new size is okay and to disable ATTR_SIZE if i_size wouldn't change. (4) Force i_size to be updated as the result of a truncate. (5) Don't truncate if ATTR_SIZE is not set. (6) Call pagecache_isize_extended() if the file was enlarged. Note that truncate_set_size() isn't used because the setting of i_size is done inside afs_vnode_commit_status() under the vnode->cb_lock. Found with the generic/029 and generic/393 xfstests. Fixes: 31143d5d515e ("AFS: implement basic file write support") Fixes: 4343d00872e1 ("afs: Get rid of the afs_writeback record") Signed-off-by: David Howells --- fs/afs/inode.c | 27 +++++++++++++++++++++++++-- fs/afs/internal.h | 7 ++++--- fs/afs/write.c | 6 ++++++ 3 files changed, 35 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e99705474dd1..70c925978d10 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -169,7 +169,7 @@ static void afs_apply_status(struct afs_operation *op, struct timespec64 t; umode_t mode; bool data_changed = false; - bool change_size = false; + bool change_size = vp->set_size; _enter("{%llx:%llu.%u} %s", vp->fid.vid, vp->fid.vnode, vp->fid.unique, @@ -799,7 +799,15 @@ void afs_evict_inode(struct inode *inode) static void afs_setattr_success(struct afs_operation *op) { + struct inode *inode = &op->file[0].vnode->vfs_inode; + afs_vnode_commit_status(op, &op->file[0]); + if (op->setattr.attr->ia_valid & ATTR_SIZE) { + loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size; + if (size > i_size) + pagecache_isize_extended(inode, i_size, size); + truncate_pagecache(inode, size); + } } static const struct afs_operation_ops afs_setattr_operation = { @@ -815,6 +823,7 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) { struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); + int ret; _enter("{%llx:%llu},{n=%pd},%x", vnode->fid.vid, vnode->fid.vnode, dentry, @@ -827,6 +836,18 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) return 0; } + if (attr->ia_valid & ATTR_SIZE) { + if (!S_ISREG(vnode->vfs_inode.i_mode)) + return -EISDIR; + + ret = inode_newsize_ok(&vnode->vfs_inode, attr->ia_size); + if (ret) + return ret; + + if (attr->ia_size == i_size_read(&vnode->vfs_inode)) + attr->ia_valid &= ~ATTR_SIZE; + } + /* flush any dirty data outstanding on a regular file */ if (S_ISREG(vnode->vfs_inode.i_mode)) filemap_write_and_wait(vnode->vfs_inode.i_mapping); @@ -840,8 +861,10 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) afs_op_set_vnode(op, 0, vnode); op->setattr.attr = attr; - if (attr->ia_valid & ATTR_SIZE) + if (attr->ia_valid & ATTR_SIZE) { op->file[0].dv_delta = 1; + op->file[0].set_size = true; + } op->ctime = attr->ia_ctime; op->file[0].update_ctime = 1; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 92cd6b8cc01f..bdc1e5efebd4 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -744,9 +744,10 @@ struct afs_vnode_param { afs_dataversion_t dv_before; /* Data version before the call */ unsigned int cb_break_before; /* cb_break + cb_s_break before the call */ u8 dv_delta; /* Expected change in data version */ - bool put_vnode; /* T if we have a ref on the vnode */ - bool need_io_lock; /* T if we need the I/O lock on this */ - bool update_ctime; /* Need to update the ctime */ + bool put_vnode:1; /* T if we have a ref on the vnode */ + bool need_io_lock:1; /* T if we need the I/O lock on this */ + bool update_ctime:1; /* Need to update the ctime */ + bool set_size:1; /* Must update i_size */ }; /* diff --git a/fs/afs/write.c b/fs/afs/write.c index 2003d7ee9e43..7437806332d9 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -492,6 +492,7 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, unsigned long count, priv; unsigned n, offset, to, f, t; pgoff_t start, first, last; + loff_t i_size, end; int loop, ret; _enter(",%lx", primary_page->index); @@ -592,7 +593,12 @@ no_more: first = primary_page->index; last = first + count - 1; + end = (loff_t)last * PAGE_SIZE + to; + i_size = i_size_read(&vnode->vfs_inode); + _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); + if (end > i_size) + to = i_size & ~PAGE_MASK; ret = afs_store_data(mapping, first, last, offset, to); switch (ret) { -- cgit From 4ec89596d06bd481ba827f3b409b938d63914157 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sun, 14 Jun 2020 22:12:05 +0100 Subject: afs: Fix the mapping of the UAEOVERFLOW abort code Abort code UAEOVERFLOW is returned when we try and set a time that's out of range, but it's currently mapped to EREMOTEIO by the default case. Fix UAEOVERFLOW to map instead to EOVERFLOW. Found with the generic/258 xfstest. Note that the test is wrong as it assumes that the filesystem will support a pre-UNIX-epoch date. Fixes: 1eda8bab70ca ("afs: Add support for the UAE error table") Signed-off-by: David Howells --- fs/afs/misc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 52b19e9c1535..5334f1bd2bca 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -83,6 +83,7 @@ int afs_abort_to_error(u32 abort_code) case UAENOLCK: return -ENOLCK; case UAENOTEMPTY: return -ENOTEMPTY; case UAELOOP: return -ELOOP; + case UAEOVERFLOW: return -EOVERFLOW; case UAENOMEDIUM: return -ENOMEDIUM; case UAEDQUOT: return -EDQUOT; -- cgit From f4c2665e33f48904f2766d644df33fb3fd54b5ec Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:24:02 +0300 Subject: io-wq: reorder cancellation pending -> running Go all over all pending lists and cancel works there, and only then try to match running requests. No functional changes here, just a preparation for bulk cancellation. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 54 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 0b65a912b036..03c7e37548c2 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -927,19 +927,14 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) return ret; } -static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, - struct io_cb_cancel_data *match) +static bool io_wqe_cancel_pending_work(struct io_wqe *wqe, + struct io_cb_cancel_data *match) { struct io_wq_work_node *node, *prev; struct io_wq_work *work; unsigned long flags; bool found = false; - /* - * First check pending list, if we're lucky we can just remove it - * from there. CANCEL_OK means that the work is returned as-new, - * no completion will be posted for it. - */ spin_lock_irqsave(&wqe->lock, flags); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); @@ -952,21 +947,20 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, } spin_unlock_irqrestore(&wqe->lock, flags); - if (found) { + if (found) io_run_cancel(work, wqe); - return IO_WQ_CANCEL_OK; - } + return found; +} + +static bool io_wqe_cancel_running_work(struct io_wqe *wqe, + struct io_cb_cancel_data *match) +{ + bool found; - /* - * Now check if a free (going busy) or busy worker has the work - * currently running. If we find it there, we'll return CANCEL_RUNNING - * as an indication that we attempt to signal cancellation. The - * completion will run normally in this case. - */ rcu_read_lock(); found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); rcu_read_unlock(); - return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; + return found; } enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, @@ -976,18 +970,34 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, .fn = cancel, .data = data, }; - enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; int node; + /* + * First check pending list, if we're lucky we can just remove it + * from there. CANCEL_OK means that the work is returned as-new, + * no completion will be posted for it. + */ for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - ret = io_wqe_cancel_work(wqe, &match); - if (ret != IO_WQ_CANCEL_NOTFOUND) - break; + if (io_wqe_cancel_pending_work(wqe, &match)) + return IO_WQ_CANCEL_OK; } - return ret; + /* + * Now check if a free (going busy) or busy worker has the work + * currently running. If we find it there, we'll return CANCEL_RUNNING + * as an indication that we attempt to signal cancellation. The + * completion will run normally in this case. + */ + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + + if (io_wqe_cancel_running_work(wqe, &match)) + return IO_WQ_CANCEL_RUNNING; + } + + return IO_WQ_CANCEL_NOTFOUND; } static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) -- cgit From 4f26bda1522c35d2701fc219368c7101c17005c1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:24:03 +0300 Subject: io-wq: add an option to cancel all matched reqs This adds support for cancelling all io-wq works matching a predicate. It isn't used yet, so no change in observable behaviour. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 60 +++++++++++++++++++++++++++++++++-------------------------- fs/io-wq.h | 2 +- fs/io_uring.c | 2 +- 3 files changed, 36 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 03c7e37548c2..e290202d6f64 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -903,13 +903,15 @@ void io_wq_cancel_all(struct io_wq *wq) struct io_cb_cancel_data { work_cancel_fn *fn; void *data; + int nr_running; + int nr_pending; + bool cancel_all; }; static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { struct io_cb_cancel_data *match = data; unsigned long flags; - bool ret = false; /* * Hold the lock to avoid ->cur_work going out of scope, caller @@ -920,55 +922,55 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) && match->fn(worker->cur_work, match->data)) { send_sig(SIGINT, worker->task, 1); - ret = true; + match->nr_running++; } spin_unlock_irqrestore(&worker->lock, flags); - return ret; + return match->nr_running && !match->cancel_all; } -static bool io_wqe_cancel_pending_work(struct io_wqe *wqe, +static void io_wqe_cancel_pending_work(struct io_wqe *wqe, struct io_cb_cancel_data *match) { struct io_wq_work_node *node, *prev; struct io_wq_work *work; unsigned long flags; - bool found = false; +retry: spin_lock_irqsave(&wqe->lock, flags); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); + if (!match->fn(work, match->data)) + continue; - if (match->fn(work, match->data)) { - wq_list_del(&wqe->work_list, node, prev); - found = true; - break; - } + wq_list_del(&wqe->work_list, node, prev); + spin_unlock_irqrestore(&wqe->lock, flags); + io_run_cancel(work, wqe); + match->nr_pending++; + if (!match->cancel_all) + return; + + /* not safe to continue after unlock */ + goto retry; } spin_unlock_irqrestore(&wqe->lock, flags); - - if (found) - io_run_cancel(work, wqe); - return found; } -static bool io_wqe_cancel_running_work(struct io_wqe *wqe, +static void io_wqe_cancel_running_work(struct io_wqe *wqe, struct io_cb_cancel_data *match) { - bool found; - rcu_read_lock(); - found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); + io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); rcu_read_unlock(); - return found; } enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, - void *data) + void *data, bool cancel_all) { struct io_cb_cancel_data match = { - .fn = cancel, - .data = data, + .fn = cancel, + .data = data, + .cancel_all = cancel_all, }; int node; @@ -980,7 +982,8 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - if (io_wqe_cancel_pending_work(wqe, &match)) + io_wqe_cancel_pending_work(wqe, &match); + if (match.nr_pending && !match.cancel_all) return IO_WQ_CANCEL_OK; } @@ -993,10 +996,15 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - if (io_wqe_cancel_running_work(wqe, &match)) + io_wqe_cancel_running_work(wqe, &match); + if (match.nr_running && !match.cancel_all) return IO_WQ_CANCEL_RUNNING; } + if (match.nr_running) + return IO_WQ_CANCEL_RUNNING; + if (match.nr_pending) + return IO_WQ_CANCEL_OK; return IO_WQ_CANCEL_NOTFOUND; } @@ -1007,7 +1015,7 @@ static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) { - return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork); + return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false); } static bool io_wq_pid_match(struct io_wq_work *work, void *data) @@ -1021,7 +1029,7 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) { void *data = (void *) (unsigned long) pid; - return io_wq_cancel_cb(wq, io_wq_pid_match, data); + return io_wq_cancel_cb(wq, io_wq_pid_match, data, false); } struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) diff --git a/fs/io-wq.h b/fs/io-wq.h index 8e138fa88b9f..7d5bd431c5e3 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -130,7 +130,7 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid); typedef bool (work_cancel_fn)(struct io_wq_work *, void *); enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, - void *data); + void *data, bool cancel_all); struct task_struct *io_wq_get_task(struct io_wq *wq); diff --git a/fs/io_uring.c b/fs/io_uring.c index c04b20bf2803..94bd88556c1e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4773,7 +4773,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) enum io_wq_cancel cancel_ret; int ret = 0; - cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr); + cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; -- cgit From 44e728b8aae0bb6d4229129083974f9dea43f50b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:24:04 +0300 Subject: io_uring: cancel all task's requests on exit If a process is going away, io_uring_flush() will cancel only 1 request with a matching pid. Cancel all of them Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 14 -------------- fs/io-wq.h | 1 - fs/io_uring.c | 14 ++++++++++++-- 3 files changed, 12 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index e290202d6f64..47c5f3aeb460 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1018,20 +1018,6 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false); } -static bool io_wq_pid_match(struct io_wq_work *work, void *data) -{ - pid_t pid = (pid_t) (unsigned long) data; - - return work->task_pid == pid; -} - -enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) -{ - void *data = (void *) (unsigned long) pid; - - return io_wq_cancel_cb(wq, io_wq_pid_match, data, false); -} - struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { int ret = -ENOMEM, node; diff --git a/fs/io-wq.h b/fs/io-wq.h index 7d5bd431c5e3..b72538fe5afd 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -125,7 +125,6 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work) void io_wq_cancel_all(struct io_wq *wq); enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); -enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid); typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/fs/io_uring.c b/fs/io_uring.c index 94bd88556c1e..ad5128a40c14 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7424,6 +7424,13 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } } +static bool io_cancel_pid_cb(struct io_wq_work *work, void *data) +{ + pid_t pid = (pid_t) (unsigned long) data; + + return work->task_pid == pid; +} + static int io_uring_flush(struct file *file, void *data) { struct io_ring_ctx *ctx = file->private_data; @@ -7433,8 +7440,11 @@ static int io_uring_flush(struct file *file, void *data) /* * If the task is going away, cancel work it may have pending */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) - io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current)); + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { + void *data = (void *) (unsigned long)task_pid_vnr(current); + + io_wq_cancel_cb(ctx->io_wq, io_cancel_pid_cb, data, true); + } return 0; } -- cgit From 67c4d9e693e3bb7fb968af24e3584f821a78ba56 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:24:05 +0300 Subject: io_uring: batch cancel in io_uring_cancel_files() Instead of waiting for each request one by one, first try to cancel all of them in a batched manner, and then go over inflight_list/etc to reap leftovers. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index ad5128a40c14..b6bcd5a7f4bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7366,9 +7366,22 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } +static bool io_wq_files_match(struct io_wq_work *work, void *data) +{ + struct files_struct *files = data; + + return work->files == files; +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { + if (list_empty_careful(&ctx->inflight_list)) + return; + + /* cancel all at once, should be faster than doing it one by one*/ + io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); + while (!list_empty_careful(&ctx->inflight_list)) { struct io_kiocb *cancel_req = NULL, *req; DEFINE_WAIT(wait); -- cgit From 4dd2824d6d5914949b5fe589538bc2622d84c5dd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:33:13 +0300 Subject: io_uring: lazy get task There will be multiple places where req->task is used, so refcount-pin it lazily with introduced *io_{get,put}_req_task(). We need to always have valid ->task for cancellation reasons, but don't care about pinning it in some cases. That's why it sets req->task in io_req_init() and implements get/put laziness with a flag. This also removes using @current from polling io_arm_poll_handler(), etc., but doesn't change observable behaviour. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index b6bcd5a7f4bc..5f946eb8b740 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -541,6 +541,7 @@ enum { REQ_F_NO_FILE_TABLE_BIT, REQ_F_QUEUE_TIMEOUT_BIT, REQ_F_WORK_INITIALIZED_BIT, + REQ_F_TASK_PINNED_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -598,6 +599,8 @@ enum { REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), + /* req->task is refcounted */ + REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT), }; struct async_poll { @@ -910,6 +913,21 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +static void io_get_req_task(struct io_kiocb *req) +{ + if (req->flags & REQ_F_TASK_PINNED) + return; + get_task_struct(req->task); + req->flags |= REQ_F_TASK_PINNED; +} + +/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ +static void __io_put_req_task(struct io_kiocb *req) +{ + if (req->flags & REQ_F_TASK_PINNED) + put_task_struct(req->task); +} + static void io_file_put_work(struct work_struct *work); /* @@ -1399,9 +1417,7 @@ static void __io_req_aux_free(struct io_kiocb *req) kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - if (req->task) - put_task_struct(req->task); - + __io_put_req_task(req); io_req_work_drop_env(req); } @@ -4367,8 +4383,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) memcpy(&apoll->work, &req->work, sizeof(req->work)); had_io = req->io != NULL; - get_task_struct(current); - req->task = current; + io_get_req_task(req); req->apoll = apoll; INIT_HLIST_NODE(&req->hash_node); @@ -4556,8 +4571,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - get_task_struct(current); - req->task = current; + io_get_req_task(req); return 0; } @@ -5818,7 +5832,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->flags = 0; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); - req->task = NULL; + req->task = current; req->result = 0; if (unlikely(req->opcode >= IORING_OP_LAST)) -- cgit From 801dd57bd1d8c2c253f43635a3045bfa32a810b3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:33:14 +0300 Subject: io_uring: cancel by ->task not pid For an exiting process it tries to cancel all its inflight requests. Use req->task to match such instead of work.pid. We always have req->task set, and it will be valid because we're matching only current exiting task. Also, remove work.pid and everything related, it's useless now. Reported-by: Eric W. Biederman Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.h | 1 - fs/io_uring.c | 16 ++++++---------- 2 files changed, 6 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.h b/fs/io-wq.h index b72538fe5afd..071f1a997800 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -90,7 +90,6 @@ struct io_wq_work { const struct cred *creds; struct fs_struct *fs; unsigned flags; - pid_t task_pid; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5f946eb8b740..e17df662c191 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1063,8 +1063,6 @@ static inline void io_req_work_grab_env(struct io_kiocb *req, } spin_unlock(¤t->fs->lock); } - if (!req->work.task_pid) - req->work.task_pid = task_pid_vnr(current); } static inline void io_req_work_drop_env(struct io_kiocb *req) @@ -7451,11 +7449,12 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } } -static bool io_cancel_pid_cb(struct io_wq_work *work, void *data) +static bool io_cancel_task_cb(struct io_wq_work *work, void *data) { - pid_t pid = (pid_t) (unsigned long) data; + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct task_struct *task = data; - return work->task_pid == pid; + return req->task == task; } static int io_uring_flush(struct file *file, void *data) @@ -7467,11 +7466,8 @@ static int io_uring_flush(struct file *file, void *data) /* * If the task is going away, cancel work it may have pending */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { - void *data = (void *) (unsigned long)task_pid_vnr(current); - - io_wq_cancel_cb(ctx->io_wq, io_cancel_pid_cb, data, true); - } + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) + io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true); return 0; } -- cgit From 241cb28e38c322068c87da4c479e07fbae0c1a4e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 28 May 2020 09:35:11 -0500 Subject: aio: Replace zero-length array with flexible-array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://github.com/KSPP/linux/issues/21 Signed-off-by: Gustavo A. R. Silva --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 7ecddc2f38db..91e7cc4a9f17 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -67,7 +67,7 @@ struct aio_ring { unsigned header_length; /* size of aio_ring */ - struct io_event io_events[0]; + struct io_event io_events[]; }; /* 128 bytes + ring size */ /* -- cgit From 6112bad79f4de4bde62f630494760a300724e4b4 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 28 May 2020 09:35:11 -0500 Subject: jffs2: Replace zero-length array with flexible-array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://github.com/KSPP/linux/issues/21 Signed-off-by: Gustavo A. R. Silva --- fs/jffs2/nodelist.h | 2 +- fs/jffs2/summary.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index 0637271f3770..8ff4d1a1e774 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -259,7 +259,7 @@ struct jffs2_full_dirent uint32_t ino; /* == zero for unlink */ unsigned int nhash; unsigned char type; - unsigned char name[0]; + unsigned char name[]; }; /* diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h index 60207a2ae952..e4131cb1f1d4 100644 --- a/fs/jffs2/summary.h +++ b/fs/jffs2/summary.h @@ -61,7 +61,7 @@ struct jffs2_sum_dirent_flash jint32_t ino; /* == zero for unlink */ uint8_t nsize; /* dirent name size */ uint8_t type; /* dirent type */ - uint8_t name[0]; /* dirent name */ + uint8_t name[]; /* dirent name */ } __attribute__((packed)); struct jffs2_sum_xattr_flash @@ -117,7 +117,7 @@ struct jffs2_sum_dirent_mem jint32_t ino; /* == zero for unlink */ uint8_t nsize; /* dirent name size */ uint8_t type; /* dirent type */ - uint8_t name[0]; /* dirent name */ + uint8_t name[]; /* dirent name */ } __attribute__((packed)); struct jffs2_sum_xattr_mem -- cgit From b2b32e3aa08475d98d2c325fa5886e151fcd2981 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 28 May 2020 09:35:11 -0500 Subject: Squashfs: Replace zero-length array with flexible-array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://github.com/KSPP/linux/issues/21 Signed-off-by: Gustavo A. R. Silva --- fs/squashfs/squashfs_fs.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 7187bd1a30ea..8d64edb80ebf 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -262,7 +262,7 @@ struct squashfs_dir_index { __le32 index; __le32 start_block; __le32 size; - unsigned char name[0]; + unsigned char name[]; }; struct squashfs_base_inode { @@ -327,7 +327,7 @@ struct squashfs_symlink_inode { __le32 inode_number; __le32 nlink; __le32 symlink_size; - char symlink[0]; + char symlink[]; }; struct squashfs_reg_inode { @@ -341,7 +341,7 @@ struct squashfs_reg_inode { __le32 fragment; __le32 offset; __le32 file_size; - __le16 block_list[0]; + __le16 block_list[]; }; struct squashfs_lreg_inode { @@ -358,7 +358,7 @@ struct squashfs_lreg_inode { __le32 fragment; __le32 offset; __le32 xattr; - __le16 block_list[0]; + __le16 block_list[]; }; struct squashfs_dir_inode { @@ -389,7 +389,7 @@ struct squashfs_ldir_inode { __le16 i_count; __le16 offset; __le32 xattr; - struct squashfs_dir_index index[0]; + struct squashfs_dir_index index[]; }; union squashfs_inode { @@ -410,7 +410,7 @@ struct squashfs_dir_entry { __le16 inode_number; __le16 type; __le16 size; - char name[0]; + char name[]; }; struct squashfs_dir_header { @@ -428,12 +428,12 @@ struct squashfs_fragment_entry { struct squashfs_xattr_entry { __le16 type; __le16 size; - char data[0]; + char data[]; }; struct squashfs_xattr_val { __le32 vsize; - char value[0]; + char value[]; }; struct squashfs_xattr_id { -- cgit From 6c85cacc8c096fc5cbdba61b6aa8fe675805e5d1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 16 Jun 2020 00:18:09 +0100 Subject: afs: Remove yfs_fs_fetch_file_status() as it's not used Remove yfs_fs_fetch_file_status() as it's no longer used. Signed-off-by: David Howells --- fs/afs/internal.h | 1 - fs/afs/yfsclient.c | 42 ------------------------------------------ 2 files changed, 43 deletions(-) (limited to 'fs') diff --git a/fs/afs/internal.h b/fs/afs/internal.h index bdc1e5efebd4..d2207cb40740 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1438,7 +1438,6 @@ extern ssize_t afs_listxattr(struct dentry *, char *, size_t); /* * yfsclient.c */ -extern void yfs_fs_fetch_file_status(struct afs_operation *); extern void yfs_fs_fetch_data(struct afs_operation *); extern void yfs_fs_create_file(struct afs_operation *); extern void yfs_fs_make_dir(struct afs_operation *); diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 52d5af5fcd44..993591739240 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -374,48 +374,6 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call) return 0; } -/* - * YFS.FetchStatus operation type - */ -static const struct afs_call_type yfs_RXYFSFetchStatus_vnode = { - .name = "YFS.FetchStatus(vnode)", - .op = yfs_FS_FetchStatus, - .deliver = yfs_deliver_fs_status_cb_and_volsync, - .destructor = afs_flat_call_destructor, -}; - -/* - * Fetch the status information for a file. - */ -void yfs_fs_fetch_file_status(struct afs_operation *op) -{ - struct afs_vnode_param *vp = &op->file[0]; - struct afs_call *call; - __be32 *bp; - - _enter(",%x,{%llx:%llu},,", - key_serial(op->key), vp->fid.vid, vp->fid.vnode); - - call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchStatus_vnode, - sizeof(__be32) * 2 + - sizeof(struct yfs_xdr_YFSFid), - sizeof(struct yfs_xdr_YFSFetchStatus) + - sizeof(struct yfs_xdr_YFSCallBack) + - sizeof(struct yfs_xdr_YFSVolSync)); - if (!call) - return afs_op_nomem(op); - - /* marshall the parameters */ - bp = call->request; - bp = xdr_encode_u32(bp, YFSFETCHSTATUS); - bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &vp->fid); - yfs_check_req(call, bp); - - trace_afs_make_fs_call(call, &vp->fid); - afs_make_op_call(op, call, GFP_NOFS); -} - /* * Deliver reply data to an YFS.FetchData64. */ -- cgit From 9bd87ec631ba07285138eed9c85645a12294f6c6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 16 Jun 2020 00:23:12 +0100 Subject: afs: Fix yfs_fs_fetch_status() to honour vnode selector Fix yfs_fs_fetch_status() to honour the vnode selector in op->fetch_status.which as does afs_fs_fetch_status() that allows afs_do_lookup() to use this as an alternative to the InlineBulkStatus RPC call if not implemented by the server. This doesn't matter in the current code as YFS servers always implement InlineBulkStatus, but a subsequent will call it on YFS servers too in some circumstances. Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept") Signed-off-by: David Howells --- fs/afs/yfsclient.c | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 993591739240..8c24fdc899e3 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -329,29 +329,6 @@ static void xdr_decode_YFSFetchVolumeStatus(const __be32 **_bp, *_bp += sizeof(*x) / sizeof(__be32); } -/* - * Deliver a reply that's a status, callback and volsync. - */ -static int yfs_deliver_fs_status_cb_and_volsync(struct afs_call *call) -{ - struct afs_operation *op = call->op; - const __be32 *bp; - int ret; - - ret = afs_transfer_reply(call); - if (ret < 0) - return ret; - - /* unmarshall the reply once we've received all of it */ - bp = call->buffer; - xdr_decode_YFSFetchStatus(&bp, call, &op->file[0].scb); - xdr_decode_YFSCallBack(&bp, call, &op->file[0].scb); - xdr_decode_YFSVolSync(&bp, &op->volsync); - - _leave(" = 0 [done]"); - return 0; -} - /* * Deliver reply data to operations that just return a file status and a volume * sync record. @@ -1562,13 +1539,37 @@ void yfs_fs_release_lock(struct afs_operation *op) afs_make_op_call(op, call, GFP_NOFS); } +/* + * Deliver a reply to YFS.FetchStatus + */ +static int yfs_deliver_fs_fetch_status(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[op->fetch_status.which]; + const __be32 *bp; + int ret; + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_YFSCallBack(&bp, call, &vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + + _leave(" = 0 [done]"); + return 0; +} + /* * YFS.FetchStatus operation type */ static const struct afs_call_type yfs_RXYFSFetchStatus = { .name = "YFS.FetchStatus", .op = yfs_FS_FetchStatus, - .deliver = yfs_deliver_fs_status_cb_and_volsync, + .deliver = yfs_deliver_fs_fetch_status, .destructor = afs_flat_call_destructor, }; @@ -1577,7 +1578,7 @@ static const struct afs_call_type yfs_RXYFSFetchStatus = { */ void yfs_fs_fetch_status(struct afs_operation *op) { - struct afs_vnode_param *vp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[op->fetch_status.which]; struct afs_call *call; __be32 *bp; -- cgit From 44767c353127cfcbee49a89bab39a3680ecd2a45 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 16 Jun 2020 00:25:56 +0100 Subject: afs: Remove afs_operation::abort_code Remove afs_operation::abort_code as it's read but never set. Use ac.abort_code instead. Signed-off-by: David Howells --- fs/afs/dir.c | 2 +- fs/afs/internal.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 308a125e9de3..ca6b147963a9 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -648,7 +648,7 @@ static void afs_do_lookup_success(struct afs_operation *op) vp = &op->file[0]; abort_code = vp->scb.status.abort_code; if (abort_code != 0) { - op->abort_code = abort_code; + op->ac.abort_code = abort_code; op->error = afs_abort_to_error(abort_code); } break; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index d2207cb40740..598934d923cc 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -771,7 +771,6 @@ struct afs_operation { struct timespec64 ctime; /* Change time to set */ short nr_files; /* Number of entries in file[], more_files */ short error; - unsigned int abort_code; unsigned int debug_id; unsigned int cb_v_break; /* Volume break counter before op */ -- cgit From 728279a5a1fd9fa9fa268f807391c4d19ad2822c Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 16 Jun 2020 00:34:09 +0100 Subject: afs: Fix use of afs_check_for_remote_deletion() afs_check_for_remote_deletion() checks to see if error ENOENT is returned by the server in response to an operation and, if so, marks the primary vnode as having been deleted as the FID is no longer valid. However, it's being called from the operation success functions, where no abort has happened - and if an inline abort is recorded, it's handled by afs_vnode_commit_status(). Fix this by actually calling the operation aborted method if provided and having that point to afs_check_for_remote_deletion(). Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept") Signed-off-by: David Howells --- fs/afs/dir.c | 21 ++++++++++++++++++--- fs/afs/dir_silly.c | 2 +- fs/afs/file.c | 2 +- fs/afs/flock.c | 4 +--- fs/afs/fs_operation.c | 10 +++++++++- fs/afs/inode.c | 1 + fs/afs/internal.h | 10 +--------- 7 files changed, 32 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index ca6b147963a9..cd74731112f4 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -700,6 +700,7 @@ static const struct afs_operation_ops afs_fetch_status_operation = { .issue_afs_rpc = afs_fs_fetch_status, .issue_yfs_rpc = yfs_fs_fetch_status, .success = afs_do_lookup_success, + .aborted = afs_check_for_remote_deletion, }; /* @@ -1236,6 +1237,17 @@ void afs_d_release(struct dentry *dentry) _enter("%pd", dentry); } +void afs_check_for_remote_deletion(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->file[0].vnode; + + switch (op->ac.abort_code) { + case VNOVNODE: + set_bit(AFS_VNODE_DELETED, &vnode->flags); + afs_break_callback(vnode, afs_cb_break_for_deleted); + } +} + /* * Create a new inode for create/mkdir/symlink */ @@ -1269,7 +1281,6 @@ static void afs_create_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); op->ctime = op->file[0].scb.status.mtime_client; - afs_check_for_remote_deletion(op, op->file[0].vnode); afs_vnode_commit_status(op, &op->file[0]); afs_update_dentry_version(op, &op->file[0], op->dentry); afs_vnode_new_inode(op); @@ -1303,6 +1314,7 @@ static const struct afs_operation_ops afs_mkdir_operation = { .issue_afs_rpc = afs_fs_make_dir, .issue_yfs_rpc = yfs_fs_make_dir, .success = afs_create_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_create_edit_dir, .put = afs_create_put, }; @@ -1353,7 +1365,6 @@ static void afs_rmdir_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); op->ctime = op->file[0].scb.status.mtime_client; - afs_check_for_remote_deletion(op, op->file[0].vnode); afs_vnode_commit_status(op, &op->file[0]); afs_update_dentry_version(op, &op->file[0], op->dentry); } @@ -1385,6 +1396,7 @@ static const struct afs_operation_ops afs_rmdir_operation = { .issue_afs_rpc = afs_fs_remove_dir, .issue_yfs_rpc = yfs_fs_remove_dir, .success = afs_rmdir_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_rmdir_edit_dir, .put = afs_rmdir_put, }; @@ -1484,7 +1496,6 @@ static void afs_unlink_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); op->ctime = op->file[0].scb.status.mtime_client; - afs_check_for_remote_deletion(op, op->file[0].vnode); afs_vnode_commit_status(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[1]); afs_update_dentry_version(op, &op->file[0], op->dentry); @@ -1516,6 +1527,7 @@ static const struct afs_operation_ops afs_unlink_operation = { .issue_afs_rpc = afs_fs_remove_file, .issue_yfs_rpc = yfs_fs_remove_file, .success = afs_unlink_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_unlink_edit_dir, .put = afs_unlink_put, }; @@ -1580,6 +1592,7 @@ static const struct afs_operation_ops afs_create_operation = { .issue_afs_rpc = afs_fs_create_file, .issue_yfs_rpc = yfs_fs_create_file, .success = afs_create_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_create_edit_dir, .put = afs_create_put, }; @@ -1649,6 +1662,7 @@ static const struct afs_operation_ops afs_link_operation = { .issue_afs_rpc = afs_fs_link, .issue_yfs_rpc = yfs_fs_link, .success = afs_link_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_create_edit_dir, .put = afs_link_put, }; @@ -1700,6 +1714,7 @@ static const struct afs_operation_ops afs_symlink_operation = { .issue_afs_rpc = afs_fs_symlink, .issue_yfs_rpc = yfs_fs_symlink, .success = afs_create_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_create_edit_dir, .put = afs_create_put, }; diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index b14e3d9a25e2..001adb87ff23 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -151,7 +151,6 @@ static void afs_silly_unlink_success(struct afs_operation *op) struct afs_vnode *vnode = op->file[1].vnode; _enter("op=%08x", op->debug_id); - afs_check_for_remote_deletion(op, op->file[0].vnode); afs_vnode_commit_status(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[1]); afs_update_dentry_version(op, &op->file[0], op->dentry); @@ -181,6 +180,7 @@ static const struct afs_operation_ops afs_silly_unlink_operation = { .issue_afs_rpc = afs_fs_remove_file, .issue_yfs_rpc = yfs_fs_remove_file, .success = afs_silly_unlink_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_silly_unlink_edit_dir, }; diff --git a/fs/afs/file.c b/fs/afs/file.c index 506c47471b42..6f6ed1605cfe 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -225,7 +225,6 @@ static void afs_fetch_data_success(struct afs_operation *op) struct afs_vnode *vnode = op->file[0].vnode; _enter("op=%08x", op->debug_id); - afs_check_for_remote_deletion(op, vnode); afs_vnode_commit_status(op, &op->file[0]); afs_stat_v(vnode, n_fetches); atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes); @@ -240,6 +239,7 @@ static const struct afs_operation_ops afs_fetch_data_operation = { .issue_afs_rpc = afs_fs_fetch_data, .issue_yfs_rpc = yfs_fs_fetch_data, .success = afs_fetch_data_success, + .aborted = afs_check_for_remote_deletion, .put = afs_fetch_data_put, }; diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 71eea2a908c7..ffb8575345ca 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -175,10 +175,7 @@ static void afs_kill_lockers_enoent(struct afs_vnode *vnode) static void afs_lock_success(struct afs_operation *op) { - struct afs_vnode *vnode = op->file[0].vnode; - _enter("op=%08x", op->debug_id); - afs_check_for_remote_deletion(op, vnode); afs_vnode_commit_status(op, &op->file[0]); } @@ -186,6 +183,7 @@ static const struct afs_operation_ops afs_set_lock_operation = { .issue_afs_rpc = afs_fs_set_lock, .issue_yfs_rpc = yfs_fs_set_lock, .success = afs_lock_success, + .aborted = afs_check_for_remote_deletion, }; /* diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 2d2dff5688a4..c264839b2fd0 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -187,9 +187,17 @@ void afs_wait_for_operation(struct afs_operation *op) op->error = afs_wait_for_call_to_complete(op->call, &op->ac); } - if (op->error == 0) { + switch (op->error) { + case 0: _debug("success"); op->ops->success(op); + break; + case -ECONNABORTED: + if (op->ops->aborted) + op->ops->aborted(op); + break; + default: + break; } afs_end_vnode_operation(op); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 70c925978d10..56e60d561f37 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -324,6 +324,7 @@ static const struct afs_operation_ops afs_fetch_status_operation = { .issue_afs_rpc = afs_fs_fetch_status, .issue_yfs_rpc = yfs_fs_fetch_status, .success = afs_fetch_status_success, + .aborted = afs_check_for_remote_deletion, }; /* diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 598934d923cc..9420890e3577 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -934,6 +934,7 @@ extern const struct address_space_operations afs_dir_aops; extern const struct dentry_operations afs_fs_dentry_operations; extern void afs_d_release(struct dentry *); +extern void afs_check_for_remote_deletion(struct afs_operation *); /* * dir_edit.c @@ -1482,15 +1483,6 @@ static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) return &vnode->vfs_inode; } -static inline void afs_check_for_remote_deletion(struct afs_operation *op, - struct afs_vnode *vnode) -{ - if (op->error == -ENOENT) { - set_bit(AFS_VNODE_DELETED, &vnode->flags); - afs_break_callback(vnode, afs_cb_break_for_deleted); - } -} - /* * Note that a dentry got changed. We need to set d_fsdata to the data version * number derived from the result of the operation. It doesn't matter if -- cgit From 7c295eec1e351003a8ca06c34f9e79336fa5b244 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 16 Jun 2020 00:52:30 +0100 Subject: afs: afs_vnode_commit_status() doesn't need to check the RPC error afs_vnode_commit_status() is only ever called if op->error is 0, so remove the op->error checks from the function. Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept") Signed-off-by: David Howells --- fs/afs/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 56e60d561f37..d5d0ae7b2b1e 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -281,8 +281,6 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v _enter(""); - ASSERTCMP(op->error, ==, 0); - write_seqlock(&vnode->cb_lock); if (vp->scb.have_error) { @@ -300,7 +298,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v write_sequnlock(&vnode->cb_lock); - if (op->error == 0 && vp->scb.have_status) + if (vp->scb.have_status) afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb); } -- cgit From 2d3a8e2deddea6c89961c422ec0c5b851e648c14 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Tue, 16 Jun 2020 20:16:55 +0800 Subject: block: Fix use-after-free in blkdev_get() In blkdev_get() we call __blkdev_get() to do some internal jobs and if there is some errors in __blkdev_get(), the bdput() is called which means we have released the refcount of the bdev (actually the refcount of the bdev inode). This means we cannot access bdev after that point. But acctually bdev is still accessed in blkdev_get() after calling __blkdev_get(). This results in use-after-free if the refcount is the last one we released in __blkdev_get(). Let's take a look at the following scenerio: CPU0 CPU1 CPU2 blkdev_open blkdev_open Remove disk bd_acquire blkdev_get __blkdev_get del_gendisk bdev_unhash_inode bd_acquire bdev_get_gendisk bd_forget failed because of unhashed bdput bdput (the last one) bdev_evict_inode access bdev => use after free [ 459.350216] BUG: KASAN: use-after-free in __lock_acquire+0x24c1/0x31b0 [ 459.351190] Read of size 8 at addr ffff88806c815a80 by task syz-executor.0/20132 [ 459.352347] [ 459.352594] CPU: 0 PID: 20132 Comm: syz-executor.0 Not tainted 4.19.90 #2 [ 459.353628] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 459.354947] Call Trace: [ 459.355337] dump_stack+0x111/0x19e [ 459.355879] ? __lock_acquire+0x24c1/0x31b0 [ 459.356523] print_address_description+0x60/0x223 [ 459.357248] ? __lock_acquire+0x24c1/0x31b0 [ 459.357887] kasan_report.cold+0xae/0x2d8 [ 459.358503] __lock_acquire+0x24c1/0x31b0 [ 459.359120] ? _raw_spin_unlock_irq+0x24/0x40 [ 459.359784] ? lockdep_hardirqs_on+0x37b/0x580 [ 459.360465] ? _raw_spin_unlock_irq+0x24/0x40 [ 459.361123] ? finish_task_switch+0x125/0x600 [ 459.361812] ? finish_task_switch+0xee/0x600 [ 459.362471] ? mark_held_locks+0xf0/0xf0 [ 459.363108] ? __schedule+0x96f/0x21d0 [ 459.363716] lock_acquire+0x111/0x320 [ 459.364285] ? blkdev_get+0xce/0xbe0 [ 459.364846] ? blkdev_get+0xce/0xbe0 [ 459.365390] __mutex_lock+0xf9/0x12a0 [ 459.365948] ? blkdev_get+0xce/0xbe0 [ 459.366493] ? bdev_evict_inode+0x1f0/0x1f0 [ 459.367130] ? blkdev_get+0xce/0xbe0 [ 459.367678] ? destroy_inode+0xbc/0x110 [ 459.368261] ? mutex_trylock+0x1a0/0x1a0 [ 459.368867] ? __blkdev_get+0x3e6/0x1280 [ 459.369463] ? bdev_disk_changed+0x1d0/0x1d0 [ 459.370114] ? blkdev_get+0xce/0xbe0 [ 459.370656] blkdev_get+0xce/0xbe0 [ 459.371178] ? find_held_lock+0x2c/0x110 [ 459.371774] ? __blkdev_get+0x1280/0x1280 [ 459.372383] ? lock_downgrade+0x680/0x680 [ 459.373002] ? lock_acquire+0x111/0x320 [ 459.373587] ? bd_acquire+0x21/0x2c0 [ 459.374134] ? do_raw_spin_unlock+0x4f/0x250 [ 459.374780] blkdev_open+0x202/0x290 [ 459.375325] do_dentry_open+0x49e/0x1050 [ 459.375924] ? blkdev_get_by_dev+0x70/0x70 [ 459.376543] ? __x64_sys_fchdir+0x1f0/0x1f0 [ 459.377192] ? inode_permission+0xbe/0x3a0 [ 459.377818] path_openat+0x148c/0x3f50 [ 459.378392] ? kmem_cache_alloc+0xd5/0x280 [ 459.379016] ? entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 459.379802] ? path_lookupat.isra.0+0x900/0x900 [ 459.380489] ? __lock_is_held+0xad/0x140 [ 459.381093] do_filp_open+0x1a1/0x280 [ 459.381654] ? may_open_dev+0xf0/0xf0 [ 459.382214] ? find_held_lock+0x2c/0x110 [ 459.382816] ? lock_downgrade+0x680/0x680 [ 459.383425] ? __lock_is_held+0xad/0x140 [ 459.384024] ? do_raw_spin_unlock+0x4f/0x250 [ 459.384668] ? _raw_spin_unlock+0x1f/0x30 [ 459.385280] ? __alloc_fd+0x448/0x560 [ 459.385841] do_sys_open+0x3c3/0x500 [ 459.386386] ? filp_open+0x70/0x70 [ 459.386911] ? trace_hardirqs_on_thunk+0x1a/0x1c [ 459.387610] ? trace_hardirqs_off_caller+0x55/0x1c0 [ 459.388342] ? do_syscall_64+0x1a/0x520 [ 459.388930] do_syscall_64+0xc3/0x520 [ 459.389490] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 459.390248] RIP: 0033:0x416211 [ 459.390720] Code: 75 14 b8 02 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 04 19 00 00 c3 48 83 ec 08 e8 0a fa ff ff 48 89 04 24 b8 02 00 00 00 0f 05 <48> 8b 3c 24 48 89 c2 e8 53 fa ff ff 48 89 d0 48 83 c4 08 48 3d 01 [ 459.393483] RSP: 002b:00007fe45dfe9a60 EFLAGS: 00000293 ORIG_RAX: 0000000000000002 [ 459.394610] RAX: ffffffffffffffda RBX: 00007fe45dfea6d4 RCX: 0000000000416211 [ 459.395678] RDX: 00007fe45dfe9b0a RSI: 0000000000000002 RDI: 00007fe45dfe9b00 [ 459.396758] RBP: 000000000076bf20 R08: 0000000000000000 R09: 000000000000000a [ 459.397930] R10: 0000000000000075 R11: 0000000000000293 R12: 00000000ffffffff [ 459.399022] R13: 0000000000000bd9 R14: 00000000004cdb80 R15: 000000000076bf2c [ 459.400168] [ 459.400430] Allocated by task 20132: [ 459.401038] kasan_kmalloc+0xbf/0xe0 [ 459.401652] kmem_cache_alloc+0xd5/0x280 [ 459.402330] bdev_alloc_inode+0x18/0x40 [ 459.402970] alloc_inode+0x5f/0x180 [ 459.403510] iget5_locked+0x57/0xd0 [ 459.404095] bdget+0x94/0x4e0 [ 459.404607] bd_acquire+0xfa/0x2c0 [ 459.405113] blkdev_open+0x110/0x290 [ 459.405702] do_dentry_open+0x49e/0x1050 [ 459.406340] path_openat+0x148c/0x3f50 [ 459.406926] do_filp_open+0x1a1/0x280 [ 459.407471] do_sys_open+0x3c3/0x500 [ 459.408010] do_syscall_64+0xc3/0x520 [ 459.408572] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 459.409415] [ 459.409679] Freed by task 1262: [ 459.410212] __kasan_slab_free+0x129/0x170 [ 459.410919] kmem_cache_free+0xb2/0x2a0 [ 459.411564] rcu_process_callbacks+0xbb2/0x2320 [ 459.412318] __do_softirq+0x225/0x8ac Fix this by delaying bdput() to the end of blkdev_get() which means we have finished accessing bdev. Fixes: 77ea887e433a ("implement in-kernel gendisk events handling") Reported-by: Hulk Robot Signed-off-by: Jason Yan Tested-by: Sedat Dilek Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Reviewed-by: Dan Carpenter Cc: Christoph Hellwig Cc: Jens Axboe Cc: Ming Lei Cc: Jan Kara Cc: Dan Carpenter Signed-off-by: Jens Axboe --- fs/block_dev.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 47860e589388..08c87db3a92b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1565,10 +1565,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) */ if (!for_part) { ret = devcgroup_inode_permission(bdev->bd_inode, perm); - if (ret != 0) { - bdput(bdev); + if (ret != 0) return ret; - } } restart: @@ -1637,8 +1635,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; BUG_ON(for_part); ret = __blkdev_get(whole, mode, 1); - if (ret) + if (ret) { + bdput(whole); goto out_clear; + } bdev->bd_contains = whole; bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || @@ -1688,7 +1688,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_unblock_events(disk); put_disk_and_module(disk); out: - bdput(bdev); return ret; } @@ -1755,6 +1754,9 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) bdput(whole); } + if (res) + bdput(bdev); + return res; } EXPORT_SYMBOL(blkdev_get); -- cgit From 9fecd13202f520f3f25d5b1c313adb740fe19773 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 1 Jun 2020 19:12:06 +0100 Subject: btrfs: fix a block group ref counter leak after failure to remove block group When removing a block group, if we fail to delete the block group's item from the extent tree, we jump to the 'out' label and end up decrementing the block group's reference count once only (by 1), resulting in a counter leak because the block group at that point was already removed from the block group cache rbtree - so we have to decrement the reference count twice, once for the rbtree and once for our lookup at the start of the function. There is a second bug where if removing the free space tree entries (the call to remove_block_group_free_space()) fails we end up jumping to the 'out_put_group' label but end up decrementing the reference count only once, when we should have done it twice, since we have already removed the block group from the block group cache rbtree. This happens because the reference count decrement for the rbtree reference happens after attempting to remove the free space tree entries, which is far away from the place where we remove the block group from the rbtree. To make things less error prone, decrement the reference count for the rbtree immediately after removing the block group from it. This also eleminates the need for two different exit labels on error, renaming 'out_put_label' to just 'out' and removing the old 'out'. Fixes: f6033c5e333238 ("btrfs: fix block group leak when removing fails") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Nikolay Borisov Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 176e8a292fd1..6462dd0b155c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -940,7 +940,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto out_put_group; + goto out; } /* @@ -978,7 +978,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(inode); - goto out_put_group; + goto out; } clear_nlink(inode); /* One for the block groups ref */ @@ -1001,13 +1001,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); if (ret < 0) - goto out_put_group; + goto out; if (ret > 0) btrfs_release_path(path); if (ret == 0) { ret = btrfs_del_item(trans, tree_root, path); if (ret) - goto out_put_group; + goto out; btrfs_release_path(path); } @@ -1016,6 +1016,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, &fs_info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); + /* Once for the block groups rbtree */ + btrfs_put_block_group(block_group); + if (fs_info->first_logical_byte == block_group->start) fs_info->first_logical_byte = (u64)-1; spin_unlock(&fs_info->block_group_cache_lock); @@ -1125,10 +1128,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = remove_block_group_free_space(trans, block_group); if (ret) - goto out_put_group; - - /* Once for the block groups rbtree */ - btrfs_put_block_group(block_group); + goto out; ret = remove_block_group_item(trans, path, block_group); if (ret < 0) @@ -1145,10 +1145,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, free_extent_map(em); } -out_put_group: +out: /* Once for the lookup reference */ btrfs_put_block_group(block_group); -out: if (remove_rsv) btrfs_delayed_refs_rsv_release(fs_info, 1); btrfs_free_path(path); -- cgit From ffcb9d44572afbaf8fa6dbf5115bff6dab7b299e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 1 Jun 2020 19:12:19 +0100 Subject: btrfs: fix race between block group removal and block group creation There is a race between block group removal and block group creation when the removal is completed by a task running fitrim or scrub. When this happens we end up failing the block group creation with an error -EEXIST since we attempt to insert a duplicate block group item key in the extent tree. That results in a transaction abort. The race happens like this: 1) Task A is doing a fitrim, and at btrfs_trim_block_group() it freezes block group X with btrfs_freeze_block_group() (until very recently that was named btrfs_get_block_group_trimming()); 2) Task B starts removing block group X, either because it's now unused or due to relocation for example. So at btrfs_remove_block_group(), while holding the chunk mutex and the block group's lock, it sets the 'removed' flag of the block group and it sets the local variable 'remove_em' to false, because the block group is currently frozen (its 'frozen' counter is > 0, until very recently this counter was named 'trimming'); 3) Task B unlocks the block group and the chunk mutex; 4) Task A is done trimming the block group and unfreezes the block group by calling btrfs_unfreeze_block_group() (until very recently this was named btrfs_put_block_group_trimming()). In this function we lock the block group and set the local variable 'cleanup' to true because we were able to decrement the block group's 'frozen' counter down to 0 and the flag 'removed' is set in the block group. Since 'cleanup' is set to true, it locks the chunk mutex and removes the extent mapping representing the block group from the mapping tree; 5) Task C allocates a new block group Y and it picks up the logical address that block group X had as the logical address for Y, because X was the block group with the highest logical address and now the second block group with the highest logical address, the last in the fs mapping tree, ends at an offset corresponding to block group X's logical address (this logical address selection is done at volumes.c:find_next_chunk()). At this point the new block group Y does not have yet its item added to the extent tree (nor the corresponding device extent items and chunk item in the device and chunk trees). The new group Y is added to the list of pending block groups in the transaction handle; 6) Before task B proceeds to removing the block group item for block group X from the extent tree, which has a key matching: (X logical offset, BTRFS_BLOCK_GROUP_ITEM_KEY, length) task C while ending its transaction handle calls btrfs_create_pending_block_groups(), which finds block group Y and tries to insert the block group item for Y into the exten tree, which fails with -EEXIST since logical offset is the same that X had and task B hasn't yet deleted the key from the extent tree. This failure results in a transaction abort, producing a stack like the following: ------------[ cut here ]------------ BTRFS: Transaction aborted (error -17) WARNING: CPU: 2 PID: 19736 at fs/btrfs/block-group.c:2074 btrfs_create_pending_block_groups+0x1eb/0x260 [btrfs] Modules linked in: btrfs blake2b_generic xor raid6_pq (...) CPU: 2 PID: 19736 Comm: fsstress Tainted: G W 5.6.0-rc7-btrfs-next-58 #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_create_pending_block_groups+0x1eb/0x260 [btrfs] Code: ff ff ff 48 8b 55 50 f0 48 (...) RSP: 0018:ffffa4160a1c7d58 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff961581909d98 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffffffffb3d63990 RDI: 0000000000000001 RBP: ffff9614f3356a58 R08: 0000000000000000 R09: 0000000000000001 R10: ffff9615b65b0040 R11: 0000000000000000 R12: ffff961581909c10 R13: ffff9615b0c32000 R14: ffff9614f3356ab0 R15: ffff9614be779000 FS: 00007f2ce2841e80(0000) GS:ffff9615bae00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000555f18780000 CR3: 0000000131d34005 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_start_dirty_block_groups+0x398/0x4e0 [btrfs] btrfs_commit_transaction+0xd0/0xc50 [btrfs] ? btrfs_attach_transaction_barrier+0x1e/0x50 [btrfs] ? __ia32_sys_fdatasync+0x20/0x20 iterate_supers+0xdb/0x180 ksys_sync+0x60/0xb0 __ia32_sys_sync+0xa/0x10 do_syscall_64+0x5c/0x280 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f2ce1d4d5b7 Code: 83 c4 08 48 3d 01 (...) RSP: 002b:00007ffd8b558c58 EFLAGS: 00000202 ORIG_RAX: 00000000000000a2 RAX: ffffffffffffffda RBX: 000000000000002c RCX: 00007f2ce1d4d5b7 RDX: 00000000ffffffff RSI: 00000000186ba07b RDI: 000000000000002c RBP: 0000555f17b9e520 R08: 0000000000000012 R09: 000000000000ce00 R10: 0000000000000078 R11: 0000000000000202 R12: 0000000000000032 R13: 0000000051eb851f R14: 00007ffd8b558cd0 R15: 0000555f1798ec20 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x74f/0x2020 softirqs last enabled at (0): [] copy_process+0x74f/0x2020 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace bd7c03622e0b0a9c ]--- Fix this simply by making btrfs_remove_block_group() remove the block group's item from the extent tree before it flags the block group as removed. Also make the free space deletion from the free space tree before flagging the block group as removed, to avoid a similar race with adding and removing free space entries for the free space tree. Fixes: 04216820fe83d5 ("Btrfs: fix race between fs trimming and block group remove/allocation") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6462dd0b155c..c037ef514b64 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1092,6 +1092,25 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->space_info->lock); + /* + * Remove the free space for the block group from the free space tree + * and the block group's item from the extent tree before marking the + * block group as removed. This is to prevent races with tasks that + * freeze and unfreeze a block group, this task and another task + * allocating a new block group - the unfreeze task ends up removing + * the block group's extent map before the task calling this function + * deletes the block group item from the extent tree, allowing for + * another task to attempt to create another block group with the same + * item key (and failing with -EEXIST and a transaction abort). + */ + ret = remove_block_group_free_space(trans, block_group); + if (ret) + goto out; + + ret = remove_block_group_item(trans, path, block_group); + if (ret < 0) + goto out; + mutex_lock(&fs_info->chunk_mutex); spin_lock(&block_group->lock); block_group->removed = 1; @@ -1126,14 +1145,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->chunk_mutex); - ret = remove_block_group_free_space(trans, block_group); - if (ret) - goto out; - - ret = remove_block_group_item(trans, path, block_group); - if (ret < 0) - goto out; - if (remove_em) { struct extent_map_tree *em_tree; -- cgit From 432cd2a10f1c10cead91fe706ff5dc52f06d642a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 8 Jun 2020 13:32:55 +0100 Subject: btrfs: fix data block group relocation failure due to concurrent scrub When running relocation of a data block group while scrub is running in parallel, it is possible that the relocation will fail and abort the current transaction with an -EINVAL error: [134243.988595] BTRFS info (device sdc): found 14 extents, stage: move data extents [134243.999871] ------------[ cut here ]------------ [134244.000741] BTRFS: Transaction aborted (error -22) [134244.001692] WARNING: CPU: 0 PID: 26954 at fs/btrfs/ctree.c:1071 __btrfs_cow_block+0x6a7/0x790 [btrfs] [134244.003380] Modules linked in: btrfs blake2b_generic xor raid6_pq (...) [134244.012577] CPU: 0 PID: 26954 Comm: btrfs Tainted: G W 5.6.0-rc7-btrfs-next-58 #5 [134244.014162] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [134244.016184] RIP: 0010:__btrfs_cow_block+0x6a7/0x790 [btrfs] [134244.017151] Code: 48 c7 c7 (...) [134244.020549] RSP: 0018:ffffa41607863888 EFLAGS: 00010286 [134244.021515] RAX: 0000000000000000 RBX: ffff9614bdfe09c8 RCX: 0000000000000000 [134244.022822] RDX: 0000000000000001 RSI: ffffffffb3d63980 RDI: 0000000000000001 [134244.024124] RBP: ffff961589e8c000 R08: 0000000000000000 R09: 0000000000000001 [134244.025424] R10: ffffffffc0ae5955 R11: 0000000000000000 R12: ffff9614bd530d08 [134244.026725] R13: ffff9614ced41b88 R14: ffff9614bdfe2a48 R15: 0000000000000000 [134244.028024] FS: 00007f29b63c08c0(0000) GS:ffff9615ba600000(0000) knlGS:0000000000000000 [134244.029491] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [134244.030560] CR2: 00007f4eb339b000 CR3: 0000000130d6e006 CR4: 00000000003606f0 [134244.031997] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [134244.033153] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [134244.034484] Call Trace: [134244.034984] btrfs_cow_block+0x12b/0x2b0 [btrfs] [134244.035859] do_relocation+0x30b/0x790 [btrfs] [134244.036681] ? do_raw_spin_unlock+0x49/0xc0 [134244.037460] ? _raw_spin_unlock+0x29/0x40 [134244.038235] relocate_tree_blocks+0x37b/0x730 [btrfs] [134244.039245] relocate_block_group+0x388/0x770 [btrfs] [134244.040228] btrfs_relocate_block_group+0x161/0x2e0 [btrfs] [134244.041323] btrfs_relocate_chunk+0x36/0x110 [btrfs] [134244.041345] btrfs_balance+0xc06/0x1860 [btrfs] [134244.043382] ? btrfs_ioctl_balance+0x27c/0x310 [btrfs] [134244.045586] btrfs_ioctl_balance+0x1ed/0x310 [btrfs] [134244.045611] btrfs_ioctl+0x1880/0x3760 [btrfs] [134244.049043] ? do_raw_spin_unlock+0x49/0xc0 [134244.049838] ? _raw_spin_unlock+0x29/0x40 [134244.050587] ? __handle_mm_fault+0x11b3/0x14b0 [134244.051417] ? ksys_ioctl+0x92/0xb0 [134244.052070] ksys_ioctl+0x92/0xb0 [134244.052701] ? trace_hardirqs_off_thunk+0x1a/0x1c [134244.053511] __x64_sys_ioctl+0x16/0x20 [134244.054206] do_syscall_64+0x5c/0x280 [134244.054891] entry_SYSCALL_64_after_hwframe+0x49/0xbe [134244.055819] RIP: 0033:0x7f29b51c9dd7 [134244.056491] Code: 00 00 00 (...) [134244.059767] RSP: 002b:00007ffcccc1dd08 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [134244.061168] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f29b51c9dd7 [134244.062474] RDX: 00007ffcccc1dda0 RSI: 00000000c4009420 RDI: 0000000000000003 [134244.063771] RBP: 0000000000000003 R08: 00005565cea4b000 R09: 0000000000000000 [134244.065032] R10: 0000000000000541 R11: 0000000000000202 R12: 00007ffcccc2060a [134244.066327] R13: 00007ffcccc1dda0 R14: 0000000000000002 R15: 00007ffcccc1dec0 [134244.067626] irq event stamp: 0 [134244.068202] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [134244.069351] hardirqs last disabled at (0): [] copy_process+0x74f/0x2020 [134244.070909] softirqs last enabled at (0): [] copy_process+0x74f/0x2020 [134244.072392] softirqs last disabled at (0): [<0000000000000000>] 0x0 [134244.073432] ---[ end trace bd7c03622e0b0a99 ]--- The -EINVAL error comes from the following chain of function calls: __btrfs_cow_block() <-- aborts the transaction btrfs_reloc_cow_block() replace_file_extents() get_new_location() <-- returns -EINVAL When relocating a data block group, for each allocated extent of the block group, we preallocate another extent (at prealloc_file_extent_cluster()), associated with the data relocation inode, and then dirty all its pages. These preallocated extents have, and must have, the same size that extents from the data block group being relocated have. Later before we start the relocation stage that updates pointers (bytenr field of file extent items) to point to the the new extents, we trigger writeback for the data relocation inode. The expectation is that writeback will write the pages to the previously preallocated extents, that it follows the NOCOW path. That is generally the case, however, if a scrub is running it may have turned the block group that contains those extents into RO mode, in which case writeback falls back to the COW path. However in the COW path instead of allocating exactly one extent with the expected size, the allocator may end up allocating several smaller extents due to free space fragmentation - because we tell it at cow_file_range() that the minimum allocation size can match the filesystem's sector size. This later breaks the relocation's expectation that an extent associated to a file extent item in the data relocation inode has the same size as the respective extent pointed by a file extent item in another tree - in this case the extent to which the relocation inode poins to is smaller, causing relocation.c:get_new_location() to return -EINVAL. For example, if we are relocating a data block group X that has a logical address of X and the block group has an extent allocated at the logical address X + 128KiB with a size of 64KiB: 1) At prealloc_file_extent_cluster() we allocate an extent for the data relocation inode with a size of 64KiB and associate it to the file offset 128KiB (X + 128KiB - X) of the data relocation inode. This preallocated extent was allocated at block group Z; 2) A scrub running in parallel turns block group Z into RO mode and starts scrubing its extents; 3) Relocation triggers writeback for the data relocation inode; 4) When running delalloc (btrfs_run_delalloc_range()), we try first the NOCOW path because the data relocation inode has BTRFS_INODE_PREALLOC set in its flags. However, because block group Z is in RO mode, the NOCOW path (run_delalloc_nocow()) falls back into the COW path, by calling cow_file_range(); 5) At cow_file_range(), in the first iteration of the while loop we call btrfs_reserve_extent() to allocate a 64KiB extent and pass it a minimum allocation size of 4KiB (fs_info->sectorsize). Due to free space fragmentation, btrfs_reserve_extent() ends up allocating two extents of 32KiB each, each one on a different iteration of that while loop; 6) Writeback of the data relocation inode completes; 7) Relocation proceeds and ends up at relocation.c:replace_file_extents(), with a leaf which has a file extent item that points to the data extent from block group X, that has a logical address (bytenr) of X + 128KiB and a size of 64KiB. Then it calls get_new_location(), which does a lookup in the data relocation tree for a file extent item starting at offset 128KiB (X + 128KiB - X) and belonging to the data relocation inode. It finds a corresponding file extent item, however that item points to an extent that has a size of 32KiB, which doesn't match the expected size of 64KiB, resuling in -EINVAL being returned from this function and propagated up to __btrfs_cow_block(), which aborts the current transaction. To fix this make sure that at cow_file_range() when we call the allocator we pass it a minimum allocation size corresponding the desired extent size if the inode belongs to the data relocation tree, otherwise pass it the filesystem's sector size as the minimum allocation size. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 12b5d61f23bb..62c3f4972ff6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -985,6 +985,7 @@ static noinline int cow_file_range(struct inode *inode, u64 num_bytes; unsigned long ram_size; u64 cur_alloc_size = 0; + u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; @@ -1035,10 +1036,26 @@ static noinline int cow_file_range(struct inode *inode, btrfs_drop_extent_cache(BTRFS_I(inode), start, start + num_bytes - 1, 0); + /* + * Relocation relies on the relocated extents to have exactly the same + * size as the original extents. Normally writeback for relocation data + * extents follows a NOCOW path because relocation preallocates the + * extents. However, due to an operation such as scrub turning a block + * group to RO mode, it may fallback to COW mode, so we must make sure + * an extent allocated during COW has exactly the requested size and can + * not be split into smaller extents, otherwise relocation breaks and + * fails during the stage where it updates the bytenr of file extent + * items. + */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + min_alloc_size = num_bytes; + else + min_alloc_size = fs_info->sectorsize; + while (num_bytes > 0) { cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, - fs_info->sectorsize, 0, alloc_hint, + min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret < 0) goto out_unlock; -- cgit From 6bd335b469f945f75474c11e3f577f85409f39c3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 8 Jun 2020 13:33:05 +0100 Subject: btrfs: fix bytes_may_use underflow when running balance and scrub in parallel When balance and scrub are running in parallel it is possible to end up with an underflow of the bytes_may_use counter of the data space_info object, which triggers a warning like the following: [134243.793196] BTRFS info (device sdc): relocating block group 1104150528 flags data [134243.806891] ------------[ cut here ]------------ [134243.807561] WARNING: CPU: 1 PID: 26884 at fs/btrfs/space-info.h:125 btrfs_add_reserved_bytes+0x1da/0x280 [btrfs] [134243.808819] Modules linked in: btrfs blake2b_generic xor (...) [134243.815779] CPU: 1 PID: 26884 Comm: kworker/u8:8 Tainted: G W 5.6.0-rc7-btrfs-next-58 #5 [134243.816944] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [134243.818389] Workqueue: writeback wb_workfn (flush-btrfs-108483) [134243.819186] RIP: 0010:btrfs_add_reserved_bytes+0x1da/0x280 [btrfs] [134243.819963] Code: 0b f2 85 (...) [134243.822271] RSP: 0018:ffffa4160aae7510 EFLAGS: 00010287 [134243.822929] RAX: 000000000000c000 RBX: ffff96159a8c1000 RCX: 0000000000000000 [134243.823816] RDX: 0000000000008000 RSI: 0000000000000000 RDI: ffff96158067a810 [134243.824742] RBP: ffff96158067a800 R08: 0000000000000001 R09: 0000000000000000 [134243.825636] R10: ffff961501432a40 R11: 0000000000000000 R12: 000000000000c000 [134243.826532] R13: 0000000000000001 R14: ffffffffffff4000 R15: ffff96158067a810 [134243.827432] FS: 0000000000000000(0000) GS:ffff9615baa00000(0000) knlGS:0000000000000000 [134243.828451] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [134243.829184] CR2: 000055bd7e414000 CR3: 00000001077be004 CR4: 00000000003606e0 [134243.830083] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [134243.830975] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [134243.831867] Call Trace: [134243.832211] find_free_extent+0x4a0/0x16c0 [btrfs] [134243.832846] btrfs_reserve_extent+0x91/0x180 [btrfs] [134243.833487] cow_file_range+0x12d/0x490 [btrfs] [134243.834080] fallback_to_cow+0x82/0x1b0 [btrfs] [134243.834689] ? release_extent_buffer+0x121/0x170 [btrfs] [134243.835370] run_delalloc_nocow+0x33f/0xa30 [btrfs] [134243.836032] btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs] [134243.836725] ? find_lock_delalloc_range+0x221/0x250 [btrfs] [134243.837450] writepage_delalloc+0xe8/0x150 [btrfs] [134243.838059] __extent_writepage+0xe8/0x4c0 [btrfs] [134243.838674] extent_write_cache_pages+0x237/0x530 [btrfs] [134243.839364] extent_writepages+0x44/0xa0 [btrfs] [134243.839946] do_writepages+0x23/0x80 [134243.840401] __writeback_single_inode+0x59/0x700 [134243.841006] writeback_sb_inodes+0x267/0x5f0 [134243.841548] __writeback_inodes_wb+0x87/0xe0 [134243.842091] wb_writeback+0x382/0x590 [134243.842574] ? wb_workfn+0x4a2/0x6c0 [134243.843030] wb_workfn+0x4a2/0x6c0 [134243.843468] process_one_work+0x26d/0x6a0 [134243.843978] worker_thread+0x4f/0x3e0 [134243.844452] ? process_one_work+0x6a0/0x6a0 [134243.844981] kthread+0x103/0x140 [134243.845400] ? kthread_create_worker_on_cpu+0x70/0x70 [134243.846030] ret_from_fork+0x3a/0x50 [134243.846494] irq event stamp: 0 [134243.846892] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [134243.847682] hardirqs last disabled at (0): [] copy_process+0x74f/0x2020 [134243.848687] softirqs last enabled at (0): [] copy_process+0x74f/0x2020 [134243.849913] softirqs last disabled at (0): [<0000000000000000>] 0x0 [134243.850698] ---[ end trace bd7c03622e0b0a96 ]--- [134243.851335] ------------[ cut here ]------------ When relocating a data block group, for each extent allocated in the block group we preallocate another extent with the same size for the data relocation inode (we do it at prealloc_file_extent_cluster()). We reserve space by calling btrfs_check_data_free_space(), which ends up incrementing the data space_info's bytes_may_use counter, and then call btrfs_prealloc_file_range() to allocate the extent, which always decrements the bytes_may_use counter by the same amount. The expectation is that writeback of the data relocation inode always follows a NOCOW path, by writing into the preallocated extents. However, when starting writeback we might end up falling back into the COW path, because the block group that contains the preallocated extent was turned into RO mode by a scrub running in parallel. The COW path then calls the extent allocator which ends up calling btrfs_add_reserved_bytes(), and this function decrements the bytes_may_use counter of the data space_info object by an amount corresponding to the size of the allocated extent, despite we haven't previously incremented it. When the counter currently has a value smaller then the allocated extent we reset the counter to 0 and emit a warning, otherwise we just decrement it and slowly mess up with this counter which is crucial for space reservation, the end result can be granting reserved space to tasks when there isn't really enough free space, and having the tasks fail later in critical places where error handling consists of a transaction abort or hitting a BUG_ON(). Fix this by making sure that if we fallback to the COW path for a data relocation inode, we increment the bytes_may_use counter of the data space_info object. The COW path will then decrement it at btrfs_add_reserved_bytes() on success or through its error handling part by a call to extent_clear_unlock_delalloc() (which ends up calling btrfs_clear_delalloc_extent() that does the decrement operation) in case of an error. Test case btrfs/061 from fstests could sporadically trigger this. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 62c3f4972ff6..62b49d2db928 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1378,6 +1378,8 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page, int *page_started, unsigned long *nr_written) { const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode)); + const bool is_reloc_ino = (BTRFS_I(inode)->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; u64 range_start = start; @@ -1408,18 +1410,23 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page, * data space info, which we incremented in the step above. * * If we need to fallback to cow and the inode corresponds to a free - * space cache inode, we must also increment bytes_may_use of the data - * space_info for the same reason. Space caches always get a prealloc + * space cache inode or an inode of the data relocation tree, we must + * also increment bytes_may_use of the data space_info for the same + * reason. Space caches and relocated data extents always get a prealloc * extent for them, however scrub or balance may have set the block - * group that contains that extent to RO mode. + * group that contains that extent to RO mode and therefore force COW + * when starting writeback. */ count = count_range_bits(io_tree, &range_start, end, range_bytes, EXTENT_NORESERVE, 0); - if (count > 0 || is_space_ino) { - const u64 bytes = is_space_ino ? range_bytes : count; + if (count > 0 || is_space_ino || is_reloc_ino) { + u64 bytes = count; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct btrfs_space_info *sinfo = fs_info->data_sinfo; + if (is_space_ino || is_reloc_ino) + bytes = range_bytes; + spin_lock(&sinfo->lock); btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); spin_unlock(&sinfo->lock); -- cgit From e7a79811d0db136dc2d336b56d54cf1b774ce972 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Jun 2020 10:38:44 +0100 Subject: btrfs: check if a log root exists before locking the log_mutex on unlink This brings back an optimization that commit e678934cbe5f02 ("btrfs: Remove unnecessary check from join_running_log_trans") removed, but in a different form. So it's almost equivalent to a revert. That commit removed an optimization where we avoid locking a root's log_mutex when there is no log tree created in the current transaction. The affected code path is triggered through unlink operations. That commit was based on the assumption that the optimization was not necessary because we used to have the following checks when the patch was authored: int btrfs_del_dir_entries_in_log(...) { (...) if (dir->logged_trans < trans->transid) return 0; ret = join_running_log_trans(root); (...) } int btrfs_del_inode_ref_in_log(...) { (...) if (inode->logged_trans < trans->transid) return 0; ret = join_running_log_trans(root); (...) } However before that patch was merged, another patch was merged first which replaced those checks because they were buggy. That other patch corresponds to commit 803f0f64d17769 ("Btrfs: fix fsync not persisting dentry deletions due to inode evictions"). The assumption that if the logged_trans field of an inode had a smaller value then the current transaction's generation (transid) meant that the inode was not logged in the current transaction was only correct if the inode was not evicted and reloaded in the current transaction. So the corresponding bug fix changed those checks and replaced them with the following helper function: static bool inode_logged(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { if (inode->logged_trans == trans->transid) return true; if (inode->last_trans == trans->transid && test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) return true; return false; } So if we have a subvolume without a log tree in the current transaction (because we had no fsyncs), every time we unlink an inode we can end up trying to lock the log_mutex of the root through join_running_log_trans() twice, once for the inode being unlinked (by btrfs_del_inode_ref_in_log()) and once for the parent directory (with btrfs_del_dir_entries_in_log()). This means if we have several unlink operations happening in parallel for inodes in the same subvolume, and the those inodes and/or their parent inode were changed in the current transaction, we end up having a lot of contention on the log_mutex. The test robots from intel reported a -30.7% performance regression for a REAIM test after commit e678934cbe5f02 ("btrfs: Remove unnecessary check from join_running_log_trans"). So just bring back the optimization to join_running_log_trans() where we check first if a log root exists before trying to lock the log_mutex. This is done by checking for a bit that is set on the root when a log tree is created and removed when a log tree is freed (at transaction commit time). Commit e678934cbe5f02 ("btrfs: Remove unnecessary check from join_running_log_trans") was merged in the 5.4 merge window while commit 803f0f64d17769 ("Btrfs: fix fsync not persisting dentry deletions due to inode evictions") was merged in the 5.3 merge window. But the first commit was actually authored before the second commit (May 23 2019 vs June 19 2019). Reported-by: kernel test robot Link: https://lore.kernel.org/lkml/20200611090233.GL12456@shao2-debian/ Fixes: e678934cbe5f02 ("btrfs: Remove unnecessary check from join_running_log_trans") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/tree-log.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 30ce7039bc27..d404cce8ae40 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1009,6 +1009,8 @@ enum { BTRFS_ROOT_DEAD_RELOC_TREE, /* Mark dead root stored on device whose cleanup needs to be resumed */ BTRFS_ROOT_DEAD_TREE, + /* The root has a log tree. Used only for subvolume roots. */ + BTRFS_ROOT_HAS_LOG_TREE, }; /* diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 920cee312f4e..cd5348f352dd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -169,6 +169,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, if (ret) goto out; + set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); root->log_start_pid = current->pid; } @@ -195,6 +196,9 @@ static int join_running_log_trans(struct btrfs_root *root) { int ret = -ENOENT; + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) + return ret; + mutex_lock(&root->log_mutex); if (root->log_root) { ret = 0; @@ -3303,6 +3307,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) if (root->log_root) { free_log_tree(trans, root->log_root); root->log_root = NULL; + clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); } return 0; } -- cgit From f2cb2f39ccc30fa13d3ac078d461031a63960e5b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Jun 2020 18:46:01 +0100 Subject: btrfs: fix hang on snapshot creation after RWF_NOWAIT write If we do a successful RWF_NOWAIT write we end up locking the snapshot lock of the inode, through a call to check_can_nocow(), but we never unlock it. This means the next attempt to create a snapshot on the subvolume will hang forever. Trivial reproducer: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ touch /mnt/foobar $ chattr +C /mnt/foobar $ xfs_io -d -c "pwrite -S 0xab 0 64K" /mnt/foobar $ xfs_io -d -c "pwrite -N -V 1 -S 0xfe 0 64K" /mnt/foobar $ btrfs subvolume snapshot -r /mnt /mnt/snap --> hangs Fix this by unlocking the snapshot lock if check_can_nocow() returned success. Fixes: edf064e7c6fec3 ("btrfs: nowait aio support") CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2c14312b05e8..04faa04fccd1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1914,6 +1914,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, inode_unlock(inode); return -EAGAIN; } + /* check_can_nocow() locks the snapshot lock on success */ + btrfs_drew_write_unlock(&root->snapshot_lock); } current->backing_dev_info = inode_to_bdi(inode); -- cgit From 4b1946284dd6641afdb9457101056d9e6ee6204c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Jun 2020 18:48:58 +0100 Subject: btrfs: fix failure of RWF_NOWAIT write into prealloc extent beyond eof If we attempt to write to prealloc extent located after eof using a RWF_NOWAIT write, we always fail with -EAGAIN. We do actually check if we have an allocated extent for the write at the start of btrfs_file_write_iter() through a call to check_can_nocow(), but later when we go into the actual direct IO write path we simply return -EAGAIN if the write starts at or beyond EOF. Trivial to reproduce: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ touch /mnt/foo $ chattr +C /mnt/foo $ xfs_io -d -c "pwrite -S 0xab 0 64K" /mnt/foo wrote 65536/65536 bytes at offset 0 64 KiB, 16 ops; 0.0004 sec (135.575 MiB/sec and 34707.1584 ops/sec) $ xfs_io -c "falloc -k 64K 1M" /mnt/foo $ xfs_io -d -c "pwrite -N -V 1 -S 0xfe -b 64K 64K 64K" /mnt/foo pwrite: Resource temporarily unavailable On xfs and ext4 the write succeeds, as expected. Fix this by removing the wrong check at btrfs_direct_IO(). Fixes: edf064e7c6fec3 ("btrfs: nowait aio support") CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 62b49d2db928..cfa863d2d97c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7889,9 +7889,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.overwrite = 1; inode_unlock(inode); relock = true; - } else if (iocb->ki_flags & IOCB_NOWAIT) { - ret = -EAGAIN; - goto out; } ret = btrfs_delalloc_reserve_space(inode, &data_reserved, offset, count); -- cgit From 260a63395f90f67d6ab89e4266af9e3dc34a77e9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Jun 2020 18:49:13 +0100 Subject: btrfs: fix RWF_NOWAIT write not failling when we need to cow If we attempt to do a RWF_NOWAIT write against a file range for which we can only do NOCOW for a part of it, due to the existence of holes or shared extents for example, we proceed with the write as if it were possible to NOCOW the whole range. Example: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ touch /mnt/sdj/bar $ chattr +C /mnt/sdj/bar $ xfs_io -d -c "pwrite -S 0xab -b 256K 0 256K" /mnt/bar wrote 262144/262144 bytes at offset 0 256 KiB, 1 ops; 0.0003 sec (694.444 MiB/sec and 2777.7778 ops/sec) $ xfs_io -c "fpunch 64K 64K" /mnt/bar $ sync $ xfs_io -d -c "pwrite -N -V 1 -b 128K -S 0xfe 0 128K" /mnt/bar wrote 131072/131072 bytes at offset 0 128 KiB, 1 ops; 0.0007 sec (160.051 MiB/sec and 1280.4097 ops/sec) This last write should fail with -EAGAIN since the file range from 64K to 128K is a hole. On xfs it fails, as expected, but on ext4 it currently succeeds because apparently it is expensive to check if there are extents allocated for the whole range, but I'll check with the ext4 people. Fix the issue by checking if check_can_nocow() returns a number of NOCOW'able bytes smaller then the requested number of bytes, and if it does return -EAGAIN. Fixes: edf064e7c6fec3 ("btrfs: nowait aio support") CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 04faa04fccd1..6d5d905281c6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1904,18 +1904,29 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, pos = iocb->ki_pos; count = iov_iter_count(from); if (iocb->ki_flags & IOCB_NOWAIT) { + size_t nocow_bytes = count; + /* * We will allocate space in case nodatacow is not set, * so bail */ if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) || - check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { + check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes) <= 0) { inode_unlock(inode); return -EAGAIN; } /* check_can_nocow() locks the snapshot lock on success */ btrfs_drew_write_unlock(&root->snapshot_lock); + /* + * There are holes in the range or parts of the range that must + * be COWed (shared extents, RO block groups, etc), so just bail + * out. + */ + if (nocow_bytes < count) { + inode_unlock(inode); + return -EAGAIN; + } } current->backing_dev_info = inode_to_bdi(inode); -- cgit From 5dbb75ed6900048e146247b6325742d92c892548 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Jun 2020 18:49:39 +0100 Subject: btrfs: fix RWF_NOWAIT writes blocking on extent locks and waiting for IO A RWF_NOWAIT write is not supposed to wait on filesystem locks that can be held for a long time or for ongoing IO to complete. However when calling check_can_nocow(), if the inode has prealloc extents or has the NOCOW flag set, we can block on extent (file range) locks through the call to btrfs_lock_and_flush_ordered_range(). Such lock can take a significant amount of time to be available. For example, a fiemap task may be running, and iterating through the entire file range checking all extents and doing backref walking to determine if they are shared, or a readpage operation may be in progress. Also at btrfs_lock_and_flush_ordered_range(), called by check_can_nocow(), after locking the file range we wait for any existing ordered extent that is in progress to complete. Another operation that can take a significant amount of time and defeat the purpose of RWF_NOWAIT. So fix this by trying to lock the file range and if it's currently locked return -EAGAIN to user space. If we are able to lock the file range without waiting and there is an ordered extent in the range, return -EAGAIN as well, instead of waiting for it to complete. Finally, don't bother trying to lock the snapshot lock of the root when attempting a RWF_NOWAIT write, as that is only important for buffered writes. Fixes: edf064e7c6fec3 ("btrfs: nowait aio support") Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6d5d905281c6..2520605afc25 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1533,7 +1533,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) + size_t *write_bytes, bool nowait) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -1541,27 +1541,43 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, u64 num_bytes; int ret; - if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) + if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; + num_bytes = lockend - lockstart + 1; - btrfs_lock_and_flush_ordered_range(inode, lockstart, - lockend, NULL); + if (nowait) { + struct btrfs_ordered_extent *ordered; + + if (!try_lock_extent(&inode->io_tree, lockstart, lockend)) + return -EAGAIN; + + ordered = btrfs_lookup_ordered_range(inode, lockstart, + num_bytes); + if (ordered) { + btrfs_put_ordered_extent(ordered); + ret = -EAGAIN; + goto out_unlock; + } + } else { + btrfs_lock_and_flush_ordered_range(inode, lockstart, + lockend, NULL); + } - num_bytes = lockend - lockstart + 1; ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, NULL, NULL, NULL); if (ret <= 0) { ret = 0; - btrfs_drew_write_unlock(&root->snapshot_lock); + if (!nowait) + btrfs_drew_write_unlock(&root->snapshot_lock); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); } - +out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend); return ret; @@ -1633,7 +1649,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && check_can_nocow(BTRFS_I(inode), pos, - &write_bytes) > 0) { + &write_bytes, false) > 0) { /* * For nodata cow case, no need to reserve * data space. @@ -1912,12 +1928,11 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) || - check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes) <= 0) { + check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes, + true) <= 0) { inode_unlock(inode); return -EAGAIN; } - /* check_can_nocow() locks the snapshot lock on success */ - btrfs_drew_write_unlock(&root->snapshot_lock); /* * There are holes in the range or parts of the range that must * be COWed (shared extents, RO block groups, etc), so just bail -- cgit From b091f7fede97cc64f7aaad3eeb37965aebee3082 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 16 Jun 2020 11:31:59 -0400 Subject: btrfs: use kfree() in btrfs_ioctl_get_subvol_info() In btrfs_ioctl_get_subvol_info(), there is a classic case where kzalloc() was incorrectly paired with kzfree(). According to David Sterba, there isn't any sensitive information in the subvol_info that needs to be cleared before freeing. So kzfree() isn't really needed, use kfree() instead. Signed-off-by: Waiman Long Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 168deb8ef68a..e8f7c5f00894 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2692,7 +2692,7 @@ out: btrfs_put_root(root); out_free: btrfs_free_path(path); - kzfree(subvol_info); + kfree(subvol_info); return ret; } -- cgit From b6489a49f7b71964e37978d6f89bbdbdb263f6f5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 15 Jun 2020 17:36:58 +0100 Subject: afs: Fix silly rename Fix AFS's silly rename by the following means: (1) Set the destination directory in afs_do_silly_rename() so as to avoid misbehaviour and indicate that the directory data version will increment by 1 so as to avoid warnings about unexpected changes in the DV. Also indicate that the ctime should be updated to avoid xfstest grumbling. (2) Note when the server indicates that a directory changed more than we expected (AFS_OPERATION_DIR_CONFLICT), indicating a conflict with a third party change, checking on successful completion of unlink and rename. The problem is that the FS.RemoveFile RPC op doesn't report the status of the unlinked file, though YFS.RemoveFile2 does. This can be mitigated by the assumption that if the directory DV cranked by exactly 1, we can be sure we removed one link from the file; further, ordinarily in AFS, files cannot be hardlinked across directories, so if we reduce nlink to 0, the file is deleted. However, if the directory DV jumps by more than 1, we cannot know if a third party intervened by adding or removing a link on the file we just removed a link from. The same also goes for any vnode that is at the destination of the FS.Rename RPC op. (3) Make afs_vnode_commit_status() apply the nlink drop inside the cb_lock section along with the other attribute updates if ->op_unlinked is set on the descriptor for the appropriate vnode. (4) Issue a follow up status fetch to the unlinked file in the event of a third party conflict that makes it impossible for us to know if we actually deleted the file or not. (5) Provide a flag, AFS_VNODE_SILLY_DELETED, to make afs_getattr() lie to the user about the nlink of a silly deleted file so that it appears as 0, not 1. Found with the generic/035 and generic/084 xfstests. Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept") Reported-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/dir.c | 21 +++++++++++++++++++-- fs/afs/dir_silly.c | 36 +++++++++++++++++++++++++++--------- fs/afs/inode.c | 22 +++++++++++++++++----- fs/afs/internal.h | 17 +++++++++++++++++ 4 files changed, 80 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index cd74731112f4..3e3c2bf0a722 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -696,7 +696,7 @@ static const struct afs_operation_ops afs_inline_bulk_status_operation = { .success = afs_do_lookup_success, }; -static const struct afs_operation_ops afs_fetch_status_operation = { +static const struct afs_operation_ops afs_lookup_fetch_status_operation = { .issue_afs_rpc = afs_fs_fetch_status, .issue_yfs_rpc = yfs_fs_fetch_status, .success = afs_do_lookup_success, @@ -1496,6 +1496,7 @@ static void afs_unlink_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); op->ctime = op->file[0].scb.status.mtime_client; + afs_check_dir_conflict(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[1]); afs_update_dentry_version(op, &op->file[0], op->dentry); @@ -1580,9 +1581,24 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) op->file[1].vnode = vnode; op->file[1].update_ctime = true; + op->file[1].op_unlinked = true; op->dentry = dentry; op->ops = &afs_unlink_operation; - return afs_do_sync_operation(op); + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + + /* If there was a conflict with a third party, check the status of the + * unlinked vnode. + */ + if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + op->file[1].update_ctime = false; + op->fetch_status.which = 1; + op->ops = &afs_fetch_status_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + } + + return afs_put_operation(op); error: return afs_put_operation(op); @@ -1767,6 +1783,7 @@ static void afs_rename_success(struct afs_operation *op) _enter("op=%08x", op->debug_id); op->ctime = op->file[0].scb.status.mtime_client; + afs_check_dir_conflict(op, &op->file[1]); afs_vnode_commit_status(op, &op->file[0]); if (op->file[1].vnode != op->file[0].vnode) { op->ctime = op->file[1].scb.status.mtime_client; diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 001adb87ff23..04f75a44f243 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -16,6 +16,7 @@ static void afs_silly_rename_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); + afs_check_dir_conflict(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[0]); } @@ -69,6 +70,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode return PTR_ERR(op); afs_op_set_vnode(op, 0, dvnode); + afs_op_set_vnode(op, 1, dvnode); + op->file[0].dv_delta = 1; + op->file[1].dv_delta = 1; + op->file[0].update_ctime = true; + op->file[1].update_ctime = true; op->dentry = old; op->dentry_2 = new; @@ -129,6 +135,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, switch (ret) { case 0: /* The rename succeeded. */ + set_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags); d_move(dentry, sdentry); break; case -ERESTARTSYS: @@ -148,18 +155,11 @@ out: static void afs_silly_unlink_success(struct afs_operation *op) { - struct afs_vnode *vnode = op->file[1].vnode; - _enter("op=%08x", op->debug_id); + afs_check_dir_conflict(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[1]); afs_update_dentry_version(op, &op->file[0], op->dentry); - - drop_nlink(&vnode->vfs_inode); - if (vnode->vfs_inode.i_nlink == 0) { - set_bit(AFS_VNODE_DELETED, &vnode->flags); - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); - } } static void afs_silly_unlink_edit_dir(struct afs_operation *op) @@ -200,12 +200,30 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, vnode); + op->file[0].dv_delta = 1; + op->file[0].update_ctime = true; + op->file[1].op_unlinked = true; + op->file[1].update_ctime = true; op->dentry = dentry; op->ops = &afs_silly_unlink_operation; trace_afs_silly_rename(vnode, true); - return afs_do_sync_operation(op); + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + + /* If there was a conflict with a third party, check the status of the + * unlinked vnode. + */ + if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + op->file[1].update_ctime = false; + op->fetch_status.which = 1; + op->ops = &afs_fetch_status_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + } + + return afs_put_operation(op); } /* diff --git a/fs/afs/inode.c b/fs/afs/inode.c index d5d0ae7b2b1e..1d13d2e882ad 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -284,16 +284,25 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v write_seqlock(&vnode->cb_lock); if (vp->scb.have_error) { + /* A YFS server will return this from RemoveFile2 and AFS and + * YFS will return this from InlineBulkStatus. + */ if (vp->scb.status.abort_code == VNOVNODE) { set_bit(AFS_VNODE_DELETED, &vnode->flags); clear_nlink(&vnode->vfs_inode); __afs_break_callback(vnode, afs_cb_break_for_deleted); + op->flags &= ~AFS_OPERATION_DIR_CONFLICT; } - } else { - if (vp->scb.have_status) - afs_apply_status(op, vp); + } else if (vp->scb.have_status) { + afs_apply_status(op, vp); if (vp->scb.have_cb) afs_apply_callback(op, vp); + } else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) { + drop_nlink(&vnode->vfs_inode); + if (vnode->vfs_inode.i_nlink == 0) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + __afs_break_callback(vnode, afs_cb_break_for_deleted); + } } write_sequnlock(&vnode->cb_lock); @@ -304,7 +313,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v static void afs_fetch_status_success(struct afs_operation *op) { - struct afs_vnode_param *vp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[op->fetch_status.which]; struct afs_vnode *vnode = vp->vnode; int ret; @@ -318,7 +327,7 @@ static void afs_fetch_status_success(struct afs_operation *op) } } -static const struct afs_operation_ops afs_fetch_status_operation = { +const struct afs_operation_ops afs_fetch_status_operation = { .issue_afs_rpc = afs_fs_fetch_status, .issue_yfs_rpc = yfs_fs_fetch_status, .success = afs_fetch_status_success, @@ -729,6 +738,9 @@ int afs_getattr(const struct path *path, struct kstat *stat, do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); generic_fillattr(inode, stat); + if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && + stat->nlink > 0) + stat->nlink -= 1; } while (need_seqretry(&vnode->cb_lock, seq)); done_seqretry(&vnode->cb_lock, seq); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 9420890e3577..573a5922c3bb 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -634,6 +634,7 @@ struct afs_vnode { #define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ +#define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ @@ -748,6 +749,7 @@ struct afs_vnode_param { bool need_io_lock:1; /* T if we need the I/O lock on this */ bool update_ctime:1; /* Need to update the ctime */ bool set_size:1; /* Must update i_size */ + bool op_unlinked:1; /* True if file was unlinked by op */ }; /* @@ -839,6 +841,7 @@ struct afs_operation { #define AFS_OPERATION_LOCK_1 0x0200 /* Set if have io_lock on file[1] */ #define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */ #define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */ +#define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */ }; /* @@ -1066,6 +1069,8 @@ extern int afs_wait_for_one_fs_probe(struct afs_server *, bool); /* * inode.c */ +extern const struct afs_operation_ops afs_fetch_status_operation; + extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); extern int afs_ilookup5_test_by_fid(struct inode *, void *); @@ -1497,6 +1502,18 @@ static inline void afs_update_dentry_version(struct afs_operation *op, (void *)(unsigned long)dir_vp->scb.status.data_version; } +/* + * Check for a conflicting operation on a directory that we just unlinked from. + * If someone managed to sneak a link or an unlink in on the file we just + * unlinked, we won't be able to trust nlink on an AFS file (but not YFS). + */ +static inline void afs_check_dir_conflict(struct afs_operation *op, + struct afs_vnode_param *dvp) +{ + if (dvp->dv_before + dvp->dv_delta != dvp->scb.status.data_version) + op->flags |= AFS_OPERATION_DIR_CONFLICT; +} + static inline int afs_io_error(struct afs_call *call, enum afs_io_error where) { trace_afs_io_error(call->debug_id, -EIO, where); -- cgit From 4e264ffd953463cd14c0720eaa9315ac052f5973 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 16 Jun 2020 19:14:08 +0900 Subject: proc/bootconfig: Fix to use correct quotes for value Fix /proc/bootconfig to select double or single quotes corrctly according to the value. If a bootconfig value includes a double quote character, we must use single-quotes to quote that value. This modifies if() condition and blocks for avoiding double-quote in value check in 2 places. Anyway, since xbc_array_for_each_value() can handle the array which has a single node correctly. Thus, if (vnode && xbc_node_is_array(vnode)) { xbc_array_for_each_value(vnode) /* vnode->next != NULL */ ... } else { snprintf(val); /* val is an empty string if !vnode */ } is equivalent to if (vnode) { xbc_array_for_each_value(vnode) /* vnode->next can be NULL */ ... } else { snprintf(""); /* value is always empty */ } Link: http://lkml.kernel.org/r/159230244786.65555.3763894451251622488.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: c1a3c36017d4 ("proc: bootconfig: Add /proc/bootconfig to show boot config list") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- fs/proc/bootconfig.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 9955d75c0585..ad31ec4ad627 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -26,8 +26,9 @@ static int boot_config_proc_show(struct seq_file *m, void *v) static int __init copy_xbc_key_value_list(char *dst, size_t size) { struct xbc_node *leaf, *vnode; - const char *val; char *key, *end = dst + size; + const char *val; + char q; int ret = 0; key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL); @@ -41,16 +42,20 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) break; dst += ret; vnode = xbc_node_get_child(leaf); - if (vnode && xbc_node_is_array(vnode)) { + if (vnode) { xbc_array_for_each_value(vnode, val) { - ret = snprintf(dst, rest(dst, end), "\"%s\"%s", - val, vnode->next ? ", " : "\n"); + if (strchr(val, '"')) + q = '\''; + else + q = '"'; + ret = snprintf(dst, rest(dst, end), "%c%s%c%s", + q, val, q, vnode->next ? ", " : "\n"); if (ret < 0) goto out; dst += ret; } } else { - ret = snprintf(dst, rest(dst, end), "\"%s\"\n", val); + ret = snprintf(dst, rest(dst, end), "\"\"\n"); if (ret < 0) break; dst += ret; -- cgit From fe557319aa06c23cffc9346000f119547e0f289a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Jun 2020 09:37:53 +0200 Subject: maccess: rename probe_kernel_{read,write} to copy_{from,to}_kernel_nofault Better describe what these functions do. Suggested-by: Linus Torvalds Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 8ba492d44e68..e502414b3556 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -512,7 +512,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) * Using bounce buffer to bypass the * hardened user copy kernel text checks. */ - if (probe_kernel_read(buf, (void *) start, tsz)) { + if (copy_from_kernel_nofault(buf, (void *)start, + tsz)) { if (clear_user(buffer, tsz)) { ret = -EFAULT; goto out; -- cgit From 2d7d67920e5c8e0854df23ca77da2dd5880ce5dd Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 16 Jun 2020 02:06:37 +0800 Subject: io_uring: don't fail links for EAGAIN error in IOPOLL mode In IOPOLL mode, for EAGAIN error, we'll try to submit io request again using io-wq, so don't fail rest of links if this io request has links. Cc: stable@vger.kernel.org Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index e17df662c191..eb3797714539 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1988,7 +1988,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if (res != req->result) + if (res != -EAGAIN && res != req->result) req_set_fail_links(req); req->result = res; if (res != -EAGAIN) -- cgit From bbde017a32b32d2fa8d5fddca25fade20132abf8 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 16 Jun 2020 02:06:38 +0800 Subject: io_uring: add memory barrier to synchronize io_kiocb's result and iopoll_completed In io_complete_rw_iopoll(), stores to io_kiocb's result and iopoll completed are two independent store operations, to ensure that once iopoll_completed is ture and then req->result must been perceived by the cpu executing io_do_iopoll(), proper memory barrier should be used. And in io_do_iopoll(), we check whether req->result is EAGAIN, if it is, we'll need to issue this io request using io-wq again. In order to just issue a single smp_rmb() on the completion side, move the re-submit work to io_iopoll_complete(). Cc: stable@vger.kernel.org Signed-off-by: Xiaoguang Wang [axboe: don't set ->iopoll_completed for -EAGAIN retry] Signed-off-by: Jens Axboe --- fs/io_uring.c | 53 +++++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index eb3797714539..9d2ae9aa8b45 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1742,6 +1742,18 @@ static int io_put_kbuf(struct io_kiocb *req) return cflags; } +static void io_iopoll_queue(struct list_head *again) +{ + struct io_kiocb *req; + + do { + req = list_first_entry(again, struct io_kiocb, list); + list_del(&req->list); + refcount_inc(&req->refs); + io_queue_async_work(req); + } while (!list_empty(again)); +} + /* * Find and free completed poll iocbs */ @@ -1750,12 +1762,21 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, { struct req_batch rb; struct io_kiocb *req; + LIST_HEAD(again); + + /* order with ->result store in io_complete_rw_iopoll() */ + smp_rmb(); rb.to_free = rb.need_iter = 0; while (!list_empty(done)) { int cflags = 0; req = list_first_entry(done, struct io_kiocb, list); + if (READ_ONCE(req->result) == -EAGAIN) { + req->iopoll_completed = 0; + list_move_tail(&req->list, &again); + continue; + } list_del(&req->list); if (req->flags & REQ_F_BUFFER_SELECTED) @@ -1773,18 +1794,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, if (ctx->flags & IORING_SETUP_SQPOLL) io_cqring_ev_posted(ctx); io_free_req_many(ctx, &rb); -} - -static void io_iopoll_queue(struct list_head *again) -{ - struct io_kiocb *req; - do { - req = list_first_entry(again, struct io_kiocb, list); - list_del(&req->list); - refcount_inc(&req->refs); - io_queue_async_work(req); - } while (!list_empty(again)); + if (!list_empty(&again)) + io_iopoll_queue(&again); } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, @@ -1792,7 +1804,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, { struct io_kiocb *req, *tmp; LIST_HEAD(done); - LIST_HEAD(again); bool spin; int ret; @@ -1818,13 +1829,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) break; - if (req->result == -EAGAIN) { - list_move_tail(&req->list, &again); - continue; - } - if (!list_empty(&again)) - break; - ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); if (ret < 0) break; @@ -1837,9 +1841,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) io_iopoll_complete(ctx, nr_events, &done); - if (!list_empty(&again)) - io_iopoll_queue(&again); - return ret; } @@ -1990,9 +1991,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (res != -EAGAIN && res != req->result) req_set_fail_links(req); - req->result = res; - if (res != -EAGAIN) + + WRITE_ONCE(req->result, res); + /* order with io_poll_complete() checking ->result */ + if (res != -EAGAIN) { + smp_wmb(); WRITE_ONCE(req->iopoll_completed, 1); + } } /* -- cgit From 9d8426a09195e2dcf2aa249de2aaadd792d491c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jun 2020 18:42:49 -0600 Subject: io_uring: acquire 'mm' for task_work for SQPOLL If we're unlucky with timing, we could be running task_work after having dropped the memory context in the sq thread. Since dropping the context requires a runnable task state, we cannot reliably drop it as part of our check-for-work loop in io_sq_thread(). Instead, abstract out the mm acquire for the sq thread into a helper, and call it from the async task work handler. Cc: stable@vger.kernel.org # v5.7 Signed-off-by: Jens Axboe --- fs/io_uring.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9d2ae9aa8b45..98c83fbf4f88 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4256,6 +4256,28 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&pt->req->apoll->poll, pt, head); } +static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (io_op_defs[req->opcode].needs_mm && !current->mm) { + if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + kthread_use_mm(ctx->sqo_mm); + } + + return 0; +} + static void io_async_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); @@ -4290,11 +4312,16 @@ static void io_async_task_func(struct callback_head *cb) if (!canceled) { __set_current_state(TASK_RUNNING); + if (io_sq_thread_acquire_mm(ctx, req)) { + io_cqring_add_event(req, -EFAULT); + goto end_req; + } mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); } else { io_cqring_ev_posted(ctx); +end_req: req_set_fail_links(req); io_double_put_req(req); } @@ -5841,11 +5868,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; - if (io_op_defs[req->opcode].needs_mm && !current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_mm); - } + if (unlikely(io_sq_thread_acquire_mm(ctx, req))) + return -EFAULT; sqe_flags = READ_ONCE(sqe->flags); /* enforce forwards compatibility on users */ @@ -5954,16 +5978,6 @@ fail_req: return submitted; } -static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - } -} - static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; -- cgit From 56952e91acc93ed624fe9da840900defb75f1323 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 17 Jun 2020 15:00:04 -0600 Subject: io_uring: reap poll completions while waiting for refs to drop on exit If we're doing polled IO and end up having requests being submitted async, then completions can come in while we're waiting for refs to drop. We need to reap these manually, as nobody else will be looking for them. Break the wait into 1/20th of a second time waits, and check for done poll completions if we time out. Otherwise we can have done poll completions sitting in ctx->poll_list, which needs us to reap them but we're just waiting for them. Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 98c83fbf4f88..2038d52c5450 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7363,7 +7363,17 @@ static void io_ring_exit_work(struct work_struct *work) if (ctx->rings) io_cqring_overflow_flush(ctx, true); - wait_for_completion(&ctx->ref_comp); + /* + * If we're doing polled IO and end up having requests being + * submitted async (out-of-line), then completions can come in while + * we're waiting for refs to drop. We need to reap these manually, + * as nobody else will be looking for them. + */ + while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) { + io_iopoll_reap_events(ctx); + if (ctx->rings) + io_cqring_overflow_flush(ctx, true); + } io_ring_ctx_free(ctx); } -- cgit From 6f2cc1664db20676069cff27a461ccc97dbfd114 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 18 Jun 2020 15:01:56 +0800 Subject: io_uring: fix possible race condition against REQ_F_NEED_CLEANUP In io_read() or io_write(), when io request is submitted successfully, it'll go through the below sequence: kfree(iovec); req->flags &= ~REQ_F_NEED_CLEANUP; return ret; But clearing REQ_F_NEED_CLEANUP might be unsafe. The io request may already have been completed, and then io_complete_rw_iopoll() and io_complete_rw() will be called, both of which will also modify req->flags if needed. This causes a race condition, with concurrent non-atomic modification of req->flags. To eliminate this race, in io_read() or io_write(), if io request is submitted successfully, we don't remove REQ_F_NEED_CLEANUP flag. If REQ_F_NEED_CLEANUP is set, we'll leave __io_req_aux_free() to the iovec cleanup work correspondingly. Cc: stable@vger.kernel.org Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2038d52c5450..a78201b96179 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2670,8 +2670,8 @@ copy_iov: } } out_free: - kfree(iovec); - req->flags &= ~REQ_F_NEED_CLEANUP; + if (!(req->flags & REQ_F_NEED_CLEANUP)) + kfree(iovec); return ret; } @@ -2793,8 +2793,8 @@ copy_iov: } } out_free: - req->flags &= ~REQ_F_NEED_CLEANUP; - kfree(iovec); + if (!(req->flags & REQ_F_NEED_CLEANUP)) + kfree(iovec); return ret; } -- cgit From 3373a3461aa15b7f9a871fa4cb2c9ef21ac20b47 Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Thu, 18 Jun 2020 12:21:38 +0800 Subject: block: make function 'kill_bdev' static kill_bdev does not have any external user, so make it static. Signed-off-by: Zheng Bin Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- fs/block_dev.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 08c87db3a92b..0ae656e022fd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -75,7 +75,7 @@ static void bdev_write_inode(struct block_device *bdev) } /* Kill _all_ buffers and pagecache , dirty or not.. */ -void kill_bdev(struct block_device *bdev) +static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; @@ -84,8 +84,7 @@ void kill_bdev(struct block_device *bdev) invalidate_bh_lrus(); truncate_inode_pages(mapping, 0); -} -EXPORT_SYMBOL(kill_bdev); +} /* Invalidate clean unused buffers and pagecache. */ void invalidate_bdev(struct block_device *bdev) -- cgit From f8ea5c7bceeb6ce6e7b3e7fb28c9dda8c0a58dcb Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 19 Jun 2020 00:01:28 +0100 Subject: afs: Fix afs_do_lookup() to call correct fetch-status op variant Fix afs_do_lookup()'s fallback case for when FS.InlineBulkStatus isn't supported by the server. In the fallback, it calls FS.FetchStatus for the specific vnode it's meant to be looking up. Commit b6489a49f7b7 broke this by renaming one of the two identically-named afs_fetch_status_operation descriptors to something else so that one of them could be made non-static. The site that used the renamed one, however, wasn't renamed and didn't produce any warning because the other was declared in a header. Fix this by making afs_do_lookup() use the renamed variant. Note that there are two variants of the success method because one is called from ->lookup() where we may or may not have an inode, but can't call iget until after we've talked to the server - whereas the other is called from within iget where we have an inode, but it may or may not be initialised. The latter variant expects there to be an inode, but because it's being called from there former case, there might not be - resulting in an oops like the following: BUG: kernel NULL pointer dereference, address: 00000000000000b0 ... RIP: 0010:afs_fetch_status_success+0x27/0x7e ... Call Trace: afs_wait_for_operation+0xda/0x234 afs_do_lookup+0x2fe/0x3c1 afs_lookup+0x3c5/0x4bd __lookup_slow+0xcd/0x10f walk_component+0xa2/0x10c path_lookupat.isra.0+0x80/0x110 filename_lookup+0x81/0x104 vfs_statx+0x76/0x109 __do_sys_newlstat+0x39/0x6b do_syscall_64+0x4c/0x78 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: b6489a49f7b7 ("afs: Fix silly rename") Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 3e3c2bf0a722..96757f3abd74 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -845,7 +845,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, * to FS.FetchStatus for op->file[1]. */ op->fetch_status.which = 1; - op->ops = &afs_fetch_status_operation; + op->ops = &afs_lookup_fetch_status_operation; afs_begin_vnode_operation(op); afs_wait_for_operation(op); } -- cgit From 5481fc6eb8a7f4b76d8ad1be371d2e11b22bfb55 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 19 Jun 2020 23:39:36 +0100 Subject: afs: Fix hang on rmmod due to outstanding timer The fileserver probe timer, net->fs_probe_timer, isn't cancelled when the kafs module is being removed and so the count it holds on net->servers_outstanding doesn't get dropped.. This causes rmmod to wait forever. The hung process shows a stack like: afs_purge_servers+0x1b5/0x23c [kafs] afs_net_exit+0x44/0x6e [kafs] ops_exit_list+0x72/0x93 unregister_pernet_operations+0x14c/0x1ba unregister_pernet_subsys+0x1d/0x2a afs_exit+0x29/0x6f [kafs] __do_sys_delete_module.isra.0+0x1a2/0x24b do_syscall_64+0x51/0x95 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix this by: (1) Attempting to cancel the probe timer and, if successful, drop the count that the timer was holding. (2) Make the timer function just drop the count and not schedule the prober if the afs portion of net namespace is being destroyed. Also, whilst we're at it, make the following changes: (3) Initialise net->servers_outstanding to 1 and decrement it before waiting on it so that it doesn't generate wake up events by being decremented to 0 until we're cleaning up. (4) Switch the atomic_dec() on ->servers_outstanding for ->fs_timer in afs_purge_servers() to use the helper function for that. Fixes: f6cbb368bcb0 ("afs: Actively poll fileservers to maintain NAT or firewall openings") Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/fs_probe.c | 11 ++++++++++- fs/afs/internal.h | 1 + fs/afs/main.c | 3 +++ fs/afs/server.c | 3 ++- 4 files changed, 16 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index b34f74b0f319..5d9ef517cf81 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -314,7 +314,7 @@ void afs_fs_probe_timer(struct timer_list *timer) { struct afs_net *net = container_of(timer, struct afs_net, fs_probe_timer); - if (!queue_work(afs_wq, &net->fs_prober)) + if (!net->live || !queue_work(afs_wq, &net->fs_prober)) afs_dec_servers_outstanding(net); } @@ -458,3 +458,12 @@ dont_wait: return -ETIME; return -EDESTADDRREQ; } + +/* + * Clean up the probing when the namespace is killed off. + */ +void afs_fs_probe_cleanup(struct afs_net *net) +{ + if (del_timer_sync(&net->fs_probe_timer)) + afs_dec_servers_outstanding(net); +} diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 573a5922c3bb..d520535ddb62 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1065,6 +1065,7 @@ extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long); extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_fs_probe_dispatcher(struct work_struct *); extern int afs_wait_for_one_fs_probe(struct afs_server *, bool); +extern void afs_fs_probe_cleanup(struct afs_net *); /* * inode.c diff --git a/fs/afs/main.c b/fs/afs/main.c index 9c79c91e8005..31b472f7c734 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -100,6 +100,7 @@ static int __net_init afs_net_init(struct net *net_ns) timer_setup(&net->fs_timer, afs_servers_timer, 0); INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher); timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0); + atomic_set(&net->servers_outstanding, 1); ret = -ENOMEM; sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); @@ -130,6 +131,7 @@ static int __net_init afs_net_init(struct net *net_ns) error_open_socket: net->live = false; + afs_fs_probe_cleanup(net); afs_cell_purge(net); afs_purge_servers(net); error_cell_init: @@ -150,6 +152,7 @@ static void __net_exit afs_net_exit(struct net *net_ns) struct afs_net *net = afs_net(net_ns); net->live = false; + afs_fs_probe_cleanup(net); afs_cell_purge(net); afs_purge_servers(net); afs_close_socket(net); diff --git a/fs/afs/server.c b/fs/afs/server.c index 039e3488511c..e82e452e2612 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -605,11 +605,12 @@ void afs_purge_servers(struct afs_net *net) _enter(""); if (del_timer_sync(&net->fs_timer)) - atomic_dec(&net->servers_outstanding); + afs_dec_servers_outstanding(net); afs_queue_server_manager(net); _debug("wait"); + atomic_dec(&net->servers_outstanding); wait_var_event(&net->servers_outstanding, !atomic_read(&net->servers_outstanding)); _leave(""); -- cgit From 3c597282887fd55181578996dca52ce697d985a5 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 19 Jun 2020 07:43:49 +0800 Subject: erofs: fix partially uninitialized misuse in z_erofs_onlinepage_fixup Hongyu reported "id != index" in z_erofs_onlinepage_fixup() with specific aarch64 environment easily, which wasn't shown before. After digging into that, I found that high 32 bits of page->private was set to 0xaaaaaaaa rather than 0 (due to z_erofs_onlinepage_init behavior with specific compiler options). Actually we only use low 32 bits to keep the page information since page->private is only 4 bytes on most 32-bit platforms. However z_erofs_onlinepage_fixup() uses the upper 32 bits by mistake. Let's fix it now. Reported-and-tested-by: Hongyu Jin Fixes: 3883a79abd02 ("staging: erofs: introduce VLE decompression support") Cc: # 4.19+ Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20200618234349.22553-1-hsiangkao@aol.com Signed-off-by: Gao Xiang --- fs/erofs/zdata.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 7824f5563a55..9b66c28b3ae9 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -144,22 +144,22 @@ static inline void z_erofs_onlinepage_init(struct page *page) static inline void z_erofs_onlinepage_fixup(struct page *page, uintptr_t index, bool down) { - unsigned long *p, o, v, id; -repeat: - p = &page_private(page); - o = READ_ONCE(*p); + union z_erofs_onlinepage_converter u = { .v = &page_private(page) }; + int orig, orig_index, val; - id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; - if (id) { +repeat: + orig = atomic_read(u.o); + orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; + if (orig_index) { if (!index) return; - DBG_BUGON(id != index); + DBG_BUGON(orig_index != index); } - v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | - ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); - if (cmpxchg(p, o, v) != o) + val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | + ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); + if (atomic_cmpxchg(u.o, orig, val) != orig) goto repeat; } -- cgit