From ff694ab60c29cfeba81b3d5068d3c908f22110ed Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Thu, 28 May 2020 07:59:55 -0700
Subject: fs/ext4: Narrow scope of DAX check in setflags

When preventing DAX and journaling on an inode.  Use the effective DAX
check rather than the mount option.

This will be required to support per inode DAX flags.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20200528150003.828793-2-ira.weiny@intel.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ioctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bfc1281fc4cb..5813e5e73eab 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -393,9 +393,9 @@ flags_err:
 	if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
 		/*
 		 * Changes to the journaling mode can cause unsafe changes to
-		 * S_DAX if we are using the DAX mount option.
+		 * S_DAX if the inode is DAX
 		 */
-		if (test_opt(inode->i_sb, DAX)) {
+		if (IS_DAX(inode)) {
 			err = -EBUSY;
 			goto flags_out;
 		}
-- 
cgit 


From 6c0d077ff8200a7b8e7131ca8e25315dec243a60 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Thu, 28 May 2020 07:59:56 -0700
Subject: fs/ext4: Disallow verity if inode is DAX

Verity and DAX are incompatible.  Changing the DAX mode due to a verity
flag change is wrong without a corresponding address_space_operations
update.

Make the 2 options mutually exclusive by returning an error if DAX was
set first.

(Setting DAX is already disabled if Verity is set first.)

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>

Link: https://lore.kernel.org/r/20200528150003.828793-3-ira.weiny@intel.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/verity.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index dc5ec724d889..f05a09fb2ae4 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -113,6 +113,9 @@ static int ext4_begin_enable_verity(struct file *filp)
 	handle_t *handle;
 	int err;
 
+	if (IS_DAX(inode))
+		return -EINVAL;
+
 	if (ext4_verity_in_progress(inode))
 		return -EBUSY;
 
-- 
cgit 


From fc626fe322f13ecdfac58cbfa6cfea797ed22623 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Thu, 28 May 2020 07:59:57 -0700
Subject: fs/ext4: Change EXT4_MOUNT_DAX to EXT4_MOUNT_DAX_ALWAYS

In prep for the new tri-state mount option which then introduces
EXT4_MOUNT_DAX_NEVER.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>

Link: https://lore.kernel.org/r/20200528150003.828793-4-ira.weiny@intel.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  4 ++--
 fs/ext4/inode.c |  2 +-
 fs/ext4/super.c | 12 ++++++------
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 91eb4381cae5..1a3daf2d18ef 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1123,9 +1123,9 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
 #define EXT4_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
 #ifdef CONFIG_FS_DAX
-#define EXT4_MOUNT_DAX			0x00200	/* Direct Access */
+#define EXT4_MOUNT_DAX_ALWAYS		0x00200	/* Direct Access */
 #else
-#define EXT4_MOUNT_DAX			0
+#define EXT4_MOUNT_DAX_ALWAYS		0
 #endif
 #define EXT4_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
 #define EXT4_MOUNT_JOURNAL_DATA		0x00400	/* Write data to journal */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2a4aae6acdcb..a10ff12194db 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4400,7 +4400,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 
 static bool ext4_should_use_dax(struct inode *inode)
 {
-	if (!test_opt(inode->i_sb, DAX))
+	if (!test_opt(inode->i_sb, DAX_ALWAYS))
 		return false;
 	if (!S_ISREG(inode->i_mode))
 		return false;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index bf5fcb477f66..7b99c44d0a91 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1775,7 +1775,7 @@ static const struct mount_opts {
 	{Opt_min_batch_time, 0, MOPT_GTE0},
 	{Opt_inode_readahead_blks, 0, MOPT_GTE0},
 	{Opt_init_itable, 0, MOPT_GTE0},
-	{Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
+	{Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET},
 	{Opt_stripe, 0, MOPT_GTE0},
 	{Opt_resuid, 0, MOPT_GTE0},
 	{Opt_resgid, 0, MOPT_GTE0},
@@ -3982,7 +3982,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				 "both data=journal and dioread_nolock");
 			goto failed_mount;
 		}
-		if (test_opt(sb, DAX)) {
+		if (test_opt(sb, DAX_ALWAYS)) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
 				 "both data=journal and dax");
 			goto failed_mount;
@@ -4092,7 +4092,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+	if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
 		if (ext4_has_feature_inline_data(sb)) {
 			ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
 					" that may contain inline data");
@@ -5412,7 +5412,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			err = -EINVAL;
 			goto restore_opts;
 		}
-		if (test_opt(sb, DAX)) {
+		if (test_opt(sb, DAX_ALWAYS)) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
 				 "both data=journal and dax");
 			err = -EINVAL;
@@ -5433,10 +5433,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
-	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
+	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS) {
 		ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
 			"dax flag with busy inodes while remounting");
-		sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
+		sbi->s_mount_opt ^= EXT4_MOUNT_DAX_ALWAYS;
 	}
 
 	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
-- 
cgit 


From a8ab6d3885ef5e2300d683b79a9e1999403eefd9 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Thu, 28 May 2020 07:59:58 -0700
Subject: fs/ext4: Update ext4_should_use_dax()

S_DAX should only be enabled when the underlying block device supports
dax.

Cache the underlying support for DAX in the super block and modify
ext4_should_use_dax() to check for device support prior to the over
riding mount option.

While we are at it change the function to ext4_should_enable_dax() as
this better reflects the ask as well as matches xfs.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>

Link: https://lore.kernel.org/r/20200528150003.828793-5-ira.weiny@intel.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  1 +
 fs/ext4/inode.c | 15 ++++++++++-----
 fs/ext4/super.c |  5 ++++-
 3 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1a3daf2d18ef..0b4db9ce7756 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1979,6 +1979,7 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
  */
 #define EXT4_FLAGS_RESIZING	0
 #define EXT4_FLAGS_SHUTDOWN	1
+#define EXT4_FLAGS_BDEV_IS_DAX	2
 
 static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
 {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a10ff12194db..6532870f6a0b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4398,10 +4398,10 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
 }
 
-static bool ext4_should_use_dax(struct inode *inode)
+static bool ext4_should_enable_dax(struct inode *inode)
 {
-	if (!test_opt(inode->i_sb, DAX_ALWAYS))
-		return false;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
 	if (!S_ISREG(inode->i_mode))
 		return false;
 	if (ext4_should_journal_data(inode))
@@ -4412,7 +4412,12 @@ static bool ext4_should_use_dax(struct inode *inode)
 		return false;
 	if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
 		return false;
-	return true;
+	if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
+		return false;
+	if (test_opt(inode->i_sb, DAX_ALWAYS))
+		return true;
+
+	return false;
 }
 
 void ext4_set_inode_flags(struct inode *inode)
@@ -4430,7 +4435,7 @@ void ext4_set_inode_flags(struct inode *inode)
 		new_fl |= S_NOATIME;
 	if (flags & EXT4_DIRSYNC_FL)
 		new_fl |= S_DIRSYNC;
-	if (ext4_should_use_dax(inode))
+	if (ext4_should_enable_dax(inode))
 		new_fl |= S_DAX;
 	if (flags & EXT4_ENCRYPT_FL)
 		new_fl |= S_ENCRYPTED;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7b99c44d0a91..f7d76dcaedfe 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4092,13 +4092,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
+	if (bdev_dax_supported(sb->s_bdev, blocksize))
+		set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
+
 	if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
 		if (ext4_has_feature_inline_data(sb)) {
 			ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
 					" that may contain inline data");
 			goto failed_mount;
 		}
-		if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
+		if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
 			ext4_msg(sb, KERN_ERR,
 				"DAX unsupported by block device.");
 			goto failed_mount;
-- 
cgit 


From 043546e46dc70c25ff7e2cf6d09cbb0424fc9978 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Thu, 28 May 2020 07:59:59 -0700
Subject: fs/ext4: Only change S_DAX on inode load

To prevent complications with in memory inodes we only set S_DAX on
inode load.  FS_XFLAG_DAX can be changed at any time and S_DAX will
change after inode eviction and reload.

Add init bool to ext4_set_inode_flags() to indicate if the inode is
being newly initialized.

Assert that S_DAX is not set on an inode which is just being loaded.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>

Link: https://lore.kernel.org/r/20200528150003.828793-6-ira.weiny@intel.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h   |  2 +-
 fs/ext4/ialloc.c |  2 +-
 fs/ext4/inode.c  | 13 ++++++++++---
 fs/ext4/ioctl.c  |  3 ++-
 fs/ext4/super.c  |  4 ++--
 fs/ext4/verity.c |  2 +-
 6 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0b4db9ce7756..f5291693ce6e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2693,7 +2693,7 @@ extern int ext4_can_truncate(struct inode *inode);
 extern int ext4_truncate(struct inode *);
 extern int ext4_break_layouts(struct inode *);
 extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
-extern void ext4_set_inode_flags(struct inode *);
+extern void ext4_set_inode_flags(struct inode *, bool init);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4b8c9a9bdf0c..7941c140723f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1116,7 +1116,7 @@ got:
 	ei->i_block_group = group;
 	ei->i_last_alloc_group = ~0;
 
-	ext4_set_inode_flags(inode);
+	ext4_set_inode_flags(inode, true);
 	if (IS_DIRSYNC(inode))
 		ext4_handle_sync(handle);
 	if (insert_inode_locked(inode) < 0) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6532870f6a0b..01636cf5f322 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4420,11 +4420,13 @@ static bool ext4_should_enable_dax(struct inode *inode)
 	return false;
 }
 
-void ext4_set_inode_flags(struct inode *inode)
+void ext4_set_inode_flags(struct inode *inode, bool init)
 {
 	unsigned int flags = EXT4_I(inode)->i_flags;
 	unsigned int new_fl = 0;
 
+	WARN_ON_ONCE(IS_DAX(inode) && init);
+
 	if (flags & EXT4_SYNC_FL)
 		new_fl |= S_SYNC;
 	if (flags & EXT4_APPEND_FL)
@@ -4435,8 +4437,13 @@ void ext4_set_inode_flags(struct inode *inode)
 		new_fl |= S_NOATIME;
 	if (flags & EXT4_DIRSYNC_FL)
 		new_fl |= S_DIRSYNC;
-	if (ext4_should_enable_dax(inode))
+
+	/* Because of the way inode_set_flags() works we must preserve S_DAX
+	 * here if already set. */
+	new_fl |= (inode->i_flags & S_DAX);
+	if (init && ext4_should_enable_dax(inode))
 		new_fl |= S_DAX;
+
 	if (flags & EXT4_ENCRYPT_FL)
 		new_fl |= S_ENCRYPTED;
 	if (flags & EXT4_CASEFOLD_FL)
@@ -4650,7 +4657,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 		 * not initialized on a new filesystem. */
 	}
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
-	ext4_set_inode_flags(inode);
+	ext4_set_inode_flags(inode, true);
 	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
 	if (ext4_has_feature_64bit(sb))
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5813e5e73eab..145083e8cd1e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -381,7 +381,8 @@ static int ext4_ioctl_setflags(struct inode *inode,
 			ext4_clear_inode_flag(inode, i);
 	}
 
-	ext4_set_inode_flags(inode);
+	ext4_set_inode_flags(inode, false);
+
 	inode->i_ctime = current_time(inode);
 
 	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f7d76dcaedfe..80eb814c47eb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1348,7 +1348,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 			 * Update inode->i_flags - S_ENCRYPTED will be enabled,
 			 * S_DAX may be disabled
 			 */
-			ext4_set_inode_flags(inode);
+			ext4_set_inode_flags(inode, false);
 		}
 		return res;
 	}
@@ -1375,7 +1375,7 @@ retry:
 		 * Update inode->i_flags - S_ENCRYPTED will be enabled,
 		 * S_DAX may be disabled
 		 */
-		ext4_set_inode_flags(inode);
+		ext4_set_inode_flags(inode, false);
 		res = ext4_mark_inode_dirty(handle, inode);
 		if (res)
 			EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index f05a09fb2ae4..89a155ece323 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -244,7 +244,7 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc,
 		if (err)
 			goto out_stop;
 		ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
-		ext4_set_inode_flags(inode);
+		ext4_set_inode_flags(inode, false);
 		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	}
 out_stop:
-- 
cgit 


From 9cb20f94afcd2964944f9468e38da736ee855b19 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Thu, 28 May 2020 08:00:00 -0700
Subject: fs/ext4: Make DAX mount option a tri-state

We add 'always', 'never', and 'inode' (default).  '-o dax' continues to
operate the same which is equivalent to 'always'.  This new
functionality is limited to ext4 only.

Specifically we introduce a 2nd DAX mount flag EXT4_MOUNT2_DAX_NEVER and set
it and EXT4_MOUNT_DAX_ALWAYS appropriately for the mode.

We also force EXT4_MOUNT2_DAX_NEVER if !CONFIG_FS_DAX.

Finally, EXT4_MOUNT2_DAX_INODE is used solely to detect if the user
specified that option for printing.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>

Link: https://lore.kernel.org/r/20200528150003.828793-7-ira.weiny@intel.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  2 ++
 fs/ext4/inode.c |  2 ++
 fs/ext4/super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 61 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f5291693ce6e..65ffb831b2b9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1168,6 +1168,8 @@ struct ext4_inode_info {
 						      blocks */
 #define EXT4_MOUNT2_HURD_COMPAT		0x00000004 /* Support HURD-castrated
 						      file systems */
+#define EXT4_MOUNT2_DAX_NEVER		0x00000008 /* Do not allow Direct Access */
+#define EXT4_MOUNT2_DAX_INODE		0x00000010 /* For printing options only */
 
 #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM	0x00000008 /* User explicitly
 						specified journal checksum */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 01636cf5f322..68fac9289109 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4402,6 +4402,8 @@ static bool ext4_should_enable_dax(struct inode *inode)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
+	if (test_opt2(inode->i_sb, DAX_NEVER))
+		return false;
 	if (!S_ISREG(inode->i_mode))
 		return false;
 	if (ext4_should_journal_data(inode))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 80eb814c47eb..5e056aa20ce9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1512,7 +1512,8 @@ enum {
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
 	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
-	Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax,
+	Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
+	Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
 	Opt_nowarn_on_error, Opt_mblk_io_submit,
 	Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
@@ -1579,6 +1580,9 @@ static const match_table_t tokens = {
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_i_version, "i_version"},
 	{Opt_dax, "dax"},
+	{Opt_dax_always, "dax=always"},
+	{Opt_dax_inode, "dax=inode"},
+	{Opt_dax_never, "dax=never"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_warn_on_error, "warn_on_error"},
@@ -1726,6 +1730,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
 #define MOPT_NO_EXT3	0x0200
 #define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
 #define MOPT_STRING	0x0400
+#define MOPT_SKIP	0x0800
 
 static const struct mount_opts {
 	int	token;
@@ -1775,7 +1780,13 @@ static const struct mount_opts {
 	{Opt_min_batch_time, 0, MOPT_GTE0},
 	{Opt_inode_readahead_blks, 0, MOPT_GTE0},
 	{Opt_init_itable, 0, MOPT_GTE0},
-	{Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET},
+	{Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP},
+	{Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS,
+		MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
+	{Opt_dax_inode, EXT4_MOUNT2_DAX_INODE,
+		MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
+	{Opt_dax_never, EXT4_MOUNT2_DAX_NEVER,
+		MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
 	{Opt_stripe, 0, MOPT_GTE0},
 	{Opt_resuid, 0, MOPT_GTE0},
 	{Opt_resgid, 0, MOPT_GTE0},
@@ -2084,13 +2095,32 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		}
 		sbi->s_jquota_fmt = m->mount_opt;
 #endif
-	} else if (token == Opt_dax) {
+	} else if (token == Opt_dax || token == Opt_dax_always ||
+		   token == Opt_dax_inode || token == Opt_dax_never) {
 #ifdef CONFIG_FS_DAX
-		ext4_msg(sb, KERN_WARNING,
-		"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
-		sbi->s_mount_opt |= m->mount_opt;
+		switch (token) {
+		case Opt_dax:
+		case Opt_dax_always:
+			ext4_msg(sb, KERN_WARNING,
+				"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+			sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS;
+			sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
+			break;
+		case Opt_dax_never:
+			sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
+			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
+			break;
+		case Opt_dax_inode:
+			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
+			sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
+			/* Strictly for printing options */
+			sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE;
+			break;
+		}
 #else
 		ext4_msg(sb, KERN_INFO, "dax option not supported");
+		sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
+		sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
 		return -1;
 #endif
 	} else if (token == Opt_data_err_abort) {
@@ -2254,7 +2284,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
 	for (m = ext4_mount_opts; m->token != Opt_err; m++) {
 		int want_set = m->flags & MOPT_SET;
 		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
-		    (m->flags & MOPT_CLEAR_ERR))
+		    (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP)
 			continue;
 		if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
 			continue; /* skip if same as the default */
@@ -2314,6 +2344,17 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
 	if (DUMMY_ENCRYPTION_ENABLED(sbi))
 		SEQ_OPTS_PUTS("test_dummy_encryption");
 
+	if (test_opt(sb, DAX_ALWAYS)) {
+		if (IS_EXT2_SB(sb))
+			SEQ_OPTS_PUTS("dax");
+		else
+			SEQ_OPTS_PUTS("dax=always");
+	} else if (test_opt2(sb, DAX_NEVER)) {
+		SEQ_OPTS_PUTS("dax=never");
+	} else if (test_opt2(sb, DAX_INODE)) {
+		SEQ_OPTS_PUTS("dax=inode");
+	}
+
 	ext4_show_quota_options(seq, sb);
 	return 0;
 }
@@ -5436,10 +5477,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
-	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS) {
+	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS ||
+	    (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_NEVER ||
+	    (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_INODE) {
 		ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
-			"dax flag with busy inodes while remounting");
-		sbi->s_mount_opt ^= EXT4_MOUNT_DAX_ALWAYS;
+			"dax mount option with busy inodes while remounting");
+		sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
+		sbi->s_mount_opt |= old_opts.s_mount_opt & EXT4_MOUNT_DAX_ALWAYS;
+		sbi->s_mount_opt2 &= ~(EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE);
+		sbi->s_mount_opt2 |= old_opts.s_mount_opt2 &
+				     (EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE);
 	}
 
 	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
-- 
cgit 


From fcebc7949cd2ff97407e5b77ed99a7211674c6de Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Thu, 28 May 2020 08:00:01 -0700
Subject: fs/ext4: Remove jflag variable

The jflag variable serves almost no purpose.  Remove it.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>

Link: https://lore.kernel.org/r/20200528150003.828793-8-ira.weiny@intel.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ioctl.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 145083e8cd1e..779631e8e849 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -300,7 +300,6 @@ static int ext4_ioctl_setflags(struct inode *inode,
 	int err = -EPERM, migrate = 0;
 	struct ext4_iloc iloc;
 	unsigned int oldflags, mask, i;
-	unsigned int jflag;
 	struct super_block *sb = inode->i_sb;
 
 	/* Is it quota file? Do not allow user to mess with it */
@@ -309,9 +308,6 @@ static int ext4_ioctl_setflags(struct inode *inode,
 
 	oldflags = ei->i_flags;
 
-	/* The JOURNAL_DATA flag is modifiable only by root */
-	jflag = flags & EXT4_JOURNAL_DATA_FL;
-
 	err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
 	if (err)
 		goto flags_out;
@@ -320,7 +316,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
 	 * The JOURNAL_DATA flag can only be changed by
 	 * the relevant capability.
 	 */
-	if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+	if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
 		if (!capable(CAP_SYS_RESOURCE))
 			goto flags_out;
 	}
@@ -391,7 +387,7 @@ flags_err:
 	if (err)
 		goto flags_out;
 
-	if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+	if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
 		/*
 		 * Changes to the journaling mode can cause unsafe changes to
 		 * S_DAX if the inode is DAX
@@ -401,7 +397,8 @@ flags_err:
 			goto flags_out;
 		}
 
-		err = ext4_change_inode_journal_flag(inode, jflag);
+		err = ext4_change_inode_journal_flag(inode,
+						     flags & EXT4_JOURNAL_DATA_FL);
 		if (err)
 			goto flags_out;
 	}
-- 
cgit 


From b383a73f2b832491a2f9e6e8ada26aad53b5763d Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Thu, 28 May 2020 08:00:02 -0700
Subject: fs/ext4: Introduce DAX inode flag

Add a flag ([EXT4|FS]_DAX_FL) to preserve FS_XFLAG_DAX in the ext4
inode.

Set the flag to be user visible and changeable.  Set the flag to be
inherited.  Allow applications to change the flag at any time except if
it conflicts with the set of mutually exclusive flags (Currently VERITY,
ENCRYPT, JOURNAL_DATA).

Furthermore, restrict setting any of the exclusive flags if DAX is set.

While conceptually possible, we do not allow setting EXT4_DAX_FL while
at the same time clearing exclusion flags (or vice versa) for 2 reasons:

	1) The DAX flag does not take effect immediately which
	   introduces quite a bit of complexity
	2) There is no clear use case for being this flexible

Finally, on regular files, flag the inode to not be cached to facilitate
changing S_DAX on the next creation of the inode.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>

Link: https://lore.kernel.org/r/20200528150003.828793-9-ira.weiny@intel.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h   | 18 ++++++++++++++----
 fs/ext4/inode.c  |  2 +-
 fs/ext4/ioctl.c  | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/super.c  |  3 +++
 fs/ext4/verity.c |  2 +-
 5 files changed, 65 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 65ffb831b2b9..598e00a9453f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -415,13 +415,16 @@ struct flex_groups {
 #define EXT4_VERITY_FL			0x00100000 /* Verity protected inode */
 #define EXT4_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
 /* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */
+
+#define EXT4_DAX_FL			0x02000000 /* Inode is DAX */
+
 #define EXT4_INLINE_DATA_FL		0x10000000 /* Inode has inline data. */
 #define EXT4_PROJINHERIT_FL		0x20000000 /* Create with parents projid */
 #define EXT4_CASEFOLD_FL		0x40000000 /* Casefolded file */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
-#define EXT4_FL_USER_VISIBLE		0x705BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE		0x604BC0FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE		0x725BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE		0x624BC0FF /* User modifiable flags */
 
 /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
 #define EXT4_FL_XFLAG_VISIBLE		(EXT4_SYNC_FL | \
@@ -429,14 +432,16 @@ struct flex_groups {
 					 EXT4_APPEND_FL | \
 					 EXT4_NODUMP_FL | \
 					 EXT4_NOATIME_FL | \
-					 EXT4_PROJINHERIT_FL)
+					 EXT4_PROJINHERIT_FL | \
+					 EXT4_DAX_FL)
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
 			   EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
 			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
 			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
-			   EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL)
+			   EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\
+			   EXT4_DAX_FL)
 
 /* Flags that are appropriate for regular files (all but dir-specific ones). */
 #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
@@ -448,6 +453,10 @@ struct flex_groups {
 /* The only flags that should be swapped */
 #define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
 
+/* Flags which are mutually exclusive to DAX */
+#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
+			   EXT4_JOURNAL_DATA_FL)
+
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 {
@@ -488,6 +497,7 @@ enum {
 	EXT4_INODE_VERITY	= 20,	/* Verity protected inode */
 	EXT4_INODE_EA_INODE	= 21,	/* Inode used for large EA */
 /* 22 was formerly EXT4_INODE_EOFBLOCKS */
+	EXT4_INODE_DAX		= 25,	/* Inode is DAX */
 	EXT4_INODE_INLINE_DATA	= 28,	/* Data in inode. */
 	EXT4_INODE_PROJINHERIT	= 29,	/* Create with parents projid */
 	EXT4_INODE_RESERVED	= 31,	/* reserved for ext4 lib */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 68fac9289109..778b0dbe3da6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4419,7 +4419,7 @@ static bool ext4_should_enable_dax(struct inode *inode)
 	if (test_opt(inode->i_sb, DAX_ALWAYS))
 		return true;
 
-	return false;
+	return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
 }
 
 void ext4_set_inode_flags(struct inode *inode, bool init)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 779631e8e849..1b520d07d371 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -292,6 +292,38 @@ static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid,
 	return 0;
 }
 
+static void ext4_dax_dontcache(struct inode *inode, unsigned int flags)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (S_ISDIR(inode->i_mode))
+		return;
+
+	if (test_opt2(inode->i_sb, DAX_NEVER) ||
+	    test_opt(inode->i_sb, DAX_ALWAYS))
+		return;
+
+	if ((ei->i_flags ^ flags) & EXT4_DAX_FL)
+		d_mark_dontcache(inode);
+}
+
+static bool dax_compatible(struct inode *inode, unsigned int oldflags,
+			   unsigned int flags)
+{
+	if (flags & EXT4_DAX_FL) {
+		if ((oldflags & EXT4_DAX_MUT_EXCL) ||
+		     ext4_test_inode_state(inode,
+					  EXT4_STATE_VERITY_IN_PROGRESS)) {
+			return false;
+		}
+	}
+
+	if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL))
+			return false;
+
+	return true;
+}
+
 static int ext4_ioctl_setflags(struct inode *inode,
 			       unsigned int flags)
 {
@@ -320,6 +352,12 @@ static int ext4_ioctl_setflags(struct inode *inode,
 		if (!capable(CAP_SYS_RESOURCE))
 			goto flags_out;
 	}
+
+	if (!dax_compatible(inode, oldflags, flags)) {
+		err = -EOPNOTSUPP;
+		goto flags_out;
+	}
+
 	if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
 		migrate = 1;
 
@@ -365,6 +403,8 @@ static int ext4_ioctl_setflags(struct inode *inode,
 	if (err)
 		goto flags_err;
 
+	ext4_dax_dontcache(inode, flags);
+
 	for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
 		if (!(mask & EXT4_FL_USER_MODIFIABLE))
 			continue;
@@ -525,12 +565,15 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
 		xflags |= FS_XFLAG_NOATIME;
 	if (iflags & EXT4_PROJINHERIT_FL)
 		xflags |= FS_XFLAG_PROJINHERIT;
+	if (iflags & EXT4_DAX_FL)
+		xflags |= FS_XFLAG_DAX;
 	return xflags;
 }
 
 #define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
 				  FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
-				  FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT)
+				  FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT | \
+				  FS_XFLAG_DAX)
 
 /* Transfer xflags flags to internal */
 static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
@@ -549,6 +592,8 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
 		iflags |= EXT4_NOATIME_FL;
 	if (xflags & FS_XFLAG_PROJINHERIT)
 		iflags |= EXT4_PROJINHERIT_FL;
+	if (xflags & FS_XFLAG_DAX)
+		iflags |= EXT4_DAX_FL;
 
 	return iflags;
 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5e056aa20ce9..3658e3016999 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1323,6 +1323,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 	if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
 		return -EINVAL;
 
+	if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+		return -EOPNOTSUPP;
+
 	res = ext4_convert_inline_data(inode);
 	if (res)
 		return res;
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 89a155ece323..4fecb3e4e338 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -113,7 +113,7 @@ static int ext4_begin_enable_verity(struct file *filp)
 	handle_t *handle;
 	int err;
 
-	if (IS_DAX(inode))
+	if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX))
 		return -EINVAL;
 
 	if (ext4_verity_in_progress(inode))
-- 
cgit 


From 829b37b8cddb1db75c1b7905505b90e593b15db1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 10 Jun 2020 11:16:37 -0400
Subject: ext4: avoid race conditions when remounting with options that change
 dax

Trying to change dax mount options when remounting could allow mount
options to be enabled for a small amount of time, and then the mount
option change would be reverted.

In the case of "mount -o remount,dax", this can cause a race where
files would temporarily treated as DAX --- and then not.

Cc: stable@kernel.org
Reported-by: syzbot+bca9799bf129256190da@syzkaller.appspotmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a22d67c5bc00..edf06c1bee9d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2104,16 +2104,40 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		switch (token) {
 		case Opt_dax:
 		case Opt_dax_always:
+			if (is_remount &&
+			    (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+			     (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
+			fail_dax_change_remount:
+				ext4_msg(sb, KERN_ERR, "can't change "
+					 "dax mount option while remounting");
+				return -1;
+			}
+			if (is_remount &&
+			    (test_opt(sb, DATA_FLAGS) ==
+			     EXT4_MOUNT_JOURNAL_DATA)) {
+				    ext4_msg(sb, KERN_ERR, "can't mount with "
+					     "both data=journal and dax");
+				    return -1;
+			}
 			ext4_msg(sb, KERN_WARNING,
 				"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
 			sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS;
 			sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
 			break;
 		case Opt_dax_never:
+			if (is_remount &&
+			    (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+			     (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS)))
+				goto fail_dax_change_remount;
 			sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
 			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
 			break;
 		case Opt_dax_inode:
+			if (is_remount &&
+			    ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+			     (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+			     !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE)))
+				goto fail_dax_change_remount;
 			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
 			sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
 			/* Strictly for printing options */
@@ -5454,12 +5478,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			err = -EINVAL;
 			goto restore_opts;
 		}
-		if (test_opt(sb, DAX_ALWAYS)) {
-			ext4_msg(sb, KERN_ERR, "can't mount with "
-				 "both data=journal and dax");
-			err = -EINVAL;
-			goto restore_opts;
-		}
 	} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
 		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
@@ -5475,18 +5493,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
-	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX_ALWAYS ||
-	    (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_NEVER ||
-	    (sbi->s_mount_opt2 ^ old_opts.s_mount_opt2) & EXT4_MOUNT2_DAX_INODE) {
-		ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
-			"dax mount option with busy inodes while remounting");
-		sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
-		sbi->s_mount_opt |= old_opts.s_mount_opt & EXT4_MOUNT_DAX_ALWAYS;
-		sbi->s_mount_opt2 &= ~(EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE);
-		sbi->s_mount_opt2 |= old_opts.s_mount_opt2 &
-				     (EXT4_MOUNT2_DAX_NEVER | EXT4_MOUNT2_DAX_INODE);
-	}
-
 	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
 		ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
 
-- 
cgit 


From cfb3c85a600c6aa25a2581b3c1c4db3460f14e46 Mon Sep 17 00:00:00 2001
From: Jeffle Xu <jefflexu@linux.alibaba.com>
Date: Fri, 22 May 2020 12:18:44 +0800
Subject: ext4: fix partial cluster initialization when splitting extent

Fix the bug when calculating the physical block number of the first
block in the split extent.

This bug will cause xfstests shared/298 failure on ext4 with bigalloc
enabled occasionally. Ext4 error messages indicate that previously freed
blocks are being freed again, and the following fsck will fail due to
the inconsistency of block bitmap and bg descriptor.

The following is an example case:

1. First, Initialize a ext4 filesystem with cluster size '16K', block size
'4K', in which case, one cluster contains four blocks.

2. Create one file (e.g., xxx.img) on this ext4 filesystem. Now the extent
tree of this file is like:

...
36864:[0]4:220160
36868:[0]14332:145408
51200:[0]2:231424
...

3. Then execute PUNCH_HOLE fallocate on this file. The hole range is
like:

..
ext4_ext_remove_space: dev 254,16 ino 12 since 49506 end 49506 depth 1
ext4_ext_remove_space: dev 254,16 ino 12 since 49544 end 49546 depth 1
ext4_ext_remove_space: dev 254,16 ino 12 since 49605 end 49607 depth 1
...

4. Then the extent tree of this file after punching is like

...
49507:[0]37:158047
49547:[0]58:158087
...

5. Detailed procedure of punching hole [49544, 49546]

5.1. The block address space:
```
lblk        ~49505  49506   49507~49543     49544~49546    49547~
	  ---------+------+-------------+----------------+--------
	    extent | hole |   extent	|	hole	 | extent
	  ---------+------+-------------+----------------+--------
pblk       ~158045  158046  158047~158083  158084~158086   158087~
```

5.2. The detailed layout of cluster 39521:
```
		cluster 39521
	<------------------------------->

		hole		  extent
	<----------------------><--------

lblk      49544   49545   49546   49547
	+-------+-------+-------+-------+
	|	|	|	|	|
	+-------+-------+-------+-------+
pblk     158084  1580845  158086  158087
```

5.3. The ftrace output when punching hole [49544, 49546]:
- ext4_ext_remove_space (start 49544, end 49546)
  - ext4_ext_rm_leaf (start 49544, end 49546, last_extent [49507(158047), 40], partial [pclu 39522 lblk 0 state 2])
    - ext4_remove_blocks (extent [49507(158047), 40], from 49544 to 49546, partial [pclu 39522 lblk 0 state 2]
      - ext4_free_blocks: (block 158084 count 4)
        - ext4_mballoc_free (extent 1/6753/1)

5.4. Ext4 error message in dmesg:
EXT4-fs error (device vdb): mb_free_blocks:1457: group 1, block 158084:freeing already freed block (bit 6753); block bitmap corrupt.
EXT4-fs error (device vdb): ext4_mb_generate_buddy:747: group 1, block bitmap and bg descriptor inconsistent: 19550 vs 19551 free clusters

In this case, the whole cluster 39521 is freed mistakenly when freeing
pblock 158084~158086 (i.e., the first three blocks of this cluster),
although pblock 158087 (the last remaining block of this cluster) has
not been freed yet.

The root cause of this isuue is that, the pclu of the partial cluster is
calculated mistakenly in ext4_ext_remove_space(). The correct
partial_cluster.pclu (i.e., the cluster number of the first block in the
next extent, that is, lblock 49597 (pblock 158086)) should be 39521 rather
than 39522.

Fixes: f4226d9ea400 ("ext4: fix partial cluster initialization")
Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Eric Whitney <enwlinux@gmail.com>
Cc: stable@kernel.org # v3.19+
Link: https://lore.kernel.org/r/1590121124-37096-1-git-send-email-jefflexu@linux.alibaba.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7d088ff1e902..221f240eae60 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2844,7 +2844,7 @@ again:
 			 * in use to avoid freeing it when removing blocks.
 			 */
 			if (sbi->s_cluster_ratio > 1) {
-				pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
+				pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
 				partial.pclu = EXT4_B2C(sbi, pblk);
 				partial.state = nofree;
 			}
-- 
cgit 


From 5adaccac46ea79008d7b75f47913f1a00f91d0ce Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Mon, 1 Jun 2020 15:34:04 +0800
Subject: ext4: stop overwrite the errcode in ext4_setup_super

Now the errcode from ext4_commit_super will overwrite EROFS exists in
ext4_setup_super. Actually, no need to call ext4_commit_super since we
will return EROFS. Fix it by goto done directly.

Fixes: c89128a00838 ("ext4: handle errors on ext4_commit_super")
Signed-off-by: yangerkun <yangerkun@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20200601073404.3712492-1-yangerkun@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index edf06c1bee9d..127b8efa8d54 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2412,6 +2412,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 		ext4_msg(sb, KERN_ERR, "revision level too high, "
 			 "forcing read-only mode");
 		err = -EROFS;
+		goto done;
 	}
 	if (read_only)
 		goto done;
-- 
cgit 


From 2ce3ee931a097e9720310db3f09c01c825a4580c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 1 Jun 2020 13:05:43 -0700
Subject: ext4: avoid utf8_strncasecmp() with unstable name

If the dentry name passed to ->d_compare() fits in dentry::d_iname, then
it may be concurrently modified by a rename.  This can cause undefined
behavior (possibly out-of-bounds memory accesses or crashes) in
utf8_strncasecmp(), since fs/unicode/ isn't written to handle strings
that may be concurrently modified.

Fix this by first copying the filename to a stack buffer if needed.
This way we get a stable snapshot of the filename.

Fixes: b886ee3e778e ("ext4: Support case-insensitive file name lookups")
Cc: <stable@vger.kernel.org> # v5.2+
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Daniel Rosenberg <drosen@google.com>
Cc: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Link: https://lore.kernel.org/r/20200601200543.59417-1-ebiggers@kernel.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/dir.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index c654205f648d..1d82336b1cd4 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -675,6 +675,7 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
 	struct qstr qstr = {.name = str, .len = len };
 	const struct dentry *parent = READ_ONCE(dentry->d_parent);
 	const struct inode *inode = READ_ONCE(parent->d_inode);
+	char strbuf[DNAME_INLINE_LEN];
 
 	if (!inode || !IS_CASEFOLDED(inode) ||
 	    !EXT4_SB(inode->i_sb)->s_encoding) {
@@ -683,6 +684,21 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
 		return memcmp(str, name->name, len);
 	}
 
+	/*
+	 * If the dentry name is stored in-line, then it may be concurrently
+	 * modified by a rename.  If this happens, the VFS will eventually retry
+	 * the lookup, so it doesn't matter what ->d_compare() returns.
+	 * However, it's unsafe to call utf8_strncasecmp() with an unstable
+	 * string.  Therefore, we have to copy the name into a temporary buffer.
+	 */
+	if (len <= DNAME_INLINE_LEN - 1) {
+		memcpy(strbuf, str, len);
+		strbuf[len] = 0;
+		qstr.name = strbuf;
+		/* prevent compiler from optimizing out the temporary buffer */
+		barrier();
+	}
+
 	return ext4_ci_compare(inode, name, &qstr, false);
 }
 
-- 
cgit 


From 811985365378df01386c3cfb7ff716e74ca376d5 Mon Sep 17 00:00:00 2001
From: Ritesh Harjani <riteshh@linux.ibm.com>
Date: Tue, 9 Jun 2020 16:23:10 +0530
Subject: ext4: mballoc: Use this_cpu_read instead of this_cpu_ptr

Simplify reading a seq variable by directly using this_cpu_read API
instead of doing this_cpu_ptr and then dereferencing it.

This also avoid the below kernel BUG: which happens when
CONFIG_DEBUG_PREEMPT is enabled

BUG: using smp_processor_id() in preemptible [00000000] code: syz-fuzzer/6927
caller is ext4_mb_new_blocks+0xa4d/0x3b70 fs/ext4/mballoc.c:4711
CPU: 1 PID: 6927 Comm: syz-fuzzer Not tainted 5.7.0-next-20200602-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x18f/0x20d lib/dump_stack.c:118
 check_preemption_disabled+0x20d/0x220 lib/smp_processor_id.c:48
 ext4_mb_new_blocks+0xa4d/0x3b70 fs/ext4/mballoc.c:4711
 ext4_ext_map_blocks+0x201b/0x33e0 fs/ext4/extents.c:4244
 ext4_map_blocks+0x4cb/0x1640 fs/ext4/inode.c:626
 ext4_getblk+0xad/0x520 fs/ext4/inode.c:833
 ext4_bread+0x7c/0x380 fs/ext4/inode.c:883
 ext4_append+0x153/0x360 fs/ext4/namei.c:67
 ext4_init_new_dir fs/ext4/namei.c:2757 [inline]
 ext4_mkdir+0x5e0/0xdf0 fs/ext4/namei.c:2802
 vfs_mkdir+0x419/0x690 fs/namei.c:3632
 do_mkdirat+0x21e/0x280 fs/namei.c:3655
 do_syscall_64+0x60/0xe0 arch/x86/entry/common.c:359
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: 42f56b7a4a7d ("ext4: mballoc: introduce pcpu seqcnt for freeing PA
to improve ENOSPC handling")
Suggested-by: Borislav Petkov <bp@alien8.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Ritesh Harjani <riteshh@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reported-by: syzbot+82f324bb69744c5f6969@syzkaller.appspotmail.com
Link: https://lore.kernel.org/r/534f275016296996f54ecf65168bb3392b6f653d.1591699601.git.riteshh@linux.ibm.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a9083113a8c0..c0a331e2feb0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4708,7 +4708,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	}
 
 	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
-	seq = *this_cpu_ptr(&discard_pa_seq);
+	seq = this_cpu_read(discard_pa_seq);
 	if (!ext4_mb_use_preallocated(ac)) {
 		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
 		ext4_mb_normalize_request(ac, ar);
-- 
cgit 


From 88ee9d571b6d8ed345f877e05f685814412e359b Mon Sep 17 00:00:00 2001
From: "Jan (janneke) Nieuwenhuizen" <janneke@gnu.org>
Date: Mon, 25 May 2020 21:39:40 +0200
Subject: ext4: support xattr gnu.* namespace for the Hurd

The Hurd gained[0] support for moving the translator and author
fields out of the inode and into the "gnu.*" xattr namespace.

In anticipation of that, an xattr INDEX was reserved[1].  The Hurd has
now been brought into compliance[2] with that.

This patch adds support for reading and writing such attributes from
Linux; you can now do something like

    mkdir -p hurd-root/servers/socket
    touch hurd-root/servers/socket/1
    setfattr --name=gnu.translator --value='"/hurd/pflocal\0"' \
        hurd-root/servers/socket/1
    getfattr --name=gnu.translator hurd-root/servers/socket/1
    # file: 1
    gnu.translator="/hurd/pflocal"

to setup a pipe translator, which is being used to create[3] a
vm-image for the Hurd from GNU Guix.

[0] https://summerofcode.withgoogle.com/projects/#5869799859027968
[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3980bd3b406addb327d858aebd19e229ea340b9a
[2] https://git.savannah.gnu.org/cgit/hurd/hurd.git/commit/?id=a04c7bf83172faa7cb080fbe3b6c04a8415ca645
[3] https://git.savannah.gnu.org/cgit/guix.git/log/?h=wip-hurd-vm

Signed-off-by: Jan Nieuwenhuizen <janneke@gnu.org>
Link: https://lore.kernel.org/r/20200525193940.878-1-janneke@gnu.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/Makefile     |  3 ++-
 fs/ext4/xattr.c      |  2 ++
 fs/ext4/xattr.h      |  1 +
 fs/ext4/xattr_hurd.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 fs/ext4/xattr_hurd.c

(limited to 'fs')

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 4ccb3c9189d8..2e42f47a7f98 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -9,7 +9,8 @@ ext4-y	:= balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
 		extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
 		indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
 		mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
-		super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
+		super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \
+		xattr_user.o
 
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 9b29a40738ac..7d2f6576d954 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -93,6 +93,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = {
 #ifdef CONFIG_EXT4_FS_SECURITY
 	[EXT4_XATTR_INDEX_SECURITY]	     = &ext4_xattr_security_handler,
 #endif
+	[EXT4_XATTR_INDEX_HURD]		     = &ext4_xattr_hurd_handler,
 };
 
 const struct xattr_handler *ext4_xattr_handlers[] = {
@@ -105,6 +106,7 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
 #ifdef CONFIG_EXT4_FS_SECURITY
 	&ext4_xattr_security_handler,
 #endif
+	&ext4_xattr_hurd_handler,
 	NULL
 };
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index ffe21ac77f78..730b91fa0dd7 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -124,6 +124,7 @@ struct ext4_xattr_inode_array {
 extern const struct xattr_handler ext4_xattr_user_handler;
 extern const struct xattr_handler ext4_xattr_trusted_handler;
 extern const struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_xattr_hurd_handler;
 
 #define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
 
diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c
new file mode 100644
index 000000000000..8cfa74a56361
--- /dev/null
+++ b/fs/ext4/xattr_hurd.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/ext4/xattr_hurd.c
+ * Handler for extended gnu attributes for the Hurd.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ * Copyright (C) 2020 by Jan (janneke) Nieuwenhuizen, <janneke@gnu.org>
+ */
+
+#include <linux/init.h>
+#include <linux/string.h>
+#include "ext4.h"
+#include "xattr.h"
+
+static bool
+ext4_xattr_hurd_list(struct dentry *dentry)
+{
+	return test_opt(dentry->d_sb, XATTR_USER);
+}
+
+static int
+ext4_xattr_hurd_get(const struct xattr_handler *handler,
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, void *buffer, size_t size)
+{
+	if (!test_opt(inode->i_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+
+	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_HURD,
+			      name, buffer, size);
+}
+
+static int
+ext4_xattr_hurd_set(const struct xattr_handler *handler,
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, const void *value,
+		    size_t size, int flags)
+{
+	if (!test_opt(inode->i_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_HURD,
+			      name, value, size, flags);
+}
+
+const struct xattr_handler ext4_xattr_hurd_handler = {
+	.prefix	= XATTR_HURD_PREFIX,
+	.list	= ext4_xattr_hurd_list,
+	.get	= ext4_xattr_hurd_get,
+	.set	= ext4_xattr_hurd_set,
+};
-- 
cgit 


From 7b97d868b7ab2448859668de9222b8af43f76e78 Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Tue, 9 Jun 2020 15:35:40 +0800
Subject: ext4, jbd2: ensure panic by fix a race between jbd2 abort and ext4
 error handlers

In the ext4 filesystem with errors=panic, if one process is recording
errno in the superblock when invoking jbd2_journal_abort() due to some
error cases, it could be raced by another __ext4_abort() which is
setting the SB_RDONLY flag but missing panic because errno has not been
recorded.

jbd2_journal_commit_transaction()
 jbd2_journal_abort()
  journal->j_flags |= JBD2_ABORT;
  jbd2_journal_update_sb_errno()
                                    | ext4_journal_check_start()
                                    |  __ext4_abort()
                                    |   sb->s_flags |= SB_RDONLY;
                                    |   if (!JBD2_REC_ERR)
                                    |        return;
  journal->j_flags |= JBD2_REC_ERR;

Finally, it will no longer trigger panic because the filesystem has
already been set read-only. Fix this by introduce j_abort_mutex to make
sure journal abort is completed before panic, and remove JBD2_REC_ERR
flag.

Fixes: 4327ba52afd03 ("ext4, jbd2: ensure entering into panic after recording an error in superblock")
Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20200609073540.3810702-1-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c   | 16 +++++-----------
 fs/jbd2/journal.c | 17 ++++++++++++-----
 2 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 127b8efa8d54..2660a7e47eef 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -522,9 +522,6 @@ static void ext4_handle_error(struct super_block *sb)
 		smp_wmb();
 		sb->s_flags |= SB_RDONLY;
 	} else if (test_opt(sb, ERRORS_PANIC)) {
-		if (EXT4_SB(sb)->s_journal &&
-		  !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
-			return;
 		panic("EXT4-fs (device %s): panic forced after error\n",
 			sb->s_id);
 	}
@@ -725,23 +722,20 @@ void __ext4_abort(struct super_block *sb, const char *function,
 	va_end(args);
 
 	if (sb_rdonly(sb) == 0) {
-		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+		if (EXT4_SB(sb)->s_journal)
+			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+
+		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 		/*
 		 * Make sure updated value of ->s_mount_flags will be visible
 		 * before ->s_flags update
 		 */
 		smp_wmb();
 		sb->s_flags |= SB_RDONLY;
-		if (EXT4_SB(sb)->s_journal)
-			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 	}
-	if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
-		if (EXT4_SB(sb)->s_journal &&
-		  !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
-			return;
+	if (test_opt(sb, ERRORS_PANIC) && !system_going_down())
 		panic("EXT4-fs panic from previous error\n");
-	}
 }
 
 void __ext4_msg(struct super_block *sb,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a49d0e670ddf..e4944436e733 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1140,6 +1140,7 @@ static journal_t *journal_init_common(struct block_device *bdev,
 	init_waitqueue_head(&journal->j_wait_commit);
 	init_waitqueue_head(&journal->j_wait_updates);
 	init_waitqueue_head(&journal->j_wait_reserved);
+	mutex_init(&journal->j_abort_mutex);
 	mutex_init(&journal->j_barrier);
 	mutex_init(&journal->j_checkpoint_mutex);
 	spin_lock_init(&journal->j_revoke_lock);
@@ -1402,7 +1403,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
 		printk(KERN_ERR "JBD2: Error %d detected when updating "
 		       "journal superblock for %s.\n", ret,
 		       journal->j_devname);
-		jbd2_journal_abort(journal, ret);
+		if (!is_journal_aborted(journal))
+			jbd2_journal_abort(journal, ret);
 	}
 
 	return ret;
@@ -2153,6 +2155,13 @@ void jbd2_journal_abort(journal_t *journal, int errno)
 {
 	transaction_t *transaction;
 
+	/*
+	 * Lock the aborting procedure until everything is done, this avoid
+	 * races between filesystem's error handling flow (e.g. ext4_abort()),
+	 * ensure panic after the error info is written into journal's
+	 * superblock.
+	 */
+	mutex_lock(&journal->j_abort_mutex);
 	/*
 	 * ESHUTDOWN always takes precedence because a file system check
 	 * caused by any other journal abort error is not required after
@@ -2167,6 +2176,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
 			journal->j_errno = errno;
 			jbd2_journal_update_sb_errno(journal);
 		}
+		mutex_unlock(&journal->j_abort_mutex);
 		return;
 	}
 
@@ -2188,10 +2198,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
 	 * layer could realise that a filesystem check is needed.
 	 */
 	jbd2_journal_update_sb_errno(journal);
-
-	write_lock(&journal->j_state_lock);
-	journal->j_flags |= JBD2_REC_ERR;
-	write_unlock(&journal->j_state_lock);
+	mutex_unlock(&journal->j_abort_mutex);
 }
 
 /**
-- 
cgit 


From 59960b9deb5354e4cdb0b6ed3a3b653a2b4eb602 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 15 Jun 2020 16:36:30 +0300
Subject: io_uring: fix lazy work init

Don't leave garbage in req.work before punting async on -EAGAIN
in io_iopoll_queue().

[  140.922099] general protection fault, probably for non-canonical
     address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI
...
[  140.922105] RIP: 0010:io_worker_handle_work+0x1db/0x480
...
[  140.922114] Call Trace:
[  140.922118]  ? __next_timer_interrupt+0xe0/0xe0
[  140.922119]  io_wqe_worker+0x2a9/0x360
[  140.922121]  ? _raw_spin_unlock_irqrestore+0x24/0x40
[  140.922124]  kthread+0x12c/0x170
[  140.922125]  ? io_worker_handle_work+0x480/0x480
[  140.922126]  ? kthread_park+0x90/0x90
[  140.922127]  ret_from_fork+0x22/0x30

Fixes: 7cdaf587de7c ("io_uring: avoid whole io_wq_work copy for requests completed inline")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 155f3d830ddb..c04b20bf2803 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1087,6 +1087,7 @@ static inline void io_prep_async_work(struct io_kiocb *req,
 			req->work.flags |= IO_WQ_WORK_UNBOUND;
 	}
 
+	io_req_init_async(req);
 	io_req_work_grab_env(req, def);
 
 	*link = io_prep_linked_timeout(req);
-- 
cgit 


From bb413489288e4e457353bac513fddb6330d245ca Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 12 Jun 2020 00:15:13 +0100
Subject: afs: Fix non-setting of mtime when writing into mmap

The mtime on an inode needs to be updated when a write is made into an
mmap'ed section.  There are three ways in which this could be done: update
it when page_mkwrite is called, update it when a page is changed from dirty
to writeback or leave it to the server and fix the mtime up from the reply
to the StoreData RPC.

Found with the generic/215 xfstest.

Fixes: 1cf7a1518aef ("afs: Implement shared-writeable mmap")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/write.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/afs/write.c b/fs/afs/write.c
index 768497f82aee..9270bb01be67 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -844,6 +844,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
 			     vmf->page->index, priv);
 	SetPagePrivate(vmf->page);
 	set_page_private(vmf->page, priv);
+	file_update_time(file);
 
 	sb_end_pagefault(inode->i_sb);
 	return VM_FAULT_LOCKED;
-- 
cgit 


From 1f32ef79897052ef7d3d154610d8d6af95abde83 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 12 Jun 2020 23:58:51 +0100
Subject: afs: afs_write_end() should change i_size under the right lock

Fix afs_write_end() to change i_size under vnode->cb_lock rather than
->wb_lock so that it doesn't race with afs_vnode_commit_status() and
afs_getattr().

The ->wb_lock is only meant to guard access to ->wb_keys which isn't
accessed by that piece of code.

Fixes: 4343d00872e1 ("afs: Get rid of the afs_writeback record")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/write.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9270bb01be67..a55cb73e0449 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -194,11 +194,11 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 
 	i_size = i_size_read(&vnode->vfs_inode);
 	if (maybe_i_size > i_size) {
-		spin_lock(&vnode->wb_lock);
+		write_seqlock(&vnode->cb_lock);
 		i_size = i_size_read(&vnode->vfs_inode);
 		if (maybe_i_size > i_size)
 			i_size_write(&vnode->vfs_inode, maybe_i_size);
-		spin_unlock(&vnode->wb_lock);
+		write_sequnlock(&vnode->cb_lock);
 	}
 
 	if (!PageUptodate(page)) {
-- 
cgit 


From 3f4aa981816368fe6b1d13c2bfbe76df9687e787 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 13 Jun 2020 00:03:48 +0100
Subject: afs: Fix EOF corruption

When doing a partial writeback, afs_write_back_from_locked_page() may
generate an FS.StoreData RPC request that writes out part of a file when a
file has been constructed from pieces by doing seek, write, seek, write,
... as is done by ld.

The FS.StoreData RPC is given the current i_size as the file length, but
the server basically ignores it unless the data length is 0 (in which case
it's just a truncate operation).  The revised file length returned in the
result of the RPC may then not reflect what we suggested - and this leads
to i_size getting moved backwards - which causes issues later.

Fix the client to take account of this by ignoring the returned file size
unless the data version number jumped unexpectedly - in which case we're
going to have to clear the pagecache and reload anyway.

This can be observed when doing a kernel build on an AFS mount.  The
following pair of commands produce the issue:

  ld -m elf_x86_64 -z max-page-size=0x200000 --emit-relocs \
      -T arch/x86/realmode/rm/realmode.lds \
      arch/x86/realmode/rm/header.o \
      arch/x86/realmode/rm/trampoline_64.o \
      arch/x86/realmode/rm/stack.o \
      arch/x86/realmode/rm/reboot.o \
      -o arch/x86/realmode/rm/realmode.elf
  arch/x86/tools/relocs --realmode \
      arch/x86/realmode/rm/realmode.elf \
      >arch/x86/realmode/rm/realmode.relocs

This results in the latter giving:

	Cannot read ELF section headers 0/18: Success

as the realmode.elf file got corrupted.

The sequence of events can also be driven with:

	xfs_io -t -f \
		-c "pwrite -S 0x58 0 0x58" \
		-c "pwrite -S 0x59 10000 1000" \
		-c "close" \
		/afs/example.com/scratch/a

Fixes: 31143d5d515e ("AFS: implement basic file write support")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/inode.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index cd0a0060950b..8d10bfb392d1 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -168,6 +168,7 @@ static void afs_apply_status(struct afs_operation *op,
 	struct timespec64 t;
 	umode_t mode;
 	bool data_changed = false;
+	bool change_size = false;
 
 	_enter("{%llx:%llu.%u} %s",
 	       vp->fid.vid, vp->fid.vnode, vp->fid.unique,
@@ -226,6 +227,7 @@ static void afs_apply_status(struct afs_operation *op,
 		} else {
 			set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
 		}
+		change_size = true;
 	} else if (vnode->status.type == AFS_FTYPE_DIR) {
 		/* Expected directory change is handled elsewhere so
 		 * that we can locally edit the directory and save on a
@@ -233,11 +235,19 @@ static void afs_apply_status(struct afs_operation *op,
 		 */
 		if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
 			data_changed = false;
+		change_size = true;
 	}
 
 	if (data_changed) {
 		inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
-		afs_set_i_size(vnode, status->size);
+
+		/* Only update the size if the data version jumped.  If the
+		 * file is being modified locally, then we might have our own
+		 * idea of what the size should be that's not the same as
+		 * what's on the server.
+		 */
+		if (change_size)
+			afs_set_i_size(vnode, status->size);
 	}
 }
 
-- 
cgit 


From da8d07551275abb3a38fae2d16e02bc9cc7396b2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 13 Jun 2020 19:34:59 +0100
Subject: afs: Concoct ctimes

The in-kernel afs filesystem ignores ctime because the AFS fileserver
protocol doesn't support ctimes.  This, however, causes various xfstests to
fail.

Work around this by:

 (1) Setting ctime to attr->ia_ctime in afs_setattr().

 (2) Not ignoring ATTR_MTIME_SET, ATTR_TIMES_SET and ATTR_TOUCH settings.

 (3) Setting the ctime from the server mtime when on the target file when
     creating a hard link to it.

 (4) Setting the ctime on directories from their revised mtimes when
     renaming/moving a file.

Found by the generic/221 and generic/309 xfstests.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/dir.c      | 18 +++++++++++++++++-
 fs/afs/inode.c    | 29 ++++++++++++++++++-----------
 fs/afs/internal.h |  2 ++
 fs/afs/write.c    |  1 +
 4 files changed, 38 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index aa1d34141ea3..308a125e9de3 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1268,6 +1268,7 @@ static void afs_vnode_new_inode(struct afs_operation *op)
 static void afs_create_success(struct afs_operation *op)
 {
 	_enter("op=%08x", op->debug_id);
+	op->ctime = op->file[0].scb.status.mtime_client;
 	afs_check_for_remote_deletion(op, op->file[0].vnode);
 	afs_vnode_commit_status(op, &op->file[0]);
 	afs_update_dentry_version(op, &op->file[0], op->dentry);
@@ -1325,6 +1326,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
+	op->file[0].update_ctime = true;
 	op->dentry	= dentry;
 	op->create.mode	= S_IFDIR | mode;
 	op->create.reason = afs_edit_dir_for_mkdir;
@@ -1350,6 +1352,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
 static void afs_rmdir_success(struct afs_operation *op)
 {
 	_enter("op=%08x", op->debug_id);
+	op->ctime = op->file[0].scb.status.mtime_client;
 	afs_check_for_remote_deletion(op, op->file[0].vnode);
 	afs_vnode_commit_status(op, &op->file[0]);
 	afs_update_dentry_version(op, &op->file[0], op->dentry);
@@ -1404,6 +1407,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
+	op->file[0].update_ctime = true;
 
 	op->dentry	= dentry;
 	op->ops		= &afs_rmdir_operation;
@@ -1479,6 +1483,7 @@ static void afs_dir_remove_link(struct afs_operation *op)
 static void afs_unlink_success(struct afs_operation *op)
 {
 	_enter("op=%08x", op->debug_id);
+	op->ctime = op->file[0].scb.status.mtime_client;
 	afs_check_for_remote_deletion(op, op->file[0].vnode);
 	afs_vnode_commit_status(op, &op->file[0]);
 	afs_vnode_commit_status(op, &op->file[1]);
@@ -1537,6 +1542,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
+	op->file[0].update_ctime = true;
 
 	/* Try to make sure we have a callback promise on the victim. */
 	ret = afs_validate(vnode, op->key);
@@ -1561,6 +1567,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 	spin_unlock(&dentry->d_lock);
 
 	op->file[1].vnode = vnode;
+	op->file[1].update_ctime = true;
 	op->dentry	= dentry;
 	op->ops		= &afs_unlink_operation;
 	return afs_do_sync_operation(op);
@@ -1601,6 +1608,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 
 	afs_op_set_vnode(op, 0, dvnode);
 	op->file[0].dv_delta = 1;
+	op->file[0].update_ctime = true;
 
 	op->dentry	= dentry;
 	op->create.mode	= S_IFREG | mode;
@@ -1620,6 +1628,7 @@ static void afs_link_success(struct afs_operation *op)
 	struct afs_vnode_param *vp = &op->file[1];
 
 	_enter("op=%08x", op->debug_id);
+	op->ctime = dvp->scb.status.mtime_client;
 	afs_vnode_commit_status(op, dvp);
 	afs_vnode_commit_status(op, vp);
 	afs_update_dentry_version(op, dvp, op->dentry);
@@ -1672,6 +1681,8 @@ static int afs_link(struct dentry *from, struct inode *dir,
 	afs_op_set_vnode(op, 0, dvnode);
 	afs_op_set_vnode(op, 1, vnode);
 	op->file[0].dv_delta = 1;
+	op->file[0].update_ctime = true;
+	op->file[1].update_ctime = true;
 
 	op->dentry		= dentry;
 	op->dentry_2		= from;
@@ -1740,9 +1751,12 @@ static void afs_rename_success(struct afs_operation *op)
 {
 	_enter("op=%08x", op->debug_id);
 
+	op->ctime = op->file[0].scb.status.mtime_client;
 	afs_vnode_commit_status(op, &op->file[0]);
-	if (op->file[1].vnode != op->file[0].vnode)
+	if (op->file[1].vnode != op->file[0].vnode) {
+		op->ctime = op->file[1].scb.status.mtime_client;
 		afs_vnode_commit_status(op, &op->file[1]);
+	}
 }
 
 static void afs_rename_edit_dir(struct afs_operation *op)
@@ -1860,6 +1874,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */
 	op->file[0].dv_delta = 1;
 	op->file[1].dv_delta = 1;
+	op->file[0].update_ctime = true;
+	op->file[1].update_ctime = true;
 
 	op->dentry		= old_dentry;
 	op->dentry_2		= new_dentry;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 8d10bfb392d1..e99705474dd1 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -165,6 +165,7 @@ static void afs_apply_status(struct afs_operation *op,
 {
 	struct afs_file_status *status = &vp->scb.status;
 	struct afs_vnode *vnode = vp->vnode;
+	struct inode *inode = &vnode->vfs_inode;
 	struct timespec64 t;
 	umode_t mode;
 	bool data_changed = false;
@@ -187,25 +188,25 @@ static void afs_apply_status(struct afs_operation *op,
 	}
 
 	if (status->nlink != vnode->status.nlink)
-		set_nlink(&vnode->vfs_inode, status->nlink);
+		set_nlink(inode, status->nlink);
 
 	if (status->owner != vnode->status.owner)
-		vnode->vfs_inode.i_uid = make_kuid(&init_user_ns, status->owner);
+		inode->i_uid = make_kuid(&init_user_ns, status->owner);
 
 	if (status->group != vnode->status.group)
-		vnode->vfs_inode.i_gid = make_kgid(&init_user_ns, status->group);
+		inode->i_gid = make_kgid(&init_user_ns, status->group);
 
 	if (status->mode != vnode->status.mode) {
-		mode = vnode->vfs_inode.i_mode;
+		mode = inode->i_mode;
 		mode &= ~S_IALLUGO;
 		mode |= status->mode;
-		WRITE_ONCE(vnode->vfs_inode.i_mode, mode);
+		WRITE_ONCE(inode->i_mode, mode);
 	}
 
 	t = status->mtime_client;
-	vnode->vfs_inode.i_ctime = t;
-	vnode->vfs_inode.i_mtime = t;
-	vnode->vfs_inode.i_atime = t;
+	inode->i_mtime = t;
+	if (vp->update_ctime)
+		inode->i_ctime = op->ctime;
 
 	if (vnode->status.data_version != status->data_version)
 		data_changed = true;
@@ -239,15 +240,18 @@ static void afs_apply_status(struct afs_operation *op,
 	}
 
 	if (data_changed) {
-		inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
+		inode_set_iversion_raw(inode, status->data_version);
 
 		/* Only update the size if the data version jumped.  If the
 		 * file is being modified locally, then we might have our own
 		 * idea of what the size should be that's not the same as
 		 * what's on the server.
 		 */
-		if (change_size)
+		if (change_size) {
 			afs_set_i_size(vnode, status->size);
+			inode->i_ctime = t;
+			inode->i_atime = t;
+		}
 	}
 }
 
@@ -817,7 +821,8 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
 	       attr->ia_valid);
 
 	if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID |
-				ATTR_MTIME))) {
+				ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET |
+				ATTR_TOUCH))) {
 		_leave(" = 0 [unsupported]");
 		return 0;
 	}
@@ -837,6 +842,8 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (attr->ia_valid & ATTR_SIZE)
 		op->file[0].dv_delta = 1;
+	op->ctime = attr->ia_ctime;
+	op->file[0].update_ctime = 1;
 
 	op->ops = &afs_setattr_operation;
 	return afs_do_sync_operation(op);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 0c9806ef2a19..92cd6b8cc01f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -746,6 +746,7 @@ struct afs_vnode_param {
 	u8			dv_delta;	/* Expected change in data version */
 	bool			put_vnode;	/* T if we have a ref on the vnode */
 	bool			need_io_lock;	/* T if we need the I/O lock on this */
+	bool			update_ctime;	/* Need to update the ctime */
 };
 
 /*
@@ -766,6 +767,7 @@ struct afs_operation {
 	struct dentry		*dentry;	/* Dentry to be altered */
 	struct dentry		*dentry_2;	/* Second dentry to be altered */
 	struct timespec64	mtime;		/* Modification time to record */
+	struct timespec64	ctime;		/* Change time to set */
 	short			nr_files;	/* Number of entries in file[], more_files */
 	short			error;
 	unsigned int		abort_code;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index a55cb73e0449..2003d7ee9e43 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -393,6 +393,7 @@ static void afs_store_data_success(struct afs_operation *op)
 {
 	struct afs_vnode *vnode = op->file[0].vnode;
 
+	op->ctime = op->file[0].scb.status.mtime_client;
 	afs_vnode_commit_status(op, &op->file[0]);
 	if (op->error == 0) {
 		afs_pages_written_back(vnode, op->store.first, op->store.last);
-- 
cgit 


From 793fe82ee33aab1023cf023cd7d744af19a3dff9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 12 Jun 2020 16:13:52 +0100
Subject: afs: Fix truncation issues and mmap writeback size

Fix the following issues:

 (1) Fix writeback to reduce the size of a store operation to i_size,
     effectively discarding the extra data.

     The problem comes when afs_page_mkwrite() records that a page is about
     to be modified by mmap().  It doesn't know what bits of the page are
     going to be modified, so it records the whole page as being dirty
     (this is stored in page->private as start and end offsets).

     Without this, the marshalling for the store to the server extends the
     size of the file to the end of the page (in afs_fs_store_data() and
     yfs_fs_store_data()).

 (2) Fix setattr to actually truncate the pagecache, thereby clearing
     the discarded part of a file.

 (3) Fix setattr to check that the new size is okay and to disable
     ATTR_SIZE if i_size wouldn't change.

 (4) Force i_size to be updated as the result of a truncate.

 (5) Don't truncate if ATTR_SIZE is not set.

 (6) Call pagecache_isize_extended() if the file was enlarged.

Note that truncate_set_size() isn't used because the setting of i_size is
done inside afs_vnode_commit_status() under the vnode->cb_lock.

Found with the generic/029 and generic/393 xfstests.

Fixes: 31143d5d515e ("AFS: implement basic file write support")
Fixes: 4343d00872e1 ("afs: Get rid of the afs_writeback record")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/inode.c    | 27 +++++++++++++++++++++++++--
 fs/afs/internal.h |  7 ++++---
 fs/afs/write.c    |  6 ++++++
 3 files changed, 35 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e99705474dd1..70c925978d10 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -169,7 +169,7 @@ static void afs_apply_status(struct afs_operation *op,
 	struct timespec64 t;
 	umode_t mode;
 	bool data_changed = false;
-	bool change_size = false;
+	bool change_size = vp->set_size;
 
 	_enter("{%llx:%llu.%u} %s",
 	       vp->fid.vid, vp->fid.vnode, vp->fid.unique,
@@ -799,7 +799,15 @@ void afs_evict_inode(struct inode *inode)
 
 static void afs_setattr_success(struct afs_operation *op)
 {
+	struct inode *inode = &op->file[0].vnode->vfs_inode;
+
 	afs_vnode_commit_status(op, &op->file[0]);
+	if (op->setattr.attr->ia_valid & ATTR_SIZE) {
+		loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size;
+		if (size > i_size)
+			pagecache_isize_extended(inode, i_size, size);
+		truncate_pagecache(inode, size);
+	}
 }
 
 static const struct afs_operation_ops afs_setattr_operation = {
@@ -815,6 +823,7 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct afs_operation *op;
 	struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
+	int ret;
 
 	_enter("{%llx:%llu},{n=%pd},%x",
 	       vnode->fid.vid, vnode->fid.vnode, dentry,
@@ -827,6 +836,18 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
 		return 0;
 	}
 
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (!S_ISREG(vnode->vfs_inode.i_mode))
+			return -EISDIR;
+
+		ret = inode_newsize_ok(&vnode->vfs_inode, attr->ia_size);
+		if (ret)
+			return ret;
+
+		if (attr->ia_size == i_size_read(&vnode->vfs_inode))
+			attr->ia_valid &= ~ATTR_SIZE;
+	}
+
 	/* flush any dirty data outstanding on a regular file */
 	if (S_ISREG(vnode->vfs_inode.i_mode))
 		filemap_write_and_wait(vnode->vfs_inode.i_mapping);
@@ -840,8 +861,10 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
 	afs_op_set_vnode(op, 0, vnode);
 	op->setattr.attr = attr;
 
-	if (attr->ia_valid & ATTR_SIZE)
+	if (attr->ia_valid & ATTR_SIZE) {
 		op->file[0].dv_delta = 1;
+		op->file[0].set_size = true;
+	}
 	op->ctime = attr->ia_ctime;
 	op->file[0].update_ctime = 1;
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 92cd6b8cc01f..bdc1e5efebd4 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -744,9 +744,10 @@ struct afs_vnode_param {
 	afs_dataversion_t	dv_before;	/* Data version before the call */
 	unsigned int		cb_break_before; /* cb_break + cb_s_break before the call */
 	u8			dv_delta;	/* Expected change in data version */
-	bool			put_vnode;	/* T if we have a ref on the vnode */
-	bool			need_io_lock;	/* T if we need the I/O lock on this */
-	bool			update_ctime;	/* Need to update the ctime */
+	bool			put_vnode:1;	/* T if we have a ref on the vnode */
+	bool			need_io_lock:1;	/* T if we need the I/O lock on this */
+	bool			update_ctime:1;	/* Need to update the ctime */
+	bool			set_size:1;	/* Must update i_size */
 };
 
 /*
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 2003d7ee9e43..7437806332d9 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -492,6 +492,7 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,
 	unsigned long count, priv;
 	unsigned n, offset, to, f, t;
 	pgoff_t start, first, last;
+	loff_t i_size, end;
 	int loop, ret;
 
 	_enter(",%lx", primary_page->index);
@@ -592,7 +593,12 @@ no_more:
 	first = primary_page->index;
 	last = first + count - 1;
 
+	end = (loff_t)last * PAGE_SIZE + to;
+	i_size = i_size_read(&vnode->vfs_inode);
+
 	_debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);
+	if (end > i_size)
+		to = i_size & ~PAGE_MASK;
 
 	ret = afs_store_data(mapping, first, last, offset, to);
 	switch (ret) {
-- 
cgit 


From 4ec89596d06bd481ba827f3b409b938d63914157 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 14 Jun 2020 22:12:05 +0100
Subject: afs: Fix the mapping of the UAEOVERFLOW abort code

Abort code UAEOVERFLOW is returned when we try and set a time that's out of
range, but it's currently mapped to EREMOTEIO by the default case.

Fix UAEOVERFLOW to map instead to EOVERFLOW.

Found with the generic/258 xfstest.  Note that the test is wrong as it
assumes that the filesystem will support a pre-UNIX-epoch date.

Fixes: 1eda8bab70ca ("afs: Add support for the UAE error table")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/misc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 52b19e9c1535..5334f1bd2bca 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -83,6 +83,7 @@ int afs_abort_to_error(u32 abort_code)
 	case UAENOLCK:			return -ENOLCK;
 	case UAENOTEMPTY:		return -ENOTEMPTY;
 	case UAELOOP:			return -ELOOP;
+	case UAEOVERFLOW:		return -EOVERFLOW;
 	case UAENOMEDIUM:		return -ENOMEDIUM;
 	case UAEDQUOT:			return -EDQUOT;
 
-- 
cgit 


From f4c2665e33f48904f2766d644df33fb3fd54b5ec Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 15 Jun 2020 10:24:02 +0300
Subject: io-wq: reorder cancellation pending -> running

Go all over all pending lists and cancel works there, and only then
try to match running requests. No functional changes here, just a
preparation for bulk cancellation.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 54 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 0b65a912b036..03c7e37548c2 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -927,19 +927,14 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 	return ret;
 }
 
-static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
-					    struct io_cb_cancel_data *match)
+static bool io_wqe_cancel_pending_work(struct io_wqe *wqe,
+				       struct io_cb_cancel_data *match)
 {
 	struct io_wq_work_node *node, *prev;
 	struct io_wq_work *work;
 	unsigned long flags;
 	bool found = false;
 
-	/*
-	 * First check pending list, if we're lucky we can just remove it
-	 * from there. CANCEL_OK means that the work is returned as-new,
-	 * no completion will be posted for it.
-	 */
 	spin_lock_irqsave(&wqe->lock, flags);
 	wq_list_for_each(node, prev, &wqe->work_list) {
 		work = container_of(node, struct io_wq_work, list);
@@ -952,21 +947,20 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
 	}
 	spin_unlock_irqrestore(&wqe->lock, flags);
 
-	if (found) {
+	if (found)
 		io_run_cancel(work, wqe);
-		return IO_WQ_CANCEL_OK;
-	}
+	return found;
+}
+
+static bool io_wqe_cancel_running_work(struct io_wqe *wqe,
+				       struct io_cb_cancel_data *match)
+{
+	bool found;
 
-	/*
-	 * Now check if a free (going busy) or busy worker has the work
-	 * currently running. If we find it there, we'll return CANCEL_RUNNING
-	 * as an indication that we attempt to signal cancellation. The
-	 * completion will run normally in this case.
-	 */
 	rcu_read_lock();
 	found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
 	rcu_read_unlock();
-	return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
+	return found;
 }
 
 enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
@@ -976,18 +970,34 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 		.fn	= cancel,
 		.data	= data,
 	};
-	enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
 	int node;
 
+	/*
+	 * First check pending list, if we're lucky we can just remove it
+	 * from there. CANCEL_OK means that the work is returned as-new,
+	 * no completion will be posted for it.
+	 */
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
-		ret = io_wqe_cancel_work(wqe, &match);
-		if (ret != IO_WQ_CANCEL_NOTFOUND)
-			break;
+		if (io_wqe_cancel_pending_work(wqe, &match))
+			return IO_WQ_CANCEL_OK;
 	}
 
-	return ret;
+	/*
+	 * Now check if a free (going busy) or busy worker has the work
+	 * currently running. If we find it there, we'll return CANCEL_RUNNING
+	 * as an indication that we attempt to signal cancellation. The
+	 * completion will run normally in this case.
+	 */
+	for_each_node(node) {
+		struct io_wqe *wqe = wq->wqes[node];
+
+		if (io_wqe_cancel_running_work(wqe, &match))
+			return IO_WQ_CANCEL_RUNNING;
+	}
+
+	return IO_WQ_CANCEL_NOTFOUND;
 }
 
 static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
-- 
cgit 


From 4f26bda1522c35d2701fc219368c7101c17005c1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 15 Jun 2020 10:24:03 +0300
Subject: io-wq: add an option to cancel all matched reqs

This adds support for cancelling all io-wq works matching a predicate.
It isn't used yet, so no change in observable behaviour.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    | 60 +++++++++++++++++++++++++++++++++--------------------------
 fs/io-wq.h    |  2 +-
 fs/io_uring.c |  2 +-
 3 files changed, 36 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 03c7e37548c2..e290202d6f64 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -903,13 +903,15 @@ void io_wq_cancel_all(struct io_wq *wq)
 struct io_cb_cancel_data {
 	work_cancel_fn *fn;
 	void *data;
+	int nr_running;
+	int nr_pending;
+	bool cancel_all;
 };
 
 static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 {
 	struct io_cb_cancel_data *match = data;
 	unsigned long flags;
-	bool ret = false;
 
 	/*
 	 * Hold the lock to avoid ->cur_work going out of scope, caller
@@ -920,55 +922,55 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 	    !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
 	    match->fn(worker->cur_work, match->data)) {
 		send_sig(SIGINT, worker->task, 1);
-		ret = true;
+		match->nr_running++;
 	}
 	spin_unlock_irqrestore(&worker->lock, flags);
 
-	return ret;
+	return match->nr_running && !match->cancel_all;
 }
 
-static bool io_wqe_cancel_pending_work(struct io_wqe *wqe,
+static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
 				       struct io_cb_cancel_data *match)
 {
 	struct io_wq_work_node *node, *prev;
 	struct io_wq_work *work;
 	unsigned long flags;
-	bool found = false;
 
+retry:
 	spin_lock_irqsave(&wqe->lock, flags);
 	wq_list_for_each(node, prev, &wqe->work_list) {
 		work = container_of(node, struct io_wq_work, list);
+		if (!match->fn(work, match->data))
+			continue;
 
-		if (match->fn(work, match->data)) {
-			wq_list_del(&wqe->work_list, node, prev);
-			found = true;
-			break;
-		}
+		wq_list_del(&wqe->work_list, node, prev);
+		spin_unlock_irqrestore(&wqe->lock, flags);
+		io_run_cancel(work, wqe);
+		match->nr_pending++;
+		if (!match->cancel_all)
+			return;
+
+		/* not safe to continue after unlock */
+		goto retry;
 	}
 	spin_unlock_irqrestore(&wqe->lock, flags);
-
-	if (found)
-		io_run_cancel(work, wqe);
-	return found;
 }
 
-static bool io_wqe_cancel_running_work(struct io_wqe *wqe,
+static void io_wqe_cancel_running_work(struct io_wqe *wqe,
 				       struct io_cb_cancel_data *match)
 {
-	bool found;
-
 	rcu_read_lock();
-	found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
+	io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
 	rcu_read_unlock();
-	return found;
 }
 
 enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
-				  void *data)
+				  void *data, bool cancel_all)
 {
 	struct io_cb_cancel_data match = {
-		.fn	= cancel,
-		.data	= data,
+		.fn		= cancel,
+		.data		= data,
+		.cancel_all	= cancel_all,
 	};
 	int node;
 
@@ -980,7 +982,8 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
-		if (io_wqe_cancel_pending_work(wqe, &match))
+		io_wqe_cancel_pending_work(wqe, &match);
+		if (match.nr_pending && !match.cancel_all)
 			return IO_WQ_CANCEL_OK;
 	}
 
@@ -993,10 +996,15 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
-		if (io_wqe_cancel_running_work(wqe, &match))
+		io_wqe_cancel_running_work(wqe, &match);
+		if (match.nr_running && !match.cancel_all)
 			return IO_WQ_CANCEL_RUNNING;
 	}
 
+	if (match.nr_running)
+		return IO_WQ_CANCEL_RUNNING;
+	if (match.nr_pending)
+		return IO_WQ_CANCEL_OK;
 	return IO_WQ_CANCEL_NOTFOUND;
 }
 
@@ -1007,7 +1015,7 @@ static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
 
 enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
 {
-	return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork);
+	return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false);
 }
 
 static bool io_wq_pid_match(struct io_wq_work *work, void *data)
@@ -1021,7 +1029,7 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
 {
 	void *data = (void *) (unsigned long) pid;
 
-	return io_wq_cancel_cb(wq, io_wq_pid_match, data);
+	return io_wq_cancel_cb(wq, io_wq_pid_match, data, false);
 }
 
 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 8e138fa88b9f..7d5bd431c5e3 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -130,7 +130,7 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid);
 typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
 
 enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
-					void *data);
+					void *data, bool cancel_all);
 
 struct task_struct *io_wq_get_task(struct io_wq *wq);
 
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c04b20bf2803..94bd88556c1e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4773,7 +4773,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
 	enum io_wq_cancel cancel_ret;
 	int ret = 0;
 
-	cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
+	cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
 	switch (cancel_ret) {
 	case IO_WQ_CANCEL_OK:
 		ret = 0;
-- 
cgit 


From 44e728b8aae0bb6d4229129083974f9dea43f50b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 15 Jun 2020 10:24:04 +0300
Subject: io_uring: cancel all task's requests on exit

If a process is going away, io_uring_flush() will cancel only 1
request with a matching pid. Cancel all of them

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    | 14 --------------
 fs/io-wq.h    |  1 -
 fs/io_uring.c | 14 ++++++++++++--
 3 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index e290202d6f64..47c5f3aeb460 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -1018,20 +1018,6 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
 	return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false);
 }
 
-static bool io_wq_pid_match(struct io_wq_work *work, void *data)
-{
-	pid_t pid = (pid_t) (unsigned long) data;
-
-	return work->task_pid == pid;
-}
-
-enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
-{
-	void *data = (void *) (unsigned long) pid;
-
-	return io_wq_cancel_cb(wq, io_wq_pid_match, data, false);
-}
-
 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 {
 	int ret = -ENOMEM, node;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 7d5bd431c5e3..b72538fe5afd 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -125,7 +125,6 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work)
 
 void io_wq_cancel_all(struct io_wq *wq);
 enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
-enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid);
 
 typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
 
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 94bd88556c1e..ad5128a40c14 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7424,6 +7424,13 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 	}
 }
 
+static bool io_cancel_pid_cb(struct io_wq_work *work, void *data)
+{
+	pid_t pid = (pid_t) (unsigned long) data;
+
+	return work->task_pid == pid;
+}
+
 static int io_uring_flush(struct file *file, void *data)
 {
 	struct io_ring_ctx *ctx = file->private_data;
@@ -7433,8 +7440,11 @@ static int io_uring_flush(struct file *file, void *data)
 	/*
 	 * If the task is going away, cancel work it may have pending
 	 */
-	if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
-		io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
+	if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
+		void *data = (void *) (unsigned long)task_pid_vnr(current);
+
+		io_wq_cancel_cb(ctx->io_wq, io_cancel_pid_cb, data, true);
+	}
 
 	return 0;
 }
-- 
cgit 


From 67c4d9e693e3bb7fb968af24e3584f821a78ba56 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 15 Jun 2020 10:24:05 +0300
Subject: io_uring: batch cancel in io_uring_cancel_files()

Instead of waiting for each request one by one, first try to cancel all
of them in a batched manner, and then go over inflight_list/etc to reap
leftovers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ad5128a40c14..b6bcd5a7f4bc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7366,9 +7366,22 @@ static int io_uring_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static bool io_wq_files_match(struct io_wq_work *work, void *data)
+{
+	struct files_struct *files = data;
+
+	return work->files == files;
+}
+
 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 				  struct files_struct *files)
 {
+	if (list_empty_careful(&ctx->inflight_list))
+		return;
+
+	/* cancel all at once, should be faster than doing it one by one*/
+	io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
+
 	while (!list_empty_careful(&ctx->inflight_list)) {
 		struct io_kiocb *cancel_req = NULL, *req;
 		DEFINE_WAIT(wait);
-- 
cgit 


From 4dd2824d6d5914949b5fe589538bc2622d84c5dd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 15 Jun 2020 10:33:13 +0300
Subject: io_uring: lazy get task

There will be multiple places where req->task is used, so refcount-pin
it lazily with introduced *io_{get,put}_req_task(). We need to always
have valid ->task for cancellation reasons, but don't care about pinning
it in some cases. That's why it sets req->task in io_req_init() and
implements get/put laziness with a flag.

This also removes using @current from polling io_arm_poll_handler(),
etc., but doesn't change observable behaviour.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b6bcd5a7f4bc..5f946eb8b740 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -541,6 +541,7 @@ enum {
 	REQ_F_NO_FILE_TABLE_BIT,
 	REQ_F_QUEUE_TIMEOUT_BIT,
 	REQ_F_WORK_INITIALIZED_BIT,
+	REQ_F_TASK_PINNED_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -598,6 +599,8 @@ enum {
 	REQ_F_QUEUE_TIMEOUT	= BIT(REQ_F_QUEUE_TIMEOUT_BIT),
 	/* io_wq_work is initialized */
 	REQ_F_WORK_INITIALIZED	= BIT(REQ_F_WORK_INITIALIZED_BIT),
+	/* req->task is refcounted */
+	REQ_F_TASK_PINNED	= BIT(REQ_F_TASK_PINNED_BIT),
 };
 
 struct async_poll {
@@ -910,6 +913,21 @@ struct sock *io_uring_get_socket(struct file *file)
 }
 EXPORT_SYMBOL(io_uring_get_socket);
 
+static void io_get_req_task(struct io_kiocb *req)
+{
+	if (req->flags & REQ_F_TASK_PINNED)
+		return;
+	get_task_struct(req->task);
+	req->flags |= REQ_F_TASK_PINNED;
+}
+
+/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
+static void __io_put_req_task(struct io_kiocb *req)
+{
+	if (req->flags & REQ_F_TASK_PINNED)
+		put_task_struct(req->task);
+}
+
 static void io_file_put_work(struct work_struct *work);
 
 /*
@@ -1399,9 +1417,7 @@ static void __io_req_aux_free(struct io_kiocb *req)
 	kfree(req->io);
 	if (req->file)
 		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
-	if (req->task)
-		put_task_struct(req->task);
-
+	__io_put_req_task(req);
 	io_req_work_drop_env(req);
 }
 
@@ -4367,8 +4383,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 		memcpy(&apoll->work, &req->work, sizeof(req->work));
 	had_io = req->io != NULL;
 
-	get_task_struct(current);
-	req->task = current;
+	io_get_req_task(req);
 	req->apoll = apoll;
 	INIT_HLIST_NODE(&req->hash_node);
 
@@ -4556,8 +4571,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	events = READ_ONCE(sqe->poll_events);
 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
 
-	get_task_struct(current);
-	req->task = current;
+	io_get_req_task(req);
 	return 0;
 }
 
@@ -5818,7 +5832,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->flags = 0;
 	/* one is dropped after submission, the other at completion */
 	refcount_set(&req->refs, 2);
-	req->task = NULL;
+	req->task = current;
 	req->result = 0;
 
 	if (unlikely(req->opcode >= IORING_OP_LAST))
-- 
cgit 


From 801dd57bd1d8c2c253f43635a3045bfa32a810b3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 15 Jun 2020 10:33:14 +0300
Subject: io_uring: cancel by ->task not pid

For an exiting process it tries to cancel all its inflight requests. Use
req->task to match such instead of work.pid. We always have req->task
set, and it will be valid because we're matching only current exiting
task.

Also, remove work.pid and everything related, it's useless now.

Reported-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.h    |  1 -
 fs/io_uring.c | 16 ++++++----------
 2 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.h b/fs/io-wq.h
index b72538fe5afd..071f1a997800 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -90,7 +90,6 @@ struct io_wq_work {
 	const struct cred *creds;
 	struct fs_struct *fs;
 	unsigned flags;
-	pid_t task_pid;
 };
 
 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5f946eb8b740..e17df662c191 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1063,8 +1063,6 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
 		}
 		spin_unlock(&current->fs->lock);
 	}
-	if (!req->work.task_pid)
-		req->work.task_pid = task_pid_vnr(current);
 }
 
 static inline void io_req_work_drop_env(struct io_kiocb *req)
@@ -7451,11 +7449,12 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 	}
 }
 
-static bool io_cancel_pid_cb(struct io_wq_work *work, void *data)
+static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
 {
-	pid_t pid = (pid_t) (unsigned long) data;
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct task_struct *task = data;
 
-	return work->task_pid == pid;
+	return req->task == task;
 }
 
 static int io_uring_flush(struct file *file, void *data)
@@ -7467,11 +7466,8 @@ static int io_uring_flush(struct file *file, void *data)
 	/*
 	 * If the task is going away, cancel work it may have pending
 	 */
-	if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
-		void *data = (void *) (unsigned long)task_pid_vnr(current);
-
-		io_wq_cancel_cb(ctx->io_wq, io_cancel_pid_cb, data, true);
-	}
+	if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
+		io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
 
 	return 0;
 }
-- 
cgit 


From 241cb28e38c322068c87da4c479e07fbae0c1a4e Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Thu, 28 May 2020 09:35:11 -0500
Subject: aio: Replace zero-length array with flexible-array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare having a
dynamically sized set of trailing elements in a structure. Kernel code should
always use “flexible array members”[1] for these cases. The older style of
one-element or zero-length arrays should no longer be used[2].

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://github.com/KSPP/linux/issues/21

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 7ecddc2f38db..91e7cc4a9f17 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -67,7 +67,7 @@ struct aio_ring {
 	unsigned	header_length;	/* size of aio_ring */
 
 
-	struct io_event		io_events[0];
+	struct io_event		io_events[];
 }; /* 128 bytes + ring size */
 
 /*
-- 
cgit 


From 6112bad79f4de4bde62f630494760a300724e4b4 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Thu, 28 May 2020 09:35:11 -0500
Subject: jffs2: Replace zero-length array with flexible-array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare having a
dynamically sized set of trailing elements in a structure. Kernel code should
always use “flexible array members”[1] for these cases. The older style of
one-element or zero-length arrays should no longer be used[2].

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://github.com/KSPP/linux/issues/21

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 fs/jffs2/nodelist.h | 2 +-
 fs/jffs2/summary.h  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 0637271f3770..8ff4d1a1e774 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -259,7 +259,7 @@ struct jffs2_full_dirent
 	uint32_t ino; /* == zero for unlink */
 	unsigned int nhash;
 	unsigned char type;
-	unsigned char name[0];
+	unsigned char name[];
 };
 
 /*
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index 60207a2ae952..e4131cb1f1d4 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -61,7 +61,7 @@ struct jffs2_sum_dirent_flash
 	jint32_t ino; 		/* == zero for unlink */
 	uint8_t nsize;		/* dirent name size */
 	uint8_t type;		/* dirent type */
-	uint8_t name[0];	/* dirent name */
+	uint8_t name[];	/* dirent name */
 } __attribute__((packed));
 
 struct jffs2_sum_xattr_flash
@@ -117,7 +117,7 @@ struct jffs2_sum_dirent_mem
 	jint32_t ino; 		/* == zero for unlink */
 	uint8_t nsize;		/* dirent name size */
 	uint8_t type;		/* dirent type */
-	uint8_t name[0];	/* dirent name */
+	uint8_t name[];	/* dirent name */
 } __attribute__((packed));
 
 struct jffs2_sum_xattr_mem
-- 
cgit 


From b2b32e3aa08475d98d2c325fa5886e151fcd2981 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Thu, 28 May 2020 09:35:11 -0500
Subject: Squashfs: Replace zero-length array with flexible-array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare having a
dynamically sized set of trailing elements in a structure. Kernel code should
always use “flexible array members”[1] for these cases. The older style of
one-element or zero-length arrays should no longer be used[2].

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://github.com/KSPP/linux/issues/21

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 fs/squashfs/squashfs_fs.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 7187bd1a30ea..8d64edb80ebf 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -262,7 +262,7 @@ struct squashfs_dir_index {
 	__le32			index;
 	__le32			start_block;
 	__le32			size;
-	unsigned char		name[0];
+	unsigned char		name[];
 };
 
 struct squashfs_base_inode {
@@ -327,7 +327,7 @@ struct squashfs_symlink_inode {
 	__le32			inode_number;
 	__le32			nlink;
 	__le32			symlink_size;
-	char			symlink[0];
+	char			symlink[];
 };
 
 struct squashfs_reg_inode {
@@ -341,7 +341,7 @@ struct squashfs_reg_inode {
 	__le32			fragment;
 	__le32			offset;
 	__le32			file_size;
-	__le16			block_list[0];
+	__le16			block_list[];
 };
 
 struct squashfs_lreg_inode {
@@ -358,7 +358,7 @@ struct squashfs_lreg_inode {
 	__le32			fragment;
 	__le32			offset;
 	__le32			xattr;
-	__le16			block_list[0];
+	__le16			block_list[];
 };
 
 struct squashfs_dir_inode {
@@ -389,7 +389,7 @@ struct squashfs_ldir_inode {
 	__le16			i_count;
 	__le16			offset;
 	__le32			xattr;
-	struct squashfs_dir_index	index[0];
+	struct squashfs_dir_index	index[];
 };
 
 union squashfs_inode {
@@ -410,7 +410,7 @@ struct squashfs_dir_entry {
 	__le16			inode_number;
 	__le16			type;
 	__le16			size;
-	char			name[0];
+	char			name[];
 };
 
 struct squashfs_dir_header {
@@ -428,12 +428,12 @@ struct squashfs_fragment_entry {
 struct squashfs_xattr_entry {
 	__le16			type;
 	__le16			size;
-	char			data[0];
+	char			data[];
 };
 
 struct squashfs_xattr_val {
 	__le32			vsize;
-	char			value[0];
+	char			value[];
 };
 
 struct squashfs_xattr_id {
-- 
cgit 


From 6c85cacc8c096fc5cbdba61b6aa8fe675805e5d1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 16 Jun 2020 00:18:09 +0100
Subject: afs: Remove yfs_fs_fetch_file_status() as it's not used

Remove yfs_fs_fetch_file_status() as it's no longer used.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/internal.h  |  1 -
 fs/afs/yfsclient.c | 42 ------------------------------------------
 2 files changed, 43 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index bdc1e5efebd4..d2207cb40740 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1438,7 +1438,6 @@ extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
 /*
  * yfsclient.c
  */
-extern void yfs_fs_fetch_file_status(struct afs_operation *);
 extern void yfs_fs_fetch_data(struct afs_operation *);
 extern void yfs_fs_create_file(struct afs_operation *);
 extern void yfs_fs_make_dir(struct afs_operation *);
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 52d5af5fcd44..993591739240 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -374,48 +374,6 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call)
 	return 0;
 }
 
-/*
- * YFS.FetchStatus operation type
- */
-static const struct afs_call_type yfs_RXYFSFetchStatus_vnode = {
-	.name		= "YFS.FetchStatus(vnode)",
-	.op		= yfs_FS_FetchStatus,
-	.deliver	= yfs_deliver_fs_status_cb_and_volsync,
-	.destructor	= afs_flat_call_destructor,
-};
-
-/*
- * Fetch the status information for a file.
- */
-void yfs_fs_fetch_file_status(struct afs_operation *op)
-{
-	struct afs_vnode_param *vp = &op->file[0];
-	struct afs_call *call;
-	__be32 *bp;
-
-	_enter(",%x,{%llx:%llu},,",
-	       key_serial(op->key), vp->fid.vid, vp->fid.vnode);
-
-	call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchStatus_vnode,
-				   sizeof(__be32) * 2 +
-				   sizeof(struct yfs_xdr_YFSFid),
-				   sizeof(struct yfs_xdr_YFSFetchStatus) +
-				   sizeof(struct yfs_xdr_YFSCallBack) +
-				   sizeof(struct yfs_xdr_YFSVolSync));
-	if (!call)
-		return afs_op_nomem(op);
-
-	/* marshall the parameters */
-	bp = call->request;
-	bp = xdr_encode_u32(bp, YFSFETCHSTATUS);
-	bp = xdr_encode_u32(bp, 0); /* RPC flags */
-	bp = xdr_encode_YFSFid(bp, &vp->fid);
-	yfs_check_req(call, bp);
-
-	trace_afs_make_fs_call(call, &vp->fid);
-	afs_make_op_call(op, call, GFP_NOFS);
-}
-
 /*
  * Deliver reply data to an YFS.FetchData64.
  */
-- 
cgit 


From 9bd87ec631ba07285138eed9c85645a12294f6c6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 16 Jun 2020 00:23:12 +0100
Subject: afs: Fix yfs_fs_fetch_status() to honour vnode selector

Fix yfs_fs_fetch_status() to honour the vnode selector in
op->fetch_status.which as does afs_fs_fetch_status() that allows
afs_do_lookup() to use this as an alternative to the InlineBulkStatus RPC
call if not implemented by the server.

This doesn't matter in the current code as YFS servers always implement
InlineBulkStatus, but a subsequent will call it on YFS servers too in some
circumstances.

Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/yfsclient.c | 51 ++++++++++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 993591739240..8c24fdc899e3 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -329,29 +329,6 @@ static void xdr_decode_YFSFetchVolumeStatus(const __be32 **_bp,
 	*_bp += sizeof(*x) / sizeof(__be32);
 }
 
-/*
- * Deliver a reply that's a status, callback and volsync.
- */
-static int yfs_deliver_fs_status_cb_and_volsync(struct afs_call *call)
-{
-	struct afs_operation *op = call->op;
-	const __be32 *bp;
-	int ret;
-
-	ret = afs_transfer_reply(call);
-	if (ret < 0)
-		return ret;
-
-	/* unmarshall the reply once we've received all of it */
-	bp = call->buffer;
-	xdr_decode_YFSFetchStatus(&bp, call, &op->file[0].scb);
-	xdr_decode_YFSCallBack(&bp, call, &op->file[0].scb);
-	xdr_decode_YFSVolSync(&bp, &op->volsync);
-
-	_leave(" = 0 [done]");
-	return 0;
-}
-
 /*
  * Deliver reply data to operations that just return a file status and a volume
  * sync record.
@@ -1562,13 +1539,37 @@ void yfs_fs_release_lock(struct afs_operation *op)
 	afs_make_op_call(op, call, GFP_NOFS);
 }
 
+/*
+ * Deliver a reply to YFS.FetchStatus
+ */
+static int yfs_deliver_fs_fetch_status(struct afs_call *call)
+{
+	struct afs_operation *op = call->op;
+	struct afs_vnode_param *vp = &op->file[op->fetch_status.which];
+	const __be32 *bp;
+	int ret;
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_YFSFetchStatus(&bp, call, &vp->scb);
+	xdr_decode_YFSCallBack(&bp, call, &vp->scb);
+	xdr_decode_YFSVolSync(&bp, &op->volsync);
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
 /*
  * YFS.FetchStatus operation type
  */
 static const struct afs_call_type yfs_RXYFSFetchStatus = {
 	.name		= "YFS.FetchStatus",
 	.op		= yfs_FS_FetchStatus,
-	.deliver	= yfs_deliver_fs_status_cb_and_volsync,
+	.deliver	= yfs_deliver_fs_fetch_status,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -1577,7 +1578,7 @@ static const struct afs_call_type yfs_RXYFSFetchStatus = {
  */
 void yfs_fs_fetch_status(struct afs_operation *op)
 {
-	struct afs_vnode_param *vp = &op->file[0];
+	struct afs_vnode_param *vp = &op->file[op->fetch_status.which];
 	struct afs_call *call;
 	__be32 *bp;
 
-- 
cgit 


From 44767c353127cfcbee49a89bab39a3680ecd2a45 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 16 Jun 2020 00:25:56 +0100
Subject: afs: Remove afs_operation::abort_code

Remove afs_operation::abort_code as it's read but never set.  Use
ac.abort_code instead.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/dir.c      | 2 +-
 fs/afs/internal.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 308a125e9de3..ca6b147963a9 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -648,7 +648,7 @@ static void afs_do_lookup_success(struct afs_operation *op)
 			vp = &op->file[0];
 			abort_code = vp->scb.status.abort_code;
 			if (abort_code != 0) {
-				op->abort_code = abort_code;
+				op->ac.abort_code = abort_code;
 				op->error = afs_abort_to_error(abort_code);
 			}
 			break;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index d2207cb40740..598934d923cc 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -771,7 +771,6 @@ struct afs_operation {
 	struct timespec64	ctime;		/* Change time to set */
 	short			nr_files;	/* Number of entries in file[], more_files */
 	short			error;
-	unsigned int		abort_code;
 	unsigned int		debug_id;
 
 	unsigned int		cb_v_break;	/* Volume break counter before op */
-- 
cgit 


From 728279a5a1fd9fa9fa268f807391c4d19ad2822c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 16 Jun 2020 00:34:09 +0100
Subject: afs: Fix use of afs_check_for_remote_deletion()

afs_check_for_remote_deletion() checks to see if error ENOENT is returned
by the server in response to an operation and, if so, marks the primary
vnode as having been deleted as the FID is no longer valid.

However, it's being called from the operation success functions, where no
abort has happened - and if an inline abort is recorded, it's handled by
afs_vnode_commit_status().

Fix this by actually calling the operation aborted method if provided and
having that point to afs_check_for_remote_deletion().

Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/dir.c          | 21 ++++++++++++++++++---
 fs/afs/dir_silly.c    |  2 +-
 fs/afs/file.c         |  2 +-
 fs/afs/flock.c        |  4 +---
 fs/afs/fs_operation.c | 10 +++++++++-
 fs/afs/inode.c        |  1 +
 fs/afs/internal.h     | 10 +---------
 7 files changed, 32 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index ca6b147963a9..cd74731112f4 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -700,6 +700,7 @@ static const struct afs_operation_ops afs_fetch_status_operation = {
 	.issue_afs_rpc	= afs_fs_fetch_status,
 	.issue_yfs_rpc	= yfs_fs_fetch_status,
 	.success	= afs_do_lookup_success,
+	.aborted	= afs_check_for_remote_deletion,
 };
 
 /*
@@ -1236,6 +1237,17 @@ void afs_d_release(struct dentry *dentry)
 	_enter("%pd", dentry);
 }
 
+void afs_check_for_remote_deletion(struct afs_operation *op)
+{
+	struct afs_vnode *vnode = op->file[0].vnode;
+
+	switch (op->ac.abort_code) {
+	case VNOVNODE:
+		set_bit(AFS_VNODE_DELETED, &vnode->flags);
+		afs_break_callback(vnode, afs_cb_break_for_deleted);
+	}
+}
+
 /*
  * Create a new inode for create/mkdir/symlink
  */
@@ -1269,7 +1281,6 @@ static void afs_create_success(struct afs_operation *op)
 {
 	_enter("op=%08x", op->debug_id);
 	op->ctime = op->file[0].scb.status.mtime_client;
-	afs_check_for_remote_deletion(op, op->file[0].vnode);
 	afs_vnode_commit_status(op, &op->file[0]);
 	afs_update_dentry_version(op, &op->file[0], op->dentry);
 	afs_vnode_new_inode(op);
@@ -1303,6 +1314,7 @@ static const struct afs_operation_ops afs_mkdir_operation = {
 	.issue_afs_rpc	= afs_fs_make_dir,
 	.issue_yfs_rpc	= yfs_fs_make_dir,
 	.success	= afs_create_success,
+	.aborted	= afs_check_for_remote_deletion,
 	.edit_dir	= afs_create_edit_dir,
 	.put		= afs_create_put,
 };
@@ -1353,7 +1365,6 @@ static void afs_rmdir_success(struct afs_operation *op)
 {
 	_enter("op=%08x", op->debug_id);
 	op->ctime = op->file[0].scb.status.mtime_client;
-	afs_check_for_remote_deletion(op, op->file[0].vnode);
 	afs_vnode_commit_status(op, &op->file[0]);
 	afs_update_dentry_version(op, &op->file[0], op->dentry);
 }
@@ -1385,6 +1396,7 @@ static const struct afs_operation_ops afs_rmdir_operation = {
 	.issue_afs_rpc	= afs_fs_remove_dir,
 	.issue_yfs_rpc	= yfs_fs_remove_dir,
 	.success	= afs_rmdir_success,
+	.aborted	= afs_check_for_remote_deletion,
 	.edit_dir	= afs_rmdir_edit_dir,
 	.put		= afs_rmdir_put,
 };
@@ -1484,7 +1496,6 @@ static void afs_unlink_success(struct afs_operation *op)
 {
 	_enter("op=%08x", op->debug_id);
 	op->ctime = op->file[0].scb.status.mtime_client;
-	afs_check_for_remote_deletion(op, op->file[0].vnode);
 	afs_vnode_commit_status(op, &op->file[0]);
 	afs_vnode_commit_status(op, &op->file[1]);
 	afs_update_dentry_version(op, &op->file[0], op->dentry);
@@ -1516,6 +1527,7 @@ static const struct afs_operation_ops afs_unlink_operation = {
 	.issue_afs_rpc	= afs_fs_remove_file,
 	.issue_yfs_rpc	= yfs_fs_remove_file,
 	.success	= afs_unlink_success,
+	.aborted	= afs_check_for_remote_deletion,
 	.edit_dir	= afs_unlink_edit_dir,
 	.put		= afs_unlink_put,
 };
@@ -1580,6 +1592,7 @@ static const struct afs_operation_ops afs_create_operation = {
 	.issue_afs_rpc	= afs_fs_create_file,
 	.issue_yfs_rpc	= yfs_fs_create_file,
 	.success	= afs_create_success,
+	.aborted	= afs_check_for_remote_deletion,
 	.edit_dir	= afs_create_edit_dir,
 	.put		= afs_create_put,
 };
@@ -1649,6 +1662,7 @@ static const struct afs_operation_ops afs_link_operation = {
 	.issue_afs_rpc	= afs_fs_link,
 	.issue_yfs_rpc	= yfs_fs_link,
 	.success	= afs_link_success,
+	.aborted	= afs_check_for_remote_deletion,
 	.edit_dir	= afs_create_edit_dir,
 	.put		= afs_link_put,
 };
@@ -1700,6 +1714,7 @@ static const struct afs_operation_ops afs_symlink_operation = {
 	.issue_afs_rpc	= afs_fs_symlink,
 	.issue_yfs_rpc	= yfs_fs_symlink,
 	.success	= afs_create_success,
+	.aborted	= afs_check_for_remote_deletion,
 	.edit_dir	= afs_create_edit_dir,
 	.put		= afs_create_put,
 };
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index b14e3d9a25e2..001adb87ff23 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -151,7 +151,6 @@ static void afs_silly_unlink_success(struct afs_operation *op)
 	struct afs_vnode *vnode = op->file[1].vnode;
 
 	_enter("op=%08x", op->debug_id);
-	afs_check_for_remote_deletion(op, op->file[0].vnode);
 	afs_vnode_commit_status(op, &op->file[0]);
 	afs_vnode_commit_status(op, &op->file[1]);
 	afs_update_dentry_version(op, &op->file[0], op->dentry);
@@ -181,6 +180,7 @@ static const struct afs_operation_ops afs_silly_unlink_operation = {
 	.issue_afs_rpc	= afs_fs_remove_file,
 	.issue_yfs_rpc	= yfs_fs_remove_file,
 	.success	= afs_silly_unlink_success,
+	.aborted	= afs_check_for_remote_deletion,
 	.edit_dir	= afs_silly_unlink_edit_dir,
 };
 
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 506c47471b42..6f6ed1605cfe 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -225,7 +225,6 @@ static void afs_fetch_data_success(struct afs_operation *op)
 	struct afs_vnode *vnode = op->file[0].vnode;
 
 	_enter("op=%08x", op->debug_id);
-	afs_check_for_remote_deletion(op, vnode);
 	afs_vnode_commit_status(op, &op->file[0]);
 	afs_stat_v(vnode, n_fetches);
 	atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes);
@@ -240,6 +239,7 @@ static const struct afs_operation_ops afs_fetch_data_operation = {
 	.issue_afs_rpc	= afs_fs_fetch_data,
 	.issue_yfs_rpc	= yfs_fs_fetch_data,
 	.success	= afs_fetch_data_success,
+	.aborted	= afs_check_for_remote_deletion,
 	.put		= afs_fetch_data_put,
 };
 
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 71eea2a908c7..ffb8575345ca 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -175,10 +175,7 @@ static void afs_kill_lockers_enoent(struct afs_vnode *vnode)
 
 static void afs_lock_success(struct afs_operation *op)
 {
-	struct afs_vnode *vnode = op->file[0].vnode;
-
 	_enter("op=%08x", op->debug_id);
-	afs_check_for_remote_deletion(op, vnode);
 	afs_vnode_commit_status(op, &op->file[0]);
 }
 
@@ -186,6 +183,7 @@ static const struct afs_operation_ops afs_set_lock_operation = {
 	.issue_afs_rpc	= afs_fs_set_lock,
 	.issue_yfs_rpc	= yfs_fs_set_lock,
 	.success	= afs_lock_success,
+	.aborted	= afs_check_for_remote_deletion,
 };
 
 /*
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 2d2dff5688a4..c264839b2fd0 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -187,9 +187,17 @@ void afs_wait_for_operation(struct afs_operation *op)
 		op->error = afs_wait_for_call_to_complete(op->call, &op->ac);
 	}
 
-	if (op->error == 0) {
+	switch (op->error) {
+	case 0:
 		_debug("success");
 		op->ops->success(op);
+		break;
+	case -ECONNABORTED:
+		if (op->ops->aborted)
+			op->ops->aborted(op);
+		break;
+	default:
+		break;
 	}
 
 	afs_end_vnode_operation(op);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 70c925978d10..56e60d561f37 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -324,6 +324,7 @@ static const struct afs_operation_ops afs_fetch_status_operation = {
 	.issue_afs_rpc	= afs_fs_fetch_status,
 	.issue_yfs_rpc	= yfs_fs_fetch_status,
 	.success	= afs_fetch_status_success,
+	.aborted	= afs_check_for_remote_deletion,
 };
 
 /*
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 598934d923cc..9420890e3577 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -934,6 +934,7 @@ extern const struct address_space_operations afs_dir_aops;
 extern const struct dentry_operations afs_fs_dentry_operations;
 
 extern void afs_d_release(struct dentry *);
+extern void afs_check_for_remote_deletion(struct afs_operation *);
 
 /*
  * dir_edit.c
@@ -1482,15 +1483,6 @@ static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
 	return &vnode->vfs_inode;
 }
 
-static inline void afs_check_for_remote_deletion(struct afs_operation *op,
-						 struct afs_vnode *vnode)
-{
-	if (op->error == -ENOENT) {
-		set_bit(AFS_VNODE_DELETED, &vnode->flags);
-		afs_break_callback(vnode, afs_cb_break_for_deleted);
-	}
-}
-
 /*
  * Note that a dentry got changed.  We need to set d_fsdata to the data version
  * number derived from the result of the operation.  It doesn't matter if
-- 
cgit 


From 7c295eec1e351003a8ca06c34f9e79336fa5b244 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 16 Jun 2020 00:52:30 +0100
Subject: afs: afs_vnode_commit_status() doesn't need to check the RPC error

afs_vnode_commit_status() is only ever called if op->error is 0, so remove
the op->error checks from the function.

Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 56e60d561f37..d5d0ae7b2b1e 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -281,8 +281,6 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
 
 	_enter("");
 
-	ASSERTCMP(op->error, ==, 0);
-
 	write_seqlock(&vnode->cb_lock);
 
 	if (vp->scb.have_error) {
@@ -300,7 +298,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
 
 	write_sequnlock(&vnode->cb_lock);
 
-	if (op->error == 0 && vp->scb.have_status)
+	if (vp->scb.have_status)
 		afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb);
 }
 
-- 
cgit 


From 2d3a8e2deddea6c89961c422ec0c5b851e648c14 Mon Sep 17 00:00:00 2001
From: Jason Yan <yanaijie@huawei.com>
Date: Tue, 16 Jun 2020 20:16:55 +0800
Subject: block: Fix use-after-free in blkdev_get()

In blkdev_get() we call __blkdev_get() to do some internal jobs and if
there is some errors in __blkdev_get(), the bdput() is called which
means we have released the refcount of the bdev (actually the refcount of
the bdev inode). This means we cannot access bdev after that point. But
acctually bdev is still accessed in blkdev_get() after calling
__blkdev_get(). This results in use-after-free if the refcount is the
last one we released in __blkdev_get(). Let's take a look at the
following scenerio:

  CPU0            CPU1                    CPU2
blkdev_open     blkdev_open           Remove disk
                  bd_acquire
		  blkdev_get
		    __blkdev_get      del_gendisk
					bdev_unhash_inode
  bd_acquire          bdev_get_gendisk
    bd_forget           failed because of unhashed
	  bdput
	              bdput (the last one)
		        bdev_evict_inode

	  	    access bdev => use after free

[  459.350216] BUG: KASAN: use-after-free in __lock_acquire+0x24c1/0x31b0
[  459.351190] Read of size 8 at addr ffff88806c815a80 by task syz-executor.0/20132
[  459.352347]
[  459.352594] CPU: 0 PID: 20132 Comm: syz-executor.0 Not tainted 4.19.90 #2
[  459.353628] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[  459.354947] Call Trace:
[  459.355337]  dump_stack+0x111/0x19e
[  459.355879]  ? __lock_acquire+0x24c1/0x31b0
[  459.356523]  print_address_description+0x60/0x223
[  459.357248]  ? __lock_acquire+0x24c1/0x31b0
[  459.357887]  kasan_report.cold+0xae/0x2d8
[  459.358503]  __lock_acquire+0x24c1/0x31b0
[  459.359120]  ? _raw_spin_unlock_irq+0x24/0x40
[  459.359784]  ? lockdep_hardirqs_on+0x37b/0x580
[  459.360465]  ? _raw_spin_unlock_irq+0x24/0x40
[  459.361123]  ? finish_task_switch+0x125/0x600
[  459.361812]  ? finish_task_switch+0xee/0x600
[  459.362471]  ? mark_held_locks+0xf0/0xf0
[  459.363108]  ? __schedule+0x96f/0x21d0
[  459.363716]  lock_acquire+0x111/0x320
[  459.364285]  ? blkdev_get+0xce/0xbe0
[  459.364846]  ? blkdev_get+0xce/0xbe0
[  459.365390]  __mutex_lock+0xf9/0x12a0
[  459.365948]  ? blkdev_get+0xce/0xbe0
[  459.366493]  ? bdev_evict_inode+0x1f0/0x1f0
[  459.367130]  ? blkdev_get+0xce/0xbe0
[  459.367678]  ? destroy_inode+0xbc/0x110
[  459.368261]  ? mutex_trylock+0x1a0/0x1a0
[  459.368867]  ? __blkdev_get+0x3e6/0x1280
[  459.369463]  ? bdev_disk_changed+0x1d0/0x1d0
[  459.370114]  ? blkdev_get+0xce/0xbe0
[  459.370656]  blkdev_get+0xce/0xbe0
[  459.371178]  ? find_held_lock+0x2c/0x110
[  459.371774]  ? __blkdev_get+0x1280/0x1280
[  459.372383]  ? lock_downgrade+0x680/0x680
[  459.373002]  ? lock_acquire+0x111/0x320
[  459.373587]  ? bd_acquire+0x21/0x2c0
[  459.374134]  ? do_raw_spin_unlock+0x4f/0x250
[  459.374780]  blkdev_open+0x202/0x290
[  459.375325]  do_dentry_open+0x49e/0x1050
[  459.375924]  ? blkdev_get_by_dev+0x70/0x70
[  459.376543]  ? __x64_sys_fchdir+0x1f0/0x1f0
[  459.377192]  ? inode_permission+0xbe/0x3a0
[  459.377818]  path_openat+0x148c/0x3f50
[  459.378392]  ? kmem_cache_alloc+0xd5/0x280
[  459.379016]  ? entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  459.379802]  ? path_lookupat.isra.0+0x900/0x900
[  459.380489]  ? __lock_is_held+0xad/0x140
[  459.381093]  do_filp_open+0x1a1/0x280
[  459.381654]  ? may_open_dev+0xf0/0xf0
[  459.382214]  ? find_held_lock+0x2c/0x110
[  459.382816]  ? lock_downgrade+0x680/0x680
[  459.383425]  ? __lock_is_held+0xad/0x140
[  459.384024]  ? do_raw_spin_unlock+0x4f/0x250
[  459.384668]  ? _raw_spin_unlock+0x1f/0x30
[  459.385280]  ? __alloc_fd+0x448/0x560
[  459.385841]  do_sys_open+0x3c3/0x500
[  459.386386]  ? filp_open+0x70/0x70
[  459.386911]  ? trace_hardirqs_on_thunk+0x1a/0x1c
[  459.387610]  ? trace_hardirqs_off_caller+0x55/0x1c0
[  459.388342]  ? do_syscall_64+0x1a/0x520
[  459.388930]  do_syscall_64+0xc3/0x520
[  459.389490]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  459.390248] RIP: 0033:0x416211
[  459.390720] Code: 75 14 b8 02 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83
04 19 00 00 c3 48 83 ec 08 e8 0a fa ff ff 48 89 04 24 b8 02 00 00 00 0f
   05 <48> 8b 3c 24 48 89 c2 e8 53 fa ff ff 48 89 d0 48 83 c4 08 48 3d
      01
[  459.393483] RSP: 002b:00007fe45dfe9a60 EFLAGS: 00000293 ORIG_RAX: 0000000000000002
[  459.394610] RAX: ffffffffffffffda RBX: 00007fe45dfea6d4 RCX: 0000000000416211
[  459.395678] RDX: 00007fe45dfe9b0a RSI: 0000000000000002 RDI: 00007fe45dfe9b00
[  459.396758] RBP: 000000000076bf20 R08: 0000000000000000 R09: 000000000000000a
[  459.397930] R10: 0000000000000075 R11: 0000000000000293 R12: 00000000ffffffff
[  459.399022] R13: 0000000000000bd9 R14: 00000000004cdb80 R15: 000000000076bf2c
[  459.400168]
[  459.400430] Allocated by task 20132:
[  459.401038]  kasan_kmalloc+0xbf/0xe0
[  459.401652]  kmem_cache_alloc+0xd5/0x280
[  459.402330]  bdev_alloc_inode+0x18/0x40
[  459.402970]  alloc_inode+0x5f/0x180
[  459.403510]  iget5_locked+0x57/0xd0
[  459.404095]  bdget+0x94/0x4e0
[  459.404607]  bd_acquire+0xfa/0x2c0
[  459.405113]  blkdev_open+0x110/0x290
[  459.405702]  do_dentry_open+0x49e/0x1050
[  459.406340]  path_openat+0x148c/0x3f50
[  459.406926]  do_filp_open+0x1a1/0x280
[  459.407471]  do_sys_open+0x3c3/0x500
[  459.408010]  do_syscall_64+0xc3/0x520
[  459.408572]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  459.409415]
[  459.409679] Freed by task 1262:
[  459.410212]  __kasan_slab_free+0x129/0x170
[  459.410919]  kmem_cache_free+0xb2/0x2a0
[  459.411564]  rcu_process_callbacks+0xbb2/0x2320
[  459.412318]  __do_softirq+0x225/0x8ac

Fix this by delaying bdput() to the end of blkdev_get() which means we
have finished accessing bdev.

Fixes: 77ea887e433a ("implement in-kernel gendisk events handling")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Jason Yan <yanaijie@huawei.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 47860e589388..08c87db3a92b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1565,10 +1565,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	 */
 	if (!for_part) {
 		ret = devcgroup_inode_permission(bdev->bd_inode, perm);
-		if (ret != 0) {
-			bdput(bdev);
+		if (ret != 0)
 			return ret;
-		}
 	}
 
  restart:
@@ -1637,8 +1635,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_clear;
 			BUG_ON(for_part);
 			ret = __blkdev_get(whole, mode, 1);
-			if (ret)
+			if (ret) {
+				bdput(whole);
 				goto out_clear;
+			}
 			bdev->bd_contains = whole;
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
@@ -1688,7 +1688,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	disk_unblock_events(disk);
 	put_disk_and_module(disk);
  out:
-	bdput(bdev);
 
 	return ret;
 }
@@ -1755,6 +1754,9 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 		bdput(whole);
 	}
 
+	if (res)
+		bdput(bdev);
+
 	return res;
 }
 EXPORT_SYMBOL(blkdev_get);
-- 
cgit 


From 9fecd13202f520f3f25d5b1c313adb740fe19773 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 1 Jun 2020 19:12:06 +0100
Subject: btrfs: fix a block group ref counter leak after failure to remove
 block group

When removing a block group, if we fail to delete the block group's item
from the extent tree, we jump to the 'out' label and end up decrementing
the block group's reference count once only (by 1), resulting in a counter
leak because the block group at that point was already removed from the
block group cache rbtree - so we have to decrement the reference count
twice, once for the rbtree and once for our lookup at the start of the
function.

There is a second bug where if removing the free space tree entries (the
call to remove_block_group_free_space()) fails we end up jumping to the
'out_put_group' label but end up decrementing the reference count only
once, when we should have done it twice, since we have already removed
the block group from the block group cache rbtree. This happens because
the reference count decrement for the rbtree reference happens after
attempting to remove the free space tree entries, which is far away from
the place where we remove the block group from the rbtree.

To make things less error prone, decrement the reference count for the
rbtree immediately after removing the block group from it. This also
eleminates the need for two different exit labels on error, renaming
'out_put_label' to just 'out' and removing the old 'out'.

Fixes: f6033c5e333238 ("btrfs: fix block group leak when removing fails")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 176e8a292fd1..6462dd0b155c 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -940,7 +940,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
-		goto out_put_group;
+		goto out;
 	}
 
 	/*
@@ -978,7 +978,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 		if (ret) {
 			btrfs_add_delayed_iput(inode);
-			goto out_put_group;
+			goto out;
 		}
 		clear_nlink(inode);
 		/* One for the block groups ref */
@@ -1001,13 +1001,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
 	if (ret < 0)
-		goto out_put_group;
+		goto out;
 	if (ret > 0)
 		btrfs_release_path(path);
 	if (ret == 0) {
 		ret = btrfs_del_item(trans, tree_root, path);
 		if (ret)
-			goto out_put_group;
+			goto out;
 		btrfs_release_path(path);
 	}
 
@@ -1016,6 +1016,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 		 &fs_info->block_group_cache_tree);
 	RB_CLEAR_NODE(&block_group->cache_node);
 
+	/* Once for the block groups rbtree */
+	btrfs_put_block_group(block_group);
+
 	if (fs_info->first_logical_byte == block_group->start)
 		fs_info->first_logical_byte = (u64)-1;
 	spin_unlock(&fs_info->block_group_cache_lock);
@@ -1125,10 +1128,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	ret = remove_block_group_free_space(trans, block_group);
 	if (ret)
-		goto out_put_group;
-
-	/* Once for the block groups rbtree */
-	btrfs_put_block_group(block_group);
+		goto out;
 
 	ret = remove_block_group_item(trans, path, block_group);
 	if (ret < 0)
@@ -1145,10 +1145,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 		free_extent_map(em);
 	}
 
-out_put_group:
+out:
 	/* Once for the lookup reference */
 	btrfs_put_block_group(block_group);
-out:
 	if (remove_rsv)
 		btrfs_delayed_refs_rsv_release(fs_info, 1);
 	btrfs_free_path(path);
-- 
cgit 


From ffcb9d44572afbaf8fa6dbf5115bff6dab7b299e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 1 Jun 2020 19:12:19 +0100
Subject: btrfs: fix race between block group removal and block group creation

There is a race between block group removal and block group creation
when the removal is completed by a task running fitrim or scrub. When
this happens we end up failing the block group creation with an error
-EEXIST since we attempt to insert a duplicate block group item key
in the extent tree. That results in a transaction abort.

The race happens like this:

1) Task A is doing a fitrim, and at btrfs_trim_block_group() it freezes
   block group X with btrfs_freeze_block_group() (until very recently
   that was named btrfs_get_block_group_trimming());

2) Task B starts removing block group X, either because it's now unused
   or due to relocation for example. So at btrfs_remove_block_group(),
   while holding the chunk mutex and the block group's lock, it sets
   the 'removed' flag of the block group and it sets the local variable
   'remove_em' to false, because the block group is currently frozen
   (its 'frozen' counter is > 0, until very recently this counter was
   named 'trimming');

3) Task B unlocks the block group and the chunk mutex;

4) Task A is done trimming the block group and unfreezes the block group
   by calling btrfs_unfreeze_block_group() (until very recently this was
   named btrfs_put_block_group_trimming()). In this function we lock the
   block group and set the local variable 'cleanup' to true because we
   were able to decrement the block group's 'frozen' counter down to 0 and
   the flag 'removed' is set in the block group.

   Since 'cleanup' is set to true, it locks the chunk mutex and removes
   the extent mapping representing the block group from the mapping tree;

5) Task C allocates a new block group Y and it picks up the logical address
   that block group X had as the logical address for Y, because X was the
   block group with the highest logical address and now the second block
   group with the highest logical address, the last in the fs mapping tree,
   ends at an offset corresponding to block group X's logical address (this
   logical address selection is done at volumes.c:find_next_chunk()).

   At this point the new block group Y does not have yet its item added
   to the extent tree (nor the corresponding device extent items and
   chunk item in the device and chunk trees). The new group Y is added to
   the list of pending block groups in the transaction handle;

6) Before task B proceeds to removing the block group item for block
   group X from the extent tree, which has a key matching:

   (X logical offset, BTRFS_BLOCK_GROUP_ITEM_KEY, length)

   task C while ending its transaction handle calls
   btrfs_create_pending_block_groups(), which finds block group Y and
   tries to insert the block group item for Y into the exten tree, which
   fails with -EEXIST since logical offset is the same that X had and
   task B hasn't yet deleted the key from the extent tree.
   This failure results in a transaction abort, producing a stack like
   the following:

------------[ cut here ]------------
 BTRFS: Transaction aborted (error -17)
 WARNING: CPU: 2 PID: 19736 at fs/btrfs/block-group.c:2074 btrfs_create_pending_block_groups+0x1eb/0x260 [btrfs]
 Modules linked in: btrfs blake2b_generic xor raid6_pq (...)
 CPU: 2 PID: 19736 Comm: fsstress Tainted: G        W         5.6.0-rc7-btrfs-next-58 #5
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
 RIP: 0010:btrfs_create_pending_block_groups+0x1eb/0x260 [btrfs]
 Code: ff ff ff 48 8b 55 50 f0 48 (...)
 RSP: 0018:ffffa4160a1c7d58 EFLAGS: 00010286
 RAX: 0000000000000000 RBX: ffff961581909d98 RCX: 0000000000000000
 RDX: 0000000000000001 RSI: ffffffffb3d63990 RDI: 0000000000000001
 RBP: ffff9614f3356a58 R08: 0000000000000000 R09: 0000000000000001
 R10: ffff9615b65b0040 R11: 0000000000000000 R12: ffff961581909c10
 R13: ffff9615b0c32000 R14: ffff9614f3356ab0 R15: ffff9614be779000
 FS:  00007f2ce2841e80(0000) GS:ffff9615bae00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000555f18780000 CR3: 0000000131d34005 CR4: 00000000003606e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  btrfs_start_dirty_block_groups+0x398/0x4e0 [btrfs]
  btrfs_commit_transaction+0xd0/0xc50 [btrfs]
  ? btrfs_attach_transaction_barrier+0x1e/0x50 [btrfs]
  ? __ia32_sys_fdatasync+0x20/0x20
  iterate_supers+0xdb/0x180
  ksys_sync+0x60/0xb0
  __ia32_sys_sync+0xa/0x10
  do_syscall_64+0x5c/0x280
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
 RIP: 0033:0x7f2ce1d4d5b7
 Code: 83 c4 08 48 3d 01 (...)
 RSP: 002b:00007ffd8b558c58 EFLAGS: 00000202 ORIG_RAX: 00000000000000a2
 RAX: ffffffffffffffda RBX: 000000000000002c RCX: 00007f2ce1d4d5b7
 RDX: 00000000ffffffff RSI: 00000000186ba07b RDI: 000000000000002c
 RBP: 0000555f17b9e520 R08: 0000000000000012 R09: 000000000000ce00
 R10: 0000000000000078 R11: 0000000000000202 R12: 0000000000000032
 R13: 0000000051eb851f R14: 00007ffd8b558cd0 R15: 0000555f1798ec20
 irq event stamp: 0
 hardirqs last  enabled at (0): [<0000000000000000>] 0x0
 hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
 softirqs last  enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
 softirqs last disabled at (0): [<0000000000000000>] 0x0
 ---[ end trace bd7c03622e0b0a9c ]---

Fix this simply by making btrfs_remove_block_group() remove the block
group's item from the extent tree before it flags the block group as
removed. Also make the free space deletion from the free space tree
before flagging the block group as removed, to avoid a similar race
with adding and removing free space entries for the free space tree.

Fixes: 04216820fe83d5 ("Btrfs: fix race between fs trimming and block group remove/allocation")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 6462dd0b155c..c037ef514b64 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1092,6 +1092,25 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	spin_unlock(&block_group->space_info->lock);
 
+	/*
+	 * Remove the free space for the block group from the free space tree
+	 * and the block group's item from the extent tree before marking the
+	 * block group as removed. This is to prevent races with tasks that
+	 * freeze and unfreeze a block group, this task and another task
+	 * allocating a new block group - the unfreeze task ends up removing
+	 * the block group's extent map before the task calling this function
+	 * deletes the block group item from the extent tree, allowing for
+	 * another task to attempt to create another block group with the same
+	 * item key (and failing with -EEXIST and a transaction abort).
+	 */
+	ret = remove_block_group_free_space(trans, block_group);
+	if (ret)
+		goto out;
+
+	ret = remove_block_group_item(trans, path, block_group);
+	if (ret < 0)
+		goto out;
+
 	mutex_lock(&fs_info->chunk_mutex);
 	spin_lock(&block_group->lock);
 	block_group->removed = 1;
@@ -1126,14 +1145,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	mutex_unlock(&fs_info->chunk_mutex);
 
-	ret = remove_block_group_free_space(trans, block_group);
-	if (ret)
-		goto out;
-
-	ret = remove_block_group_item(trans, path, block_group);
-	if (ret < 0)
-		goto out;
-
 	if (remove_em) {
 		struct extent_map_tree *em_tree;
 
-- 
cgit 


From 432cd2a10f1c10cead91fe706ff5dc52f06d642a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 8 Jun 2020 13:32:55 +0100
Subject: btrfs: fix data block group relocation failure due to concurrent
 scrub

When running relocation of a data block group while scrub is running in
parallel, it is possible that the relocation will fail and abort the
current transaction with an -EINVAL error:

   [134243.988595] BTRFS info (device sdc): found 14 extents, stage: move data extents
   [134243.999871] ------------[ cut here ]------------
   [134244.000741] BTRFS: Transaction aborted (error -22)
   [134244.001692] WARNING: CPU: 0 PID: 26954 at fs/btrfs/ctree.c:1071 __btrfs_cow_block+0x6a7/0x790 [btrfs]
   [134244.003380] Modules linked in: btrfs blake2b_generic xor raid6_pq (...)
   [134244.012577] CPU: 0 PID: 26954 Comm: btrfs Tainted: G        W         5.6.0-rc7-btrfs-next-58 #5
   [134244.014162] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
   [134244.016184] RIP: 0010:__btrfs_cow_block+0x6a7/0x790 [btrfs]
   [134244.017151] Code: 48 c7 c7 (...)
   [134244.020549] RSP: 0018:ffffa41607863888 EFLAGS: 00010286
   [134244.021515] RAX: 0000000000000000 RBX: ffff9614bdfe09c8 RCX: 0000000000000000
   [134244.022822] RDX: 0000000000000001 RSI: ffffffffb3d63980 RDI: 0000000000000001
   [134244.024124] RBP: ffff961589e8c000 R08: 0000000000000000 R09: 0000000000000001
   [134244.025424] R10: ffffffffc0ae5955 R11: 0000000000000000 R12: ffff9614bd530d08
   [134244.026725] R13: ffff9614ced41b88 R14: ffff9614bdfe2a48 R15: 0000000000000000
   [134244.028024] FS:  00007f29b63c08c0(0000) GS:ffff9615ba600000(0000) knlGS:0000000000000000
   [134244.029491] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
   [134244.030560] CR2: 00007f4eb339b000 CR3: 0000000130d6e006 CR4: 00000000003606f0
   [134244.031997] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
   [134244.033153] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
   [134244.034484] Call Trace:
   [134244.034984]  btrfs_cow_block+0x12b/0x2b0 [btrfs]
   [134244.035859]  do_relocation+0x30b/0x790 [btrfs]
   [134244.036681]  ? do_raw_spin_unlock+0x49/0xc0
   [134244.037460]  ? _raw_spin_unlock+0x29/0x40
   [134244.038235]  relocate_tree_blocks+0x37b/0x730 [btrfs]
   [134244.039245]  relocate_block_group+0x388/0x770 [btrfs]
   [134244.040228]  btrfs_relocate_block_group+0x161/0x2e0 [btrfs]
   [134244.041323]  btrfs_relocate_chunk+0x36/0x110 [btrfs]
   [134244.041345]  btrfs_balance+0xc06/0x1860 [btrfs]
   [134244.043382]  ? btrfs_ioctl_balance+0x27c/0x310 [btrfs]
   [134244.045586]  btrfs_ioctl_balance+0x1ed/0x310 [btrfs]
   [134244.045611]  btrfs_ioctl+0x1880/0x3760 [btrfs]
   [134244.049043]  ? do_raw_spin_unlock+0x49/0xc0
   [134244.049838]  ? _raw_spin_unlock+0x29/0x40
   [134244.050587]  ? __handle_mm_fault+0x11b3/0x14b0
   [134244.051417]  ? ksys_ioctl+0x92/0xb0
   [134244.052070]  ksys_ioctl+0x92/0xb0
   [134244.052701]  ? trace_hardirqs_off_thunk+0x1a/0x1c
   [134244.053511]  __x64_sys_ioctl+0x16/0x20
   [134244.054206]  do_syscall_64+0x5c/0x280
   [134244.054891]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
   [134244.055819] RIP: 0033:0x7f29b51c9dd7
   [134244.056491] Code: 00 00 00 (...)
   [134244.059767] RSP: 002b:00007ffcccc1dd08 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
   [134244.061168] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f29b51c9dd7
   [134244.062474] RDX: 00007ffcccc1dda0 RSI: 00000000c4009420 RDI: 0000000000000003
   [134244.063771] RBP: 0000000000000003 R08: 00005565cea4b000 R09: 0000000000000000
   [134244.065032] R10: 0000000000000541 R11: 0000000000000202 R12: 00007ffcccc2060a
   [134244.066327] R13: 00007ffcccc1dda0 R14: 0000000000000002 R15: 00007ffcccc1dec0
   [134244.067626] irq event stamp: 0
   [134244.068202] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
   [134244.069351] hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
   [134244.070909] softirqs last  enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
   [134244.072392] softirqs last disabled at (0): [<0000000000000000>] 0x0
   [134244.073432] ---[ end trace bd7c03622e0b0a99 ]---

The -EINVAL error comes from the following chain of function calls:

  __btrfs_cow_block() <-- aborts the transaction
    btrfs_reloc_cow_block()
      replace_file_extents()
        get_new_location() <-- returns -EINVAL

When relocating a data block group, for each allocated extent of the block
group, we preallocate another extent (at prealloc_file_extent_cluster()),
associated with the data relocation inode, and then dirty all its pages.
These preallocated extents have, and must have, the same size that extents
from the data block group being relocated have.

Later before we start the relocation stage that updates pointers (bytenr
field of file extent items) to point to the the new extents, we trigger
writeback for the data relocation inode. The expectation is that writeback
will write the pages to the previously preallocated extents, that it
follows the NOCOW path. That is generally the case, however, if a scrub
is running it may have turned the block group that contains those extents
into RO mode, in which case writeback falls back to the COW path.

However in the COW path instead of allocating exactly one extent with the
expected size, the allocator may end up allocating several smaller extents
due to free space fragmentation - because we tell it at cow_file_range()
that the minimum allocation size can match the filesystem's sector size.
This later breaks the relocation's expectation that an extent associated
to a file extent item in the data relocation inode has the same size as
the respective extent pointed by a file extent item in another tree - in
this case the extent to which the relocation inode poins to is smaller,
causing relocation.c:get_new_location() to return -EINVAL.

For example, if we are relocating a data block group X that has a logical
address of X and the block group has an extent allocated at the logical
address X + 128KiB with a size of 64KiB:

1) At prealloc_file_extent_cluster() we allocate an extent for the data
   relocation inode with a size of 64KiB and associate it to the file
   offset 128KiB (X + 128KiB - X) of the data relocation inode. This
   preallocated extent was allocated at block group Z;

2) A scrub running in parallel turns block group Z into RO mode and
   starts scrubing its extents;

3) Relocation triggers writeback for the data relocation inode;

4) When running delalloc (btrfs_run_delalloc_range()), we try first the
   NOCOW path because the data relocation inode has BTRFS_INODE_PREALLOC
   set in its flags. However, because block group Z is in RO mode, the
   NOCOW path (run_delalloc_nocow()) falls back into the COW path, by
   calling cow_file_range();

5) At cow_file_range(), in the first iteration of the while loop we call
   btrfs_reserve_extent() to allocate a 64KiB extent and pass it a minimum
   allocation size of 4KiB (fs_info->sectorsize). Due to free space
   fragmentation, btrfs_reserve_extent() ends up allocating two extents
   of 32KiB each, each one on a different iteration of that while loop;

6) Writeback of the data relocation inode completes;

7) Relocation proceeds and ends up at relocation.c:replace_file_extents(),
   with a leaf which has a file extent item that points to the data extent
   from block group X, that has a logical address (bytenr) of X + 128KiB
   and a size of 64KiB. Then it calls get_new_location(), which does a
   lookup in the data relocation tree for a file extent item starting at
   offset 128KiB (X + 128KiB - X) and belonging to the data relocation
   inode. It finds a corresponding file extent item, however that item
   points to an extent that has a size of 32KiB, which doesn't match the
   expected size of 64KiB, resuling in -EINVAL being returned from this
   function and propagated up to __btrfs_cow_block(), which aborts the
   current transaction.

To fix this make sure that at cow_file_range() when we call the allocator
we pass it a minimum allocation size corresponding the desired extent size
if the inode belongs to the data relocation tree, otherwise pass it the
filesystem's sector size as the minimum allocation size.

CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 12b5d61f23bb..62c3f4972ff6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -985,6 +985,7 @@ static noinline int cow_file_range(struct inode *inode,
 	u64 num_bytes;
 	unsigned long ram_size;
 	u64 cur_alloc_size = 0;
+	u64 min_alloc_size;
 	u64 blocksize = fs_info->sectorsize;
 	struct btrfs_key ins;
 	struct extent_map *em;
@@ -1035,10 +1036,26 @@ static noinline int cow_file_range(struct inode *inode,
 	btrfs_drop_extent_cache(BTRFS_I(inode), start,
 			start + num_bytes - 1, 0);
 
+	/*
+	 * Relocation relies on the relocated extents to have exactly the same
+	 * size as the original extents. Normally writeback for relocation data
+	 * extents follows a NOCOW path because relocation preallocates the
+	 * extents. However, due to an operation such as scrub turning a block
+	 * group to RO mode, it may fallback to COW mode, so we must make sure
+	 * an extent allocated during COW has exactly the requested size and can
+	 * not be split into smaller extents, otherwise relocation breaks and
+	 * fails during the stage where it updates the bytenr of file extent
+	 * items.
+	 */
+	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+		min_alloc_size = num_bytes;
+	else
+		min_alloc_size = fs_info->sectorsize;
+
 	while (num_bytes > 0) {
 		cur_alloc_size = num_bytes;
 		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
-					   fs_info->sectorsize, 0, alloc_hint,
+					   min_alloc_size, 0, alloc_hint,
 					   &ins, 1, 1);
 		if (ret < 0)
 			goto out_unlock;
-- 
cgit 


From 6bd335b469f945f75474c11e3f577f85409f39c3 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 8 Jun 2020 13:33:05 +0100
Subject: btrfs: fix bytes_may_use underflow when running balance and scrub in
 parallel

When balance and scrub are running in parallel it is possible to end up
with an underflow of the bytes_may_use counter of the data space_info
object, which triggers a warning like the following:

   [134243.793196] BTRFS info (device sdc): relocating block group 1104150528 flags data
   [134243.806891] ------------[ cut here ]------------
   [134243.807561] WARNING: CPU: 1 PID: 26884 at fs/btrfs/space-info.h:125 btrfs_add_reserved_bytes+0x1da/0x280 [btrfs]
   [134243.808819] Modules linked in: btrfs blake2b_generic xor (...)
   [134243.815779] CPU: 1 PID: 26884 Comm: kworker/u8:8 Tainted: G        W         5.6.0-rc7-btrfs-next-58 #5
   [134243.816944] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
   [134243.818389] Workqueue: writeback wb_workfn (flush-btrfs-108483)
   [134243.819186] RIP: 0010:btrfs_add_reserved_bytes+0x1da/0x280 [btrfs]
   [134243.819963] Code: 0b f2 85 (...)
   [134243.822271] RSP: 0018:ffffa4160aae7510 EFLAGS: 00010287
   [134243.822929] RAX: 000000000000c000 RBX: ffff96159a8c1000 RCX: 0000000000000000
   [134243.823816] RDX: 0000000000008000 RSI: 0000000000000000 RDI: ffff96158067a810
   [134243.824742] RBP: ffff96158067a800 R08: 0000000000000001 R09: 0000000000000000
   [134243.825636] R10: ffff961501432a40 R11: 0000000000000000 R12: 000000000000c000
   [134243.826532] R13: 0000000000000001 R14: ffffffffffff4000 R15: ffff96158067a810
   [134243.827432] FS:  0000000000000000(0000) GS:ffff9615baa00000(0000) knlGS:0000000000000000
   [134243.828451] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
   [134243.829184] CR2: 000055bd7e414000 CR3: 00000001077be004 CR4: 00000000003606e0
   [134243.830083] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
   [134243.830975] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
   [134243.831867] Call Trace:
   [134243.832211]  find_free_extent+0x4a0/0x16c0 [btrfs]
   [134243.832846]  btrfs_reserve_extent+0x91/0x180 [btrfs]
   [134243.833487]  cow_file_range+0x12d/0x490 [btrfs]
   [134243.834080]  fallback_to_cow+0x82/0x1b0 [btrfs]
   [134243.834689]  ? release_extent_buffer+0x121/0x170 [btrfs]
   [134243.835370]  run_delalloc_nocow+0x33f/0xa30 [btrfs]
   [134243.836032]  btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs]
   [134243.836725]  ? find_lock_delalloc_range+0x221/0x250 [btrfs]
   [134243.837450]  writepage_delalloc+0xe8/0x150 [btrfs]
   [134243.838059]  __extent_writepage+0xe8/0x4c0 [btrfs]
   [134243.838674]  extent_write_cache_pages+0x237/0x530 [btrfs]
   [134243.839364]  extent_writepages+0x44/0xa0 [btrfs]
   [134243.839946]  do_writepages+0x23/0x80
   [134243.840401]  __writeback_single_inode+0x59/0x700
   [134243.841006]  writeback_sb_inodes+0x267/0x5f0
   [134243.841548]  __writeback_inodes_wb+0x87/0xe0
   [134243.842091]  wb_writeback+0x382/0x590
   [134243.842574]  ? wb_workfn+0x4a2/0x6c0
   [134243.843030]  wb_workfn+0x4a2/0x6c0
   [134243.843468]  process_one_work+0x26d/0x6a0
   [134243.843978]  worker_thread+0x4f/0x3e0
   [134243.844452]  ? process_one_work+0x6a0/0x6a0
   [134243.844981]  kthread+0x103/0x140
   [134243.845400]  ? kthread_create_worker_on_cpu+0x70/0x70
   [134243.846030]  ret_from_fork+0x3a/0x50
   [134243.846494] irq event stamp: 0
   [134243.846892] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
   [134243.847682] hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
   [134243.848687] softirqs last  enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020
   [134243.849913] softirqs last disabled at (0): [<0000000000000000>] 0x0
   [134243.850698] ---[ end trace bd7c03622e0b0a96 ]---
   [134243.851335] ------------[ cut here ]------------

When relocating a data block group, for each extent allocated in the
block group we preallocate another extent with the same size for the
data relocation inode (we do it at prealloc_file_extent_cluster()).
We reserve space by calling btrfs_check_data_free_space(), which ends
up incrementing the data space_info's bytes_may_use counter, and
then call btrfs_prealloc_file_range() to allocate the extent, which
always decrements the bytes_may_use counter by the same amount.

The expectation is that writeback of the data relocation inode always
follows a NOCOW path, by writing into the preallocated extents. However,
when starting writeback we might end up falling back into the COW path,
because the block group that contains the preallocated extent was turned
into RO mode by a scrub running in parallel. The COW path then calls the
extent allocator which ends up calling btrfs_add_reserved_bytes(), and
this function decrements the bytes_may_use counter of the data space_info
object by an amount corresponding to the size of the allocated extent,
despite we haven't previously incremented it. When the counter currently
has a value smaller then the allocated extent we reset the counter to 0
and emit a warning, otherwise we just decrement it and slowly mess up
with this counter which is crucial for space reservation, the end result
can be granting reserved space to tasks when there isn't really enough
free space, and having the tasks fail later in critical places where
error handling consists of a transaction abort or hitting a BUG_ON().

Fix this by making sure that if we fallback to the COW path for a data
relocation inode, we increment the bytes_may_use counter of the data
space_info object. The COW path will then decrement it at
btrfs_add_reserved_bytes() on success or through its error handling part
by a call to extent_clear_unlock_delalloc() (which ends up calling
btrfs_clear_delalloc_extent() that does the decrement operation) in case
of an error.

Test case btrfs/061 from fstests could sporadically trigger this.

CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 62c3f4972ff6..62b49d2db928 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1378,6 +1378,8 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page,
 			   int *page_started, unsigned long *nr_written)
 {
 	const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode));
+	const bool is_reloc_ino = (BTRFS_I(inode)->root->root_key.objectid ==
+				   BTRFS_DATA_RELOC_TREE_OBJECTID);
 	const u64 range_bytes = end + 1 - start;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	u64 range_start = start;
@@ -1408,18 +1410,23 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page,
 	 *    data space info, which we incremented in the step above.
 	 *
 	 * If we need to fallback to cow and the inode corresponds to a free
-	 * space cache inode, we must also increment bytes_may_use of the data
-	 * space_info for the same reason. Space caches always get a prealloc
+	 * space cache inode or an inode of the data relocation tree, we must
+	 * also increment bytes_may_use of the data space_info for the same
+	 * reason. Space caches and relocated data extents always get a prealloc
 	 * extent for them, however scrub or balance may have set the block
-	 * group that contains that extent to RO mode.
+	 * group that contains that extent to RO mode and therefore force COW
+	 * when starting writeback.
 	 */
 	count = count_range_bits(io_tree, &range_start, end, range_bytes,
 				 EXTENT_NORESERVE, 0);
-	if (count > 0 || is_space_ino) {
-		const u64 bytes = is_space_ino ? range_bytes : count;
+	if (count > 0 || is_space_ino || is_reloc_ino) {
+		u64 bytes = count;
 		struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 		struct btrfs_space_info *sinfo = fs_info->data_sinfo;
 
+		if (is_space_ino || is_reloc_ino)
+			bytes = range_bytes;
+
 		spin_lock(&sinfo->lock);
 		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
 		spin_unlock(&sinfo->lock);
-- 
cgit 


From e7a79811d0db136dc2d336b56d54cf1b774ce972 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 15 Jun 2020 10:38:44 +0100
Subject: btrfs: check if a log root exists before locking the log_mutex on
 unlink

This brings back an optimization that commit e678934cbe5f02 ("btrfs:
Remove unnecessary check from join_running_log_trans") removed, but in
a different form. So it's almost equivalent to a revert.

That commit removed an optimization where we avoid locking a root's
log_mutex when there is no log tree created in the current transaction.
The affected code path is triggered through unlink operations.

That commit was based on the assumption that the optimization was not
necessary because we used to have the following checks when the patch
was authored:

  int btrfs_del_dir_entries_in_log(...)
  {
        (...)
        if (dir->logged_trans < trans->transid)
            return 0;

        ret = join_running_log_trans(root);
        (...)
   }

   int btrfs_del_inode_ref_in_log(...)
   {
        (...)
        if (inode->logged_trans < trans->transid)
            return 0;

        ret = join_running_log_trans(root);
        (...)
   }

However before that patch was merged, another patch was merged first which
replaced those checks because they were buggy.

That other patch corresponds to commit 803f0f64d17769 ("Btrfs: fix fsync
not persisting dentry deletions due to inode evictions"). The assumption
that if the logged_trans field of an inode had a smaller value then the
current transaction's generation (transid) meant that the inode was not
logged in the current transaction was only correct if the inode was not
evicted and reloaded in the current transaction. So the corresponding bug
fix changed those checks and replaced them with the following helper
function:

  static bool inode_logged(struct btrfs_trans_handle *trans,
                           struct btrfs_inode *inode)
  {
        if (inode->logged_trans == trans->transid)
                return true;

        if (inode->last_trans == trans->transid &&
            test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
            !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
                return true;

        return false;
  }

So if we have a subvolume without a log tree in the current transaction
(because we had no fsyncs), every time we unlink an inode we can end up
trying to lock the log_mutex of the root through join_running_log_trans()
twice, once for the inode being unlinked (by btrfs_del_inode_ref_in_log())
and once for the parent directory (with btrfs_del_dir_entries_in_log()).

This means if we have several unlink operations happening in parallel for
inodes in the same subvolume, and the those inodes and/or their parent
inode were changed in the current transaction, we end up having a lot of
contention on the log_mutex.

The test robots from intel reported a -30.7% performance regression for
a REAIM test after commit e678934cbe5f02 ("btrfs: Remove unnecessary check
from join_running_log_trans").

So just bring back the optimization to join_running_log_trans() where we
check first if a log root exists before trying to lock the log_mutex. This
is done by checking for a bit that is set on the root when a log tree is
created and removed when a log tree is freed (at transaction commit time).

Commit e678934cbe5f02 ("btrfs: Remove unnecessary check from
join_running_log_trans") was merged in the 5.4 merge window while commit
803f0f64d17769 ("Btrfs: fix fsync not persisting dentry deletions due to
inode evictions") was merged in the 5.3 merge window. But the first
commit was actually authored before the second commit (May 23 2019 vs
June 19 2019).

Reported-by: kernel test robot <rong.a.chen@intel.com>
Link: https://lore.kernel.org/lkml/20200611090233.GL12456@shao2-debian/
Fixes: e678934cbe5f02 ("btrfs: Remove unnecessary check from join_running_log_trans")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h    | 2 ++
 fs/btrfs/tree-log.c | 5 +++++
 2 files changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 30ce7039bc27..d404cce8ae40 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1009,6 +1009,8 @@ enum {
 	BTRFS_ROOT_DEAD_RELOC_TREE,
 	/* Mark dead root stored on device whose cleanup needs to be resumed */
 	BTRFS_ROOT_DEAD_TREE,
+	/* The root has a log tree. Used only for subvolume roots. */
+	BTRFS_ROOT_HAS_LOG_TREE,
 };
 
 /*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 920cee312f4e..cd5348f352dd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -169,6 +169,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 		if (ret)
 			goto out;
 
+		set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
 		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 		root->log_start_pid = current->pid;
 	}
@@ -195,6 +196,9 @@ static int join_running_log_trans(struct btrfs_root *root)
 {
 	int ret = -ENOENT;
 
+	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
+		return ret;
+
 	mutex_lock(&root->log_mutex);
 	if (root->log_root) {
 		ret = 0;
@@ -3303,6 +3307,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 	if (root->log_root) {
 		free_log_tree(trans, root->log_root);
 		root->log_root = NULL;
+		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
 	}
 	return 0;
 }
-- 
cgit 


From f2cb2f39ccc30fa13d3ac078d461031a63960e5b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 15 Jun 2020 18:46:01 +0100
Subject: btrfs: fix hang on snapshot creation after RWF_NOWAIT write

If we do a successful RWF_NOWAIT write we end up locking the snapshot lock
of the inode, through a call to check_can_nocow(), but we never unlock it.

This means the next attempt to create a snapshot on the subvolume will
hang forever.

Trivial reproducer:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/foobar
  $ chattr +C /mnt/foobar
  $ xfs_io -d -c "pwrite -S 0xab 0 64K" /mnt/foobar
  $ xfs_io -d -c "pwrite -N -V 1 -S 0xfe 0 64K" /mnt/foobar

  $ btrfs subvolume snapshot -r /mnt /mnt/snap
    --> hangs

Fix this by unlocking the snapshot lock if check_can_nocow() returned
success.

Fixes: edf064e7c6fec3 ("btrfs: nowait aio support")
CC: stable@vger.kernel.org # 4.14+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2c14312b05e8..04faa04fccd1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1914,6 +1914,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 			inode_unlock(inode);
 			return -EAGAIN;
 		}
+		/* check_can_nocow() locks the snapshot lock on success */
+		btrfs_drew_write_unlock(&root->snapshot_lock);
 	}
 
 	current->backing_dev_info = inode_to_bdi(inode);
-- 
cgit 


From 4b1946284dd6641afdb9457101056d9e6ee6204c Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 15 Jun 2020 18:48:58 +0100
Subject: btrfs: fix failure of RWF_NOWAIT write into prealloc extent beyond
 eof

If we attempt to write to prealloc extent located after eof using a
RWF_NOWAIT write, we always fail with -EAGAIN.

We do actually check if we have an allocated extent for the write at
the start of btrfs_file_write_iter() through a call to check_can_nocow(),
but later when we go into the actual direct IO write path we simply
return -EAGAIN if the write starts at or beyond EOF.

Trivial to reproduce:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/foo
  $ chattr +C /mnt/foo

  $ xfs_io -d -c "pwrite -S 0xab 0 64K" /mnt/foo
  wrote 65536/65536 bytes at offset 0
  64 KiB, 16 ops; 0.0004 sec (135.575 MiB/sec and 34707.1584 ops/sec)

  $ xfs_io -c "falloc -k 64K 1M" /mnt/foo

  $ xfs_io -d -c "pwrite -N -V 1 -S 0xfe -b 64K 64K 64K" /mnt/foo
  pwrite: Resource temporarily unavailable

On xfs and ext4 the write succeeds, as expected.

Fix this by removing the wrong check at btrfs_direct_IO().

Fixes: edf064e7c6fec3 ("btrfs: nowait aio support")
CC: stable@vger.kernel.org # 4.14+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 62b49d2db928..cfa863d2d97c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7889,9 +7889,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 			dio_data.overwrite = 1;
 			inode_unlock(inode);
 			relock = true;
-		} else if (iocb->ki_flags & IOCB_NOWAIT) {
-			ret = -EAGAIN;
-			goto out;
 		}
 		ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
 						   offset, count);
-- 
cgit 


From 260a63395f90f67d6ab89e4266af9e3dc34a77e9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 15 Jun 2020 18:49:13 +0100
Subject: btrfs: fix RWF_NOWAIT write not failling when we need to cow

If we attempt to do a RWF_NOWAIT write against a file range for which we
can only do NOCOW for a part of it, due to the existence of holes or
shared extents for example, we proceed with the write as if it were
possible to NOCOW the whole range.

Example:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/sdj/bar
  $ chattr +C /mnt/sdj/bar

  $ xfs_io -d -c "pwrite -S 0xab -b 256K 0 256K" /mnt/bar
  wrote 262144/262144 bytes at offset 0
  256 KiB, 1 ops; 0.0003 sec (694.444 MiB/sec and 2777.7778 ops/sec)

  $ xfs_io -c "fpunch 64K 64K" /mnt/bar
  $ sync

  $ xfs_io -d -c "pwrite -N -V 1 -b 128K -S 0xfe 0 128K" /mnt/bar
  wrote 131072/131072 bytes at offset 0
  128 KiB, 1 ops; 0.0007 sec (160.051 MiB/sec and 1280.4097 ops/sec)

This last write should fail with -EAGAIN since the file range from 64K to
128K is a hole. On xfs it fails, as expected, but on ext4 it currently
succeeds because apparently it is expensive to check if there are extents
allocated for the whole range, but I'll check with the ext4 people.

Fix the issue by checking if check_can_nocow() returns a number of
NOCOW'able bytes smaller then the requested number of bytes, and if it
does return -EAGAIN.

Fixes: edf064e7c6fec3 ("btrfs: nowait aio support")
CC: stable@vger.kernel.org # 4.14+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 04faa04fccd1..6d5d905281c6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1904,18 +1904,29 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	pos = iocb->ki_pos;
 	count = iov_iter_count(from);
 	if (iocb->ki_flags & IOCB_NOWAIT) {
+		size_t nocow_bytes = count;
+
 		/*
 		 * We will allocate space in case nodatacow is not set,
 		 * so bail
 		 */
 		if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
 					      BTRFS_INODE_PREALLOC)) ||
-		    check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
+		    check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes) <= 0) {
 			inode_unlock(inode);
 			return -EAGAIN;
 		}
 		/* check_can_nocow() locks the snapshot lock on success */
 		btrfs_drew_write_unlock(&root->snapshot_lock);
+		/*
+		 * There are holes in the range or parts of the range that must
+		 * be COWed (shared extents, RO block groups, etc), so just bail
+		 * out.
+		 */
+		if (nocow_bytes < count) {
+			inode_unlock(inode);
+			return -EAGAIN;
+		}
 	}
 
 	current->backing_dev_info = inode_to_bdi(inode);
-- 
cgit 


From 5dbb75ed6900048e146247b6325742d92c892548 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 15 Jun 2020 18:49:39 +0100
Subject: btrfs: fix RWF_NOWAIT writes blocking on extent locks and waiting for
 IO

A RWF_NOWAIT write is not supposed to wait on filesystem locks that can be
held for a long time or for ongoing IO to complete.

However when calling check_can_nocow(), if the inode has prealloc extents
or has the NOCOW flag set, we can block on extent (file range) locks
through the call to btrfs_lock_and_flush_ordered_range(). Such lock can
take a significant amount of time to be available. For example, a fiemap
task may be running, and iterating through the entire file range checking
all extents and doing backref walking to determine if they are shared,
or a readpage operation may be in progress.

Also at btrfs_lock_and_flush_ordered_range(), called by check_can_nocow(),
after locking the file range we wait for any existing ordered extent that
is in progress to complete. Another operation that can take a significant
amount of time and defeat the purpose of RWF_NOWAIT.

So fix this by trying to lock the file range and if it's currently locked
return -EAGAIN to user space. If we are able to lock the file range without
waiting and there is an ordered extent in the range, return -EAGAIN as
well, instead of waiting for it to complete. Finally, don't bother trying
to lock the snapshot lock of the root when attempting a RWF_NOWAIT write,
as that is only important for buffered writes.

Fixes: edf064e7c6fec3 ("btrfs: nowait aio support")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6d5d905281c6..2520605afc25 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1533,7 +1533,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 }
 
 static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
-				    size_t *write_bytes)
+				    size_t *write_bytes, bool nowait)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_root *root = inode->root;
@@ -1541,27 +1541,43 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
 	u64 num_bytes;
 	int ret;
 
-	if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
+	if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
 		return -EAGAIN;
 
 	lockstart = round_down(pos, fs_info->sectorsize);
 	lockend = round_up(pos + *write_bytes,
 			   fs_info->sectorsize) - 1;
+	num_bytes = lockend - lockstart + 1;
 
-	btrfs_lock_and_flush_ordered_range(inode, lockstart,
-					   lockend, NULL);
+	if (nowait) {
+		struct btrfs_ordered_extent *ordered;
+
+		if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
+			return -EAGAIN;
+
+		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+						     num_bytes);
+		if (ordered) {
+			btrfs_put_ordered_extent(ordered);
+			ret = -EAGAIN;
+			goto out_unlock;
+		}
+	} else {
+		btrfs_lock_and_flush_ordered_range(inode, lockstart,
+						   lockend, NULL);
+	}
 
-	num_bytes = lockend - lockstart + 1;
 	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
 			NULL, NULL, NULL);
 	if (ret <= 0) {
 		ret = 0;
-		btrfs_drew_write_unlock(&root->snapshot_lock);
+		if (!nowait)
+			btrfs_drew_write_unlock(&root->snapshot_lock);
 	} else {
 		*write_bytes = min_t(size_t, *write_bytes ,
 				     num_bytes - pos + lockstart);
 	}
-
+out_unlock:
 	unlock_extent(&inode->io_tree, lockstart, lockend);
 
 	return ret;
@@ -1633,7 +1649,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
 			if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
 						      BTRFS_INODE_PREALLOC)) &&
 			    check_can_nocow(BTRFS_I(inode), pos,
-					&write_bytes) > 0) {
+					    &write_bytes, false) > 0) {
 				/*
 				 * For nodata cow case, no need to reserve
 				 * data space.
@@ -1912,12 +1928,11 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 		 */
 		if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
 					      BTRFS_INODE_PREALLOC)) ||
-		    check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes) <= 0) {
+		    check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes,
+				    true) <= 0) {
 			inode_unlock(inode);
 			return -EAGAIN;
 		}
-		/* check_can_nocow() locks the snapshot lock on success */
-		btrfs_drew_write_unlock(&root->snapshot_lock);
 		/*
 		 * There are holes in the range or parts of the range that must
 		 * be COWed (shared extents, RO block groups, etc), so just bail
-- 
cgit 


From b091f7fede97cc64f7aaad3eeb37965aebee3082 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 16 Jun 2020 11:31:59 -0400
Subject: btrfs: use kfree() in btrfs_ioctl_get_subvol_info()

In btrfs_ioctl_get_subvol_info(), there is a classic case where kzalloc()
was incorrectly paired with kzfree(). According to David Sterba, there
isn't any sensitive information in the subvol_info that needs to be
cleared before freeing. So kzfree() isn't really needed, use kfree()
instead.

Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 168deb8ef68a..e8f7c5f00894 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2692,7 +2692,7 @@ out:
 	btrfs_put_root(root);
 out_free:
 	btrfs_free_path(path);
-	kzfree(subvol_info);
+	kfree(subvol_info);
 	return ret;
 }
 
-- 
cgit 


From b6489a49f7b71964e37978d6f89bbdbdb263f6f5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 15 Jun 2020 17:36:58 +0100
Subject: afs: Fix silly rename

Fix AFS's silly rename by the following means:

 (1) Set the destination directory in afs_do_silly_rename() so as to avoid
     misbehaviour and indicate that the directory data version will
     increment by 1 so as to avoid warnings about unexpected changes in the
     DV.  Also indicate that the ctime should be updated to avoid xfstest
     grumbling.

 (2) Note when the server indicates that a directory changed more than we
     expected (AFS_OPERATION_DIR_CONFLICT), indicating a conflict with a
     third party change, checking on successful completion of unlink and
     rename.

     The problem is that the FS.RemoveFile RPC op doesn't report the status
     of the unlinked file, though YFS.RemoveFile2 does.  This can be
     mitigated by the assumption that if the directory DV cranked by
     exactly 1, we can be sure we removed one link from the file; further,
     ordinarily in AFS, files cannot be hardlinked across directories, so
     if we reduce nlink to 0, the file is deleted.

     However, if the directory DV jumps by more than 1, we cannot know if a
     third party intervened by adding or removing a link on the file we
     just removed a link from.

     The same also goes for any vnode that is at the destination of the
     FS.Rename RPC op.

 (3) Make afs_vnode_commit_status() apply the nlink drop inside the cb_lock
     section along with the other attribute updates if ->op_unlinked is set
     on the descriptor for the appropriate vnode.

 (4) Issue a follow up status fetch to the unlinked file in the event of a
     third party conflict that makes it impossible for us to know if we
     actually deleted the file or not.

 (5) Provide a flag, AFS_VNODE_SILLY_DELETED, to make afs_getattr() lie to
     the user about the nlink of a silly deleted file so that it appears as
     0, not 1.

Found with the generic/035 and generic/084 xfstests.

Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/dir.c       | 21 +++++++++++++++++++--
 fs/afs/dir_silly.c | 36 +++++++++++++++++++++++++++---------
 fs/afs/inode.c     | 22 +++++++++++++++++-----
 fs/afs/internal.h  | 17 +++++++++++++++++
 4 files changed, 80 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index cd74731112f4..3e3c2bf0a722 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -696,7 +696,7 @@ static const struct afs_operation_ops afs_inline_bulk_status_operation = {
 	.success	= afs_do_lookup_success,
 };
 
-static const struct afs_operation_ops afs_fetch_status_operation = {
+static const struct afs_operation_ops afs_lookup_fetch_status_operation = {
 	.issue_afs_rpc	= afs_fs_fetch_status,
 	.issue_yfs_rpc	= yfs_fs_fetch_status,
 	.success	= afs_do_lookup_success,
@@ -1496,6 +1496,7 @@ static void afs_unlink_success(struct afs_operation *op)
 {
 	_enter("op=%08x", op->debug_id);
 	op->ctime = op->file[0].scb.status.mtime_client;
+	afs_check_dir_conflict(op, &op->file[0]);
 	afs_vnode_commit_status(op, &op->file[0]);
 	afs_vnode_commit_status(op, &op->file[1]);
 	afs_update_dentry_version(op, &op->file[0], op->dentry);
@@ -1580,9 +1581,24 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 
 	op->file[1].vnode = vnode;
 	op->file[1].update_ctime = true;
+	op->file[1].op_unlinked = true;
 	op->dentry	= dentry;
 	op->ops		= &afs_unlink_operation;
-	return afs_do_sync_operation(op);
+	afs_begin_vnode_operation(op);
+	afs_wait_for_operation(op);
+
+	/* If there was a conflict with a third party, check the status of the
+	 * unlinked vnode.
+	 */
+	if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+		op->file[1].update_ctime = false;
+		op->fetch_status.which = 1;
+		op->ops = &afs_fetch_status_operation;
+		afs_begin_vnode_operation(op);
+		afs_wait_for_operation(op);
+	}
+
+	return afs_put_operation(op);
 
 error:
 	return afs_put_operation(op);
@@ -1767,6 +1783,7 @@ static void afs_rename_success(struct afs_operation *op)
 	_enter("op=%08x", op->debug_id);
 
 	op->ctime = op->file[0].scb.status.mtime_client;
+	afs_check_dir_conflict(op, &op->file[1]);
 	afs_vnode_commit_status(op, &op->file[0]);
 	if (op->file[1].vnode != op->file[0].vnode) {
 		op->ctime = op->file[1].scb.status.mtime_client;
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 001adb87ff23..04f75a44f243 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -16,6 +16,7 @@ static void afs_silly_rename_success(struct afs_operation *op)
 {
 	_enter("op=%08x", op->debug_id);
 
+	afs_check_dir_conflict(op, &op->file[0]);
 	afs_vnode_commit_status(op, &op->file[0]);
 }
 
@@ -69,6 +70,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
 		return PTR_ERR(op);
 
 	afs_op_set_vnode(op, 0, dvnode);
+	afs_op_set_vnode(op, 1, dvnode);
+	op->file[0].dv_delta = 1;
+	op->file[1].dv_delta = 1;
+	op->file[0].update_ctime = true;
+	op->file[1].update_ctime = true;
 
 	op->dentry		= old;
 	op->dentry_2		= new;
@@ -129,6 +135,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
 	switch (ret) {
 	case 0:
 		/* The rename succeeded. */
+		set_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags);
 		d_move(dentry, sdentry);
 		break;
 	case -ERESTARTSYS:
@@ -148,18 +155,11 @@ out:
 
 static void afs_silly_unlink_success(struct afs_operation *op)
 {
-	struct afs_vnode *vnode = op->file[1].vnode;
-
 	_enter("op=%08x", op->debug_id);
+	afs_check_dir_conflict(op, &op->file[0]);
 	afs_vnode_commit_status(op, &op->file[0]);
 	afs_vnode_commit_status(op, &op->file[1]);
 	afs_update_dentry_version(op, &op->file[0], op->dentry);
-
-	drop_nlink(&vnode->vfs_inode);
-	if (vnode->vfs_inode.i_nlink == 0) {
-		set_bit(AFS_VNODE_DELETED, &vnode->flags);
-		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
-	}
 }
 
 static void afs_silly_unlink_edit_dir(struct afs_operation *op)
@@ -200,12 +200,30 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
 
 	afs_op_set_vnode(op, 0, dvnode);
 	afs_op_set_vnode(op, 1, vnode);
+	op->file[0].dv_delta = 1;
+	op->file[0].update_ctime = true;
+	op->file[1].op_unlinked = true;
+	op->file[1].update_ctime = true;
 
 	op->dentry	= dentry;
 	op->ops		= &afs_silly_unlink_operation;
 
 	trace_afs_silly_rename(vnode, true);
-	return afs_do_sync_operation(op);
+	afs_begin_vnode_operation(op);
+	afs_wait_for_operation(op);
+
+	/* If there was a conflict with a third party, check the status of the
+	 * unlinked vnode.
+	 */
+	if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+		op->file[1].update_ctime = false;
+		op->fetch_status.which = 1;
+		op->ops = &afs_fetch_status_operation;
+		afs_begin_vnode_operation(op);
+		afs_wait_for_operation(op);
+	}
+
+	return afs_put_operation(op);
 }
 
 /*
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index d5d0ae7b2b1e..1d13d2e882ad 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -284,16 +284,25 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
 	write_seqlock(&vnode->cb_lock);
 
 	if (vp->scb.have_error) {
+		/* A YFS server will return this from RemoveFile2 and AFS and
+		 * YFS will return this from InlineBulkStatus.
+		 */
 		if (vp->scb.status.abort_code == VNOVNODE) {
 			set_bit(AFS_VNODE_DELETED, &vnode->flags);
 			clear_nlink(&vnode->vfs_inode);
 			__afs_break_callback(vnode, afs_cb_break_for_deleted);
+			op->flags &= ~AFS_OPERATION_DIR_CONFLICT;
 		}
-	} else {
-		if (vp->scb.have_status)
-			afs_apply_status(op, vp);
+	} else if (vp->scb.have_status) {
+		afs_apply_status(op, vp);
 		if (vp->scb.have_cb)
 			afs_apply_callback(op, vp);
+	} else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+		drop_nlink(&vnode->vfs_inode);
+		if (vnode->vfs_inode.i_nlink == 0) {
+			set_bit(AFS_VNODE_DELETED, &vnode->flags);
+			__afs_break_callback(vnode, afs_cb_break_for_deleted);
+		}
 	}
 
 	write_sequnlock(&vnode->cb_lock);
@@ -304,7 +313,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
 
 static void afs_fetch_status_success(struct afs_operation *op)
 {
-	struct afs_vnode_param *vp = &op->file[0];
+	struct afs_vnode_param *vp = &op->file[op->fetch_status.which];
 	struct afs_vnode *vnode = vp->vnode;
 	int ret;
 
@@ -318,7 +327,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
 	}
 }
 
-static const struct afs_operation_ops afs_fetch_status_operation = {
+const struct afs_operation_ops afs_fetch_status_operation = {
 	.issue_afs_rpc	= afs_fs_fetch_status,
 	.issue_yfs_rpc	= yfs_fs_fetch_status,
 	.success	= afs_fetch_status_success,
@@ -729,6 +738,9 @@ int afs_getattr(const struct path *path, struct kstat *stat,
 	do {
 		read_seqbegin_or_lock(&vnode->cb_lock, &seq);
 		generic_fillattr(inode, stat);
+		if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
+		    stat->nlink > 0)
+			stat->nlink -= 1;
 	} while (need_seqretry(&vnode->cb_lock, seq));
 
 	done_seqretry(&vnode->cb_lock, seq);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 9420890e3577..573a5922c3bb 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -634,6 +634,7 @@ struct afs_vnode {
 #define AFS_VNODE_AUTOCELL	6		/* set if Vnode is an auto mount point */
 #define AFS_VNODE_PSEUDODIR	7 		/* set if Vnode is a pseudo directory */
 #define AFS_VNODE_NEW_CONTENT	8		/* Set if file has new content (create/trunc-0) */
+#define AFS_VNODE_SILLY_DELETED	9		/* Set if file has been silly-deleted */
 
 	struct list_head	wb_keys;	/* List of keys available for writeback */
 	struct list_head	pending_locks;	/* locks waiting to be granted */
@@ -748,6 +749,7 @@ struct afs_vnode_param {
 	bool			need_io_lock:1;	/* T if we need the I/O lock on this */
 	bool			update_ctime:1;	/* Need to update the ctime */
 	bool			set_size:1;	/* Must update i_size */
+	bool			op_unlinked:1;	/* True if file was unlinked by op */
 };
 
 /*
@@ -839,6 +841,7 @@ struct afs_operation {
 #define AFS_OPERATION_LOCK_1		0x0200	/* Set if have io_lock on file[1] */
 #define AFS_OPERATION_TRIED_ALL		0x0400	/* Set if we've tried all the fileservers */
 #define AFS_OPERATION_RETRY_SERVER	0x0800	/* Set if we should retry the current server */
+#define AFS_OPERATION_DIR_CONFLICT	0x1000	/* Set if we detected a 3rd-party dir change */
 };
 
 /*
@@ -1066,6 +1069,8 @@ extern int afs_wait_for_one_fs_probe(struct afs_server *, bool);
 /*
  * inode.c
  */
+extern const struct afs_operation_ops afs_fetch_status_operation;
+
 extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *);
 extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *);
 extern int afs_ilookup5_test_by_fid(struct inode *, void *);
@@ -1497,6 +1502,18 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
 			(void *)(unsigned long)dir_vp->scb.status.data_version;
 }
 
+/*
+ * Check for a conflicting operation on a directory that we just unlinked from.
+ * If someone managed to sneak a link or an unlink in on the file we just
+ * unlinked, we won't be able to trust nlink on an AFS file (but not YFS).
+ */
+static inline void afs_check_dir_conflict(struct afs_operation *op,
+					  struct afs_vnode_param *dvp)
+{
+	if (dvp->dv_before + dvp->dv_delta != dvp->scb.status.data_version)
+		op->flags |= AFS_OPERATION_DIR_CONFLICT;
+}
+
 static inline int afs_io_error(struct afs_call *call, enum afs_io_error where)
 {
 	trace_afs_io_error(call->debug_id, -EIO, where);
-- 
cgit 


From 4e264ffd953463cd14c0720eaa9315ac052f5973 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 16 Jun 2020 19:14:08 +0900
Subject: proc/bootconfig: Fix to use correct quotes for value

Fix /proc/bootconfig to select double or single quotes
corrctly according to the value.

If a bootconfig value includes a double quote character,
we must use single-quotes to quote that value.

This modifies if() condition and blocks for avoiding
double-quote in value check in 2 places. Anyway, since
xbc_array_for_each_value() can handle the array which
has a single node correctly.
Thus,

if (vnode && xbc_node_is_array(vnode)) {
	xbc_array_for_each_value(vnode)	/* vnode->next != NULL */
		...
} else {
	snprintf(val); /* val is an empty string if !vnode */
}

is equivalent to

if (vnode) {
	xbc_array_for_each_value(vnode)	/* vnode->next can be NULL */
		...
} else {
	snprintf("");	/* value is always empty */
}

Link: http://lkml.kernel.org/r/159230244786.65555.3763894451251622488.stgit@devnote2

Cc: stable@vger.kernel.org
Fixes: c1a3c36017d4 ("proc: bootconfig: Add /proc/bootconfig to show boot config list")
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 fs/proc/bootconfig.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 9955d75c0585..ad31ec4ad627 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -26,8 +26,9 @@ static int boot_config_proc_show(struct seq_file *m, void *v)
 static int __init copy_xbc_key_value_list(char *dst, size_t size)
 {
 	struct xbc_node *leaf, *vnode;
-	const char *val;
 	char *key, *end = dst + size;
+	const char *val;
+	char q;
 	int ret = 0;
 
 	key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL);
@@ -41,16 +42,20 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
 			break;
 		dst += ret;
 		vnode = xbc_node_get_child(leaf);
-		if (vnode && xbc_node_is_array(vnode)) {
+		if (vnode) {
 			xbc_array_for_each_value(vnode, val) {
-				ret = snprintf(dst, rest(dst, end), "\"%s\"%s",
-					val, vnode->next ? ", " : "\n");
+				if (strchr(val, '"'))
+					q = '\'';
+				else
+					q = '"';
+				ret = snprintf(dst, rest(dst, end), "%c%s%c%s",
+					q, val, q, vnode->next ? ", " : "\n");
 				if (ret < 0)
 					goto out;
 				dst += ret;
 			}
 		} else {
-			ret = snprintf(dst, rest(dst, end), "\"%s\"\n", val);
+			ret = snprintf(dst, rest(dst, end), "\"\"\n");
 			if (ret < 0)
 				break;
 			dst += ret;
-- 
cgit 


From fe557319aa06c23cffc9346000f119547e0f289a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 Jun 2020 09:37:53 +0200
Subject: maccess: rename probe_kernel_{read,write} to
 copy_{from,to}_kernel_nofault

Better describe what these functions do.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 8ba492d44e68..e502414b3556 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -512,7 +512,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 				 * Using bounce buffer to bypass the
 				 * hardened user copy kernel text checks.
 				 */
-				if (probe_kernel_read(buf, (void *) start, tsz)) {
+				if (copy_from_kernel_nofault(buf, (void *)start,
+						tsz)) {
 					if (clear_user(buffer, tsz)) {
 						ret = -EFAULT;
 						goto out;
-- 
cgit 


From 2d7d67920e5c8e0854df23ca77da2dd5880ce5dd Mon Sep 17 00:00:00 2001
From: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Date: Tue, 16 Jun 2020 02:06:37 +0800
Subject: io_uring: don't fail links for EAGAIN error in IOPOLL mode

In IOPOLL mode, for EAGAIN error, we'll try to submit io request
again using io-wq, so don't fail rest of links if this io request
has links.

Cc: stable@vger.kernel.org
Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e17df662c191..eb3797714539 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1988,7 +1988,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 	if (kiocb->ki_flags & IOCB_WRITE)
 		kiocb_end_write(req);
 
-	if (res != req->result)
+	if (res != -EAGAIN && res != req->result)
 		req_set_fail_links(req);
 	req->result = res;
 	if (res != -EAGAIN)
-- 
cgit 


From bbde017a32b32d2fa8d5fddca25fade20132abf8 Mon Sep 17 00:00:00 2001
From: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Date: Tue, 16 Jun 2020 02:06:38 +0800
Subject: io_uring: add memory barrier to synchronize io_kiocb's result and
 iopoll_completed

In io_complete_rw_iopoll(), stores to io_kiocb's result and iopoll
completed are two independent store operations, to ensure that once
iopoll_completed is ture and then req->result must been perceived by
the cpu executing io_do_iopoll(), proper memory barrier should be used.

And in io_do_iopoll(), we check whether req->result is EAGAIN, if it is,
we'll need to issue this io request using io-wq again. In order to just
issue a single smp_rmb() on the completion side, move the re-submit work
to io_iopoll_complete().

Cc: stable@vger.kernel.org
Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
[axboe: don't set ->iopoll_completed for -EAGAIN retry]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 53 +++++++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index eb3797714539..9d2ae9aa8b45 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1742,6 +1742,18 @@ static int io_put_kbuf(struct io_kiocb *req)
 	return cflags;
 }
 
+static void io_iopoll_queue(struct list_head *again)
+{
+	struct io_kiocb *req;
+
+	do {
+		req = list_first_entry(again, struct io_kiocb, list);
+		list_del(&req->list);
+		refcount_inc(&req->refs);
+		io_queue_async_work(req);
+	} while (!list_empty(again));
+}
+
 /*
  * Find and free completed poll iocbs
  */
@@ -1750,12 +1762,21 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 {
 	struct req_batch rb;
 	struct io_kiocb *req;
+	LIST_HEAD(again);
+
+	/* order with ->result store in io_complete_rw_iopoll() */
+	smp_rmb();
 
 	rb.to_free = rb.need_iter = 0;
 	while (!list_empty(done)) {
 		int cflags = 0;
 
 		req = list_first_entry(done, struct io_kiocb, list);
+		if (READ_ONCE(req->result) == -EAGAIN) {
+			req->iopoll_completed = 0;
+			list_move_tail(&req->list, &again);
+			continue;
+		}
 		list_del(&req->list);
 
 		if (req->flags & REQ_F_BUFFER_SELECTED)
@@ -1773,18 +1794,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	if (ctx->flags & IORING_SETUP_SQPOLL)
 		io_cqring_ev_posted(ctx);
 	io_free_req_many(ctx, &rb);
-}
-
-static void io_iopoll_queue(struct list_head *again)
-{
-	struct io_kiocb *req;
 
-	do {
-		req = list_first_entry(again, struct io_kiocb, list);
-		list_del(&req->list);
-		refcount_inc(&req->refs);
-		io_queue_async_work(req);
-	} while (!list_empty(again));
+	if (!list_empty(&again))
+		io_iopoll_queue(&again);
 }
 
 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
@@ -1792,7 +1804,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 {
 	struct io_kiocb *req, *tmp;
 	LIST_HEAD(done);
-	LIST_HEAD(again);
 	bool spin;
 	int ret;
 
@@ -1818,13 +1829,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		if (!list_empty(&done))
 			break;
 
-		if (req->result == -EAGAIN) {
-			list_move_tail(&req->list, &again);
-			continue;
-		}
-		if (!list_empty(&again))
-			break;
-
 		ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
 		if (ret < 0)
 			break;
@@ -1837,9 +1841,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	if (!list_empty(&done))
 		io_iopoll_complete(ctx, nr_events, &done);
 
-	if (!list_empty(&again))
-		io_iopoll_queue(&again);
-
 	return ret;
 }
 
@@ -1990,9 +1991,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 
 	if (res != -EAGAIN && res != req->result)
 		req_set_fail_links(req);
-	req->result = res;
-	if (res != -EAGAIN)
+
+	WRITE_ONCE(req->result, res);
+	/* order with io_poll_complete() checking ->result */
+	if (res != -EAGAIN) {
+		smp_wmb();
 		WRITE_ONCE(req->iopoll_completed, 1);
+	}
 }
 
 /*
-- 
cgit 


From 9d8426a09195e2dcf2aa249de2aaadd792d491c7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 16 Jun 2020 18:42:49 -0600
Subject: io_uring: acquire 'mm' for task_work for SQPOLL

If we're unlucky with timing, we could be running task_work after
having dropped the memory context in the sq thread. Since dropping
the context requires a runnable task state, we cannot reliably drop
it as part of our check-for-work loop in io_sq_thread(). Instead,
abstract out the mm acquire for the sq thread into a helper, and call
it from the async task work handler.

Cc: stable@vger.kernel.org # v5.7
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9d2ae9aa8b45..98c83fbf4f88 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4256,6 +4256,28 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
 	__io_queue_proc(&pt->req->apoll->poll, pt, head);
 }
 
+static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
+{
+	struct mm_struct *mm = current->mm;
+
+	if (mm) {
+		kthread_unuse_mm(mm);
+		mmput(mm);
+	}
+}
+
+static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
+				   struct io_kiocb *req)
+{
+	if (io_op_defs[req->opcode].needs_mm && !current->mm) {
+		if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
+			return -EFAULT;
+		kthread_use_mm(ctx->sqo_mm);
+	}
+
+	return 0;
+}
+
 static void io_async_task_func(struct callback_head *cb)
 {
 	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
@@ -4290,11 +4312,16 @@ static void io_async_task_func(struct callback_head *cb)
 
 	if (!canceled) {
 		__set_current_state(TASK_RUNNING);
+		if (io_sq_thread_acquire_mm(ctx, req)) {
+			io_cqring_add_event(req, -EFAULT);
+			goto end_req;
+		}
 		mutex_lock(&ctx->uring_lock);
 		__io_queue_sqe(req, NULL);
 		mutex_unlock(&ctx->uring_lock);
 	} else {
 		io_cqring_ev_posted(ctx);
+end_req:
 		req_set_fail_links(req);
 		io_double_put_req(req);
 	}
@@ -5841,11 +5868,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	if (unlikely(req->opcode >= IORING_OP_LAST))
 		return -EINVAL;
 
-	if (io_op_defs[req->opcode].needs_mm && !current->mm) {
-		if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
-			return -EFAULT;
-		kthread_use_mm(ctx->sqo_mm);
-	}
+	if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
+		return -EFAULT;
 
 	sqe_flags = READ_ONCE(sqe->flags);
 	/* enforce forwards compatibility on users */
@@ -5954,16 +5978,6 @@ fail_req:
 	return submitted;
 }
 
-static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
-{
-	struct mm_struct *mm = current->mm;
-
-	if (mm) {
-		kthread_unuse_mm(mm);
-		mmput(mm);
-	}
-}
-
 static int io_sq_thread(void *data)
 {
 	struct io_ring_ctx *ctx = data;
-- 
cgit 


From 56952e91acc93ed624fe9da840900defb75f1323 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 17 Jun 2020 15:00:04 -0600
Subject: io_uring: reap poll completions while waiting for refs to drop on
 exit

If we're doing polled IO and end up having requests being submitted
async, then completions can come in while we're waiting for refs to
drop. We need to reap these manually, as nobody else will be looking
for them.

Break the wait into 1/20th of a second time waits, and check for done
poll completions if we time out. Otherwise we can have done poll
completions sitting in ctx->poll_list, which needs us to reap them but
we're just waiting for them.

Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 98c83fbf4f88..2038d52c5450 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7363,7 +7363,17 @@ static void io_ring_exit_work(struct work_struct *work)
 	if (ctx->rings)
 		io_cqring_overflow_flush(ctx, true);
 
-	wait_for_completion(&ctx->ref_comp);
+	/*
+	 * If we're doing polled IO and end up having requests being
+	 * submitted async (out-of-line), then completions can come in while
+	 * we're waiting for refs to drop. We need to reap these manually,
+	 * as nobody else will be looking for them.
+	 */
+	while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) {
+		io_iopoll_reap_events(ctx);
+		if (ctx->rings)
+			io_cqring_overflow_flush(ctx, true);
+	}
 	io_ring_ctx_free(ctx);
 }
 
-- 
cgit 


From 6f2cc1664db20676069cff27a461ccc97dbfd114 Mon Sep 17 00:00:00 2001
From: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Date: Thu, 18 Jun 2020 15:01:56 +0800
Subject: io_uring: fix possible race condition against REQ_F_NEED_CLEANUP

In io_read() or io_write(), when io request is submitted successfully,
it'll go through the below sequence:

    kfree(iovec);
    req->flags &= ~REQ_F_NEED_CLEANUP;
    return ret;

But clearing REQ_F_NEED_CLEANUP might be unsafe. The io request may
already have been completed, and then io_complete_rw_iopoll()
and io_complete_rw() will be called, both of which will also modify
req->flags if needed. This causes a race condition, with concurrent
non-atomic modification of req->flags.

To eliminate this race, in io_read() or io_write(), if io request is
submitted successfully, we don't remove REQ_F_NEED_CLEANUP flag. If
REQ_F_NEED_CLEANUP is set, we'll leave __io_req_aux_free() to the
iovec cleanup work correspondingly.

Cc: stable@vger.kernel.org
Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2038d52c5450..a78201b96179 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2670,8 +2670,8 @@ copy_iov:
 		}
 	}
 out_free:
-	kfree(iovec);
-	req->flags &= ~REQ_F_NEED_CLEANUP;
+	if (!(req->flags & REQ_F_NEED_CLEANUP))
+		kfree(iovec);
 	return ret;
 }
 
@@ -2793,8 +2793,8 @@ copy_iov:
 		}
 	}
 out_free:
-	req->flags &= ~REQ_F_NEED_CLEANUP;
-	kfree(iovec);
+	if (!(req->flags & REQ_F_NEED_CLEANUP))
+		kfree(iovec);
 	return ret;
 }
 
-- 
cgit 


From 3373a3461aa15b7f9a871fa4cb2c9ef21ac20b47 Mon Sep 17 00:00:00 2001
From: Zheng Bin <zhengbin13@huawei.com>
Date: Thu, 18 Jun 2020 12:21:38 +0800
Subject: block: make function 'kill_bdev' static

kill_bdev does not have any external user, so make it static.

Signed-off-by: Zheng Bin <zhengbin13@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 08c87db3a92b..0ae656e022fd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -75,7 +75,7 @@ static void bdev_write_inode(struct block_device *bdev)
 }
 
 /* Kill _all_ buffers and pagecache , dirty or not.. */
-void kill_bdev(struct block_device *bdev)
+static void kill_bdev(struct block_device *bdev)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 
@@ -84,8 +84,7 @@ void kill_bdev(struct block_device *bdev)
 
 	invalidate_bh_lrus();
 	truncate_inode_pages(mapping, 0);
-}	
-EXPORT_SYMBOL(kill_bdev);
+}
 
 /* Invalidate clean unused buffers and pagecache. */
 void invalidate_bdev(struct block_device *bdev)
-- 
cgit 


From f8ea5c7bceeb6ce6e7b3e7fb28c9dda8c0a58dcb Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 19 Jun 2020 00:01:28 +0100
Subject: afs: Fix afs_do_lookup() to call correct fetch-status op variant

Fix afs_do_lookup()'s fallback case for when FS.InlineBulkStatus isn't
supported by the server.

In the fallback, it calls FS.FetchStatus for the specific vnode it's
meant to be looking up.  Commit b6489a49f7b7 broke this by renaming one
of the two identically-named afs_fetch_status_operation descriptors to
something else so that one of them could be made non-static.  The site
that used the renamed one, however, wasn't renamed and didn't produce
any warning because the other was declared in a header.

Fix this by making afs_do_lookup() use the renamed variant.

Note that there are two variants of the success method because one is
called from ->lookup() where we may or may not have an inode, but can't
call iget until after we've talked to the server - whereas the other is
called from within iget where we have an inode, but it may or may not be
initialised.

The latter variant expects there to be an inode, but because it's being
called from there former case, there might not be - resulting in an oops
like the following:

  BUG: kernel NULL pointer dereference, address: 00000000000000b0
  ...
  RIP: 0010:afs_fetch_status_success+0x27/0x7e
  ...
  Call Trace:
    afs_wait_for_operation+0xda/0x234
    afs_do_lookup+0x2fe/0x3c1
    afs_lookup+0x3c5/0x4bd
    __lookup_slow+0xcd/0x10f
    walk_component+0xa2/0x10c
    path_lookupat.isra.0+0x80/0x110
    filename_lookup+0x81/0x104
    vfs_statx+0x76/0x109
    __do_sys_newlstat+0x39/0x6b
    do_syscall_64+0x4c/0x78
    entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: b6489a49f7b7 ("afs: Fix silly rename")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 3e3c2bf0a722..96757f3abd74 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,7 +845,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
 		 * to FS.FetchStatus for op->file[1].
 		 */
 		op->fetch_status.which = 1;
-		op->ops = &afs_fetch_status_operation;
+		op->ops = &afs_lookup_fetch_status_operation;
 		afs_begin_vnode_operation(op);
 		afs_wait_for_operation(op);
 	}
-- 
cgit 


From 5481fc6eb8a7f4b76d8ad1be371d2e11b22bfb55 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 19 Jun 2020 23:39:36 +0100
Subject: afs: Fix hang on rmmod due to outstanding timer

The fileserver probe timer, net->fs_probe_timer, isn't cancelled when
the kafs module is being removed and so the count it holds on
net->servers_outstanding doesn't get dropped..

This causes rmmod to wait forever.  The hung process shows a stack like:

	afs_purge_servers+0x1b5/0x23c [kafs]
	afs_net_exit+0x44/0x6e [kafs]
	ops_exit_list+0x72/0x93
	unregister_pernet_operations+0x14c/0x1ba
	unregister_pernet_subsys+0x1d/0x2a
	afs_exit+0x29/0x6f [kafs]
	__do_sys_delete_module.isra.0+0x1a2/0x24b
	do_syscall_64+0x51/0x95
	entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fix this by:

 (1) Attempting to cancel the probe timer and, if successful, drop the
     count that the timer was holding.

 (2) Make the timer function just drop the count and not schedule the
     prober if the afs portion of net namespace is being destroyed.

Also, whilst we're at it, make the following changes:

 (3) Initialise net->servers_outstanding to 1 and decrement it before
     waiting on it so that it doesn't generate wake up events by being
     decremented to 0 until we're cleaning up.

 (4) Switch the atomic_dec() on ->servers_outstanding for ->fs_timer in
     afs_purge_servers() to use the helper function for that.

Fixes: f6cbb368bcb0 ("afs: Actively poll fileservers to maintain NAT or firewall openings")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/fs_probe.c | 11 ++++++++++-
 fs/afs/internal.h |  1 +
 fs/afs/main.c     |  3 +++
 fs/afs/server.c   |  3 ++-
 4 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index b34f74b0f319..5d9ef517cf81 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -314,7 +314,7 @@ void afs_fs_probe_timer(struct timer_list *timer)
 {
 	struct afs_net *net = container_of(timer, struct afs_net, fs_probe_timer);
 
-	if (!queue_work(afs_wq, &net->fs_prober))
+	if (!net->live || !queue_work(afs_wq, &net->fs_prober))
 		afs_dec_servers_outstanding(net);
 }
 
@@ -458,3 +458,12 @@ dont_wait:
 		return -ETIME;
 	return -EDESTADDRREQ;
 }
+
+/*
+ * Clean up the probing when the namespace is killed off.
+ */
+void afs_fs_probe_cleanup(struct afs_net *net)
+{
+	if (del_timer_sync(&net->fs_probe_timer))
+		afs_dec_servers_outstanding(net);
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 573a5922c3bb..d520535ddb62 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1065,6 +1065,7 @@ extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long);
 extern void afs_probe_fileserver(struct afs_net *, struct afs_server *);
 extern void afs_fs_probe_dispatcher(struct work_struct *);
 extern int afs_wait_for_one_fs_probe(struct afs_server *, bool);
+extern void afs_fs_probe_cleanup(struct afs_net *);
 
 /*
  * inode.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 9c79c91e8005..31b472f7c734 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -100,6 +100,7 @@ static int __net_init afs_net_init(struct net *net_ns)
 	timer_setup(&net->fs_timer, afs_servers_timer, 0);
 	INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher);
 	timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0);
+	atomic_set(&net->servers_outstanding, 1);
 
 	ret = -ENOMEM;
 	sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL);
@@ -130,6 +131,7 @@ static int __net_init afs_net_init(struct net *net_ns)
 
 error_open_socket:
 	net->live = false;
+	afs_fs_probe_cleanup(net);
 	afs_cell_purge(net);
 	afs_purge_servers(net);
 error_cell_init:
@@ -150,6 +152,7 @@ static void __net_exit afs_net_exit(struct net *net_ns)
 	struct afs_net *net = afs_net(net_ns);
 
 	net->live = false;
+	afs_fs_probe_cleanup(net);
 	afs_cell_purge(net);
 	afs_purge_servers(net);
 	afs_close_socket(net);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 039e3488511c..e82e452e2612 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -605,11 +605,12 @@ void afs_purge_servers(struct afs_net *net)
 	_enter("");
 
 	if (del_timer_sync(&net->fs_timer))
-		atomic_dec(&net->servers_outstanding);
+		afs_dec_servers_outstanding(net);
 
 	afs_queue_server_manager(net);
 
 	_debug("wait");
+	atomic_dec(&net->servers_outstanding);
 	wait_var_event(&net->servers_outstanding,
 		       !atomic_read(&net->servers_outstanding));
 	_leave("");
-- 
cgit 


From 3c597282887fd55181578996dca52ce697d985a5 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Fri, 19 Jun 2020 07:43:49 +0800
Subject: erofs: fix partially uninitialized misuse in z_erofs_onlinepage_fixup

Hongyu reported "id != index" in z_erofs_onlinepage_fixup() with
specific aarch64 environment easily, which wasn't shown before.

After digging into that, I found that high 32 bits of page->private
was set to 0xaaaaaaaa rather than 0 (due to z_erofs_onlinepage_init
behavior with specific compiler options). Actually we only use low
32 bits to keep the page information since page->private is only 4
bytes on most 32-bit platforms. However z_erofs_onlinepage_fixup()
uses the upper 32 bits by mistake.

Let's fix it now.

Reported-and-tested-by: Hongyu Jin <hongyu.jin@unisoc.com>
Fixes: 3883a79abd02 ("staging: erofs: introduce VLE decompression support")
Cc: <stable@vger.kernel.org> # 4.19+
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Link: https://lore.kernel.org/r/20200618234349.22553-1-hsiangkao@aol.com
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/zdata.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 7824f5563a55..9b66c28b3ae9 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -144,22 +144,22 @@ static inline void z_erofs_onlinepage_init(struct page *page)
 static inline void z_erofs_onlinepage_fixup(struct page *page,
 	uintptr_t index, bool down)
 {
-	unsigned long *p, o, v, id;
-repeat:
-	p = &page_private(page);
-	o = READ_ONCE(*p);
+	union z_erofs_onlinepage_converter u = { .v = &page_private(page) };
+	int orig, orig_index, val;
 
-	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
-	if (id) {
+repeat:
+	orig = atomic_read(u.o);
+	orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (orig_index) {
 		if (!index)
 			return;
 
-		DBG_BUGON(id != index);
+		DBG_BUGON(orig_index != index);
 	}
 
-	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
-		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
-	if (cmpxchg(p, o, v) != o)
+	val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
+	if (atomic_cmpxchg(u.o, orig, val) != orig)
 		goto repeat;
 }
 
-- 
cgit